Practicals - motif occurrences - multitesting corrections

In the previous tutorial on word probabilities, we used the binomial distribution to test the significance of a given oligonucleotide in a given sequence.

We extend now this problem by performing a systematic analysis of all oligonucleotides of a given size observed in a sequence set. This will lead us to perform several thousands of tests, with an important consequence on the risk of false positives. We will apply various multi-testing corrections, and compare their effects.

Exercise

We counted the occurrences of all possible hexanucleotides in the promoters of 30 genes involved in methionine metabolism in the yeast Saccharomyces cerevisiae.

Loading the R configuration file

Before running the scripts, you should load the configuration and utilities. For this you can simply run the R command
source('http://pedagogix-tagc.univ-mrs.fr/courses/statistics_bioinformatics/R-files/config.R')
    

Loading the data files

Hexanucleotide occurrences in promoters of the MET genes

## Load word occurrences in promoters of the MET genes
dir.oligos <- file.path(dir.data, 'exceptional_motifs', 'Saccharomyces_cerevisiae', 'MET')
file.oligos <- 'MET_up-noorf_6nt-ovlp-1str.tab'
oligos <- read.delim(file.path(dir.oligos, file.oligos), comment.char=';', header=T, row.names=1)
row.names(oligos) <- oligos$id
oligos <- oligos[,-1] ## Suppress the first column, redundant with the row name

## Check the content of the file
dim(oligos) ## Check the dimensions of the data frame
oligos[1:10,] ## Check the first 10 rows
    

Computing the binomial P-value

For computing the binomial P-value, we need the following parameters:
## The number of trials is the sum of all word occurrences
N <- sum(oligos$occ)

## Compute the binomial P-value
Pval.binom <- pbinom(q=oligos$occ - 1,size=N, prob=oligos$exp_freq, lower.tail=F)

## Add this Pval as a column to the data frame
oligos$Pval.binom <- Pval.binom
print(oligos[1:10,]) ## Check the first 10 rows
    

Compare expected and observed occurrences

## Store the prior probabilities in a separate variable for convenience
p <- oligos$exp_freq

## Add a column with expected frequencies to the data frame
oligos$x.exp <- N*p
oligos[1:10,] ## Check the first 10 rows

## Draw a plot with expected versus observed occurrences
plot(oligos$x.exp, 
     oligos$occ, 
     col='blue',
     xlab='Expected occurrences', ylab='Observed occurrences',
     panel.first=c(grid(col='#BBBBBB',lty='solid'),
		abline(a=0, b=1,col='darkgreen',lty='solid'),
		abline(h=0,col='black',lty='solid'),
		abline(v=0,col='black',lty='solid')
		)
     )
    

Compute observed/expected ratio

## Add a column to the oligo table with the observed/expected ratio
oligos$ratio <- oligos$occ / oligos$x.exp
oligos[1:10,] ## Check the first 10 rows

## Sort rows of the input file according to the ratio
ratio.order <- order(oligos$ratio, decreasing=TRUE)

## Show the indexes of the 10 oligos with the highest obs/
ratio.order[1:10]

## Show the names of these oligos
row.names(oligos)[ratio.order[1:10]]

## Show the 10 oligos with the highest obs/exp ratio
oligos[ratio.order[1:10],]
    

Log-likelihood

## Compute the log-likelihood
oligos$log.like <- oligos$x.exp *log(oligos$occ/oligos$x.exp)
oligos[1:10,] ## Check the first 10 rows


## Compare obs/exp ratio and log-likellihood
plot(oligos$ratio, 
     oligos$log.like, col='darkblue', xlab='obs/exp ratio', ylab='log-likelihood',
     panel.first=c(grid(col='#BBBBBB', lty='solid'),
		   abline(h=0, col='brown'), abline(v=1, col='brown')))

## Compare  log-likellihood and binomial P-value
plot(oligos$log.like, 
     oligos$Pval.binom, log='y',
     col='darkblue', ylab='Binomial P-value', xlab='log-likelihood',
     panel.first=c(grid(col='#BBBBBB', lty='solid'),
		   abline(h=1, col='brown'), abline(v=0, col='brown')))

Multi-testing corrections

E-value

## Number of tests
T <- nrow(oligos)

## Compute the E-value
oligos$Eval.binom <- oligos$Pval.binom*T
oligos[1:10,] ## Check the first 10 rows

## Sort the oligo table as a function of the E-value
oligos <- oligos[order(oligos$Eval.binom, decreasing=F),]
oligos[1:10,] ## Check the first 10 rows

FWER

## Compute the FWER
oligos$FWER <- 1 - (1 - oligos$Pval.binom)^T
oligos[1:10,] ## Check the first 10 rows

## Compare E-value and FWER
plot(oligos$Eval.binom, oligos$FWER,log='xy', xlab='E-value', ylab='FWER',
     panel.first=c(grid(col='#BBBBBB', lty='solid'),
		   abline(h=1, col='brown'), abline(v=c(1,T), col='brown'),
	       abline(a=0,b=1,col='#00DD00')))

Q-value (estimator of FDR)

Script with more sophisticated treatment

A script with a more complete treatment of this data set is provided in the scripts repository.