################################################################
##
## R file for classifying genes on the basis of their upstream
## patterns
##
################################################################

## source ("~/research/seq_analysis/yeast/upstream_classifications/R-files/nit_met_pho.R")

################################################################
## Load libraries
library(mda)
library(mva)
library(class)
library(e1071)

################################################################
## load some utilities
## dir.R.files <- "~/research/R-files/"
source(file.path(dir.R.files, "config.R"))
source(file.path(dir.util, 'util_chip_analysis.R'))
source(file.path(dir.util, 'util.R'))
source(file.path(dir.util, 'util_distances.R'))
source(file.path(dir.util, 'util_sda.R'))
source(file.path(dir.util, 'util_mva.R'))
source(file.path(dir.util, 'discrim_poisson.R'))

################################################################
## Parameters

## input.genes <- 64 ## families with 64 genes, those used in the Bioinformatics paper. Beware : these families contain  few genes for which there is no direct evidence of pho or met regulation. 

## input parameters
#input.prefix <- "NIT_MET_PHO"
input.prefix <- "NIT_MET_PHO_together"
#input.genes <- 55 ## families with 55 genes, for which there are experimental evidence of the TF binding site
input.genes <- 64 ## families with 64 genes, used in the published manuscripts. For some of these genes, the evidence is indirect

rand.sequences <- T ### use randomly generated sequences as control group
rand.selection <- F ### use random gene selections as control group

## ##############################################################
## Permutation test
## Beware: this is not necessary for clustering, since it is
## unsupervised, the permutation is done when the confusion table is
## calculated. It is only useful for supervised classification

permut.test <- F ### permute object labels (permutation test)

## Bootstrapping options
bootstrap.columns <- F ### bootstrap on the columns to assess the robustness
bootstrap.rows <- F ### bootstrap on the columns to assess the robustness

dir.main <- paste(dir.home,'research/seq_analysis/yeast/upstream_classifications',sep='/')
dir.data <- paste(dir.main,'data',sep='/')
dir.results <- paste(dir.main,'results',sep='/')
dir.figures <- paste(dir.results,'figures',sep='/')

setwd(dir.main)



## ##############################################################
## default parameters
verbosity <- 2
export <- T
export.formats <- c("pdf")
metric.name <- "poisson.similarity"
clustering.method <- "ward"
clustering.methods <- c("ward", "complete", "average",  "single")
##clustering.methods <- c("ward", "complete", "average",  "single", "mcquitty")
## clustering.methods <- c("complete", "ward")
rand.number <- 30


################################################################
## Metric names have to be specified within the loops because
## Mahalanobis is sometimes included, sometimes not
metric.names <-  c(
                   "correlation.coefficient.d",
                   "euclidian.dist",
                   "manhattan.dist",
                   "park.similarity.d",
                   "poisson.dissimilarity.distinct",
#                   "poisson.dissimilarity.distinct.product",
                   "poisson.dissimilarity.over",
#                   "poisson.dissimilarity.over.product",
                   "poisson.mixed.distinct.d",
                   "poisson.mixed.distinct.product.d",
                   "poisson.mixed.over.d",
                   "poisson.mixed.over.product",
                   "poisson.mixed.over.product.d",
                   "poisson.similarity.d",
                   "poisson.similarity.product.d",

                   ### similrity metrics
#                   "park.similarity",
#                   "poisson.mixed.distinct",
#                   "poisson.mixed.distinct.product",
#                   "poisson.mixed.over",
#                   "poisson.mixed.over.product",
#                   "poisson.similarity",
#                   "poisson.similarity.product",
                   )
################################################################
### Iterate over the different parameters
for (input.genes in c(64, 55)) {
for (bootstrap.columns in c(F,T)) {
for (bootstrap.rows in c(F)) {
for (input.prefix in c("NIT_MET_PHO_together","NIT_MET_PHO")) {
for (rand.sequences in c(T,F)) {
for (rand.selection in c(T,F)) {

################################################################
## load data
source(paste(dir.main, 'R-files/load_nit_met_pho.R',sep='/'))

## ##############################################################
## Plot points on the 2 principal component axes, and color group
## labels
if (!(bootstrap.rows || bootstrap.columns)) {
    dir.results <- paste(dir.main,'results/PCA',sep='/')
    dir.figures <- paste(dir.results,'figures',sep='/')
 
    #plot.prcomp.groups(x,group.labels, plot.NA=F,group.palette=group.colors,group.symbols=group.labels)
    pc <- plot.prcomp.groups(x,group.labels, plot.NA=F,group.palette=group.colors,label.ref=F,xleg=10,return.pc=T)
    setwd(dir.figures); export.plot (file.prefix=paste(prefix, "PCA_2dim", sep="_"),export.formats=export.formats,width=8,height=8)
    pc.all <- plot.prcomp.groups(x.all,group.labels.all, plot.NA=T,gtoup.labels=group.colors, label.ref=T,scale.axes=F,return.pc=T)
    setwd(dir.figures); export.plot (file.prefix=paste(prefix, "PCA_2dim_all", sep="_"),export.formats=export.formats,width=8,height=8)
}

## ##############################################################
## test different distance metrics and clustering methods
## ##############################################################

verbose("Calculating poisson-based metrics", 1)

#### Poisson-based distance metrics
verbose("    poisson-based dissimilarity", 2)
poisson.dissimilarity.distinct <- dist.poisson(x,over=F)
poisson.dissimilarity.over <- dist.poisson(x,over=T)
poisson.dissimilarity.over.product <- dist.poisson(x,over=T,product=T)
poisson.dissimilarity.distinct.product <- dist.poisson(x,over=F,product=T)

#### Poisson-based similarity metrics
verbose("    poisson-based similarity", 2)
poisson.similarity <- sim.poisson(x,product=F)
poisson.similarity.product <- sim.poisson(x,product=T)

#### mixed metrix (combine distance and similarity)
verbose("    poisson-based mixed metrics", 2)
alpha <- 1 ### weighting between similarity and distance for additive models
beta <- 0 ### constant for additive models
poisson.mixed.over <- sim.poisson.mixed(x,alpha=alpha,beta=beta,over=T,product=F)
poisson.mixed.distinct <- sim.poisson.mixed(x,alpha=alpha,beta=beta,over=F,product=F)
poisson.mixed.over.product <- sim.poisson.mixed(x,over=T,product=T)
poisson.mixed.distinct.product <- sim.poisson.mixed(x,over=F,product=T)

#### Park metric
verbose("Calculating Park similarity", 1)
park.similarity <- sim.park(x)

## classical distance metrics
verbose("Calculating other metrics", 1)
verbose ("Euclidian distance", 2); euclidian.dist <- dist(x,method="euclidian")
verbose ("Manhattan distance", 2); manhattan.dist <- dist(x,method="manhattan")

## ##############################################################
## Calculate the Mahalanobis distance. This cannot be done with
## bootstrapped columns because the covariance matrix is singular.
n <- dim(x)[1]
p <- dim(x)[2]
if ((!bootstrap.columns) &&
    (!bootstrap.columns)) {
  verbose ("Mahalanobis distance", 2)
  mahalanobis.dist <- matrix(nrow=n,ncol=n)
  for (i in 1:n) {
     mahalanobis.dist[i,] <- mahalanobis(x,as.matrix(x)[i,],cov(x))
  }
  metric.names <- c(metric.names, "mahalanobis.dist")
  mahalanobis.dist <- as.dist(mahalanobis.dist)
}

correlation.coefficient <- cor(t(x))

################################################################
## similarities are converted to pseudo-distances by the transformation 
##       dist = k - sim
## k is set to max(sim), and varies thus with the distance
## convert similarities to distances
poisson.mixed.over.d <- max(poisson.mixed.over) - poisson.mixed.over 
poisson.mixed.distinct.d <- max(poisson.mixed.distinct) - poisson.mixed.distinct 
poisson.mixed.over.product.d <- max(poisson.mixed.over.product) - poisson.mixed.over.product 
poisson.mixed.distinct.product.d <- max(poisson.mixed.distinct.product) - poisson.mixed.distinct.product 
poisson.similarity.d <- max(poisson.similarity) - poisson.similarity 
poisson.similarity.product.d <- max(poisson.similarity.product)- poisson.similarity.product
park.similarity.d <- max(park.similarity) - park.similarity
correlation.coefficient.d <- max(correlation.coefficient) - correlation.coefficient

################################################################
# Plot and export the different trees
#
verbose("Hierarchical clustering", 1)
par(cex=0.8)
par(font=2)

dir.results <- paste(dir.main,'results/clustering/text_files',sep='/')
dir.figures <- paste(dir.main,'results/clustering/figures',sep='/')

setwd(dir.figures)
#prefix <- "pattern_counts"
for (clustering.method in clustering.methods) {
  for (metric.name in metric.names) {
    dist.mat <- get(metric.name)
    tree <- hclust(dist(dist.mat),method=clustering.method)

    ## plot the tree
    plot(tree,main=paste(metric.name,clustering.method,sep=" - "), xlab="", labels=full.labels)
    if (export) {
      setwd(dir.figures); export.plot (file.prefix=paste(prefix,  metric.name,clustering.method,sep="_"),export.formats=export.formats, width=15,height=8)
    }

    ## Compare known class with clustering result
    if (export) {
      setwd(dir.results)
      sink(paste(prefix, "_", 
			   metric.name, "_", clustering.method, 
			   "_confusion",".txt", sep=""))
    }
    print (paste("Distance metric:", metric.name))
    print (paste("Clustering method:", clustering.method))
    for (k in c(3,4,5)) {
      print (paste("Confusion table, pruning with k=", k))
      print (table(cutree(tree,k=k),group.labels))
    }
    if (export) sink()
  }
}
par(font=1)
par(cex=1)
## ##############################################################
## calculate confusion tables, hit rates, in order to select the
## method which gave the best hit rate with this dataset 
verbose("Calculating confusion tables with hierarchical clustering", 1)

k <- 3
if (rand.sequences) { k <- k + 1}
if (rand.selection) { k <- k + 1}

metric.name <- "poisson.similarity.product.d"
#metric.name <- "poisson.mixed.distinct.product.d"
#metric.name <- "park.similarity.d"
#metric.name <- "euclidian.dist"
#metric.name <- "poisson.dissimilarity.over"
clustering.method <- "ward"

  
## ##############################################################
## Compare confusion tables between the different agglomeration rules
## and metrics

#### prepare a matrix for storing confusion tables
confusion.tables <- data.frame(matrix(nrow=length(metric.names)*length(clustering.methods),
                                      ncol=length(groups)^2 + 6
                                      ))
## row names
rnames <- vector()
for (cm in sort(clustering.methods)) {
  for (mn in sort(metric.names)) {
    rnames <- c(rnames, paste (mn, cm) )
  }
}
row.names(confusion.tables) <- rnames

## column names
cnames <- c("metric", "clustering", "permuted")
for (g1 in groups) {
  for (g2 in groups) {
    cnames <- c(cnames, paste (g2, g1, sep= " > "))
  }
}
cnames <- c(cnames, "correct", "wrong", "hit rate")
names(confusion.tables) <- cnames

################################################################
## calculate and store confusion tables
for (clustering.method in clustering.methods) {
  for (metric.name in metric.names) {

    tree <- hclust(dist(get(metric.name)),method=clustering.method)
    ## plot and export the tree
#    par(cex=0.8)
#    par(font=2)
#    plot(tree,main=paste(metric.name,clustering.method,sep=" - "), xlab="", labels=full.labels)
#    if (export) {
#      setwd (dir.figures); export.plot (file.prefix=paste(prefix, metric.name,clustering.method,sep="_"),export.formats=export.formats, width=15,height=8)
#    }
#    par(font=1)
#    par(cex=1)

    for (permuted in c(F,T)) {

        ## match clustering results and prior groups
	if (permuted) {
            check.labels <- sample(as.vector(group.labels))
	} else {
            check.labels <- group.labels
	}
        (tab <- table(cutree(tree,k=k),check.labels))
        (class.match <- matchClasses(tab,method="exact"))
        (match.comp <- compareMatchedClasses(cutree(tree,k=k),check.labels, method="exact"))
        (hit.rate <- match.comp$diag)
    
	## row name
        rname <- paste(metric.name, clustering.method)
	if (permuted) {
	    rname <- paste(rname, "permuted")
	}

	## Parameters of this row
        confusion.tables[rname, "clustering"] <- clustering.method
        confusion.tables[rname, "metric"] <- metric.name
        confusion.tables[rname, "hit rate"] <- hit.rate
        confusion.tables[rname, "permuted"] <- permuted
    
        correct <- 0
        wrong <- 0
        for (g1 in groups) {
          for (c in 1:k) {
            g2 <- class.match[c]
            cname <- paste (g1, g2, sep= " > ")
            m <- tab[c,g1]
            if (g1 == g2) {
              correct <- correct + m
            } else {
              wrong <- wrong + m
            }
            confusion.tables[rname,cname] <- m
          }
        }
        confusion.tables[rname, "correct"] <- correct
        confusion.tables[rname, "wrong"] <- wrong
      }
    }
}

confusion.tables <- confusion.tables[order(confusion.tables[,"hit rate"], decreasing=T),]
setwd (dir.results); export.object(confusion.tables, file.prefix=paste(prefix,"conf_tables",sep='_'), export.format='table')



################################################################
# Calculate average inter- and intra-group distances
#
verbose("Calculating inter-group distances", 1)
dir.results <- paste(dir.main,'results/group_distances',sep='/')
dir.figures <- paste(dir.results,'figures',sep='/')

gn <- length(groups) ## number of groups
mn <- length(metric.names) ## number of metrics
group.distances <- data.frame(matrix(nrow=mn,
                                        ncol=gn*(gn+1)
                                     ))
row.names(group.distances) <- metric.names
inter.groups <- vector()
intra.groups <- vector()
for (g1 in groups) {
  for (g2 in groups) {
    if (g2 >= g1) {
      cname <- paste (g1, g2,sep=".")
      if (g2 == g1) {
        intra.groups <- c(intra.groups, cname)
      } else {
        inter.groups <- c(inter.groups, cname)
      }
    }
  }
}
cnames <- c(intra.groups,inter.groups)
cnames <- c(cnames, paste(c(intra.groups,inter.groups),"std",sep='.'))
names(group.distances) <- cnames
plot.histo <- F

for (metric.name in metric.names) {
  dist.mat <- as.matrix(get(metric.name))
  b <- pretty(dist.mat,n=40)
  
  par(mfrow=c(gn,gn))
  for (g in groups) {
    ## mean intra-group distance
                                        #    group.distances[metric.name,g] <- mean(as.vector(dist.mat[group.labels == g,group.labels == g]))
    
    ## inter-group distances
    for (g2 in groups) {
      cn <- paste (g, g2,sep=".")
      group.dist <- as.vector(dist.mat[group.labels == g,group.labels == g2])
      ## calculate mean group distance
      if (g2 >= g) {
        group.distances[metric.name,cn] <- mean(group.dist,na.omit=T)
        group.distances[metric.name,paste(cn,"std",sep='.')] <- sqrt(var(group.dist))
      }

      ## plot histogram with group distances
      if (plot.histo) {
        if (g==g2) {
          c='#0000ff'
        } else {
          c='#ff0000'
        }
        h <- hist(group.dist,breaks=b,main=cn,col=c)
      }
    }
  }
  par(mfrow=c(1,1))
  
  setwd(dir.figures); export.plot (file.prefix=paste(prefix, "group_distance", metric.name, sep="_"),export.formats=export.formats)
  
}

group.distances$intra.group <- apply(group.distances[,intra.groups],1,mean)
group.distances$inter.group <- apply(group.distances[,inter.groups],1,mean)
group.distances$separation <- group.distances$inter.group/group.distances$intra.group
group.distances$separation.inv <- 1/group.distances$separation
group.distances$best <- apply(group.distances[,c("separation","separation.inv")],1,max)

group.distances$intra.group.std <- apply(group.distances[,paste(intra.groups, "std", sep='.')],1,mean)
group.distances$inter.group.std <- apply(group.distances[,paste(inter.groups, "std", sep='.')],1,mean)

setwd (dir.results); export.object(group.distances, file.prefix=paste(prefix,"group_distances",sep='_'), export.format='table')

## ##############################################################
## plot.svd
verbose("Plotting on SVD axes", 1)
par(mai=c(1,1,1,1))
for (metric.name in metric.names) {
    verbose(paste("    SVD with", metric.name), 2)
    dist.mat <- get(metric.name)
    z <- plot.svd(x,group.labels,sim=dist.mat, main=paste("SVD with matrix of ", metric.name))
    setwd(dir.figures); export.plot (file.prefix=paste(prefix, metric.name, "SVD", sep="_"),export.formats=export.formats,width=8,height=8)
}

## ##############################################################
## Stepwise discriminant analysis
##
verbose("Performing discriminant analysis", 1)
dir.results <- paste(dir.main,'results/discriminant_analysis/text_files',sep='/')
dir.figures <- paste(dir.main,'results/discriminant_analysis/figures',sep='/')

pda.eval <- compare.stepwise.pda.methods(x,group.labels,data.title=data.title, qda=F,with.pc=F)
setwd(dir.figures); export.plot (file.prefix=paste(prefix, "PDA", "error_rates",sep="_"), export.formats=export.formats)
setwd(dir.results); export.object(pda.eval, file=paste(prefix, "stepwise_PDA", sep="_"),export.formats="print")


x.selected <- x[,pda.eval$stepwise.lda$best.vars]
mlda <- pda(x.selected,group.labels,pda.method="lda",loo=T)


## ##############################################################
##
## Poisson-based discriminant analysis
##
verbose("Performing Poisson-based supervised classification", 1)
dp <- discrim.poisson (x, group.labels,CV=F)
dp.cv <- discrim.poisson (x, group.labels,CV=T)
setwd(dir.results); export.object(dp.cv, file=paste(prefix, "poisson_superv_classif_LOO", sep="_"),export.formats="print")

## Use the subset of variables selected by stepwise lda for poisson discriminant analysis
dp.cv.selected <- discrim.poisson (x.selected, group.labels,CV=T)
setwd(dir.results); export.object(dp.cv.selected, file=paste(prefix, "_selected_poisson_superv_classif_LOO", sep="_"),export.formats="print")

################################################################
# Flexible Discriminant Analysis

#### extract discriminant variables
verbose("Flexible discriminant analysis", 1)
verbose("Beware: appareently this si an internal validation", 1)
mfda <- fda(group.labels ~.,data=y)

setwd (dir.results); export.object(mfda, file.prefix=paste(prefix,"fda_internal",sep='_'), export.format='print')

mfda
names(mfda)
confusion(mfda)
plot(mfda)
coef(mfda)

attach(mfda)
percent.explained
means			
values			
values/(1-values)	# eigenvalues
prior
fit

if (length(groups) == 2) {
   hist(fit$fitted.values,breaks=30)
} else if (length(groups) == 3) {
  plot(fit$fitted.values,type="n")
  text(fit$fitted.values,group.labels)
}

result	<- data.frame(true = group.labels,
                      predict = predict(mfda),
                      fitted.value = fit$fitted.value,
                      post = predict(mfda,type="post"),
                      variates = predict(mfda,type="variates"),
                      row.names = names(group.labels))

#result
result[result$predict != result$group.labels,]

detach(mfda)

setwd (dir.results); export.object(result, file.prefix=paste(prefix,"fda_internal_table",sep='_'), export.format='table')



priors.all <- c(CTL = length(group.labels.all)-150,
                NIT = 50,
                PHO = 50,
                MET = 50)/length(group.labels.all)

pred.all <- predict.fda(mfda,x.all,prior=priors.all)
table(pred.all)
confusion(pred.all,group.labels.all)
setwd (dir.results); export.object(pred.all, file.prefix=paste(prefix,"fda_predictions",sep='_'), export.format='table')

#### leave-one-out validation
loo.pred <- vector(mode="character",l=length(group.labels))
names(loo.pred) <- names(group.labels)
n <- dim(y)[1]
for (i in 1:n) {
  test <- y[i,]
  train <- y[-i,] ### leave-one-out validation
#  train <- y ### internal validation
  mfda1 <- fda(group.labels ~ ., train)
  pred <- predict(mfda,test)
  loo.pred[i] <- levels(pred)[pred]
  
}
fda.loo.conf <- confusion(loo.pred,group.labels)
setwd (dir.results); export.object(fda.loo.conf, file.prefix=paste(prefix,"fda_predictions",sep='_'), export.format='table')

## ##############################################################
## k-nearest-neighbours classification
##
verbose ("K nearest neighbour classification", 1)
for (k in c(1:20)) {
  cl <- knn.cv(x,group.labels,k=k)
  (conf <- confusion(cl,group.labels))
  cl.selected <- knn.cv(x.selected,group.labels,k=k)
  conf.selected <- confusion(cl.selected,group.labels)
  print (paste (k, attr(conf, "error"), attr(conf.selected, "error")))
}


################################################################
## End of all iterations
}
}
}
}
}
}
