################################################################
#
# R file for analyzing nitrogen-regulated genes
#

# WARNING: we need more memory than the default value, so R should be called withthe following  parameters
#  R --vsize=20M 	# on unix
#  rgui --vsize=20M	# on windows


library(mda)

#source("/win/home/jvanheld/enseignement/statistics_bioinformatics/R-files/config.R")
source("~/enseignement/statistics_bioinformatics/R-files/config.R")
source(util.chip.analysis)
setwd("~/research/regulatory_sequence_analysis/upstream_classifications")



################################################################
# Prepare data

data.N		<-  read.table("data/nitrogen/N/N_pattern_count.tab",header=TRUE,as.is=TRUE)
data.N$group	<- rep("N",dim(data.N)[1])

data.C		<-  read.table("data/nitrogen/C/C_pattern_count.tab",header=TRUE,as.is=TRUE)
data.C$group	<- rep("C",dim(data.C)[1])

#### test for debugging



data.O		<-  read.table("data/nitrogen/O/O_pattern_count.tab",header=TRUE,as.is=TRUE)
data.O$group	<- rep(NA,dim(data.O)[1])

orf.names 	<- c(#	data.O[,1],
			data.N[,1],
			data.C[,1])

known 		<- data.frame(rbind(	data.N[,c(2,4:length(data.N))],
					data.C[,c(2,4:length(data.C))]),
				row.names=orf.names)
training	<- known[2*(1:(dim(known)[1]/2)),]
testing		<- known[-2*(1:(dim(known)[1]/2)),]

dimnames(training)
dimnames(testing)

to.classify	<- data.frame(data.O[,2:length(data.O)],
			row.names=data.O[,1])

################################################################
# plot.svd

attach(known)
colors <- ifelse(group == "N","#00FF00","#FF0000")
plot.svd(known[1:length(known)-1],col=colors)
detach(known)

################################################################
#
# Principal component analysis
#
pc <-prcomp(known[,-length(known)])
attach(known)
colors <- ifelse(group == "N","#00FF00","#FF0000")
plot(pc$x[,"PC1"],pc$x[,"PC2"],
		type	= "p",
		pch	= 19,
		col	= colors,
		xlab	= "PC1", 
		ylab	= "PC2",
		panel.first	= c(grid(col=1),
					abline(h=0,col=1),
					abline(v=0,col=1))
		)
detach(known)

################################################################
# flexible discriminant analysis

#### extract discriminant variables
mfda <- fda(group ~ .,data=training)
mfda
names(mfda)
confusion(mfda)
plot(mfda)
coef(mfda)

attach(mfda)
percent.explained
means			
values			
values/(1-values)	# eigenvalues
prior
fit
hist(fit$fitted.values,breaks=30)

result	<- data.frame(	true		= training$group,
			predict		= predict(mfda),
			fitted.value	= fit$fitted.value,
			post		= predict(mfda,type="post"),
			variates	= predict(mfda,type="variates"),
			row.names	= dimnames(training)[[1]]
			)
result
result[result$predict != result$true,]

predict.fda(mfda)

detach(mfda)

