################################################################
#
# Load different data types and prepare them for multivariate
# analysis
#

source(file.path(dir.util,'util_mva.R'))
verbose ("reading the data", 1)

#### set default data type
if (!exists("data.type")) {
  data.type <- "expression"
}


#### directories
dir.met.pho <- file.path(dir.home, "research/discriminant_analysis/met_pho")
dir.data <- file.path(dir.met.pho, "data","R-data")
dir.oligos <- file.path(dir.met.pho, "results","oligos")
prev.wd <- getwd()

#################################################################
##
## Prepare the data for analysis
##

## ##############################################################
## Gene expression data from Ogawa et al.
if (data.type == "expression") {
  verbose("Loading PHO expression data",1)
  setwd(dir.data)
  ## Load gene expression data
  pho.file <- 'expression.pho.tab' # store the location of the gene expression file in a variable
  pho <- read.table(pho.file, header=TRUE, row.names=1)     # read the gene expression file
#  names(pho)				# check column names
#  row.names(pho)[1:20]                    # check the 20 first row names
  expr <- na.omit(pho[,-1])
#  names(expr) <- paste("chip",1:8,sep="")
  data <- expr
  data.title <- "phosphate expression data"
  groups <- c("PHO", "CTL")

## ##############################################################
## scores obtained by scanning upstream sequences with position-specific scoring matrices
} else if (data.type == "matrix_scores") {
  ## top matrix scores in upstream sequences
  ## Whole-genome pattern matching with different position weight
  ## matrices : Pho4, Met4, Met31, Pho4.cacgtg and Pho4.cacgtt. 
  ## For each matrix, the 3 top scores are reported.
  setwd(dir.data)
##  match.file <- 'all_matrices_allup800_top3.tab' # store the location of the data file in a variable
  match.file <- 'all_matrices_top3.tab' # store the location of the data file in a variable
  matches <- read.table(match.file, header=TRUE, row.names=1)     # read the data file
  names(matches)			# check column names
  row.names(matches)[1:20]	# check the 20 first row names
  data <- na.omit(data.frame(matches[,-1]))
  data.title <- "PHO and MET upstream motifs"
  groups <- c("PHO", "MET", "CTL")


## ##############################################################
## oligoncleotide counts
} else if (data.type == "oligo_counts") {
  ## pattern counts in upstream sequences 
  setwd(dir.oligos)
  count.file <- 'oligos_6nt_allup_noTY.cts'
                                        #  count.file <- 'oligos_6nt_allup.counts' # store the location of the data file in a variable
                                        #  count.file <- '6nt_oligos_allup.counts'
  data <- read.table(count.file, header=TRUE,row.names=1)     # read the data file
  names(data)			# check column names
  row.names(data)[1:20]	# check the 20 first row names
  data.title <- "PHO and MET oligo counts"
  groups <- c("PHO", "MET", "CTL")
  orf.gene <- read.table('orf_genes.tab', header=TRUE,row.names=1)     # read the correspondence between ORF and gene name
} else {
  print ("ERROR: UNKNOWN DATA TYPE")
}

## ##############################################################
## Check data and measure dimensions
row.names(data) <- toupper(row.names(data))
data <- check.data(data)
n <- nrow(data)
p <- ncol(data)
if (data.type == "oligo_counts") {
  max.p <- 19
  do.qda <- F
} else {
  max.p <- p
  do.qda <- T
}

## ##############################################################
## read the composition of the training groups
group.labels <- rep(NA,length.out=n)
names(group.labels) <- row.names(data)
setwd(dir.data)
for (g in groups) {
  group.file <- paste(g,'.orfs',sep="")
  verbose (group.file , 1)
  group.description <- read.table(group.file, header=TRUE, row.names=1, sep='\t')
  group.orfs <- toupper(row.names(group.description))
  if (data.type == "oligo_counts") {
    genes <- orf.gene[group.orfs,1]
    group.labels[genes] <- g
  } else {
    group.labels[group.orfs] <- g
  }
}

## ##############################################################
## Group labels for 3-group classification
group.labels.3groups <- group.labels

## ##############################################################
## Group labels for two-group classification
group.labels.MET <- group.labels
group.labels.MET[group.labels.MET=="PHO"] <- "CTL"
    
group.labels.PHO <- group.labels
group.labels.PHO[group.labels.PHO=="MET"] <- "CTL"


#### use gene names instead of ORF names
##if (data.type == "matrix_scores") {
##  row.names(data) <- matches[row.names(data),"gene"]
##  names(group.labels) <- row.names(data) 
##}

#### restore original working directory
verbose (prev.wd,3)
setwd(prev.wd)



