################################################################
## cluster_motifs.R
##
## This script takes as input a matrix comparison file (that can be
## generated by compare-matrices), performs hierarchical
## clustering.
##
## It returns the resulting tree in json format, which can be loaded
## by the d3 library for display purposes.

## TO DO
## - Define an argument infile.consensus, providing a 3-columns file with the logo ID + its consensus in direct and reverse complementary strands. This consensus file should be exported by comapre-matrices.
## -> this consensus could e used to display trees in R, without requiring the

## Load required libraries
library("RJSONIO")
library("ctc")
library("dendroextras")

## Redefine the main directory (this should be adapted to local configuration)
dir.main <- getwd()

dir.rsat <- Sys.getenv("RSAT")
if (dir.rsat == "") {
  stop("The environment variable RSAT is not defined.")
}

## Load some libraries
source(file.path(dir.rsat, 'R-scripts/config.R'))
source(file.path(dir.rsat, 'R-scripts/cluster_motifs_lib.R'))


## Options
plot.tree <- FALSE
export <- 'json'


################################################################
## Read arguments from the command line.
##
## Arguments passed on the command line will over-write the default
## arguments specified above.
args = commandArgs(trailingOnly=TRUE);
#print("Parsing command-line arguments")
if (length(args >= 1)) {
  print(args)
  for(i in 1:length(args)){
    eval(parse(text=args[[i]]))
  }
}

## Check parameters
check.param()


##################################
## Read matrix comparison table
global.compare.matrices.table <<- read.csv(infile, sep = "\t", comment.char = ";")
names(global.compare.matrices.table)[1] <- sub("^X.", "", names(global.compare.matrices.table)[1])


################################################################
## Read description table 
global.description.table <<- read.csv(description.file, sep = "\t", comment.char = ";")
if(length(global.description.table$id) == 2*length(unique(global.description.table$id))){
  global.description.table <- global.description.table[1:length(unique(global.description.table$id)),]
}
## In reference to the names, order alphabetically the description table
global.description.table <- global.description.table[order(global.description.table$name),]
global.description.table$n <- 1:length(global.description.table$n)

matrix.labels <-  as.vector(global.description.table$label)
names(matrix.labels) <- as.vector(global.description.table$id)


## Check that the compare-matrices table contains the required score column
if (length(grep(pattern=score, names(global.compare.matrices.table))) < 1) {
  stop(paste(sep="", "Input file (", infile, ") does not contain the score column (", score, ")."))
}

## Convert distance table into a distance matrix, required by hclust
distances.objects <- build.distance.matrix(global.compare.matrices.table)
dist.table <- distances.objects[[1]]
dist.matrix <- distances.objects[[2]]

## Export the distance table
write.table(dist.table, file = distance.table, quote = FALSE, row.names = TRUE, col.names=NA, sep = "\t")


##############################################
### Build the tree by hierarchical clustering, and export it in Newick format
tree <<- hclust(dist.matrix, method = hclust.method)
tree$labels <- as.vector(global.description.table$label)
system(paste("mkdir -p ",out.prefix, "_trees/", sep = ""))


if (export == "newick") {
  temp.tree <- tree
  temp.tree[[2]] <- round(tree[[2]], digits = 3)
  newick.file <- paste(out.prefix, "_trees/tree.newick", sep = "")
  verbose(paste("Exporing newick file", newick.file), 1)
  write(hc2Newick(temp.tree, flat = TRUE), file=newick.file)
  rm(temp.tree)
}

#######################################
### Creates and parse the json file
halfway.tree <- hclustToTree(tree)
jsonTree <- toJSON(halfway.tree)

## Fix some little technical issues for JSON compatibility with the tree display javascript
jsonTree <- gsub("\\],", "\\]", jsonTree, perl = TRUE)
jsonTree <- paste("{\n\"name\": \"\",\n\"children\":", jsonTree, "}", sep = "")
jsonTree <- gsub("\n\"order\":\\s+\\d+", "", jsonTree, perl = TRUE)
#jsonTree <- gsub(",\\s*\n\\s*}", "\n}", jsonTree, perl = TRUE)

#############################
### Prints the .json file 
json.file <- paste(out.prefix, "_trees/tree.json", sep="")
verbose(paste("JSON tree file", json.file), 1)
writeLines(jsonTree, con=json.file)


#############################################################
## Bottom-up traversal of the tree to orientate the logos
## Initialize global variables
tree$labels <- paste(as.vector(global.description.table$consensus), 1:length(global.description.table$consensus))
merge.level <- 1
motifs.info <<- list()
internal.nodes.attributes <<- list()
forest.nb <<- 1
forest.list <- list()

compare.matrices.table <- global.compare.matrices.table
description.table <- global.description.table

## Saves the nodes clustered on each level
## of the merge
merge.levels.leaves <<- leaves.per.node(tree)

## Saves the attributes of the merge
internal.nodes.attributes <<- list()

#########################################
## Traversing the tree: read the merge of
## the hclust tree and align the leaves.
## Bottom-up traversal of the tree to orientate the logos
for (merge.level in 1:nrow(tree$merge)) { 
  
  child1 <- tree$merge[merge.level,1]
  child2 <- tree$merge[merge.level,2]

  internal.nodes.attributes[[paste("merge_level_", merge.level)]][["merge_level"]] <- merge.level
  internal.nodes.attributes[[paste("merge_level_", merge.level)]][["method"]] <- hclust.method

  ########################################
  ## Case 1: merging between two leaves ##
  ########################################
   if ((child1 < 0) && (child2 < 0)) {
     align.two.leaves(child1, child2)
   }


  ############################################
  ## Case 2: merging a motif with a cluster ##
  ############################################
  if(((child1 < 0) && (child2 > 0)) || ((child1 > 0) && (child2 < 0))){
    align.leave.and.cluster(child1, child2)
  }


  ##########################################
  ## Case 3: merging between two clusters ##
  ##########################################
  if ((child1 > 0) && (child2 > 0)) {
    align.clusters(child1, child2)
  }
}

## Split the tree into forest
forest <<- cutree(tree, k = forest.nb)
ids.forest <<- list()

## Get IDs of the forest
for( lvl in 1:length(table(forest))){
  ids.forest[[paste("forest_", lvl, sep = "")]] <- get.id(as.numeric(which(forest == lvl)))
}

## Fill the downstream end 
motifs.info <- fill.downstream(motifs.info, ids.forest)
forest.list[[paste("forest_", 1, sep = "")]] <- motifs.info

## Reset the labels
for(nb in 1:length(tree$labels)){

  ## Add the aligned consensus
  tree$labels[nb] <- paste(motifs.info[[get.id(nb)]][["consensus"]], sep = "   ")
  
  ## Add the new labels
  for(label in labels){
    if(label == "consensus"){
      next
    } else if(label == "id"){
      tree$labels[nb] <- paste(tree$labels[nb], get.id(nb), sep = " ")
    } else if(label == "number"){
      tree$labels[nb] <- paste(tree$labels[nb], nb, sep = " ")
    } else if(label == "strand"){
      tree$labels[nb] <- paste(tree$labels[nb], motifs.info[[get.id(nb)]][["strand"]], sep = " ")
    }
  }
}

## Colour the branches
tree.dendro <- as.dendrogram(tree)
tree.dendro <- color_clusters(tree.dendro, k = forest.nb, col = rainbow, groupLabels = TRUE)

## Get the aligment width, to calculate the limits of the plot
alignment.width <- sapply(tree$labels, function(X){
  nchar(X)
})
alignment.width <- max(alignment.width)
mar4 <- alignment.width - 18

## Export the tree with the aligment
plot.format <- "pdf" ## Default for testing inside the loop
for (plot.format in c("pdf", "png")) {
  w.inches <- 15 ## width in inches
  h.inches <- round(0.7* length(motifs.info)) ## height in inches
  #h.inches <- 8 ## height in inches
  resol <- 72 ## Screen resolution
  tree.drawing.file <- paste(sep="", out.prefix, "_consensus_tree.", plot.format)
  verbose(paste("Exporting hclust tree drawing", tree.drawing.file), 1)
  if (plot.format == "pdf") {
    pdf(file=tree.drawing.file, width=w.inches, height=h.inches)
  } else if (plot.format == "png") {
    png(filename=tree.drawing.file, width=w.inches*resol, height=h.inches*resol)
  }

  par(mar=c(3,2,1,mar4),family="mono")
  plot(tree.dendro, horiz=TRUE, main = paste("Aligned consensus tree; labels:" ,paste(labels, collapse = ","), sep = " "))
  dev.off()
}


#########################
##  Produce the internal nodes attributes table
internal.nodes.attributes.table <- lapply(internal.nodes.attributes, function(X){
  return(c(X[["merge_level"]], X[["method"]], X[["min_score"]], X[["max_score"]], X[["median_score"]], X[["alignment_status"]], X[["cluster_1"]], X[["cluster_2"]]))
})
internal.nodes.attributes.table <- t(data.frame(internal.nodes.attributes.table))
colnames(internal.nodes.attributes.table) <- c("#merge_level", "method", "min_score", "max_score", "median_score", "alignment_status", "cluster_1", "cluster_2")
attributes.file <- paste(sep="", out.prefix, "_internal_nodes_attributes.tab")
write.table(internal.nodes.attributes.table, file = attributes.file, sep = "\t", quote = FALSE, row.names = FALSE)
verbose(paste("Exporting merge attributes table", attributes.file), 1)


#####################################################
## Produce the forests: when a pair of clusters is
## not aligned, it is splited and each part (forest)
## is realigned and printed in pdf and png
if(forest.nb > 1){

  ## Creates a folder with where the separated information
  ## of each cluster will be stored
  system(paste("mkdir -p ", out.prefix, "_clusters_information", sep = ""))
  clusters.info.folder <<- paste(out.prefix, "_clusters_information", sep = "")
  
  global.motifs.info <<- motifs.info
  forest <<- cutree(tree, k = forest.nb)
  forest.list <- list()
  ids.forest <<- list()
  
  ## Get IDs of the forest
  for( lvl in 1:length(table(forest))){
    ids.forest[[paste("forest_", lvl, sep = "")]] <- get.id(as.numeric(which(forest == lvl)))
  }

  for(nb in 1:length(table(forest))){
    
    cluster.nb <<- nb 
    #verbose(paste("Exploring the cluster generated: ", nb ), 1)
    rm(compare.matrices.table)
    rm(description.table)
    rm(tree)
    rm(motifs.info)
    rm(merge.level.leaves)
    rm(internal.nodes.attributes)
    internal.nodes.attributes <<- list()

    ## Creates an individual folder for each cluster
    system(paste("mkdir -p ", clusters.info.folder, "/cluster_", cluster.nb, sep = ""))
    cluster.folder <<- paste(clusters.info.folder, "/cluster_", cluster.nb, sep = "")

    ## JvH: open an error log file
    error.file <- file.path(cluster.folder, "errors.txt")
    error.log <- file(error.file)

    ## JvH: open a command log file
    command.file <- file.path(cluster.folder, "commands.sh")
    command.log <- file(command.file)

    ## Collect the IDs of each cluster
    ids <- ids.forest[[paste("forest_", nb, sep = "")]]

    ## Skips the hierarchical clustering step if the cluster
    ## is a single node
    if(length(ids) < 2){
      forest.list[[nb]] <- NULL
      forest.list[[paste("cluster_", nb, sep = "")]][[ids]] <- global.motifs.info[[ids]]
      forest.list[[paste("cluster_", nb, sep = "")]][[ids]][["consensus"]] <- gsub("-", "", forest.list[[paste("cluster_", nb, sep = "")]][[ids]][["consensus"]])
      forest.list[[paste("cluster_", nb, sep = "")]][[ids]][["number"]] <- as.numeric(1)
      forest.list[[paste("cluster_", nb, sep = "")]][[ids]][["spacer"]] <- as.numeric(0)
      forest.list[[paste("cluster_", nb, sep = "")]][[ids]][["offset_down"]] <- as.numeric(0)

      ## Crete a JSON file for a trees with a single node
      ## This is required because the cannot be used the hclust to Josn function
      label.single.node <- as.vector(global.description.table[global.description.table$id == ids, ]$label) 
      JSON.single.node <- paste("{\n\"name\": \"\",\n\"children\":[\n{\n \"label\": \"", label.single.node, "\",\n}\n]\n}", sep = "")
      json.file <- paste(out.prefix, "_trees/tree_cluster_", nb,".json", sep="")
      verbose(paste("JSON tree file", json.file), 1)
      writeLines(JSON.single.node, con=json.file)

      ## For consistency, print the empty file
      ## It will be erased later
      JSON.empty <- ";Empty_file\n"
      JSON.clusters.table.file <- paste(sep="", cluster.folder, "/levels_JSON_cluster_", cluster.nb,"_table.tab")
  write.table(JSON.empty, file = JSON.clusters.table.file, sep = "\t", quote = FALSE, row.names = FALSE)

      ## For consistency, Create the folder with the merged consensuses
      system(paste("mkdir -p ", cluster.folder, "/merged_consensuses", sep = ""))
      flag <- system(paste("ls ", cluster.folder, "/merged_consensuses", "/ | wc -l", sep = ""), intern = TRUE)
      if(flag >= 1){
## DEBUG: JvH        system(paste("rm -r ", cluster.folder, "/merged_consensuses", "/*", sep = ""))
      }
      next
    }

    #############################################
    ## Align the internal cluster of they have
    ## more than a single node
    
    ## New comparison table
    compare.matrices.table <<- global.compare.matrices.table[which((global.compare.matrices.table[,"id1"] %in% ids & global.compare.matrices.table[,"id2"] %in% ids)),]
    
    compare.matrices.table$name1 <- as.vector(compare.matrices.table$name1)
    compare.matrices.table$name2 <- as.vector(compare.matrices.table$name2)
    
    ## New description table
    description.table <<- global.description.table[global.description.table[,"id"] %in% ids, ]
    ## names(description.table)[1] <- sub("^X.", "", names(description.table)[1])
    ## In reference to the names, order alphabetically the description table
    description.table <- description.table[order(as.vector(description.table$name)),]
    description.table$n <- 1:length(description.table$n)
    matrix.labels <-  as.vector(description.table$label)
    names(matrix.labels) <- as.vector(description.table$id)
    
    ################################################################
    ## Build a distance matrix from the distance score list, this
    ## distance matrix is required for hclust
    distances.objects <- build.distance.matrix(compare.matrices.table)
    dist.table <- distances.objects[[1]]
    dist.matrix <- distances.objects[[2]]
    

    #############################################
    ## Runs and plot the hierarchical cluster
    tree <<- hclust(dist.matrix, method = hclust.method)
    tree$labels <- as.vector(description.table$label)
    #tree$labels <- paste(as.vector(description.table$consensus), 1:length(description.table$consensus))
    
    ######################################
    ## Creates and parse the json file
    halfway.tree <- hclustToTree(tree)
    jsonTree <- toJSON(halfway.tree)

    ## Fix some little technical issues for JSON compatibility with the tree display javascript
    jsonTree <- gsub("\\],", "\\]", jsonTree, perl = TRUE)
    jsonTree <- paste("{\n\"name\": \"\",\n\"children\":", jsonTree, "}", sep = "")
    jsonTree <- gsub("\n\"order\":\\s+\\d+", "", jsonTree, perl = TRUE)
    ## jsonTree <- gsub(",\\s*\n\\s*}", "\n}", jsonTree, perl = TRUE)
    
    ############################
    ## Prints the .json file 
    json.file <- paste(out.prefix, "_trees/tree_cluster_", nb,".json", sep="")
    verbose(paste("JSON tree file", json.file), 1)
    writeLines(jsonTree, con=json.file)

    motifs.info <<- list()

    ## Saves the nodes clustered on each level of the merge
    merge.levels.leaves <<- leaves.per.node(tree)

    ##################
    ## Produce the table to add the merged
    ## consensuses into the logo tree
    JSON.clusters()

    #########################################
    ## Traversing the tree: read the merge of
    ## the hclust tree and align the leaves.
    ## Bottom-up traversal of the tree to orientate the logos
    for (merge.level in 1:nrow(tree$merge)) {
      alignment.alignment.level <<- 0
      child1 <- tree$merge[merge.level,1]
      child2 <- tree$merge[merge.level,2]
      
      internal.nodes.attributes[[paste("merge_level_", merge.level)]][["merge_level"]] <- merge.level
      internal.nodes.attributes[[paste("merge_level_", merge.level)]][["method"]] <- hclust.method
      
      ########################################
      ## Case 1: merging between two leaves ##
      ########################################
      if ((child1 < 0) && (child2 < 0)) {
        align.two.leaves(child1, child2)
      }
      
      ############################################
      ## Case 2: merging a motif with a cluster ##
      ############################################
      if(((child1 < 0) && (child2 > 0)) || ((child1 > 0) && (child2 < 0))){
        align.leave.and.cluster(child1, child2)
      }
      
      ##########################################
      ## Case 3: merging between two clusters ##
      ##########################################
      if ((child1 > 0) && (child2 > 0)) {
        align.clusters(child1, child2)
      }

      
      ## Create the files with the aligned matrices
      single.mat.files <<- NULL
      merge.consensus.info <<- NULL
      verbose(paste("Merging the matrices of merge level: ", merge.level ), 1)
      aligned.matrices.to.merge(merge.level)

    }
  
  
    ## Fill the downstream end 
    motifs.info <- fill.downstream.forest(motifs.info)
    
    ## Reset the labels
    for(lab.nb in 1:length(tree$labels)){

      ## Add the aligned consensus
      tree$labels[lab.nb] <- paste(motifs.info[[get.id(lab.nb)]][["consensus"]], sep = "   ")
      
      ## Add the new labels
      for(label in labels){
        if(label == "consensus"){
          next
        } else if(label == "id"){
          tree$labels[lab.nb] <- paste(tree$labels[lab.nb], get.id(lab.nb), sep = " ")
        } else if(label == "number"){
          tree$labels[lab.nb] <- paste(tree$labels[lab.nb], lab.nb, sep = " ")
        } else if(label == "strand"){
          tree$labels[lab.nb] <- paste(tree$labels[lab.nb], motifs.info[[get.id(lab.nb)]][["strand"]], sep = " ")
        }
      }
    }
    
    ## for(x in 1:length(tree$labels)){
    ##   tree$labels[x] <- paste(motifs.info[[get.id(x)]][["consensus"]], x, sep = " ")
    ## }

    
    ## Get the aligment width, to calculate the limits of the plot
    alignment.width <- sapply(tree$labels, function(X){
      nchar(X)
    })
    alignment.width <- max(alignment.width)
    mar4 <- alignment.width - 20

    ## Export the tree with the aligment
    plot.format <- "pdf" ## Default for testing inside the loop
    for (plot.format in c("pdf", "png")) {
      ## w.inches <- 10 ## width in inches
      ## h.inches <- 7 ## height in inches
      w.inches <- 15 ## width in inches
      h.inches <- 7 ## height in inches
      resol <- 72 ## Screen resolution
      tree.drawing.file <- paste(sep="", out.prefix, "_consensus_tree_forest_", cluster.nb, ".", plot.format)
      if (plot.format == "pdf") {
        pdf(file=tree.drawing.file, width=w.inches, height=h.inches)
      } else if (plot.format == "png") {
        png(filename=tree.drawing.file, width=w.inches*resol, height=h.inches*resol)
      }
      ## dev.new(width=10, height=7)
      par(mar=c(3,2,2,mar4),family="mono")
      plot(as.dendrogram(tree), horiz=TRUE, main = paste("Aligned consensus tree cluster", cluster.nb, ";labels:" ,paste(labels, collapse = ","), sep = " "))
      dev.off()
    }
    
    forest.list[[paste("cluster_", nb, sep = "")]] <- motifs.info
  }
}


#################################
## Produce the aligment table
if(forest.nb > 1){
  alignment.table <- sapply(forest.list, function(X){
    sapply(X, function(Y){
      return(c(Y[["number"]], Y[["strand"]], Y[["spacer"]], Y[["offset_down"]], Y[["consensus"]]))
    })
  })
} else{
  alignment.table <- lapply(forest.list[[1]], function(X){
    return(c(X[["number"]], X[["strand"]], X[["spacer"]], X[["offset_down"]], X[["consensus"]]))
  })
}
alignment.table <- as.data.frame(t(data.frame(alignment.table)))

## Produce the column ID
temp <- unlist(strsplit(rownames(alignment.table), "cluster_\\d+."))
alignment.table$id <- as.vector(temp[which(temp != "")])

## Produce the column Width
width.tmp <- unlist(sapply(forest.list, function(X){
  sapply(X, function(Y){
    return( nchar(Y[["consensus"]]))
  })
}))
names(width.tmp) <- NULL
alignment.table$width <- width.tmp

## Produce the column Forest_ID
forest.names <- names(forest.list)
forest.id <- vector()
for(name in forest.names){
  forest.id <- append(forest.id, rep(name, length(forest.list[[name]])))
}
alignment.table$forest <- forest.id

##  Re-order the table and export it
alignment.table <- alignment.table[,c(6, 8, 2:4, 7, 5)]
colnames(alignment.table) <- c("#id", "cluster", "strand", "offset_up", "offset_down", "width", "aligned_consensus")
alignment.file <- paste(sep="", out.prefix, "_alignment_table.tab")
write.table(alignment.table, file = alignment.file, sep = "\t", quote = FALSE, row.names = FALSE)

## JvH: Close the error log file
close(command.log)
close(error.log)
