#!/usr/bin/env perl
############################################################
#
# $Id: matrix-clustering,v 1.5 2013/02/19 05:35:16 jvanheld Exp $
#
############################################################

## use strict;

=pod

=head1 NAME

matrix-complexity

=head1 VERSION

$program_version

=head1 DESCRIPTION

Taking as input a set of position-specific scoring matrices, this program 
permutes each motif N times and based on the distribution of comparison scores
(Ncor) estimates wheter the motif has a low or high complexity.

=head1 DEPENDENCIES


=head1 AUTHORS

=head2 Implementation

=over

=item Jaime Castro <jcastro@lcg.unam.mx>

=back

=head2 Conception

=over

=item Jaime Castro

=back

=head1 CATEGORY

util

=head1 USAGE

matrix-complexity [-matrix inputfile] [-o outputfile] [-v ] [...]


=head1 OUTPUT FORMAT

=head1 SEE ALSO

=over

=item I<compare-matrices>

The program I<compare-matrices> is used by I<cluster-matrices> to
measure pairwise similarities and define the best alignment (offset,
strand) between each pair of matrices.

=back

=head1 WISH LIST

=cut

BEGIN {
  if ($0 =~ /([^(\/)]+)$/) {
    push (@INC, "$`lib/");
  }
}
require "RSA.lib";
require "RSA2.cgi.lib";
use RSAT::util;
use RSAT::matrix;
use RSAT::MatrixReader;
use RSAT::SeqUtil;
use List::MoreUtils qw(uniq);

require "RSA.disco.lib";
require "footprint.lib.pl";
use Data::Dumper;
use File::Basename;
use File::Path;

################################################################
## Main package
package main;
{

  ################################################################
  ## Initialise parameters
  local $start_time = &RSAT::util::StartScript();
  $program_version = do { my @r = (q$Revision: 1.5 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };

  ## Input / output files
  %main::infile = ();
  %main::outfile = ();
  %main::dir = ();
  %main::param = ();
  @dirs = ();

  %main::matrix_file = ();
  %main::matrix_titles = ();

  $main::verbose = 0;
  $main::out = STDOUT;

  local @tab_to_convert = (); ## Tables to convert to html

  ## Input formats: only accept formats supporting multiple matrices
  local @supported_matrix_formats = qw(transfac tf tab cluster-buster cb infogibbs meme stamp uniprobe);
  local %supported_matrix_format = ();
  foreach my $format (@supported_matrix_formats) {
    $supported_matrix_format{$format} = 1;
  }
  local $supported_matrix_formats = join ",", @supported_matrix_formats;

  ## Detect the input matrices command line is correct
  local $count_input_matrix_parameters = 0;

  ## Store the name of the individual motif files input and permuted
  local %individual_motif_info = ();

  ## Default number of permutations
  local $nb_permutation = 1000;


  ################################################################
  ## Ensure the access to the java libraries, which are required to
  ## display the results (logo trees, dynamic tables).
  ##
  ## If $include_js_lib is set to 1, the javascript libraries are
  ## included in the output directory, in order to avoid problems with
  ## the links. This solution enables to move the result
  ## (e.g. download a self-contained archive), but costs 500kb of disk
  ## space for each result, we should evaluate alternative solutions.
  ## 
  ## the alternative is to point to the libraries on the RSAT server,
  ## but we faced problems on some servers, due to the the path
  ## public_html/lib, which inludes a soft link (lib ->
  ## ../perl-scripts/lib).
  local $include_js_lib = 1; 


  ## Base on which we will buld the the URLs to the javascripts (D3 +
  ## JQuery libraries, required to display logo trees and tables).
  local $js_base;
  local $d3_base;
  local $d3_venn_base;
  local $jquery_base;
  local $datatable_base;
  local $datatable_css_base;

  ## Concatenate all the files which a logo will be produced
  ## This is done to call the system once
  local $concat_motif_files = "";

 
  ## Metric parameters
  local @supported_metrics = qw(
			         cor
                                 Ncor
			     );
  local $supported_metrics = join ",", @supported_metrics;
  local %supported_metrics = ();
  foreach my $met (@supported_metrics) {
    $supported_metrics{lc($met)} = 1;
  }

  ## Matrices
  local @all_matrices = ();

  ## Unrecognized arguments are passed to compare-matrices
  local @args_to_pass = ();
  local $args_to_pass = "";
  
  ## Use compare-matrices-quick
  local $quick_flag = 0;

  ## Heatmap color palette and classes
  local $heatmap_color_palette = "YlOrRd";
  local $heatmap_color_classes = 9;

  ## Permute input matrices
  local $random_flag = 0;

  ## Flag to dectect the user specified at least one input matrix file
  local $input_matrix = 0;

  ## Limit the number of input motifs
  local $max_matrices = 0;
  local $top_matrices = 0;
  local $skip_matrices = 0;
  
  ## To export in the results the input file
  local $clone_input_flag = 1;

  ## Lower threshold on column-wise information content: left-most and
  ## right-most columns having a lower IC are trimmed.
  $trim_threshold = 0;

  ## Supported fields to return
  local %return_fields = ();
  local @supported_return_fields = qw (align_consensus heatmap json newick root_matrices nb_clusters);
  local %supported_return_fields = ();
  foreach my $field (@supported_return_fields) {
    $supported_return_fields{$field} = 1;
  }
  local $supported_return_fields = join ",", @supported_return_fields;

  ################################################################
  ## Set default options
  local %param = ();
  local @matrix_compa_metrics = qw(cor Ncor Ncor1 Ncor2 NcorS logoDP logocor Nlogocor Icor NIcor cov dEucl NdEucl NsEucl SSD SW NSW match_rank zscores);
  $param{matrix_compa_metrics} = join(",", @matrix_compa_metrics);
  $param{matrix_compa_sort_field} = "Ncor"; ## sorting field
  $main::param{matrix_compa_score} = "Ncor";
  $param{archive_format} = "zip";
  $param{progressive_synthesis} = 1;
  $param{title} = "matrix-complexity results";

  ################################################################
  ## Read argument values
  &ReadArguments();

  ################################################################
  ## Check argument values

  &RSAT::message::TimeWarn("Checking parameter values") if ($main::verbose >= 2);

  ################################################################
  ## Input file is mandatory
  unless ($input_matrix) {
    &RSAT::error::FatalError("Input should be specified either as a matrix file (option -matrix): -matrix matrix_title matrix_file");
  }

  ################################################################
  ## Non-recognized parameters are passed to compare-matrices
  if (scalar(@args_to_pass)) {
    $args_to_pass = join (" ", @args_to_pass);
    &RSAT::message::Info("Unrecognized arguments passed to matrix-clustering", $args_to_pass) if ($main::verbose >= 2);
  }

  ################################################################
  ## Check that the output prefix has been specified
  unless ($outfile{prefix}) {
    &RSAT::error::FatalError("You must define the output prefix (option -o).");
  }

  ################################################################
  ## Check output directories and define file names
  &set_output_file_names();


  ################################################################
  ## Pre-process the input motif file(s):
  ## 1) Restrict the number of motifs
  ## 2) Assign unique IDs to each motif
  &pre_process_motif_files();


  ################################################################
  ## Generate the matrix description table
  &Generate_matrix_description_table();

  ################################################################
  ## Produce the logos 
  &Produce_logos();

  ## The permutation + comparison + logos will be stored in separated folder
  ## and are processed individually
  foreach my $motif_ID (sort keys %individual_motif_info) {

      ## Create folder for the current motif
      mkdir($individual_motif_info{$motif_ID}{"folder"});

      ################################################################
      ## Generate the random-permuted motifs
      &GenerateXPermutatedMotifs($motif_ID);

      ################################################################
      ## Compare the motifs
      &CompareMatricesQuick($motif_ID);
  }

  ################################################################
  ## Draw plots : Violin plot + Scatterplot + Histograms
  &DrawScoreDistributions()

  ################################################################
  ## Create an archive with all result files
  #&Archive(1);

  #######################################
  ## Delete temporal files and folders
  #&Delete_temporal_files();

  ################################################################
  ## Print verbose
  &Verbose() if ($main::verbose >= 1);
  
  ################################################################
  ## Close output stream
  my $exec_time = &RSAT::util::ReportExecutionTime($start_time); print $main::out $exec_time if ($main::verbose >= 1);
  close $main::out if ($main::outfile{prefix});
  
  exit(0);
}

################################################################
################### SUBROUTINE DEFINITION ######################
################################################################


################################################################
## Display full help message 
sub PrintHelp {
    system "pod2text -c $0";  
    exit()
}

################################################################
## Display short help message
sub PrintOptions {
    &PrintHelp();
}

################################################################
## Read arguments 
sub ReadArguments {
    my $arg;
    my @arguments = @ARGV; ## create a copy to shift, because we need ARGV to report command line in &Verbose()
    while (scalar(@arguments) >= 1) {
	$arg = shift (@arguments);

=pod

=head1 OPTIONS

=over 4

=item B<-v #>

Level of verbosity (detail in the warning messages during execution)

=cut
    if ($arg eq "-v") {
      if (&IsNatural($arguments[0])) {
	$main::verbose = shift(@arguments);
      } else {
	$main::verbose = 1;
      }

=pod

=item B<-h>

Display full help message

=cut
    } elsif ($arg eq "-h") {
      &PrintHelp();


=pod

=item B<-help>

Same as -h

=cut
    } elsif ($arg eq "-help") {
      &PrintOptions();


      
=pod

=item B<-matrx matrix_title input_matrix_file>

The input file contains a set of position-specific scoring
matrices.

Example: -matrix OCt_motifs Oct_motifs_peakmotifs.tf

The matrix_title will be concatenated to each motif ID in order to
create unique motif IDs. The collection label is displayed in the results.

This label is useful when two motifs for the same TF come from different
files and the user wants to know to which collection does the motif come from.

B<Supported matrix formats>

Since the program takes several matrices as input, it only accepts
matrices in formats supporting several matrices per file (transfac,
tf, tab, cluster-buster, cb, infogibbs, meme, stamp, uniprobe).

For a description of these formats, see the help of I<convert-matrix>.

=cut
    } elsif ($arg eq "-matrix") {

	$count_input_matrix_parameters = 0;
	my $matrix_title = shift(@arguments);
	
	if($matrix_title){
	    $count_input_matrix_parameters++;
	}
	
	## Substitue special characters which cannot be used inside a file name
	$matrix_title =~ s|\s|_|g;
	$matrix_title =~ s|/|_|g;
	$matrix_title =~ s|:|_|g;
	
	$main::matrix_file{$matrix_title} = shift(@arguments);

	if($main::matrix_file{$matrix_title}){
	    $count_input_matrix_parameters++;
	}
	
	push @main::matrix_titles, $matrix_title;
	
	$input_matrix = 1;

	if( ($count_input_matrix_parameters != 2) || ($matrix_title =~ /^-/) || ($main::matrix_file{$matrix_title} =~ /^-/) ){
	    &RSAT::error::FatalError("Input should be specified with matrix file (option -matrix) with 2 parameters: matrix_title matrix_file\n\tExample: -matrix Oct_motifs Oct_motifs_peakmotifs.tf");
	}

=pod

=item B<-matrix_format matrix_format>

Specify the input matrix format.


B<Supported matrix formats>

Since the program takes several matrices as input, it only accepts
matrices in formats supporting several matrices per file (transfac,
tf, tab, clusterbuster, cb, infogibbs, meme, stamp, uniprobe).

For a description of these formats, see the help of I<convert-matrix>.

=cut
    } elsif ($arg eq "-matrix_format") {
      $main::matrix_format = shift(@arguments);
      unless ($supported_matrix_format{$matrix_format}) {
	&RSAT::error::FatalError($matrix_format, "Invalid format for input matrices\tSupported: ".$main::supported_matrix_formats);
      }

=pod

=item	B<-title title>

Title displayed on top of the report page.

=cut
     } elsif ($arg eq "-title") {
      $main::param{title} = shift(@arguments);
      $main::param{title} =~ s/\s+/_/g;


=pod

=item	B<-perm nb_permutations>

Number of column randomly-permuted motifs generated for each input motif.

Default: 1000 permutations

=cut

     } elsif ($arg eq "-perm") {

      $nb_permutation = shift(@arguments);

      &RSAT::error::FatalError($nb_permutation, "Invalid value for option -top_matrices: must be a natural number") 
         unless (&IsNatural($nb_permutation));

=pod

=item	B<-o output_prefix>

Prefix for the output files.

Mandatory option: since the program I<cluster-matrices> returns a
list of output files (pairwise matrix comparisons, matrix clusters).

=cut
    } elsif ($arg eq "-o") {
      $main::outfile{prefix} = shift(@arguments);

=pod

=item B<-task tasks>

Specify one or several tasks to be run. If this option is not
specified, all the tasks are run.

Note that some tasks depend on others. This option should thus be
used with caution, by advanced users only.

Supported tasks: (all, comparison, clustering, report)

=over

=item B<all>

Execute all the parts of the program (default)

=item B<comparison>

Run the motif comparison step. The input set of motifs are compared
against themselves. The output is the pairwise comparison between the input motifs 
and a description table showing the main features of each motif (name, id, consensus, width). 

=item B<clustering>

Skip the matrix comparison step and only executes the clustering step.

Assumes the users already have the description table and comparison table 
exported from the program I<compare-matrices>.

This option is ideal to saving time once all comparison beteen the input motifs had been done. 

=back

=cut

       } elsif ($arg eq "-task") {
	 $arg = shift (@arguments);
	 chomp($arg);
	 my @tasks = split ",", $arg;
         %selected_tasks = ();
	 foreach my $task (@tasks) {
	   $task = lc($task);
	   if ($supported_tasks{$task}) {
	     $selected_tasks{$task} = 1;
	   } else {
	     &RSAT::error::FatalError(join("\t", $task, "Invalid tasks. Supported:", $supported_tasks));
	   }
	 }


=pod

=item	B<-max_matrices X>

This option specifies how many matrices can be clustered in the same
analysis. If there are more matrices than the specified number, the
program restrics the analyses to the first X matrices, and issues a
warning.

This parameter can be useful to prevent submission of excessive
datasets to the Web server, or for running quick tests before starting
the analysis of a large matrix collection.

=cut
    } elsif ($arg eq "-max_matrices") {
      $max_matrices = shift(@arguments);
    

=pod

=item B<-top_matrices X>

Only analyze the first X motifs of the input file. This options is
convenient for quick testing before starting the full analysis.

If several motif files are specified, the selection of top motifs is
performed independently for each motif collection (the max number of
motifs will this be X * the number of input files).

=cut
    } elsif ($arg eq "-top_matrices") {
      $top_matrices = shift(@arguments);

      &RSAT::error::FatalError($top_matrices, "Invalid value for option -top_matrices: must be a natural number") 
         unless (&IsNatural($top_matrices));

=pod

=item B<-skip_matrices X>

Skip the first X motifs of the input file. This options is convenient
for testing the program on a subset of the motifs before starting the
full analysis.

If several motif files are specified, the option is applied to each
file independently.

=cut
    } elsif ($arg eq "-skip_matrices") {
      $skip_matrices = shift(@arguments);
      &RSAT::error::FatalError($skip_matrices, "Invalid value for option -skip_matrices: must be a natural number") 
         unless (&IsNatural($skip_matrices));

=pod

=item B<-return return_fields>

List of fields to return.

Supported fields:

 heatmap,json,newick,root_matrices

=over

=item B<clone_input:> Copy input file.

When this field is selected, the input motif database is copied 
and exported in the results folder.

NOTE: take into account the input file size.

=item B<heatmap:> Heatmap with similarities.

When this field is selected, exports a heatmap showing the 
similarities, the clusters and the hierarchical tree of the 
input motifs.

The heatmap is exported in JPEG and PDF format. 

We recommend to use this option when the number of motifs is 
lower than 300. 

=item B<json:> Hierarchical tree in JSON format.

File format used for D3 library to visualize the logo forest in HTML.

The hierarchical tree in JSON format is always exported, 
since it is required to display the logo tree with the d3 library.

=item B<newick:> Hierarchical tree in newick format.

When this field is specified, the hierarchical tree is converted 
and exported in Newick format, a widely used text format to represent 
phylogenetic trees.

=item B<root_matrices:> Return only the root motif of each cluster.

When this field is specified, matrix-clustering runs the minimal 
analysis and return a text file with the root motifs of each
cluster.

This option is useful when the user wants to explore the data and
to avoid the cimputation of the visual elements. 

=back

=cut
    } elsif ($arg eq "-return") {
      $arg = shift (@arguments);
      chomp($arg);
      my @fields_to_return = split ",", $arg;
      foreach my $field (@fields_to_return) {
	$field = lc($field);

	if ($supported_return_fields{$field}) {
	  $return_fields{$field} = 1;

	  if($return_fields{clone_input}){
	      $clone_input_flag = 1;
	  } 

	  if($return_fields{align_consensus}){
	      $align_consensus = 1;
	  }  

	  if($return_fields{heatmap}){
	      $draw_heatmap = 1;
	  }  
	  
	  if($return_fields{newick}){
	      $export_newick = 1;
	  } 

	  if($return_fields{root_matrices}){
	      $root_matrices_flag = 1;
	      $clone_input_flag = 0;
	  }

	} else {
	  &RSAT::error::FatalError(join("\t", $field, "Invalid return field. Supported:", $supported_return_fields));
	}
      }

    ## Additional arguments are passed to compare-matrices
    }else {
        if ($arg =~ /\s/) {
	    push @args_to_pass, "'".$arg."'";
        } else {
	    push @args_to_pass, $arg;
        }
    }
  }
=pod

=back

=cut

}

################################################################
## Verbose message
sub Verbose {
  print $main::out "; matrix-complexity ";
  &PrintArguments($main::out);
  printf $main::out "; %-22s\t%s\n", "Program version", $program_version;
  if (%main::infile) {
    print $main::out "; Input files\n";
    while (my ($key,$value) = each %main::infile) {
      printf $main::out ";\t%-28s\t%s\n", $key, $value;
    }
  }
  printf $main::out  "; %-28s\t%s\n", "Number of matrices", scalar(@all_matrices);

  if (%main::outfile) {
    print $main::out "; Output files\n";
    while (my ($key,$value) = each %main::outfile) {
      printf $main::out ";\t%-28s\t%s\n", $key, $value;
    }
  }

  if (%main::dir) {
    print $main::out "; Directories\n";
    while (my ($key,$value) = each %main::dir) {
      printf $main::out ";\t%-28s\t%s\n", $key, $value;
    }
  }
}


################################################################
## Set output file names + check output directories
sub set_output_file_names {
  ## Create output dir if required
  my $basename;
  ($dir{output}, $basename) = &RSAT::util::SplitFileName($main::outfile{prefix});
  $dir{output} = "." if ($dir{output} eq "");
  &RSAT::util::CheckOutDir($dir{output});
  push @dirs, "output";

  local @output_folders = ();

 
  ## Create output directories
  @output_folders = qw( tables html logos);
  foreach my $f (@output_folders) {
      my $new_f = $main::outfile{prefix}."_".$f;
      &RSAT::util::CheckOutDir($new_f);
  }

  ################################################################
  ## Specify output file names and open output stream

  ## Log file should contain the trace of all commands
  
  $main::outfile{log} = $main::outfile{prefix}."_log.txt"; push @outfiles, "log";
  $main::out = &OpenOutputFile($main::outfile{log});

  ## Error file should contain only errors
  $main::outfile{err_log} = $main::outfile{prefix}."_errors.txt"; push @outfiles, "err_log";
  $main::err = &OpenOutputFile($main::outfile{err_log});

  ## Specific log file for R commands
  $main::outfile{Rlog} = $main::outfile{prefix}."_Rlog.txt"; push @outfiles, "Rlog";

  ## Converted matrices (required for compare-matrices-quick, and to
  ## restrict the number of matrices with option -max_matrices).
  $main::outfile{input_matrices} = $main::outfile{prefix}."_input_motifs.tf"; push @outfiles, "input_matrices"; 

  ## Pairwise comparisons between matrices
  $main::outfile{pairwise_compa} = $main::outfile{prefix}."_tables/pairwise_compa.tab"; push @outfiles, "pairwise_compa"; 
  $main::outfile{pairwise_compa_html} = $main::outfile{prefix}."_html/pairwise_compa.html";  push @outfiles, "pairwise_compa_html";

  ## Individual matrix descriptions
  $main::outfile{matrix_descriptions} = $main::outfile{prefix}."_tables/pairwise_compa_matrix_descriptions.tab"; push @outfiles, "matrix_descriptions"; &AddTabToConvert("matrix_descriptions");
  $main::outfile{matrix_descriptions_html} = $main::outfile{prefix}."_html/pairwise_compa_matrix_descriptions.html";  push @outfiles, "matrix_descriptions_html";

  ## Html report
  $main::outfile{html_report_template} = $ENV{RSAT}."/public_html/templates_html/matrix_complexity_template.html";
  $main::outfile{html_report} = $main::outfile{prefix}."_report.html";

  ## Define the base directories for the javascripts, which are
  ## required to display logo trees, animations and dynamic tables
  
  if ($include_js_lib) {
      ## Note: we will need to redefine it after having parsed the
      ## arguments, since it will be relative to output directory.
      &RSAT::message::Debug("Copying javascript libraries to output directory") if ($main::verbose >= 3);
      my $js_sync_cmd = "rsync -ruptl";
      $js_sync_cmd .= " ".$ENV{RSAT}."/perl-scripts/lib/js";
      $js_sync_cmd .= " ".$dir{output};
      &doit($js_sync_cmd, 0, 1, $verbose, 0, "", $main::out, $main::err);
      $js_base = $dir{output}."/js";
      &RSAT::message::Info("Javascript libraries copied to output directory", $js_base) if ($main::verbose >= 3);
  }  elsif (defined($ENV{RSA_OUTPUT_CONTEXT}) && ($ENV{RSA_OUTPUT_CONTEXT} eq "cgi")) {
      $js_base = $ENV{rsat_www}."/lib/js";
  } else {
      ## NOTE from JvH to Jaime: This solution does not allow to see the
      ## result on another computer than the RSAT server on which
      ## matrix-clustering ran.
      $js_base = $ENV{RSAT}."/perl-scripts/lib/js";
  }
  $d3_base= &RSAT::util::RelativePath($main::outfile{summary}, $js_base."/d3.v3.min.js");
  $d3_venn_base= &RSAT::util::RelativePath($main::outfile{summary}, $js_base."/venn.js-master/venn.js");
  $jquery_base = &RSAT::util::RelativePath($main::outfile{summary}, $js_base."/DataTables-1.10.4/media/js/jquery.js");
  $datatable_base = &RSAT::util::RelativePath($main::outfile{summary}, $js_base."/DataTables-1.10.4/media/js/jquery.dataTables.min.js");
  $datatable_css_base = &RSAT::util::RelativePath($main::outfile{summary}, $js_base."/DataTables-1.10.4/media/css/jquery.dataTables.min.css");

  ## Archive to facilitate transfer of all the results in a zip file
  $main::outfile{archive} = $main::outfile{prefix}."_archive.zip";

  &RSAT::message::Debug("Archive", $main::outfile{archive}) if ($main::verbose >= 3);

  ## Open the HTML index file and write header
  $main::outfile{html_index} = $main::outfile{prefix}."_html/index.html";
  push @outfiles, "html_index";
}


######################################################
## Pre-process the input motifs in order to create
## unique IDs and concatenate the input motifs from 
## different collections (file) in a single one. 
sub pre_process_motif_files{
  
  ## Count the number of motif sets
  my $motif_set_counter = 0;
  
  ## This variable (previously optional now mandatory to display the title in the trees)
  $display_collection_name_flag = 1;
  
  ## Create a directory to store the pre-processes motif files
  ## Pre-processing includes 
  ##   - conversion to transfac
  ##   - concatenation of collection name and motif ID if several
  ##     input files are specified
  ##   - selection of top X motifs if required
  ##   - skip the top X motifs if required

  ## Count the number of input files
  $number_of_collections = scalar(keys(%main::matrix_file));
  
  $main::dir{data} = $main::outfile{prefix}."_data";
  &RSAT::util::CheckOutDir($dir{data});

  $main::dir{results} = $main::outfile{prefix}."_results";
  &RSAT::util::CheckOutDir($dir{results});


  ## Assign the title, matrix format and matrix file path to a hash 
  foreach my $k (keys %main::matrix_file){
      
      my $collection_name = $k;

      ## The collection labels must be unique or the program will die
      if (exists($motif_set_attributes{$collection_name}{collection_label})){
	  &RSAT::error::FatalError($motif_set_attributes{$collection_name}{collection_label}, " repeated. The collection label of each file must be unique.");	
      }
      
      $motif_set_attributes{$collection_name}{collection_label} = $collection_name;
      $motif_set_attributes{$collection_name}{matrix_format} = $matrix_format; 
      $motif_set_attributes{$collection_name}{matrix_file_input} = $main::matrix_file{$collection_name};

      &RSAT::message::Info("Matrix file", $matrix_format, $collection_name, $main::matrix_file{$collection_name}) if ($main::verbose >= 0);

      $motif_collection_count{$collection_name} = 0;
      $motif_set_counter++;
  }
  
  &RSAT::message::Info("Read list of",scalar(keys(%motif_set_attributes)),
		       "matrix files") if ($main::verbose >= 2);

  ## Pre-process each motif collection
  foreach my $collection_name (keys(%motif_set_attributes)) {

      $motif_collection_count{$collection_name} = 0;
    
      ## Set the values of the title, file and format in a hash table
      my $matrix_format = $motif_set_attributes{$collection_name}{matrix_format}; 
      my $matrix_file = $motif_set_attributes{$collection_name}{matrix_file_input};
          
      ################################################################
      ## Collect all matrices
      @matrices = &RSAT::MatrixReader::readFromFile($matrix_file, $matrix_format);
      push(@all_matrices, @matrices);
      &RSAT::message::TimeWarn(scalar(@matrices), "Matrices loaded from file", $matrix_file) if ($main::verbose >= 2);
      
      ################################################################
      ## Check matrix number (required for verbose + if max matrices specified).
      if ($max_matrices > 0) {
	  &RSAT::message::TimeWarn("Checking max number of input matrices (".$max_matrices.")") if ($main::verbose >= 2);
	  &RSAT::message::TimeWarn(scalar(@matrices), "Matrices loaded from file", $matrix_file)
	      if ($main::verbose >= 2);
	  &RSAT::message::TimeWarn(scalar(@all_matrices), "Matrices loaded in total")
	      if ($main::verbose >= 2);
	  
	  if (scalar(@all_matrices) > $max_matrices) {
	      &RSAT::message::Warning("Input files contain ".scalar(@all_matrices)." matrices.", 
				      "The analysis will be limited to the ".$max_matrices." first matrices (option -max_matrices).");
	      
	      @all_matrices = @all_matrices[1..$max_matrices];
	  }
      }    
      
      ## Create a single file combining all the matrices from all the current
      ## input file, and append the motif collection name to each matrix ID.
      $motif_set_attributes{$collection_name}{matrix_file_processed_1} = $main::dir{data}."/".$collection_name."_input_motifs_processed_1.tf";
      $motif_set_attributes{$collection_name}{matrix_file_processed} = $main::dir{data}."/".$collection_name."_input_motifs_processed.tf";
      
      ## Convert input matrices in transfac format + append prefix if required
      &RSAT::message::TimeWarn("Converting input matrices") if ($main::verbose >= 2);
      my $convert_matrix_cmd = $SCRIPTS."/convert-matrix";
      $convert_matrix_cmd .= " -i ".$motif_set_attributes{$collection_name}{matrix_file_input};
      $convert_matrix_cmd .= " -from ".$matrix_format;
      $convert_matrix_cmd .= " -to transfac";
      $convert_matrix_cmd .= " -top ".$top_matrices if ($top_matrices > 0);
      $convert_matrix_cmd .= " -skip ".$skip_matrices if ($skip_matrices > 0);
      $convert_matrix_cmd .= " -o ".$motif_set_attributes{$collection_name}{matrix_file_processed_1};
      &doit($convert_matrix_cmd, 0, 1, $verbose, 0, "", $main::out, $main::err);

      ##############################################
      ## Store the motif link in a hash
      if($ID_link_flag == 1){

	  my $m_counter = 0;
	  my ($motif_file) = &OpenInputFile($motif_set_attributes{$collection_name}{matrix_file_processed_1});
	  while(<$motif_file>){

	      next if (/^#/); ## Skip header line
	      next if (/^;/); ## Skip comment lines
	      next unless (/\S/); ## Skip empty lines
	      next unless (/^AC\s+/);
	      chomp();
	      my($null, $AC_unique) = split(/\s+/, $_);
	      $m_counter++;
	      $ID_nb_hash{$AC_unique} = $m_counter;
	  }
	  close($motif_file);
      }

      ###############################
      ## Append prefix if required
      &RSAT::message::TimeWarn("Converting input matrices") if ($main::verbose >= 2);
      $convert_matrix_cmd = $SCRIPTS."/convert-matrix";
      $convert_matrix_cmd .= " -i ".$motif_set_attributes{$collection_name}{matrix_file_processed_1};
      $convert_matrix_cmd .= " -from tf";
      $convert_matrix_cmd .= " -to transfac";
      $convert_matrix_cmd .= " -prefix_id ".$collection_name;
      $convert_matrix_cmd .= " -o ".$motif_set_attributes{$collection_name}{matrix_file_processed};
      &doit($convert_matrix_cmd, 0, 1, $verbose, 0, "", $main::out, $main::err);

      ########################################
      ## Save the unique ID in a hash table
      ## Key = ID unique
      ## Value = title (motif collection)
      my ($processed_matrix_file) = &OpenInputFile($motif_set_attributes{$collection_name}{matrix_file_processed});
      $m_counter = 0;
      while(<$processed_matrix_file>){
	  next if (/^#/); ## Skip header line
	  next if (/^;/); ## Skip comment lines
	  next unless (/\S/); ## Skip empty lines
	  next unless (/^AC\s+/);
	  chomp();
	  my($null, $AC_unique) = split(/\s+/, $_);
	  $motifs_ID_unique{$AC_unique} = $collection_name;
	  $alignment_info{$AC_unique}{file_name} = $main::outfile{prefix}."_data/motif_".$AC_unique.".tf";

	  if($ID_link_flag == 1){
	      $m_counter++;
	      $new_ID_nb_hash{$m_counter} = $AC_unique;
	  }
      }
      close($processed_matrix_file);

      ##############################################
      ## Store the motif link in a hash
      if($ID_link_flag == 1){

	  my ($link_table) = &OpenInputFile($main::infile{ID_link_table});
	  while(<$link_table>){

	      next if (/^#/); ## Skip header line
	      next if (/^;/); ## Skip comment lines
	      next unless (/\S/); ## Skip empty lines
	      chomp();
	      my($ID, $link) = split(/\s+/, $_);
	      $OLD_ID_link_hash{$ID} = $link;
	      $ID_link_hash{$new_ID_nb_hash{$ID_nb_hash{$ID}}} = $OLD_ID_link_hash{$ID}; 
	  }
	  close($link_table);
      }
  }

  ################################################################
  ## Concatenate all the processed input files in a single one.
  ## Rename the motif files.
  $main::outfile{motif_file} = $main::outfile{prefix}."_data/input_motifs_processed.tf";
  my $concatenate_processed_files_cmd = "cat ";
  foreach my $t (keys %motif_set_attributes){
      $concatenate_processed_files_cmd .=  $motif_set_attributes{$t}{matrix_file_processed}." ";
  }
  $concatenate_processed_files_cmd .= " > ".$main::outfile{motif_file};
  &doit($concatenate_processed_files_cmd, 0, 1, $verbose, 0, "", $main::out, $main::err); 


  $main::outfile{input_matrices} = $main::outfile{motif_file};

  ################################################################
  ## Split all the processed motifs into single file
  ## each one containing a single motif
  &RSAT::message::TimeWarn("Split the input file in individual files each with a single motif") if ($main::verbose >= 2);
  my $split_matrix_cmd = $SCRIPTS."/convert-matrix";
  $split_matrix_cmd .= " -i ".$main::outfile{motif_file};
  $split_matrix_cmd .= " -split";
  $split_matrix_cmd .= " -from tf -to tf";
  $split_matrix_cmd .= " -o ".$main::outfile{prefix}."_data/motif";
  &doit($split_matrix_cmd, 0, 1, $verbose, 0, "", $main::out, $main::err);

  
}

################################################################
## Random permutation of PSSM columns
## Generates X randomly-permuted PSSMs
sub GenerateXPermutatedMotifs{

    my $ID = shift;

    &RSAT::message::TimeWarn("Generating permuted motifs for $ID : ") if ($main::verbose >= 2);

    my $motif_query_file = $individual_motif_info{$ID}{"file"};
    my $motif_permuted_file = $individual_motif_info{$ID}{"permutations"};
    
    my $randomize_motifs_cmd = $SCRIPTS."/convert-matrix";
    $randomize_motifs_cmd .= " -i ".$motif_query_file;
    $randomize_motifs_cmd .= " -from tf -to tf";
    $randomize_motifs_cmd .= " -perm ".$individual_motif_info{$ID}{"nb_permutations"};
    $randomize_motifs_cmd .= " -o ".$motif_permuted_file;
    &doit($randomize_motifs_cmd, 0, 1, $verbose, 0, "", $main::out, $main::err);

    &RSAT::message::Info("Randomly-permuted motifs generated - ID ", $ID, $motif_permuted_file) if ($main::verbose >= 3);
}

################################################################
## Compare each discovered motifs to each other.
sub CompareMatricesQuick {

    my $ID = shift;
    my $motif_query_file = $individual_motif_info{$ID}{"file"};
    my $motif_permuted_file = $individual_motif_info{$ID}{"permutations"};
    my $motif_comparison_file = $individual_motif_info{$ID}{"comparison"};

    &RSAT::message::TimeWarn("Motif comparison $ID motifs against permuted motifs: ") if ($main::verbose >= 2);
    ## Pariwise comparisons between discovered matrices.  We don't
    ## the option "distinct" in order to ensure that all the motifs are
    ## in the output graph even if they are not related to any other
    ## motif.
    ## This is the faster C implemented version of compare-matrices
    
    $matrix_compa_verbose = 0;
    my $BIN=$ENV{RSAT}."/bin";
    
    ## Run the pairwise matrix comparison
    &RSAT::message::TimeWarn("Pairwise matrix comparison. Quick version.") if ($main::verbose >= 3);
    my $cmd = $BIN."/compare-matrices-quick -v ".$matrix_compa_verbose; 
    $cmd .=  " -file1 ".$motif_query_file;
    $cmd .=  " -file2 ".$motif_permuted_file;
    $cmd .= " -lth_ncor1 -1";
    $cmd .= " -lth_ncor2 -1";
    $cmd .= " -lth_ncor -1";
    $cmd .= " -lth_cor -1";
    $cmd .= " -lth_w 0";
    $cmd .= " -mode matches";
    $cmd .= " -o ".$motif_comparison_file;
    &RSAT::util::one_command($cmd, 1,"");
    &RSAT::message::TimeWarn("Matrix comparison table:", $ID, $motif_comparison_file) if ($main::verbose >= 2);
    
    return();
}

################################################################
## Index a table to convert to HTML
sub AddTabToConvert {
  my ($key) = @_;
  my $tab = $main::outfile{$key};
  my $html_key = $key."_html";
  my $html = $tab."_html";
  $html =~ s/\.tab.html$/\.html/;
  $html =~ s/\.txt.html$/\.html/;
  $main::outfile{$html_key} = $html; push @outfiles, $html_key; 
  push @tab_to_convert, $key; ## Add the key to the list to be converted
}

################################################################
## Convert tab-delimited files in HTML
sub ConvertTabToHTML {
  my @tab_to_convert = @_;
  &RSAT::message::TimeWarn("Converting tab-delimited to HTML files") if ($main::verbose >= 2);
  for my $key (@tab_to_convert) {
    my $tab = $main::outfile{$key};
    my $html_key = $key."_html";
    my $html = $main::outfile{$html_key};
    my $cmd = $SCRIPTS."/text-to-html ";
    $cmd .= " -i ".$tab;
    $cmd .= " -o ".$html;
    &doit($cmd, 0, $die_on_error, $verbose, 0, "", $main::out, $main::err);
  }
}


##############################################################
## When the option -top is used the analysis is restricted 
## to the first X motifs, however the input motif file is 
## splitted on individual files, although not all of them 
## are not. This functions will delete those files
sub Delete_temporal_files {

  foreach my $f (keys %to_delete) {
    unlink($to_delete{$f});
  }

  foreach my $fd (keys %to_delete_folder) {
    rmtree($to_delete_folder{$fd});
  }

}

################################################################
## Generate a compressed archive with all the results
sub Archive {

  my ($remove_first, $to_archive) = @_;
  &RSAT::message::TimeWarn("\n; Archiving data and results") if ($main::verbose >= 2);

  ## Delete previous version of the archive to avoid including the old archive in the new one
  $cmd .= "rm -f ".$main::outfile{archive}."; " if ($remove_first);

  ################################################################
  ## Define archiving parameters

  ## By default, archive the whole output directory
  $to_archive = $main::dir{output} unless $to_archive;

  ## Archive all data and results
  my ($archive_dir, $archive) = &SplitFileName($main::outfile{archive});
  my ($archive_dir_dir, $archive_dir_base) = &SplitFileName($archive_dir);
#  $to_archive_rel_path = &RSAT::util::RelativePath($main::outfile{archive}, $to_archive);
  my $to_archive_rel_path = &RSAT::util::RelativePath($archive_dir_dir, $to_archive);
  my $archive_rel_path = &RSAT::util::RelativePath($archive_dir_dir, $main::outfile{archive});

  ## TO CHECK (Jacques)
  if ($main::verbose >= 3) {
    &RSAT::message::Debug("folder to archive", $to_archive);
  }

  ## Avoid crash of the program when run from the "$archive_dir_dir"
  if ($archive_dir_dir eq "") {
    $archive_dir_dir = ".";
    $to_archive_rel_path = $to_archive;
    $archive_rel_path = $main::outfile{archive};	 
  }

  my $cmd = "";
  if ($main::param{archive_format} eq "zip") {
    $cmd .= "(cd ".$archive_dir_dir." ; ";
    $cmd .= " zip -ryq ".$archive_rel_path." ".$to_archive_rel_path;
    $cmd .= " -x ".$archive;
    $cmd .= ")";
  } elsif (($main::param{archive_format} eq "tar") ||
	   ($main::param{archive_format} eq "tgz")) {
    $cmd .= "tar -cpf ".$main::outfile{archive};
    $cmd .= " -z" if ($main::param{archive_format} eq "tgz");
    $cmd .= " -C ".$archive_dir_dir; ## Avoid including the whole path in the archive paths
    $cmd .= " --exclude ".$archive;
    $cmd .= " ".$to_archive_rel_path;
  } else {
    &RSAT::error::FatalError($main::param{archive_format}, "Invalid archive format. Supported: zip, tar, tgz.");
  }

  &one_command($cmd, 1);

  &RSAT::message::TimeWarn("Archive", $main::outfile{archive}) if ($main::verbose >= 2);
}

####################################################
## This function calls the program convert-matrix
## to add empty columns to the selected file.
sub Produce_logos() {

    &RSAT::message::TimeWarn("Generating Logos") if ($main::verbose >= 2);

    ## Add the empty columns
    my $cmd = $SCRIPTS."/convert-matrix -i ".$main::outfile{input_matrices};
    $cmd .= " -from tf -to tf -return logo -logo_no_title";
    $cmd .= " -logo_format png ";
    $cmd .= " -logo_dir ".$main::outfile{prefix}."_logos/";
    &doit($cmd, 0, $die_on_error, $verbose, 0, "", $main::out, $main::err);
}

################################################################
## Produce a table with motif information
sub Generate_matrix_description_table() {

    ################################################################
    ## Export a description of the matrices
    @matrices = &RSAT::MatrixReader::readFromFile($main::outfile{input_matrices}, "tf");
    &RSAT::message::TimeWarn("Generating description table for the input matrices") if ($main::verbose >= 2);
    my $desc = &OpenOutputFile($main::outfile{matrix_descriptions});
    
    ## Print header
    print $desc join ("\t", 
			     "#n",
			     "id",
			     "name",
			     "width",
                             "nb_permutations",
			     "consensus",
			     "rc_consensus",
		             "IC",
		             "File",
		             "File_permuted",
		             "Folder",
                             "Comparison",
                             "Logo",
                             "Logo_RC"		     
	)."\n";
    
    my $m=0;

    foreach my $matrix (@matrices) {
	my $c = "";
	my $c_rc = "";
	$m ++;

	$matrix->calcConsensus();
	$c = $matrix->get_attribute("consensus.IUPAC");
	$c_rc = $matrix->get_attribute("consensus.IUPAC.rc");

	## Calculates the IC of the matrix
	$matrix->calcInformation();
	$matrix->toString(col_width=>(1+4),
					    decimals=>1,
					    type=>"information",
					    format=>"tf");
        my $ic = $matrix->get_attribute("total.information");
        my $rounded_ic = sprintf("%.2f", $ic);

	my $single_mat_file = $main::outfile{prefix}."_data/motif_".$matrix->get_attribute("id").".tf";

        $mid = $matrix->get_attribute("id");

        $individual_motif_info{$mid}{"name"} = $matrix->get_attribute("name");
        $individual_motif_info{$mid}{"ncol"} = $matrix->get_attribute("ncol");
        $individual_motif_info{$mid}{"nb_permutations"} = $matrix->get_attribute("ncol") ** 4;
        $individual_motif_info{$mid}{"consensus"} = $c;
        $individual_motif_info{$mid}{"consensus_rc"} = $c_rc;
        $individual_motif_info{$mid}{"IC"} = $rounded_ic;
        $individual_motif_info{$mid}{"file"} = $single_mat_file;
        $individual_motif_info{$mid}{"permutations"} = $main::dir{results}."/".$mid."/".$mid."_permuted_motifs.tf";
        $individual_motif_info{$mid}{"folder"} = $main::dir{results}."/".$mid;
        $individual_motif_info{$mid}{"comparison"} = $main::dir{results}."/".$mid."/".$mid."_motif_comparison_table.tab";
        $individual_motif_info{$mid}{"logo"} = $main::outfile{prefix}."_logos/".$mid."_logo.png";
        $individual_motif_info{$mid}{"logo_rc"} = $main::outfile{prefix}."_logos/".$mid."_logo_rc.png";


        my $aa = &RSAT::util::RelativePath($main::outfile{html_report}, $single_mat_file);
        my $ab = &RSAT::util::RelativePath($main::outfile{html_report}, $main::outfile{prefix}."_results/".$mid."/".$mid."_permuted_motifs.tf");
        my $ac = &RSAT::util::RelativePath($main::outfile{html_report}, $main::outfile{prefix}."_results/".$mid);
        my $ad = &RSAT::util::RelativePath($main::outfile{html_report}, $main::outfile{prefix}."_results/".$mid."/".$mid."_motif_comparison_table.tab");
        my $ae = &RSAT::util::RelativePath($main::outfile{html_report}, $main::outfile{prefix}."_logos/".$mid."_logo.png");
        my $af = &RSAT::util::RelativePath($main::outfile{html_report}, $main::outfile{prefix}."_logos/".$mid."_logo_rc.png");

	my $matrix_desc_string = join ("\t", 
				       $m,
				       $matrix->get_attribute("id"),
				       $matrix->get_attribute("name"),
				       $matrix->get_attribute("ncol"),
                                       $individual_motif_info{$mid}{"nb_permutations"},
				       $c,
				       $c_rc,
                                       $rounded_ic,
                                       $aa,
                                       $ab,
                                       $ac,
                                       $ad,
                                       $ae,
                                       $af
	    );
	print $desc $matrix_desc_string."\n";
    }
    close $desc;
}

################################################################
## Call a R script that produces the plots
sub DrawScoreDistributions(){

    &RSAT::message::TimeWarn("Generating plots") if ($main::verbose >= 2);

    ##################################
    ### Identify the path of the R executable
    my $r_path = &RSAT::server::GetProgramPath("R");

    my $matrix_complexity_script  = $ENV{RSAT}."/R-scripts/matrix_complexity.R";
    &RSAT::error::FatalError("Cannot read the motif complexity script", $matrix_complexity_script) unless (-r $matrix_complexity_script);
    $r_verbosity = &RSAT::stats::max(($main::verbose-1), 0);
    
    ## Basic parameters
    my $matrix_complexity_cmd = "";
    $matrix_complexity_cmd .= " cat ".$matrix_complexity_script;
    $matrix_complexity_cmd .= " | ".$r_path;
    $matrix_complexity_cmd .= " --slave --no-save --no-restore --no-environ --vanilla";
    $matrix_complexity_cmd .= " --args \"";

    $matrix_complexity_cmd .= " motif.attributes.table = '".$main::outfile{matrix_descriptions}."'";
    $matrix_complexity_cmd .= "; html.template.file = '".$main::outfile{html_report_template}."'";
    $matrix_complexity_cmd .= "; prefix = '".$main::outfile{prefix}."'";
    $matrix_complexity_cmd .= "; \"";

    if ($r_path) {
        &doit($matrix_complexity_cmd, 0, 1, $verbose, 0, "", $main::out, $main::err);
    } else {
        &RSAT::message::Warning("Could not run matrix complexity because the program R is not available") if ($main::verbose >= 1);
    }
    return();

}
__END__
