#!/usr/bin/perl -w
############################################################
#
# $Id: compare-matrices,v 1.30 2009/11/05 01:05:04 amedina Exp $
#
############################################################

## use strict;

=pod

=head1 NAME

compare-matrices

=head1 VERSION

$program_version

=head1 DESCRIPTION

Compare two or more position-specific scoring matrices (PSSM), and
return various similarity statistics.

=head1 AUTHORS

Jacques van Helden <Jacques.van.Helden@ulb.ac.be>
Gipsi Lima-Mendez <gipsi@bigre.ulb.ac.be>
Matthieu Defrance <defrance@bigre.ulb.ac.be>

=head1 CATEGORY

=over

=item sequences

=item pattern matching

=item PSSM

=back

=head1 USAGE

compare-matrices -file1 inputfile1 -file2 inputfile2 [-o outputfile] [-v #] [...]

=head1 INPUT FORMAT

The user has to specify exactly input files (options I<-file1> and
I<-file2>), each containing one or several PSSMs. Each matrix of file
one is compared with each matrix of file2.

Any PSSM format supported in RSAT (type I<convert-matrix -h> for a
description).

=head1 OUTPUT FORMAT

By default, the output format is a tab-delimited file with one row per
matrix comparison, and one column per statistics.

=head2 ALGORITHM

The program successively computs one or several (dis)similiraty
metrics between each matrix of the first input file and each matrix of
the secnd input file.

Since the matrices are not supposed to be in phase, for each pair of
matrix, the program tests all possible I<offset> (shift) values
between the two matrices.

=head1 (DIS)SIMLARITY METRICS

In the formula below, symbols are defined as follows

=over

=item I<m1, m2>

Two position-specific scoring matrices.

=item I<w1,w2>

Number of columns of matrices m1 and m2, respectively.

=item Row number I<r>

Number of rows in each matrix, which correspond to the number of
residues in the alphabet (A,C,G,T for DNA motifs).

=item Aligned length I<w>

Number of aligned columns between matrices m1 and m2 (depends on the
offset between the two matrices).

 w <= w1
 w <= w2

=item Total length I<W>

Total length of the alignent between matrices m1 and m2.

 W = w1 + w2 - w


=item Relative length

A measure of the mutual overlap between the aligned matrices.

Wr = w / W

This actually corresponds to the Jaccard coefficient (intersection /
union), applied to the alignment lengths.

=item I<s1, s2>

Number of sites in matrices m1 and m2, respectively.

=item I<n>

Number of cells in the aligned portion of the matrices.

 n = w * r

=item I<i>

Index of a row of the aligned PSSM (corresponds to a residue).

=item I<j>

Index of a column of the aligned PSSM (corresponds to an aligned
position).

=item I<f1{i,j}>

Frequency of residue I<r> in the I<jth> column of the aligned subset of the
first matrix (taking the offset into account).

=item I<f2{i,j}>

Frequency of residue r in the jth column of the aligned subset of the
second matrix (taking the offset into account).

=item I<f1m, f2m>

Mean frequency computed over all cells of matrices m1 and m2, respectively.

=back

=head2 Euclidian distance (dEucl)

 dEucl = 1/w * sqrt( SUM{i=1->r} SUM{j=1->w} (f1{i,j} - f2{i,j})^2)

Note that our Euclidian distance is relativized by the number of
aligned columns (I<w>). This differs from the definition provided in
Pape et al. (2008).

=head2 Relative Euclidian similarity (rsEucl)

A similarity metrics derived from the Euclidian distance.

 rsEucl = (Max(dEucl) - dEucl) / dEucl

where I<Max(dEucl)> is the maximal possible Euclidian distance for the
current pair of matrices. 

 Max(dEucl) = sqrt(2)

=head2 Kullback-Leibler distance (dKL)

As defined in Aerts et al. (2003).

 dKL = 1/(2w) * SUM{i=1->r} SUM{j=1->w} (
                   f1{i,j}*log(f1{i,j}/f2{i,j})
                   + f2{i,j}*log(f2{i,j}/f1{i,j}))


=head2 Covariance (cov)

 cov = 1/n * SUM{i=1->r} SUM{j=1->w} (f1{i,j} - f1m) * (f2{i,j} - f2m)

Beware : this covariance has nothing to do with the "natural
covariance" of Pape (which still need to be implemented here).  What
we compute here is simply the covariance between the counts in the
aligned cells of the respective matrices.

=head2 Coefficient of correlation (cor)

 v1 = 1/n * SUM{i=1->r} SUM{j=1->w} (f1{i,j} - f1m)^2
 v2 = 1/n * SUM{i=1->r} SUM{j=1->w} (f2{i,j} - f2m)^2
 cor = cov/ sqrt(v1*v2)

=head2 Normalized correlation (Ncor)

This is the coefficient of correlation, corrected for the relative
alignment length (I<Wr>).

 Ncor = cor * Wr

This correction is particularly important to avoid selecting spurious
alignments between short fragments of the flanks of the matrices
(e.g. single-column alignments). For this reasons, I<Ncor> generall
gives a better estimation of motif similarity than I<cor>, and we
recommend it as similarity score.

=head1 REFERENCES

Distances between PSSMs have been treated in many ways. The most
recent and relevant articles are cited hereafter.

=over

=item Pape, U.J., Rahman, S., and Vingron, M. (2008). Natural
similarity measures between position frequency matrices with an
application to clustering. Bioinformatics 24 (3) pp. 350-7.

=item Gupta et al. Quantifying similarity between motifs. Genome Biol
(2007) vol. 8 (2) pp. R24.

=item Aerts et al. Computational detection of cis -regulatory
modules. Bioinformatics (2003) vol. 19 Suppl 2 pp. ii5-14

=back

=cut


BEGIN {
  if ($0 =~ /([^(\/)]+)$/) {
    push (@INC, "$`lib/");
  }
}
require "RSA.lib";
use RSAT::matrix;
use RSAT::MatrixReader;
use RSAT::stats;
use Data::Dumper;

################################################################
## Main package
package main;
{

  ################################################################
  ## Initialise parameters
  my $start_time = &AlphaDate();
  $program_version = do { my @r = (q$Revision: 1.30 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
  #    $program_version = "0.00";

  %main::infile = ();
  %main::outfile = ();

  $main::verbose = 0;
#  $main::in = STDIN;
  $main::out = STDOUT;


  ## Return fields
  my %supported_return_fields = (
				 "pos"=>1, ## start and end positions of the alignment relative to the matrices
				 "dEucl"=>1, ## Euclidian distance
#				 "rsEucl"=>1, ## Euclidian distance
				 "dKL"=>1, ## Kullback-Leibler distance
				 "cor"=>1, ## Coefficient of correlation
				 "cov"=>1, ## Covariance
				 "consensus"=>1,
				 "rank"=>1,
				);
  my %return_field = %supported_return_fields; ## Temporary

  ## Sort field
  local $sort_field = "offset";

  $main::show_matrix_name=0;

  ## Threshold parameters
  local %lth = ();		# lower threshold values
  local %uth = ();		# upper threshold values
  @supported_thresholds = qw (
			      offset
			      w
			      Wr
			      dEucl
			      rsEucl
			      dKL
			      cov
			      cor
			      Ncor
			      rank
			     );
  $supported_thresholds = join ",", @supported_thresholds;
  %supported_threshold = ();
  foreach my $thr (@supported_thresholds) {
    $supported_threshold{lc($thr)} = 1;
  }


  ## Matrices
  local @matrices1 = ();
  local @matrices2 = ();

  ## input formats
  %supported_input_format = %RSAT::MatrixReader::supported_input_format;
  $supported_input_formats = join ",", sort keys %supported_input_format;
  local $input_format1 = "";
  local $input_format2 = "";
  local $strands = "DR";

  ################################################################
  ## Read argument values
  &ReadArguments();

  ################################################################
  ## Check argument values
#  unless ((($infile{file1}) && ($infile{file2})) ||
#	  (($infile{file1}) && ($infile{mlist2})) ||
#	  (($infile{mlist1}) && ($infile{file2})) ||
#	  (($infile{mlist1}) && ($infile{mlist2}))) {
  unless ((($infile{file1}) || ($infile{mlist1})) &&
	  (($infile{file2}) || ($infile{mlist2}))) {
    &RSAT::error::FatalError("You must define two input files, as one combination among options (-file1 or -mfile1) and (-file2 or -mlist2)");
  }

  unless (($input_format1) && ($input_format2)) {
    &RSAT::error::FatalError("You must define two input format(s) (option -format, or options -format1 and -format2)");
  }

  ################################################################
  ## Treat dependencies between fields to compute

  ## All return fields should be computed
  my %calc_field = %return_field;

  ## All fields having thresholds must be computed
  foreach my $field (keys %lth) {
    $calc_field{$field} = 1;
  }
  foreach my $field (keys %uth) {
    $calc_field{$field} = 1;
  }

  ## The sorting field must be computed
  $calc_field{$sort_field} = 1;

  ## Some fields must be computed as intermediate results for other fields
  $calc_field{cov} = 1 if ($calc_field{cor});
  $calc_field{dEucl} = 1 if ($calc_field{rsEucl});

  ################################################################
  ## Open output stream
  $main::out = &OpenOutputFile($main::outfile{output});

  ################################################################
  ## Read input
  my @matrices1 = ();
  my @matrices2 = ();
  my @matrix_files1 = ();
  my @matrix_files2 = ();
  if ($infile{file1}){
    @matrices1 = &RSAT::MatrixReader::readFromFile($infile{file1}, $input_format1);
    &RSAT::message::TimeWarn(scalar(@matrices1), "Matrices loaded from file 1", $infile{file1})
      if ($main::verbose >= 2);
  } elsif ($infile{mlist1}){
    @matrices1 = &RSAT::MatrixReader::readMatrixFileList(&OpenInputFile($infile{mlist1}), $input_format1);
  }

  if ($infile{file2}){
    @matrices2 = &RSAT::MatrixReader::readFromFile($infile{file2}, $input_format2);
    &RSAT::message::TimeWarn(scalar(@matrices2), "Matrices loaded from file 2", $infile{file2})
      if ($main::verbose >= 2);
  } elsif($infile{mlist2}){
    @matrices2 = &RSAT::MatrixReader::readMatrixFileList(&OpenInputFile($infile{mlist2}), $input_format2);
  }

  ################################################################
  ## Reverse complements

  ## TEMPORARY: if the comparison is DR, te input file 1 is read
  ## twice: the first reading is converted to reverse complement, and
  ## the second reading used for direct "strand". This should be
  ## changed because it means that the R and D comparisons are
  ## considered as separate comparisons, and the score ranking applies
  ## to them separately rather than altogether.
  if ($strands =~ /R/) {
    &RSAT::message::Info("Computing reverse complement for the matrices of the second file.") if ($main::verbose >= 0);
    foreach my $matrix (@matrices2) {
      $matrix->reverse_complement();
      $matrix->set_attribute("direction", "R");
    }
    if ($strands eq "DR") {
      if ($infile{file2}){
	push @matrices2, &RSAT::MatrixReader::readFromFile($infile{file2}, $input_format2);
	&RSAT::message::TimeWarn(scalar(@matrices2), "Matrices loaded from file 2", $infile{file2})
	  if ($main::verbose >= 2);
      } elsif ($infile{mlist2}){
	push @matrices2, &RSAT::MatrixReader::readMatrixFileList(&OpenInputFile($infile{mlist2}), $input_format2);
      }
    }
  }
  my $count_m=0;
  foreach my $matrix (@matrices1, @matrices2) {
      $count_m ++;
      ## Calculate the consensus for each matrix
      $matrix->calcConsensus() if ($return_field{consensus});
      
      ################################################################
      ## Compute the number of sites as the maximal sum of counts
      ## per column. Note that some matrices might have been built
      ## from sites of varying lengths, so that the col sum is not
      ## always the same for all the columns
      my @counts = $matrix->getMatrix();
      my @counts_per_col = &RSAT::matrix::col_sum($matrix->get_attribute("nrow"), $matrix->get_attribute("ncol"), @counts);
      my $nsites = &RSAT::stats::max(@counts_per_col);
      $matrix->set_attribute("nsites", $nsites);
      my $matrix_name = $matrix->get_attribute("name");
      unless (defined($matrix_name)){
	  $matrix->set_attribute("name","matrix1.".$count_m);
	  $matrix_name=$matrix->get_attribute("name");
      } 
  }
  
  &RSAT::message::TimeWarn("Starting comparison between",
			   scalar(@matrices1) , "matrices from file 1 and ",
			   scalar(@matrices2) , "matrices from file 2.",
      ) if ($main::verbose >= 2);
  
  ################################################################
  ## Print verbose
  &Verbose() if ($main::verbose);
  
  ################################################################
  ## Print the header
  my @header = ();
  push (@header, "compa");
  push (@header, "m1", "m2");
  push (@header, "offset", "direct", "w", "Wr");
  push (@header, "start1", "end1", "start2", "end2") if ($return_field{pos});
  push (@header, "dEucl") if ($return_field{dEucl});
  push (@header, "rsEucl") if ($return_field{rsEucl});
  push (@header, "dKL") if ($return_field{dKL});
  push (@header, "cov") if ($return_field{cov});
  push (@header, "cor", "Ncor") if ($return_field{cor});
  push (@header, "consensus1", "consensus2") if ($return_field{consensus});
  push (@header, "rank") if ($return_field{rank});
  
  ## Print a description of the output fields
  if ($main::verbose >= 1) {
      my %description = ();
      $description{compa} = "number of comparison";
      $description{m1} = "matrix number in file 1";
      $description{m2} = "matrix number in file 2";
      $description{offset} = "shift of the second matrix relative to the first matrix (negative:left; positive: right)";
      $description{direct} = "direction of matrix 2 relative to matrix 1 (D=direct;  R=reverse)";
      $description{w} = "Alignment length (number of overlapping columns between matrix 1 and matrix 2, as a function of the offset)";
      $description{W} = "Total alignment length (overlapping + non-overlapping columns). W = w1 + w2 - w";
      $description{Wr} = "Relative alignment length (overlap divided by the toal alignemnt length). Wr = w/W";
      $description{start1} = "Start position of the alignment in matrix 1";
      $description{end1} = "End position of the alignment in matrix 1";
      $description{start2} = "Start position of the alignment in matrix 2";
      $description{end2} = "End position of the alignment in matrix 2";
      $description{dEucl} = "Euclidian distance";
      $description{rsEucl} = "relative Euclidian similarity. rsEucl = (max(dEucl) - dEucl)/max(dEucl)";
      $description{dKL} = "Kullback-Leibler distance";
      $description{cov} = "Covariance";
      $description{cor} = "Coefficient of correlation";
      $description{Ncor} = "Aligned fraction corrected correlation. Ncor = cor * Wr";
      $description{consensus1} = "IUPAC consensus of matrix 1";
      $description{consensus2} = "IUPAC consensus of matrix 2";
      $description{rank} = "Rank fo the alignment for the current pair of matrices";
      print $out "; Column content\n";
      my $c = 0;
      foreach my $field (@header) {
	  $c++;
	  printf $out ";\t%d\t%-13s\t%s\n", $c, $field, $description{$field};
      }
  }
  print $out "#", join("\t", @header), "\n";
  
  ################################################################
  ## compare the matrices and print the result
  my $m1 = 0;
  my $compa = 0;
  foreach my $matrix1 (@matrices1) {
      $m1++;
      &RSAT::message::TimeWarn("File 1", "Analyzing matrix", $m1."/".scalar(@matrices1))  if ($main::verbose >= 3);
      
      my $ncol1 = $matrix1->get_attribute("ncol");
      my $nrow1 = $matrix1->get_attribute("nrow");
      my @crude_freq1 = $matrix1->getCrudeFrequencies();
      my @corrected_freq1 = $matrix1->getFrequencies();
      my $matrix1_name = $matrix1 ->get_attribute("name");
      my $m2 = 0;
      
      foreach my $matrix2 (@matrices2) {
	  $m2++;
	  &RSAT::message::TimeWarn("Comparing matrices", 
				   "file1", $m1."/".scalar(@matrices1),
				   "file2", $m2."/".scalar(@matrices2),
	      )
	      if ($main::verbose >= 4);
	  my $matrix2_name = $matrix2-> get_attribute("name");
	  my $ncol2 = $matrix2->get_attribute("ncol");
	  my $nrow2 = $matrix2->get_attribute("nrow");
	  
	  ## Check that both matrices have the same number of rows
	  unless ($nrow2 == $nrow1) {
	      &RSAT::error::FatalError("Input matrices must have the same number of residues rows)", $nrow1, $nrow2);
	  }
	  my @crude_freq2 = $matrix2->getCrudeFrequencies();
	  
	  ## Compute offsets for sliding one matrix along the other one
	  my $offset_min = 1 - $ncol2;
#      my $offset_max = &RSAT::stats::min($ncol1, $ncol2) -1;
	  my $offset_max = $ncol1 - 1;
#      &RSAT::message::Debug("offset range", $offset_min, $offset_max) if ($main::verbose >= 10);
	  
	  
	  ## Prepare output variables
	  my %cov = (); ## Covariances as a function of the offset
	  my %cor = (); ## Correlations
	  my %Ncor = (); ## Correlations
	  my %dEucl = (); ## Euclidian distances
	  my %rsEucl = (); ## Euclidian distances
	  my %dKL = (); ## Kullback-Leibler distances
	  my %out_line = (); ## Output line for each offset value
	  my @output_offset = (); ## Offsets passing the thresholds
	  my $direction = $matrix2->get_attribute("direction") || "D";
	  
	  ## Slide matrix2 along matrix1
	  foreach my $offset ($offset_min..$offset_max) {
	      $compa++;
	      next unless (&CheckThresholds("offset", $offset));
	      
	      ## Compute aligned matrix positions
	      
	      my $end1 = &RSAT::stats::min($ncol1, $ncol2+$offset);
	      my $start1 = &RSAT::stats::max(1, $offset+1, $end1-$ncol2+1);
	      my $w = $end1-$start1+1;
	      my $total_len = $ncol1 + $ncol2 - $w;
	      my $Wr = $w / $total_len;
	      next unless (&CheckThresholds("w", $w));
	      my $start2 = &RSAT::stats::max(1, 1- $offset);
	      my $end2 = &RSAT::stats::min($start2+$w-1, $ncol2);
	      
	      ## Highlight alignment in consensus
	      $consensus1 = join ("",
				  "."x($start1-1),
				  substr($matrix1->get_attribute("consensus.IUPAC"), $start1-1, $w),
				  "."x($ncol1-$end1)
		  );
	      $consensus2 = join ("",
				  "."x($start2-1),
				  substr($matrix2->get_attribute("consensus.IUPAC"), $start2-1, $w),
				  "."x($ncol2-$end2)
		  );
	      
	      
	      ## Compute the Euclidian distance between the matrices
	      my $dEucl = 0;
	      my $rsEucl = 0;
	      my $dKL = 0;
	      my $cov = 0;
	      my $cor = 0;
	      my $Ncor = 0;
	      my $sum_f1 = 0;
	      my $sum_f2 = 0;
	      my $sum_f1f2 = 0;
	      my $sum_sq_f1 = 0;
	      my $sum_sq_f2 = 0;
	      foreach my $pos (1..$w) {
		  my $col1 = $start1+$pos-1;
		  my $col2 = $start2+$pos-1;
		  for my $r (0..($nrow1-1)) {
		      my $f1 = $crude_freq1[$col1-1][$r];
		      my $f2 = $crude_freq2[$col2-1][$r];
		      
		      ## Update sums for correlations
		      if (($calc_field{cov}) || ($calc_field{cor})){
			  $sum_f1 += $f1;
			  $sum_f2 += $f2;
			  $sum_sq_f1 += $f1**2;
			  $sum_sq_f2 += $f2**2;
			  $sum_f1f2 += $f1*$f2;
		      }
		      
		      ## Update sum for Euclidian distance
		      if ($calc_field{dEucl}) {
			  $dEucl +=  ($f1 - $f2)**2;
		      }
		      
		      ## Update sum for Kullback-Leibler distance
		      if ($calc_field{dKL}) {
			  next if ($dKL eq "Inf");
			  
			  if ($f1*$f2 > 0) {
			      $dKL += $f1*log($f1/$f2) + $f2*log($f2/$f1);
			      
			  } elsif (($f1 > 0) || ($f2 > 0)) {
			      ## The KL distance is infinite if one of the values is 0 and the other not !!!
			      $dKL = "Inf";
			      &RSAT::message::Warning("The Kullback-Leibler distance is infinite ") if ($main::verbose >= 3);
			  }
		      }
		      
#	    &RSAT::message::Debug("offset=".$offset, "pos=".$pos, "row=".$r, "f1=".$f1, "f2=".$f2, 
#				  "sum_f1=".$sum_f1,
#				  "dKL=", $dKL,
#				  "dEucl=", $dEucl,
#				 )
#		  if ($main::verbose >= 10);
		      
		  }
	      }
	      
	      ## Compute Euclidian distance
	      if ($calc_field{dEucl}) {
		  $dEucl = sqrt($dEucl);
		  $dEucl /= $w if ($w > 0); ## Relativize Euclidian distance with respect to the number of aligned columns
		  next unless (&CheckThresholds("dEucl", $dEucl)); $dEucl{$offset} = $dEucl;
	      }

	      ## Compute Euclidian similarity
	      if ($calc_field{rsEucl}) {
		  my $maxEucl = sqrt(2);
		  $rsEucl = ($maxEucl - $dEucl) / $maxEucl;
	      }

	      ## Compute Kullback-Leibler distance
	      if ($calc_field{dKL}) {
		  unless ($dKL eq "Inf") {
		      $dKL /= 2;
		      $dKL /= $w if ($w > 0); ## Relativize KL distance with respect to the number of aligned columns
		      next unless (&CheckThresholds("dKL", $dKL)); $dKL{$offset} = $dKL;
		      $dKL = sprintf("%.5f", $dKL);
		  }
	      }

	      ## Compute coefficient of correlation
	      my $n = 0; ## Number of matrix cells in the alignment
	      my $v1 = 0; ## Variance of the first matrix
	      my $v2 = 0; ## Variance of the second matrix
	      if (($calc_field{cov}) || ($calc_field{cor})){
		  $n = $nrow1*$w;
		  $v1 = $sum_sq_f1/$n - ($sum_f1/$n)**2;
		  $v2 = $sum_sq_f2/$n - ($sum_f2/$n)**2;
		  $cov = $sum_f1f2/$n - $sum_f1*$sum_f2/($n**2); ## Warning : this covariance has nothing to do with the "natural covariance" of Pape
		  if ($cov == 0) {
		      $cor = 0;
		  } else {
		      $cor = $cov/sqrt($v1*$v2);
		  }
		  $Ncor = $cor * $Wr;
#  	  &RSAT::message::Debug("n=".$n,
# 				"sum_f1=".$sum_f1,
# 				"sum_f2=".$sum_f2,
# 				"sum_sq_f1=".$sum_sq_f1,
# 				"sum_sq_f2=".$sum_sq_f2,
# 				"v1=".$v1,
# 				"v2=".$v2,
# 				"cov=".$cov,
# 				"cor=".$cor,
# 				"Ncor=".$Ncor,
# 			       ) if ($main::verbose >= 10);
	      }
	      next unless (&CheckThresholds("cov", $cov));
	      next unless (&CheckThresholds("cor", $cor));
	next unless (&CheckThresholds("Ncor", $Ncor));
	      
	      ## Index values for sorting
	      $cov{$offset} = $cov;
	      $cor{$offset} = $cor;
	      $Ncor{$offset} = $Ncor;
	      
	      ## Generate the output line
#	my @out_fields = ($compa,$matrix1->get_attribute("name"), $matrix2->get_attribute("name"));
	      my @out_fields = ($compa);
	      if ($main::show_matrix_name){
		  push (@out_fields,$matrix1_name, $matrix2_name);
	      }
	      else{
		  push (@out_fields, $m1, $m2);
	      }
	      push (@out_fields, $offset, $direction, $w, sprintf("%.4f", $Wr));
	      push (@out_fields, $start1, $end1) if ($return_field{pos});
	      push (@out_fields, $start2, $end2) if ($return_field{pos});
	      push (@out_fields, sprintf("%.5f", $dEucl)) if ($return_field{dEucl});
	      push (@out_fields, sprintf("%.5f", $rsEucl)) if ($return_field{rsEucl});
	      push (@out_fields, $dKL) if ($return_field{dKL});
	      push (@out_fields, sprintf("%.4f", $cov)) if ($return_field{cov});
	      if ($return_field{cor}) {
		  push (@out_fields, sprintf("%.4f", $cor));
		  push (@out_fields, sprintf("%.4f", $Ncor));
	      }
	      push (@out_fields,
		    $consensus1,
		    $consensus2) if ($return_field{consensus});
	      push @output_offset, $offset;
	      $out_line{$offset} =  join("\t", @out_fields);
	  }
	  
	  ## Sort the results
	  my @sorted_offset;
	  
	  ## Euclidian distance is sorted in ascending order
	  if ($sort_field eq "dEucl") {
	      @sorted_offset = sort {$dEucl{$a} <=> $dEucl{$b}} @output_offset;
	      
	  } elsif ($sort_field eq "rsEucl") {
	      @sorted_offset = sort {$dEucl{$b} <=> $dEucl{$a}} @output_offset;
	      
	      ## Kullback-Leibler distance is sorted in ascending order
	  } elsif ($sort_field eq "dKL") {
	      @sorted_offset = sort {$dKL{$a} <=> $dKL{$b}} @output_offset;
	      
	      ## covariance is sorted in descending order
	  } elsif ($sort_field eq "cov") {
	      @sorted_offset = sort {$cov{$b} <=> $cov{$a}} @output_offset;
	      
	      ## Correlation is sorted in descending order
	  } elsif ($sort_field eq "cor") {
	      @sorted_offset = sort {$cor{$b} <=> $cor{$a}} @output_offset;
	      
	      ## Correlation is sorted in descending order
	  } elsif ($sort_field eq "Ncor") {
	      @sorted_offset = sort {$Ncor{$b} <=> $Ncor{$a}} @output_offset;
	      
	  } else {
	      @sorted_offset = @output_offset;
	  }
	  
	  ## specific treatment for the rank
	  my %rank = ();
	  if ($calc_field{rank}) {
	      my $rank = 0;
	      foreach my $offset (@sorted_offset) {
		  $rank++;
		  $rank{$offset} = $rank;
	      }
	      if (defined($uth{rank})) {
		  @sorted_offset = @sorted_offset[0..($uth{rank}-1)];
	      }
	      if (defined($lth{rank})) {
		  @sorted_offset = @sorted_offset[($lth{rank}-1)..$#sorted_offset];
	      }
	  }
	  
	  ## Print the output
	  foreach my $offset (@sorted_offset) {
	      print $out  $out_line{$offset};
	      print $out "\t", $rank{$offset} if ($return_field{rank});
	      print $out "\n";
	  }
      }
  }
  
  &RSAT::message::TimeWarn("Matrix comparisons done") if ($main::verbose >= 2);
  
  ################################################################
  ## Finish verbose
  if ($main::verbose >= 1) {
    my $done_time = &AlphaDate();
    print $main::out "; Job started $start_time\n";
    print $main::out "; Job done    $done_time\n";
  }


  ################################################################
  ## Close output stream
  close $main::out if ($main::outfile{output});


  exit(0);
}

################################################################
################### SUBROUTINE DEFINITION ######################
################################################################


################################################################
## Display full help message 
sub PrintHelp {
  system "pod2text -c $0";
  exit()
}

################################################################
## Display short help message
sub PrintOptions {
  &PrintHelp();
}

################################################################
## Read arguments 
sub ReadArguments {
  my $arg;
  my @arguments = @ARGV; ## create a copy to shift, because we need ARGV to report command line in &Verbose()
  while (scalar(@arguments) >= 1) {
    $arg = shift (@arguments);

    ## Verbosity
=pod

=head1 OPTIONS

=over 4

=item B<-v #>

Level of verbosity (detail in the warning messages during execution)

=cut
      if ($arg eq "-v") {
	if (&IsNatural($arguments[0])) {
	  $main::verbose = shift(@arguments);
	} else {
	  $main::verbose = 1;
	}

	## Help message
=pod

=item B<-h>

Display full help message

=cut
      } elsif ($arg eq "-h") {
	&PrintHelp();

	## List of options
=pod

=item B<-help>

Same as -h

=cut
      } elsif ($arg eq "-help") {
	&PrintOptions();

	## Input files
=pod

=item B<-file1 matrix_file1>

The first input file containing one or several matrices.

=cut
      } elsif ($arg eq "-file1") {
	$main::infile{file1} = shift(@arguments);

=pod

=item B<-file2 matrix_file2>

The second input file containing one or several matrices.

=cut
      } elsif ($arg eq "-file2") {
	$main::infile{file2} = shift(@arguments);

=pod

=item B<-mlist1 matrix list>

The fisrt input file contaning a list of matrix files (given as paths)

=cut
      } elsif ($arg eq "-mlist1") {
	$main::infile{mlist1} = shift(@arguments);

=pod

=item B<-mlist2 matrix list>

The second input file contaning a list of matrix files (given as paths)
The reverse complement is computed for this set of matrices.
=cut
      } elsif ($arg eq "-mlist2") {
	$main::infile{mlist2} = shift(@arguments);


	### Input format 
=pod

=item B<-format matrix_format>

Specify the matrix format for both input files (alternatively, see
options -format1 and -format2).

=cut


      } elsif ($arg eq "-format") {
	$input_format1 = shift(@arguments);
	$input_format2 = $input_format1;
	unless ($supported_input_format{$input_format1}) {
	  &RSAT::error::FatalError("$input_format1\tInvalid format for input matrices\tSupported: ".$supported_input_formats);
	}

	### Input format for file1 only
=pod

=item B<-format1 matrix_format1>

Specify the matrix format for the first input file only (requires
-format2).

=cut
      } elsif ($arg eq "-format1") {
	$input_format1 = shift(@arguments);
	unless ($supported_input_format{$input_format1}) {
	  &RSAT::error::FatalError("$input_format1\tInvalid format for input matrices\tSupported: ".$supported_input_formats);
	}

	### Input format for file2 only
=pod

=item B<-format2 matrix_format2>

Specify the matrix format for the second input file only (requires
-format1).

=cut
      } elsif ($arg eq "-format2") {
	$input_format2 = shift(@arguments);
	unless ($supported_input_format{$input_format2}) {
	  &RSAT::error::FatalError("$input_format2\tInvalid format for input matrices\tSupported: ".$supported_input_formats);
	}

	## Output file
=pod

=item	B<-o outputfile>

If no output file is specified, the standard output is used.  This
allows to use the command within a pipe.

=cut
      } elsif ($arg eq "-o") {
	$main::outfile{output} = shift(@arguments);

=pod

=item B<-DR>

Perform comparisons on both direct (D) and reverse complementary (R)
matrices.

This is the default option for matrix orientations.

=cut
	} elsif ($arg eq "-DR") {
	    $main::strands = "DR";


=pod

=item B<-D>

Perform comparisons on the direct (D) orientation only (the reverse
complementary matrices are not taken into consideration).

By default, comparisons are performed in both orientations.

=cut
	} elsif ($arg eq "-D") {
	    $main::strands = "D";

=pod

=item B<-R>

Perform comparisons on the reverse complementary (R) orientation only
(all matrices of the second matrix file are converted to the reverse
complementary matrix, and their original orientation is NOT taken into
consideration).

This option is useful to answer very particular questions, for example
to detect reverse complementary palindromic motifs (e.g., a motif with
consensus tCAGswwsGTGa is reverse complementary palindromic). When a
motif is reverse complementary palindromic, the matrix is correlated
to its own reverse complement.

I<Remark about a frequent misconception of biological palindromes>

Reverse complementary palindroms are frequent in DNA signals
(e.g. transcription factor binding sites, restriction sites, ...)
because they correspond to a rotational symmetry in the 3D
structure. Such symmetrical motifs are often characteristic of sites
recognized by homodimeric complexes.

By contrast, simple string-based palindromes (e.g. CAGTTGAC) no
absolutely not correspond to any symmetry on the biochemical point of
view, because the 3D structure of the corresponding double helix is
not symmetrical. The apparent symetry is an artifact of the string
representation, but the corresponding molecule has neither rotational
nor translational symmetry.

DNA signals can either be symmetrical (reverse complementary
palindromes, tandem repeats) or asymmetrical.

=cut
	} elsif ($arg eq "-R") {
	    $main::strands = "R";
       

        ## Matrix ID
=pod

=item B<-matrix_id #>
By default (0) an authomatic matrix ID is generated and printed in the out field. 
To use the file name as matrix ID set this option to 1.
   -matrix_id 1
=cut
      } elsif ($arg eq "-matrix_id") {
	$main::show_matrix_name = shift(@arguments);


	## Sort field
=pod

=item	B<-sort sort_field>

Field to sort the results.

Supported: dEucl, rsEucl, dKL, cov, cor, Ncor.

=cut
      } elsif ($arg eq "-sort") {
	$main::sort_field = shift(@arguments);

	## Thresholds
=pod

=item	B<-lth param lower_threshold>

=item	B<-uth param upper_threshold>

Threshold on some parameter (-lth: lower, -uth: upper threshold). 

Supported threshold fields : rank, dEucl, cor, cov, ali_len, offset

=cut

	    ### Lower threshold
	} elsif ($arg eq "-lth") {
	    my $thr_field = lc(shift(@arguments));
	    my $thr_value =  shift(@arguments);
	    unless ($supported_threshold{$thr_field}) {
		&RSAT::error::FatalError("Invalid threshold field $thr_field. Supported: $supported_thresholds");
	    }
	    &RSAT::error::FatalError($thr_value, "Invalid value for a lower threshold. Should be a real number. ") 
	      unless (&RSAT::util::IsReal($thr_value));
	    $lth{$thr_field} = $thr_value;

	    ### Upper threshold
	} elsif ($arg eq "-uth") {
	    my $thr_field = lc(shift(@arguments));
	    my $thr_value = shift(@arguments);
	    unless ($supported_threshold{$thr_field}) {
		&RSAT::error::FatalError("Invalid threshold field $thr_field. Supported: $supported_thresholds");
	    }
	    &RSAT::error::FatalError($thr_value, "Invalid value for an upper threshold. Should be a real number. ") 
	      unless (&RSAT::util::IsReal($thr_value));
	    $uth{$thr_field} = $thr_value;


      } else {
	&FatalError(join("\t", "Invalid option", $arg));

      }
  }


=pod

=back

=cut

}

################################################################
## Verbose message
sub Verbose {
    print $main::out "; compare-matrices ";
    &PrintArguments($main::out);
    printf $main::out "; %-22s\t%s\n", "Program version", $program_version;
    if (defined(%main::infile)) {
	print $main::out "; Input files\n";
	while (my ($key,$value) = each %main::infile) {
	  printf $main::out ";\t%-6s\t%s\n", $key, $value;
	}
    }
    if (defined(%main::outfile)) {
	print $main::out "; Output files\n";
	while (my ($key,$value) = each %main::outfile) {
	  printf $main::out ";\t%-13s\t%s\n", $key, $value;
	}
    }

    print $main::out "; Matrices\n";
    print $main::out join ("\t", ";", "file", "matrix", "ncol", "nsites", "consensus.IUPAC"),  "\n";

    ## Matrices in the first file
    my $m1 = 0;
    foreach my $matrix (@matrices1) {
      $m1++;
      print $main::out join ("\t", ";", "file1", $m1,
			     $matrix->get_attribute("ncol"),
			     $matrix->get_attribute("nsites"),
			     $matrix->get_attribute("consensus.IUPAC")), "\n";
    }

    ## Matrices in the second file
    my $m2 = 0;
    foreach my $matrix (@matrices2) {
      $m2++;
      print $main::out join ("\t", ";", "file2", $m2,
			     $matrix->get_attribute("ncol"),
			     $matrix->get_attribute("nsites"),
			     $matrix->get_attribute("consensus.IUPAC")),  "\n";
    }

    print $main::out "; Thresholds\n";
}


################################################################
#### check threshold on some parameter
sub CheckThresholds {
  my ($key, $value) = @_;
  $key = lc($key); ## Ensure case-insensitivity

  &RSAT::message::Debug("Checking threshold", $key, $value) if ($main::verbose >= 4);

  ## Lower threshold
  if (defined($lth{$key})) {
    if ($value < $lth{$key}) {
      return (0);
    }
    &RSAT::message::Debug("Passed lower threshold", $key, $value, "lth=".$lth{$key}) if ($main::verbose >= 5);
  }

  ## Upper threshold
  if (defined($uth{$key})) {
    if ($value > $uth{$key}) {
      return (0);
    }
    &RSAT::message::Debug("Passed upper threshold", $key, $value, "uth=".$uth{$key}) if ($main::verbose >= 5);
  }


  return(1);
}


__END__

=pod

=head1 SEE ALSO

=over

=item B<convert-matrix>

=item B<matrix-scan>

=back

=head1 WISH LIST

=over

=item B<Additional metrics>

=over

=item Mutual information

We should check if this fixes the problems of 0 values that we have
with the KL distance.

=item The "natural covariance", as proposed by Pape et al. (2008).

This metrics measures the covariance between hits of two matrices
above a given threshold for each of them.

=item chi2 P-value (for the sake of comparison).

Note that a condition of applicability of the chi2 P-value is that the
expected value should be >= 5 for each cell of the matrix. This
condition is usually not fulfilled for the PSSM we use for motif
scanning.

=back

=item B<-strand D|R|DS>

Perform comparison with the direct (D), reverse complementary (R) or
both orientations of the second matrix, relative to the first one.

=item B<-table field>

Export a table with one row per matrix of the file 1, one column per
matrix of file 2, and where each cell indicates the value of the
selected field.

=back

=cut
