#!/usr/bin/env perl
############################################################
#
# $Id: merge-matrices,v 1.48 2013/10/03 17:24:24 jvanheld Exp $
#
############################################################

## use strict;

=pod

=head1 NAME

merge-matrices

=head1 VERSION

$program_version

=head1 DESCRIPTION

This programs takes as input a list of position-specific scoring
matrices, and merges them by applying a user-selected operator on the
aligned cell of all matrices.

Supported operators (option -calc): sum, mean, diff.

The diff operator returns the result of the subtraction between the
first and the second matrices (if the input file contains more than
two matrices, a warning message is issued).

=back

=head1 AUTHORS

Jaime Castro <jcastro@lcg.unam.mx>

Jacques van Helden <Jacques.van-Helden\@univ-amu.fr>

=head1 CATEGORY

=over

=item PSSM

=back

=head1 USAGE

merge-matrices [-v #]  [-i inputfile] [-in_format input_format] \
  [-o outputfile] [-out_format output_format]
  [-calc sum|mean|diff]

=head1 INPUT/OUTPUT FORMAT

I<merge-matrices> supports the same input and output formats as
I<convert-matrix>.

=head2 Constraint on matrix widths

The merging operations assume that all input matrices have the same
width. 

=head1 SEE ALSO

=head2 convert-matrix

=head2 matrix-clustering

The program I<matrix-clustering> uses I<merge-matrices> to generate
the merged matrices and consensuses at each intermediate branch of the
matrix tree.

=head1 WISH LIST

=over

=item B<-merged_attr key value>

Impose the value of specific attribtues of the merged matrix. 

This option is useful to specify meaningful identifiers or names
depending on the context of the analysis.

=item B<-consensus_name>

Calculate the consensus and use it as name for the merged matrix.

=item B<decimals>

JvH should Fix a bug with the decimals: currently, they only apply to frequencies, not to counts.

=back

=cut


BEGIN {
  if ($0 =~ /([^(\/)]+)$/) {
    push (@INC, "$`lib/");
  }
}
require "RSA.lib";

use RSAT::matrix;
use RSAT::MatrixReader;


################################################################
## Main package
package main;
{

  ################################################################
  ## Initialise parameters
  our $start_time = &RSAT::util::StartScript();
  our $program_version = do { my @r = (q$Revision: 1.48 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
  #    $program_version = "0.00";

  our %infile = ();
  our %outfile = ();
  $outfile{id} = "merged";
  $outfile{name} = "merged";
  $outfile{version} = "none";


  our $verbose = 0;
  our $in = STDIN;
  our $out = STDOUT;


  ## Operator
  $operator = "sum";
  local %supported_operator = ("mean"=>1, "sum"=>1, "diff"=>1);
  local $supported_operators = join ",", sort keys %supported_operator;

  ## Input formats
#  local %supported_input_format = %RSAT::MatrixReader::supported_input_format;
  local %supported_input_format = &RSAT::MatrixReader::ListInputMatrixFormats();
  local $supported_input_formats = join ",", sort keys %supported_input_format;

  ## Output formats
  local %supported_output_format = %RSAT::matrix::supported_output_format;
  local $supported_output_formats = join ",", sort keys %supported_output_format;

  ## Initialize some attributes (can be over-written with the option -merged_attr)
  local %merged_attributes = ();

  ## Parameters for the &doit() command
  our $dry = 0;
  our $die_on_error = 1;
  our $job_prefix = "matrix-from-patterns";
  our $batch = 0;
  
  ## Parameters for rescan-matrix
  our $seq = "";
  our $iterations = 1;
  our $scan_parameters = "";


  ################################################################
  ## Read argument values
  &ReadArguments();

  $merged_attributes{"id"} = $outfile{id};
  $merged_attributes{"AC"} = $outfile{name};
  $merged_attributes{"version"} = $outfile{name};

  ################################################################
  ## Check argument values

  ## Two redundant attribtue names, maintained for compatibility
  ## between different matrix formats
  unless (defined($merged_attributes{"accession"})) {
    $merged_attributes{"accession"} =  $merged_attributes{"AC"};
  }
  unless (defined($merged_attributes{"identifier"})) {
    $merged_attributes{"identifier"} =  $merged_attributes{"id"};
  }

  ## Option -seq requires an output file name
  if (($infile{sequences}) && (!$outfile{output})) {
    &RSAT::error::FatalError("Option -seq requires to specify an output file (option -o).");
  }

  ################################################################
  ## Read input
  my @matrices = &RSAT::MatrixReader::readFromFile($infile{input}, $input_format);


  ## Check the number of input matrices
  local $nb_matrices = scalar(@matrices);
  if ($nb_matrices < 2) {
    &RSAT::error::FatalError("Input file must contain at least two matrices.");
  } elsif ($main::verbose >= 2) {
    &RSAT::message::Info("Input file contains", $nb_matrices, "matrices");
  }

  ## Check that all input matrices have the same width

  ## First matrix width
  $matrix_width = $matrices[0]->ncol();
  @alphabet = $matrices[0]->get_attribute("alphabet");
  $alphabet_size = $matrices[0]->nrow();
  if ($main::verbose >= 2) {
    &RSAT::message::Info("Width of the first matrix", $matrix_width);
  }

  ## Check that all subsequent matrices have the same with as the first one
  for my $i (1..$#matrices) {
    my $matrix = $matrices[$i]; 
    my $width = $matrix->ncol();
    unless ($width == $matrix_width) {
      my $id = $matrix->get_attribute("id");
      &RSAT::error::FatalError("Matrix", $id, "contains", $width, 
			       "columns.\nAll matrices should have the same width as the first matrix (".$matrix_width.").");
    }
  }



  ################################################################
  ## Execute the command
  my $merged_matrix = new RSAT::matrix();
  foreach my $key (keys %merged_attributes) {
    my $value = $merged_attributes{$key};
    $merged_matrix->force_attribute($key, $value);
  }

  $merged_matrix->setAlphabet_lc(@alphabet);

  
  @merged_counts = ();
  if ($operator eq "diff") {
      if (scalar(@matrices) != 2) {
	  &RSAT::error::FatalError("Matrix difference is only valid with input files containing exactly 2 matrices.");
      }
        
      ## First count matrix
      my $matrix1 = $matrices[0];
      my @counts1 = $matrix1->getMatrix();
      my $ncol1 = $matrix1->ncol();
      my $nrow1 = $matrix1->nrow();
  
      ## Second count matrix
      my $matrix2 = $matrices[1];
      my @counts2 = $matrix2->getMatrix();
      my $ncol2 = $matrix2->ncol();
      my $nrow2 = $matrix2->nrow();

      ## Check dimensions
      if ($ncol1 != $ncol2) {
	  &RSAT::error::FatalError("The numbers of columns differs between matrix 1 (${ncol1}) and matrix 2 (${ncol2}).");
      }    
      if ($nrow1 != $nrow2) {
	  &RSAT::error::FatalError("The numbers of rows differs between matrix 1 (${nrow1}) and matrix 2 (${nrow2}).");
      }

      ## Compute the difference
      for my $c (0..($ncol1-1)) {
	  for my $r (0..($nrow1-1)) {
#	    &RSAT::message::Debug("Merging", "c=".$c, "r=".$r, 
#				  "counts1=".$counts1[$c][$r]) if ($main::verbose >= 10);
	    $merged_counts[$c][$r] += $counts2[$c][$r] - $counts1[$c][$r];
	  }
      }
      
#       print $matrix1->to_tab();
#       print $matrix2->to_tab();
# #      print $merged_counts->to_tab();
      
#       die "HELLO\n", join(";", join("\n", @counts1)), "\n";
  } else {
      
      ## Set divider to 1 for matrix sums, and to the nb of matrices for
      ## the mean.
      my $divider = 1;
      if ($operator eq "mean") {
	  $divider *= $nb_matrices;
      }
      
      foreach my $matrix (@matrices) {
	  ## count matrix
	  my @counts = $matrix->getMatrix();
	  my $ncol = $matrix->ncol();
	  my $nrow = $matrix->nrow();
	  
	  for my $c (0..($ncol-1)) {
	      for my $r (0..($nrow-1)) {
		  $merged_counts[$c][$r] += $counts[$c][$r]/$divider;
	      }
	  }
      }
  }

  $merged_matrix->setMatrix($alphabet_size, $matrix_width, @merged_counts);
  $merged_matrix->calcConsensus();  
  my $consensus = $merged_matrix->get_attribute("consensus.IUPAC");
  $merged_matrix->force_attribute("description", $consensus);
#  $merged_matrix->force_attribute("ac", "Nunca las cosas se parecen tanto");
  #$merged_matrix->set_attribute("version", "Nunca las cosas se parecen tanto");


  
  ################################################################
  ## Output

  if ($infile{sequences}) {

    ################################################################
    ## Define the prefix based on the output file
    $prefix{output} = $outfile{output};
    $prefix{output} =~ s|\.\w+$||;
    $prefix{output} .= "_rescanned";
#    $outfile{rescanned_matrices} = $prefix{output}.".".$output_format;
#    $outfile{rescanned_sites} = $prefix{output}.".".$output_format;

    $outfile{merged_counts} = $prefix{output}."_merged_counts.".$output_format;

    ################################################################
    ## Open output stream in a separate log file for the verbosity
    $out = &OpenOutputFile($outfile{merged_counts});
  } else {
    ################################################################
    ## Open output stream
    $out = &OpenOutputFile($outfile{output});
  }
  ################################################################
  ## Print verbose
  &Verbose() if ($main::verbose >= 1);
  
  ## Export the merged matrix
  print $out $merged_matrix->toString(type=>"counts",format=>$output_format, "decimals"=>2);

  &close_merged_counts();

  ################################################################
  ## If a sequence file has been specified, scan them with the merged
  ## matrix in order to build a secondary matrix.
  if ($infile{sequences}) {
    
    ################################################################
    ## Print verbose
    &Verbose() if ($main::verbose >= 1);

    my $rescan_cmd = &RSAT::server::GetProgramPath("rescan-matrix");
    $rescan_cmd .= " -v 0";
    $rescan_cmd .= " -seq ".$infile{sequences};
    $rescan_cmd .= " -seq_format fasta";
    $rescan_cmd .= " -m ".$outfile{merged_counts};
    $rescan_cmd .= " -matrix_format ".$output_format;
    $rescan_cmd .= " -iterations ".$iterations;
    if ($scan_parameters) {
      $rescan_cmd .= " -scan_param '".$scan_parameters."'";
    }
    $rescan_cmd .= " -o ".$outfile{output};
    &doit($rescan_cmd, $dry, $die_on_error, $verbose, $batch, $job_prefix, $log, $err);

  }

  ## Exit
  exit(0);
}

################################################################
################### SUBROUTINE DEFINITION ######################
################################################################


################################################################
## Close output file and quit
sub close_merged_counts {

  ## Report execution time
  my $exec_time = &RSAT::util::ReportExecutionTime($start_time); ## This has to be exectuted by all scripts
  print $main::out $exec_time if ($main::verbose >= 1); ## only report exec time if verbosity is specified

  ## Close output file
  if ($outfile{output}) {
    close $main::out;
    &RSAT::message::TimeWarn("Output file", $outfile{output}) if ($main::verbose >= 2);
  }

}


################################################################
## Display full help message 
sub PrintHelp {
  system "pod2text -c $0";
  exit()
}

################################################################
## Display short help message
sub PrintOptions {
  &PrintHelp();
}

################################################################
## Read arguments 
sub ReadArguments {
  my $arg;
  my @arguments = @ARGV; ## create a copy to shift, because we need ARGV to report command line in &Verbose()
  while (scalar(@arguments) >= 1) {
    $arg = shift (@arguments);


=pod

=head1 OPTIONS

=over 4

=item B<-v #>

Level of verbosity (detail in the warning messages during execution)

=cut
    if ($arg eq "-v") {
      if (&IsNatural($arguments[0])) {
	$main::verbose = shift(@arguments);
      } else {
	$main::verbose = 1;
      }


=pod

=item B<-h>

Display full help message

=cut
    } elsif ($arg eq "-h") {
      &PrintHelp();


=pod

=item B<-help>

Same as -h

=cut
    } elsif ($arg eq "-help") {
      &PrintOptions();


=pod

=item B<-i inputfile>

If no input file is specified, the standard input is used.  This
allows to use the command within a pipe.

=cut
    } elsif ($arg eq "-i") {
      $main::infile{input} = shift(@arguments);

=pod

=item B<-in_format input_formaat>

Input format. See I<convert-matrix> for a list of supported input
formats.

=cut

    } elsif ($arg eq "-in_format") {
      $input_format = lc(shift(@arguments));
      unless ($supported_input_format{$input_format}) {
	&RSAT::error::FatalError($input_format, "Invalid input format for matrix", "Supported: ".$supported_input_formats);
      }

=pod

=item	B<-id matrix_ID>

Specify the ID of the output matrix.

=cut
    } elsif ($arg eq "-id") {
      $outfile{id} = shift(@arguments);


=item	B<-name matrix_name>

Specify the name of the output matrix.

=cut
    } elsif ($arg eq "-name") {
      $outfile{name} = shift(@arguments);

=pod

=item B<-calc merging_stat>

Specify the operator used to merge matrices. 

Supported: 

=over

=item I<mean> (default)

Each cell of the output matrix contains the mean of the values found
in the corresponding cell of the input matrices.

=item I<sum>

Each cell of the output matrix contains the sum of the values found
in the corresponding cell of the input matrices.

=item I<diff>

Each cell of the output matrix contains the difference between the two
input matrices. B<Beware>: this option ssumes that the input file
contains only two matrices.

=back

=cut

    } elsif ($arg eq "-calc") {
      $operator = lc(shift(@arguments));
      unless ($supported_operator{$operator}) {
	&RSAT::error::FatalError($operator, "Invalid operator for option -calc. ", "Supported: ".$supported_operators);
      }


=pod
=item B<-seq sequence_file>

If specified, the merged matrix (computed with sum or mean) and the
sequences are passed to I<rescan-matrix> in order to build a secondary
matrix from binding sites foundin the input sequence.

A typical application is the merging of matrices obtained by different
motif discovery algorithms in ChIP-seq peak sequences. This enables to
base the merged matrix on the actual (predicted) sites from the
sequence rather than summing counts which are liely to result from
largely overlapping colections of sites. .


=cut

    } elsif ($arg eq "-seq") {
      $main::infile{sequences} = shift(@arguments);

=item B<-iterations #>

Maximal number of iterations for rescan-matrix (useful only if
combined with -seq option).

=cut
    } elsif ($arg eq "-iterations") {
	$main::iterations = shift(@arguments);
	if (!&IsNatural($main::iterations) || ($main::iterations < 1)) {
	    &RSAT::error::FatalError($main::iteration, "Invalid value for option -iterations. Should be a Natural number >= 1ti. ");
	}

=item B<-scan_param scanning_parameters>

Parameters to scan the sites. The argument is passed to rescan-matrix.

=cut
    } elsif ($arg eq "-scan_param") {
	$main::scan_parameters = shift(@arguments);

=pod

=item	B<-o outputfile>

If no output file is specified, the standard output is used.  This
allows to use the command within a pipe.

=cut
    } elsif ($arg eq "-o") {
      $outfile{output} = shift(@arguments);

=pod

=item B<-out_format output_formaat>

Output format. See I<convert-matrix> for a list of supportd output formats.

=cut

    } elsif ($arg eq "-out_format") {
      $output_format = lc(shift(@arguments));
      unless ($supported_output_format{$output_format}) {
	&RSAT::error::FatalError("$output_format\tInvalid output format for matrix\tSupported: $supported_output_formats");
      }

    } else {
      &FatalError(join("\t", "Invalid option", $arg));

    }
  }

=pod

=back

=cut

}

################################################################
## Verbose message
sub Verbose {
  print $out "; merge-matrices ";
  &PrintArguments($out);
  printf $out "; %-22s\t%s\n", "Program version", $program_version;
  if (%main::infile) {
    print $out "; Input files\n";
    while (my ($key,$value) = each %main::infile) {
      printf $out ";\t%-13s\t%s\n", $key, $value;
    }
  }
  printf $out "%-22s\t%s\n", "; Input format", $input_format;

  if (%main::outfile) {
    print $out "; Output files\n";
    while (my ($key,$value) = each %main::outfile) {
      printf $out ";\t%-13s\t%s\n", $key, $value;
    }
  }
  printf $out "%-22s\t%s\n", "; Output format", $output_format;

  printf $out "; %-22s\t%s\n", "Merging operator", $main::operator;
  printf $out "; %-22s\t%s\n", "Input matrices", $main::nb_matrices;
  printf $out "; %-22s\t%s\n", "Matrix width", $main::matrix_width;
}


__END__
