#!/usr/bin/env perl
############################################################
#
# $Id: compare-rsnps,v 1.48 2013/10/03 17:24:24 jvanheld Exp $
#
############################################################

## use strict;

=pod

=head1 NAME

compare-reg-var

=head1 VERSION

$program_version

=head1 DESCRIPTION

Compare two or more files containing regulatory variations (predicted
or annotated).

Regulatory variations can be obtained from a variety of sources:

=over

=item I<variation-scan>

RSAT program that predicts regulatory variations by scanning variation sequences with
position-specific scoring matrices.

L<http://www.rsat.eu/>

=item I<is-rsnp>

A Web site allowing to predict regulatory variations or to retrieve pre-computed
predictions.

L<http://bioinformatics.research.nicta.com.au/software/is-rsnp/>

=item I<haploreg>

A web site combining annotations and predictions of regulatory variations,
as well as integration of variations with regulatory information (ChIP-seq
for transcription factors, histone modifications, ...).

L<http://www.broadinstitute.org/mammals/haploreg/haploreg.php>

=back

=head1 AUTHORS

=over

=item Jacques.van-Helden\@univ-amu.fr

=item Yvon Mbouamboua

=back

=head1 CATEGORY

=over

=item variations

=back

=head1 USAGE

compare-reg-var [-i inputfile] [-o outputfile] [-v #] [...]

=head1 INPUT FORMAT

=head1 OUTPUT FORMAT

The output format is a tab-delimited file, with one row per regulatory
variation.

A regulatory variation is defined by 2 mandatory fields:

=over

=item I<variation ID>

The identifier of the variation shown or predicted to affect regulation.

=item I<TFBM>

Identifier of the transcription factor binding motif (TFBM) affected
by the variation.

=back

=head1 SEE ALSO

=over

=item B<variation-scan>

The program I<compare-rsnps> takes as input the results of
I<variation-scan>.

=back


=head1 WISH LIST

=over

=item B<wish 1>

=item B<wish 2>

=back

=cut


BEGIN {
  if ($0 =~ /([^(\/)]+)$/) {
    push (@INC, "$`lib/");
  }
}
require "RSA.lib";
require RSAT::variation;


################################################################
## Main package
package main;
{

  ################################################################
  ## Initialise parameters
  our $start_time = &RSAT::util::StartScript();
  our $program_version = do { my @r = (q$Revision: 1.48 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
  #    $program_version = "0.00";

  @names = ();
  @files = ();
  our %infile = (); ## Index file paths by name
  our %outfile = ();

  our $verbose = 0;
  our $in = STDIN;
  our $out = STDOUT;

  our %matrix_names = ();
  our %regvar = ();
  our $null = "<NA>"; ## Printout for null values (regulatory variations not found in a file)

  ################################################################
  ## Read argument values
  &ReadArguments();

  ################################################################
  ## Check argument values
  &RSAT::message::TimeWarn("Checking parameter values") if ($main::verbose >= 2);

  if (scalar(@files) < 2) {
    &RSAT::error::FatalError("The comparison requires at least two input files (option -file).");
  }

  ################################################################
  ## Open output stream
  $out = &OpenOutputFile($outfile{output});

  ################################################################
  ## Read input
  &RSAT::message::TimeWarn("Reading", scalar(@files), "variation files") if ($main::verbose >= 2);
  for my $i (1..scalar(@files)) {
    my $name = $names[$i-1];
    my $file = $files[$i-1];
    &RSAT::message::TimeWarn() if ($main::verbose >= 3);
    ($main::in) = &OpenInputFile($file);
    while (<$main::in>) {
      next unless (/\S/); ## Skip empty rows
      next if (/^;/); ## Skip comment rows
      next if (/^#/); ## Skip header rows
      chomp();
      my @fields = split("\t", $_);
      my $matrix_id = $fields[0];
      my $matrix_name = $fields[1];
      my $var_id = $fields[2];
      my $best_pval = $fields[8];
      my $worst_pval = $fields[9];
      my $pval_ratio = $fields[10];

      ## Index matrix names
      if ($matrix_name) {
	$matrix_names{lc($matrix_id)} = $matrix_name;
	&RSAT::message::Warning("Indexing matrix name", $matrix_id, $matrix_name) if ($main::verbose >= 0);
      } else {
	&RSA::message::Warning("Missing name for matrix", $matrix_id);
      }

      ## Index the variations for this file name.  We should check if
      ## the 3 index keys are not problematic for memory when reading
      ## very big files.
      $regvar{$var_id}{$matrix_id}{$name} = $pval_ratio;
    }
    close $main::in if ($main::infile{input});
  }

  ################################################################
  ## Print verbose
  &Verbose() if ($main::verbose >= 1);

  ################################################################
  ## Compare the variations
  &RSAT::message::TimeWarn("Comparing variations") if ($main::verbose >= 2);

  ################################################################
  ## Insert here output printing
  &RSAT::message::TimeWarn("Printing the output") if ($main::verbose >= 2);

  ## Print the header
  print $out join("\t", 
		  "#variation_id",
		  "matrix_id",
		  "matrix_name",
		  @names,
      ), "\n";
      

  ## Print the comparison
  foreach my $var_id (sort keys %regvar) {
    foreach my $matrix_id (sort keys %{$regvar{$var_id}}) {
      my $matrix_name = $matrix_id;
      if (defined($matrix_names{lc($matrix_id)})) {
	$matrix_name = $matrix_names{lc($matrix_id)};
      }

      print $out join("\t", 
		      $matrix_id,
		      $var_id,
		      $matrix_name
	  );
      foreach my $name (@names) {
	my $score = $null;
	if (defined($regvar{$var_id}{$matrix_id}{$name})) {
	  $score = $regvar{$var_id}{$matrix_id}{$name};
	}
	print $out "\t", $score;
      }
      print $out "\n";
    }
  }

  ################################################################
  ## Report execution time and close output stream
  &close_and_quit();
}

################################################################
################### SUBROUTINE DEFINITION ######################
################################################################


################################################################
## Close output file and quit
sub close_and_quit {

  ## Report execution time
  my $exec_time = &RSAT::util::ReportExecutionTime($start_time); ## This has to be exectuted by all scripts
  print $main::out $exec_time if ($main::verbose >= 1); ## only report exec time if verbosity is specified

  ## Close output file
  if ($outfile{output}) {
    close $main::out;
    &RSAT::message::TimeWarn("Output file", $outfile{output}) if ($main::verbose >= 2);
  }

  ## CLOSE OTHER FILES HERE IF REQUIRED

  exit(0);
}


################################################################
## Display full help message 
sub PrintHelp {
  system "pod2text -c $0";
  exit()
}

################################################################
## Display short help message
sub PrintOptions {
  &PrintHelp();
}

################################################################
## Read arguments 
sub ReadArguments {
  my $arg;
  my @arguments = @ARGV; ## create a copy to shift, because we need ARGV to report command line in &Verbose()
  while (scalar(@arguments) >= 1) {
    $arg = shift (@arguments);


=pod

=head1 OPTIONS

=over 4

=item B<-v #>

Level of verbosity (detail in the warning messages during execution)

=cut
    if ($arg eq "-v") {
      if (&IsNatural($arguments[0])) {
	$main::verbose = shift(@arguments);
      } else {
	$main::verbose = 1;
      }


=pod

=item B<-h>

Display full help message

=cut
    } elsif ($arg eq "-h") {
      &PrintHelp();


=pod

=item B<-help>

Same as -h

=cut
    } elsif ($arg eq "-help") {
      &PrintOptions();


=pod

=item B<-file name inputfile>

Input name and file.

The input file should contain regulatory variations in varscan format.
Regulatory variations from other sources (e.g. is-srnp, HaploReg) can
be converted to varscan format with the tool I<convert-variations>.

The "name" argument indicates the column header associated with this
input file in the output table.

This argument can be used two or more time on the same command line to
specify several input files. At least two input files must be
provided.

=cut
    } elsif ($arg eq "-file") {
      my $name = shift(@arguments);
      $name =~ s/[\s\[\]\(\)\:\;\,\/\\]/_/g;
      push @names, $name;
      my $file = shift(@arguments);
      push @files, $file;
      if (defined($infile{lc($name)})) {
	&RSAT::error::FatalError($name, "The same name cannot be used twice.");
      }
      $infile{lc($name)} = $file; ## Index the file


=pod

=item	B<-o outputfile>

If no output file is specified, the standard output is used.  This
allows to use the command within a pipe.

=cut
    } elsif ($arg eq "-o") {
      $outfile{output} = shift(@arguments);

    } else {
      &FatalError(join("\t", "Invalid option", $arg));

    }
  }

=pod

=back

=cut

}

################################################################
## Verbose message
sub Verbose {
  print $out "; compare-reg-var ";
  &PrintArguments($out);
  printf $out "; %-22s\t%s\n", "Program version", $program_version;
  if (%main::infile) {
    print $out "; Input files\n";
    while (my ($key,$value) = each %main::infile) {
      printf $out ";\t%-13s\t%s\n", $key, $value;
    }
  }
  if (%main::outfile) {
    print $out "; Output files\n";
    while (my ($key,$value) = each %main::outfile) {
      printf $out ";\t%-13s\t%s\n", $key, $value;
    }
  }
}


__END__
