#!/usr/bin/env perl

############################################################
#
# $Id: retrieve-matrix,v 1.0 2017/10/19 14:27:53 thnguyen $
#
# Time-stamp: <2003-08-05 10:24:33 jvanheld>
#
############################################################

=pod

=head1 NAME

retrieve-matrix

=head1 DESCRIPTION

Retrieve a subset of matrices from a collection of position-specific
scoring matrices.

The program takes as input a file containing a collection of matrices
in TRANSFAC format, and a list of query identifiers (matrix accession
or name).  

The program returns the selected matrices in TRANSFAC format.

=head1 AUTHORS

=over

=item Thi-Thuy-Nga NGUYEN <thnguyen@biologie.ens.fr>

=item Morgane Thomas-Chollier <mthomas@biologie.ens.fr>

=item Jacques van Helden <Jacques.van-Helden@france-bioinformatique.fr>

=back

=head1 CATEGORY

util
matrix

=head1 USAGE

retrieve-matrix -i collection_file -id identifier(s) [-o outputfile] [-v #] [...]

=head1 INPUT FORMAT

The collection of matrices must be in TRANSFAC format. The program
convert-matrix can be used to obtain this format.


=head1 OUTPUT FORMAT

Matrices are exported in TRANSFAC format. The format can then be
changed with the program convert-matrix.

=head1 EXAMPLES

retrieve-matrix  -i ${RSAT}/public_html/motif_databases/JASPAR/Jaspar_2018/nonredundant/JASPAR2018_CORE_vertebrates_non-redundant_pfms_transfac.tf -id MA1115_1,SOX2

This command will return two matrices corresponding respectively to
the transcription factors Pou5f1 (synonym for Oct4) and and Sox2 from
the collection RSAT non-redundant vertebrates (2017).



=cut

if ($0 =~ /([^(\/)]+)$/) {
push (@INC, "$`lib/");
}
require "RSA.lib";
require "RSA.seq.lib";

#use experimental 'smartmatch';

## Initialize parameters
local $start_time = &RSAT::util::StartScript();

my $outputfile = "";
my $collection_file = "";
my $id = "";
my $id_file = "";

&ReadArguments();

################################################################################
############################### check parameters ###############################
################################################################################

&RSAT::message::TimeWarn("Checking parameters") if ($verbose >= 2);



if(!$collection_file){
    &RSAT::error::FatalError("You must specify the path to a collection file (-i)");
}
#if(!$id || !$id_file){
   &RSAT::error::FatalError("You must specify either a list of identifiers (-id), or the path to a file of identifiers list (-id_file)") unless ($id || $id_file) ;
#}


################################################################
## Read the list of query IDs
my @ids = ();
if ($id ne "") {
  &RSAT::message::TimeWarn("Parsing query IDs from command line argument") if ($verbose >= 2);
  @ids = split(",", $id);
} elsif ($id_file ne "") {
  &RSAT::message::TimeWarn("Reading query IDs from input file", $id_file) if ($verbose >= 2);
  open(my $fh, "<", $id_file) or die "Cannot open file $id_file";
  while(my $row = <$fh>) {
    chomp $row;
    push @ids, $row;
  }
}
&RSAT::message::Info("Retrieving matrices for", scalar(@ids), "selected IDs") if ($verbose >= 2);

## Index IDs in a has table to enable their identification
## Note: we convert IDs to lowercases to make the search case-insensitive
my %ids = ();
foreach my $id (@ids) {
    $ids{lc($id)} = 1;
}

################################################################
## Read the motif collection and select matching matrices
open(my $fh, "<", $collection_file) or die "Cannot open file $!";
my $result_matrices = "";
my $current_matrix = "";
my $matrix_ac = "";
my $matrix_id = "";
my $to_print = 0;
my @found_matrices = ();

&RSAT::message::TimeWarn("Reading matrix file and selecting matrices") if ($verbose >= 2);
$collection_size = 0;
while(my $row = <$fh>){
  chomp $row;
  
  if ($row =~ /^AC\s+/) {
    ## Accession key starts a new matrix
    $current_matrix = $row."\n";
    $current_ac = $f[1];
    $to_print = 0;
    $collection_size++;
    
  } elsif (($row =~ /^\/\//) || (eof($fh))) {
    $current_matrix .= "//\n";

    ## End of matrix
    if ($to_print) {
      $result_matrices .= $current_matrix;
    }
    $current_matrix = "";
  } else {
    $current_matrix .= $row."\n";
  }
  
  
  ## Check if the accession or identifier are part of the selection
  if (($row =~ /^AC\s+/) || ($row =~ /^ID\s+/)) {
    ## Get the accession number
    @f = split(/\s+/, $row);
    if ($ids{lc($f[1])}) {
#        if ($f[1] ~~ @ids) {
      &RSAT::message::Info("Found matrix", $current_ac, $f[1]) if ($verbose >= 3);
      $to_print = 1;
      push(@found_matrices, $current_ac);
    }
  }
}


################################################################
## Print the result
&RSAT::message::TimeWarn("Printing", scalar(@found_matrices), "matrices") if ($verbose >= 2);
$out = &OpenOutputFile($outputfile);
&Verbose() if ($verbose);
print $out $result_matrices;

close($out);

exit(0);

################################################################
#### subroutine definitions
################################################################

################################################################
## Display full help message
sub PrintHelp {
    system "pod2text -c $0";
    exit()
}

################################################################
## Display short help message
sub PrintShortHelp {
open(HELP, "| more");
print HELP<<End_short_help;
retrieve-matrix options
--------------------
-i		collection file path
-id		matrix identifier(s), can be a list of accessions separated with commas
-id_file    matrix identifier(s) file which contains a list of accessions (one per line)
-o		outputfile.
End_short_help
close HELP;
exit;
}


################################################################
## Read arguments
sub ReadArguments {
    foreach $a (0..$#ARGV) {

=pod

=head1 OPTIONS

=over 4

=item B<-o>

the path to the output file

=cut
        ### output file name
        if ($ARGV[$a] eq "-o") {
            $outputfile = $ARGV[$a+1];

=pod

=item B<-i collection file>

the path to the collection file which contains the matrices.

=cut
            ### input collection file
        } elsif ($ARGV[$a] eq "-i") {
            $collection_file = $ARGV[$a+1];

=pod

=item B<-h>

Display full help message

=cut
            ### help request
        } elsif ($ARGV[$a] eq "-h") {
            &PrintHelp;
=pod

=item B<-help>

Display concise help message

=cut
        } elsif ($ARGV[$a] eq "-help") {
            &PrintShortHelp;
=pod

=item B<-id identifiers>

Query matrix identifier(s). Multiple identifiers can be provided
separated by commas (without space). The argument can also be used
iteratively in the command line to specify several query IDs.


Queries are case-insensitive, and can be specified by either the
matrix identifier (the "AC" field in Transfac format, e.g. MA1115_1)
or its name (the "ID" field in Transfac format, e.g. SOX2).


=cut
            ### input identifiers
        } elsif ($ARGV[$a] eq "-id") {
	  if ($id ne "") {
	    $id .= ",";
	  }
	  $id .= $ARGV[$a+1];

=pod

=item B<-id_file identifier_file>

the path to a file containing queries (matrix accessions or
identifiers). Each line should contain a single query.

=cut

            ### input identifiers file
        } elsif ($ARGV[$a] eq "-id_file") {
            $id_file = $ARGV[$a+1];


=pod

=item B<-v #>

Level of verbosity (detail in the warning messages during execution)

=back

=cut
		### verbose
        } elsif ($ARGV[$a] eq "-v") {
            if (&IsNatural($ARGV[$a+1])) {
                $verbose = $ARGV[$a+1];
            } else {
                $verbose = 1;
            }
        }
    }
}


################################################################
#### verbose ####
sub Verbose {
    print $out "; retrieve-matrix";
    &PrintArguments($out);
    printf $out "; %-14s\t%s\n", "Collection", $collection_file;
    printf $out "; %-14s\t%s\n", "Number of matrices in collection", $collection_size;
    if ($id) {
        printf $out "; %-14s\t%s\n", "query identifiers", $id;
    }elsif($id_file) {
        printf $out "; %-14s\t%s\n", "query file", $id_file;
    }
    printf $out "; %-14s\t%s\n", "number of queries", scalar(@ids);
    printf $out "; %-14s\t%s\n", "number of matrices found", scalar(@found_matrices);
    printf $out "; %-14s\t%s\n", "output file", $outputfile if ($outputfile);

}

__END__

=pod

=head1 SEE ALSO

=over

=item convert-matrix

=back

=cut
