#!/usr/bin/perl -w
############################################################
#
# $Id: matrix-from-patterns,v 1.17 2009/11/05 00:32:07 jvanheld Exp $
#
# Time-stamp: <2003-07-04 12:48:55 jvanheld>
#
############################################################

## use strict;

=pod

=head1 NAME

matrix-from-patterns

=head1 DESCRIPTION

Extract one or several position-specific scoring matrices (PSSM) from
a set of sequences, using a seeds a set of patterns (the output of
oligo-analysis or dyad-analysis).

The program proceeds in two steps:

1) A B<significance matrix> is built by assembling the patterns (with
I<pattern assembly>), and converting each assembly to a
position-specific scoring matrix (the conversion is performed with
I<convert-matrix>).  This significance matrix contains one row per
residue, one column per position of the pattern assembly, and each
cell indicates the maximal significance value observed for that
residue at that position in the assembly.

2) The significance matrix is then used to scan the input sequences
(with I<matrix-scan>), and collect all sites above a given threshold
of P-value. A I<count matrix> is built from these sites.

Advantage of this two-step process: classically, position-specific
scoring matrices are build from an alignemnt of sites (e.g. binding
sites for a transcription factor).  A significance matrix already
gives a good indication of the motif, but it does not always reflect
the real sites present in the sequence, because it is built by
assembling overlapping oligonucleotides (or dyads), irrespective of
the fact that these patterns are found together or not in the input
sequence. The program matrix-from-pattern solves this problem by using
the assembled patterns as seeds to perform a matrix-based scanning of
the input sequences, and collect the most likely instances of the
motif (putative sites). These sites are then used to build an count
matrix, reflecting the absolute residue frequencies at each position
of the collected sites.

Weakness of this approach: the scanning stepc an be time-consuming
when the input sequences are large (e.g. for whole-genome moti
detection).

=head1 AUTHORS

jvanheld@bigre.ulb.ac.be

=head1 CATEGORY

=item sequences

=item pattern discovery

=head1 USAGE

matrix-from-patterns -seq sequence_filfe [-pl pattern_file | -asmb assembly_file] \
   [-o output_prefix] [-v #]

=head1 INPUT FORMAT

=over

=item B<Sequence file>

=item B<Pattern file>

The pattern file must be in the same format as the output from
pattern-assembly.

=back

=head1 OUTPUT FORMAT

The output file contains the matrix or matrices extrated from the
sequences. The supported output formats are the same as for
convert-matrix.

=cut


BEGIN {
    if ($0 =~ /([^(\/)]+)$/) {
	push (@INC, "$`lib/");
    }
}
require "RSA.lib";



################################################################
## Main package
package main;
{

    ################################################################
    #### initialise parameters

    ## Generic parameters
    my $start_time = &AlphaDate();
    local %infile = ();
    local %outfile = ();

    local $verbose = 0;
    local $strands = "-2str";

    ## Parameters for pattern-assembly
    local $asmb_maxfl = 1;
    local $asmb_subst = 1;
    local $max_asmb_size = 50;
    local $max_asmb_nb = 5;
    local $asmb_maxpat = 100;

    ## Parameters for convert-matrix

    ## Parameters for matrix-scan
    local $seq_format = "fasta";
    local %uth;
    $uth{Pval} = 1e-4;
    local $scan_parameters = "";

    ## Parameters for the &doit() command
    $dry = 0;
    $die_on_error = 1;
    $job_prefix = "matrix-from-patterns";
    $batch = 0;

    ################################################################
    ## Read argument values
    &ReadArguments();

    ################################################################
    ## Check argument values

    ## matrix-scan parameters
    ## The upstream threshold on Pval MUST be defined (if not, all sites can be taken, resulting in a meaningless matrix)
    unless ($scan_parameters =~ /-uth Pval/) {
      $scan_parameters = " -uth Pval ".$uth{Pval}." ".$scan_parameters;
    }

    ## The output prefix is mandatory
    &RSAT::error::FatalError("You must specify the output prefix (option -o)")
      unless (defined($outfile{output}));

    ## The input sequence is mandatory
    &RSAT::error::FatalError("You must specify the sequence file (option -seq)")
      unless (defined($infile{sequence}));

    ## The patterns should be provided, either as a pattern file or as an assembly file
    &RSAT::error::FatalError("You must give as input either a pattern file (option -pl) or an assembly file (option -asmb)")
      unless ((defined($infile{patterns})) || (defined($infile{assembly})));

    ################################################################
    ## Specify output files
    $outfile{log} = $outfile{output}."_log.txt";
    if ($infile{assembly}) {
      $outfile{assembly} = $infile{assembly};
    } else {
      $outfile{assembly} = $outfile{output}.".asmb";
    }
    $outfile{sig_matrices} = $outfile{output}."_sig_matrices.txt";
    $outfile{sig_sites} = $outfile{output}."_sig_sites.ft";
    $outfile{count_matrices} = $outfile{output}."_count_matrices.txt";

    ################################################################
    ## Open output stream
    local $log = &OpenOutputFile($outfile{log});

    ################################################################
    ## Print verbose
    &Verbose() if ($main::verbose);

    ################################################################
    ## Assemble the patterns (unless the input is already an assembly
    ## result)
    if ($infile{patterns}) {
      &RSAT::message::TimeWarn("Assembling the patterns") if ($main::verbose >= 2);
      my $assembly_cmd = "${SCRIPTS}/pattern-assembly -v 1 -i ".$infile{patterns};
      $assembly_cmd .= " ".$strands;
      $assembly_cmd .= " -maxfl ".$asmb_maxfl;
      $assembly_cmd .= " -subst ".$asmb_subst;
      $assembly_cmd .= " -maxpat ".$asmb_maxpat;
      $assembly_cmd .= " -max_asmb_size ".$max_asmb_size;
      $assembly_cmd .= " -max_asmb_nb ".$max_asmb_nb;
      $assembly_cmd .= " -o ".$outfile{assembly};
      print $log "; patterns -> assemblies\n", $assembly_cmd, "\n";
      &doit($assembly_cmd, $dry, $die_on_error, $verbose, $batch, $job_prefix);
    }

    ################################################################
    ## Convert assembled patterns into matrices
    &RSAT::message::TimeWarn("Converting assemblies to significance matrices") if ($main::verbose >= 2);
    my $convert_cmd = "${SCRIPTS}/convert-matrix -v 1 -i ".$outfile{assembly};
    $convert_cmd .= " -from assembly";
    $convert_cmd .= " -return counts,parameters";
    $convert_cmd .= " -top ".$max_asmb_nb;
    $convert_cmd .= " -o ".$outfile{sig_matrices};
    print $log "; assemblies -> significance matrices\n", $convert_cmd, "\n";
    &doit($convert_cmd, $dry, $die_on_error, $verbose, $batch, $job_prefix);

    ################################################################
    ## Scan sequences to collect sites
    &RSAT::message::TimeWarn("Collecting sites with significance matrices") if ($main::verbose >= 2);
    my $scan_cmd = "${SCRIPTS}/matrix-scan -v 1";
    $scan_cmd .= " -i ".$infile{sequence};
    $scan_cmd .= " -seq_format ".$seq_format;
    $scan_cmd .= " -m ".$outfile{sig_matrices};
    $scan_cmd .= " -n skip";
    $scan_cmd .= " -matrix_format tab";
    $scan_cmd .= " ".$strands;
    $scan_cmd .= " -return sites,pval";
    $scan_cmd .= " ".$scan_parameters;
    $scan_cmd .= " -o ".$outfile{sig_sites};
    print $log "; significance matrices + sequences -> sites\n", $scan_cmd, "\n";

    ################################################################
    ## Convert the collected sites to count matrices
    &RSAT::message::TimeWarn("Converting sites to count matrices") if ($main::verbose >= 2);
    my $convert2_cmd = "; ${SCRIPTS}/convert-matrix -v 1";
    $convert2_cmd .= " -i ".$outfile{sig_sites};
    $convert2_cmd .= " -from feature -to tab";
    $convert2_cmd .= " -return counts,parameters";
#    $convert2_cmd .= " -sort desc information.per.column";
    $convert2_cmd .= " -return sites" if ($return_fields{sites});
    $convert2_cmd .= " -return logo" if ($return_fields{logo});
    $convert2_cmd .= " -o ".$outfile{count_matrices};
    print $log "; sites -> count matrices\n", $convert2_cmd, "\n";
    &doit($scan_cmd.$convert2_cmd, $dry, $die_on_error, $verbose, $batch, $job_prefix);

#     ################################################################
#     ## convert sites to fasta format
#     ## THIS DOES NOT WORK BECAUSE ALL THE SITES FOR ALL MATRICES ARE IN THE SAME FILE
#     my $site_cmd = "";
#     if ($return_fields{sites}) {
#       &RSAT::message::TimeWarn("Converting sites to fasta") if ($main::verbose >= 2);
#       $site_cmd = "convert-features -i ".$outfile{sig_sites};
#       $site_cmd .= " -from ft -to fasta";
#       $site_cmd .= " -o ".$outfile{sig_sites}.".fasta";
#     } else {
#       &RSAT::message::TimeWarn("Deleting site file") if ($main::verbose >= 2);
#       $site_cmd = "rm -f ".$outfile{sig_sites};
#     }
#     &doit($site_cmd, $dry, $die_on_error, $verbose, $batch, $job_prefix);

    ################################################################
    ## Finish verbose
    if ($main::verbose >= 1) {
	my $done_time = &AlphaDate();
	print $log "; Job started $start_time\n";
	print $log "; Job done    $done_time\n";
    }

    ################################################################
    ## Close output stream
    close $log if ($outfile{output});

    &RSAT::message::Info("matrix-from-pattern log file", $outfile{log})
      if ($main::verbose >= 2);

    exit(0);
}

################################################################
################### subroutine definition ######################
################################################################


################################################################
#### display full help message 
sub PrintHelp {
    system "pod2text -c $0";
    exit()
}

################################################################
#### display short help message
sub PrintOptions {
    &PrintHelp();
}

################################################################
#### Read arguments 
sub ReadArguments {
#    foreach my $a (0..$#ARGV) {
    my $arg;
    
    my @arguments = @ARGV; ## create a copy to shift, because we need ARGV to report command line in &Verbose()
    

    while (scalar(@arguments) >= 1) {
      $arg = shift (@arguments);
	## Verbosity
=pod
	    

=head1 OPTIONS

=over 4

=item B<-v #>

Level of verbosity (detail in the warning messages during execution)

=cut
	if ($arg eq "-v") {
	    if (&IsNatural($arguments[0])) {
		$main::verbose = shift(@arguments);
	    } else {
		$main::verbose = 1;
	    }
	    
	    ## Help message
=pod

=item B<-h>

Display full help message

=cut
	} elsif ($arg eq "-h") {
	    &PrintHelp();
	    
	    ## List of options
=pod

=item B<-help>

Same as -h

=cut
	} elsif ($arg eq "-help") {
	    &PrintOptions();
	    

	    ## Input file
=pod

=item B<-seq sequence_file>

If no input file is specified, the standard input is used.  This
allows to use the command within a pipe.

=cut
	} elsif ($arg eq "-seq") {
	    $infile{sequence} = shift(@arguments);

	    ## Sequence format
=pod

=item B<-format sequence_format>

Sequence format.

=cut
	} elsif ($arg eq "-format") {
	    $seq_format = shift(@arguments);

	    ## Pattern file
=pod

=item B<-pl pattern_file>

Pattern file. This file contains a list of patterns. typically, this
file should be the output of the programes oligoanalysi or
dyad-analysis.

=cut
	} elsif ($arg eq "-pl") {
	  &RSAT::error::FatalError("Options -asmb and -pl are mutually exclusive") 
	    if (defined($infile{assembly}));
	  $infile{patterns} = shift(@arguments);


	  ## Max number of assemblies
=pod

=item B<-max_asmb_nb>

This parameter is passed to pattern-assembly, to indicate the maximal
number of assemblies to return.

Since each assembly is then converted into a PSSM by
matrix-from-pattern, this option also indicates the maximal number of
matrices to return.

Note that when matrix-from-patterns is fed with pre-assembled patterns
(option -asmb instead of -pl), the option -max_asmb_nb is not working.

=cut
	} elsif ($arg eq "-max_asmb_nb") {
#	  &RSAT::error::FatalError("Options -asmb and -max_asmb_nb are mutually exclusive")
#	    if (defined($infile{assembly}));
	  $max_asmb_nb = shift(@arguments);


	    ## Assembly file
=pod

=item B<-asmb assembly_file>

Assembly file. As an alternative to the pattern file, the patterns can
be provided in the form of the result of pattern-assembly.

=cut
	} elsif ($arg eq "-asmb") {
	  &RSAT::error::FatalError("Options -asmb and -pl are mutually exclusive") 
	    if (defined($infile{patterns}));
	  $infile{assembly} = shift(@arguments);

	    ## Strands
=pod

=item B<-1str|-2str>

Strands

=over

=item B<-1str> use a single strand to build the motifs

=item B<-2str> use both strands to build the motifs

=back

=cut
	} elsif ($arg eq "-1str") {
	    $strands = "-1str";
	} elsif ($arg eq "-2str") {
	    $strands = "-2str";

	    ## Output file
=pod

=item	B<-o output_prefix>

The program exports several files, whose name is specified by the
output prefix, followed by an extension.

=cut
	} elsif ($arg eq "-o") {
	    $outfile{output} = shift(@arguments);

	    ## Return sites
=pod

=item	B<-sites>

Export the sites used to build the count matrix. These sites can be
used for example to draw a sequence logo.

The sites are exported in fasta format, after each count matrix.

=cut
	} elsif ($arg eq "-sites") {
	    $return_fields{sites} = 1;

	    ## Return logos
=pod

=item	B<-logo>

Export the sequence logos representing the count matrix. 

=cut
	} elsif ($arg eq "-logo") {
	    $return_fields{logo} = 1;

=pod

=item B<-scan_param>

The next argument is passed to matrix-scan (this will raise an error
if these arguments are not supported).

Example:
  -scan_param '-uth Pval 1e-3 -uth rank 40'
will only return the 40 top ranking sites, with a maximal Pvalue of
1e-3.

Any other parameter supported by matrix-scan can be passed in the same
way. The option can be used iteratively on a command line to add up
various parameters. Example:
  -scan_param '-uth Pval 1e-3' -scan_param '-uth rank 40' [...]

If not parameters are specified, the Pval is set to 1e-4 by default,
without limit on the number of sites.

=cut
	} else {
	  $scan_parameters .= " ".$arg;

	}
    }


=pod

=back

=cut

}

################################################################
#### verbose message
sub Verbose {
    print $log "; matrix-from-patterns ";
    &PrintArguments($log);
    if (defined(%infile)) {
	print $log "; Input files\n";
	while (my ($key,$value) = each %infile) {
	    print $log ";\t$key\t$value\n";
	}
    }
    if (defined(%outfile)) {
	print $log "; Output files\n";
	while (my ($key,$value) = each %outfile) {
	    print $log ";\t$key\t$value\n";
	}
    }
}


__END__

=pod

=head1 SEE ALSO

=head2 I<pattern-assembly>

=head2 I<convert-matrix>

=head2 I<matrix-scan>


=cut
