#!/usr/bin/perl -w
############################################################
#
# $Id: footprint-discovery,v 1.40 2010/09/09 01:24:23 amedina Exp $
#
############################################################

## use strict;

=pod

=head1 NAME

footprint-discovery

=head1 VERSION

$program_version

=head1 DESCRIPTION

Detect phylogenetic footprints by applying dyad-analysis in promoters
of a set of orthologous genes.

Adapted from the procedure described in Janky & van Helden (2008).

=head1 AUTHORS

=over

=item Rekin's Janky <rekins@bigre.ulb.ac.be>

=item Jacques van Helden <jacques.van.helden@ulb.ac.be>

=back

=head1 CATEGORY

Sequences

Pattern discovery

=head1 USAGE

footprint-discovery [-i inputfile] -o [output_prefix] \
     -taxon \
     -q query_gene [-q query_gene2 ...] \
     [-v #] [...]


=head2 EXAMPLES

=head2 single-gene footprint discovery

Discover conserved motifs in the promoters of the orthologs of lexA in
Enterobacteriales.

 footprint-discovery  -v 1 -index -org Escherichia_coli_K12 -taxon Enterobacteriales \
		-lth occ 1 -lth occ_sig 0 -uth rank 50 \
		-return occ,proba,rank -filter -index \
		-to_matrix -bg_model taxfreq -q lexA

=head2 footprint discovery for all genes of a genome

Iterate footprint discovery for each gene separately.

 footprint-discovery  -v 1 -index -org Escherichia_coli_K12 -taxon Enterobacteriales \
		-lth occ 1 -lth occ_sig 0 -uth rank 50 \
		-return occ,proba,rank -filter -index \
		-to_matrix -bg_model taxfreq -all_genes -sep_genes


=head1 INPUT FORMAT

The program takes as input a taxon of interest + one or several query
genes.

=head1 OUTPUT FORMAT

The output consists in a set of files, containing the results of the
different steps of the analysis.

=over

=item [prefix]_log.txt

Log file listing the analysis parameters + output file names;

=item [prefix]_query_genes.tab

List of query genes (one or several genes can be entered)

=item [prefix]_ortho_bbh.tab

List of orthologous genes

=item [prefix]_ortho_seq.fasta

Promoter sequences of the orthologous genes

=item [prefix]_ortho_seq_purged.fasta

Purged promoter sequences (for pattern discovery)
=item [prefix]_ortho_filter_dyads.tab

Dyads found in the query genes (for dyad filtering)

=item [prefix]_ortho_dyads.tab

Significant dyads found in the promoters of orthologous genes
(the footprints)

=item [prefix]_ortho_dyads.asmb

Assembled dyads

=item [prefix]_ortho_dyads.png

Feature-map

=item NOTE : 'ortho' is replaced by 'leaders' in the filename prefix with option -infer_operons

=back

=head1 REFERENCES

Janky, R. and van Helden, J. Evaluation of phylogenetic footprint
discovery for the prediction of bacterial cis-regulatory elements
(2008). BMC Bioinformatics 2008, 9:37 [Pubmed 18215291].

=pod

=head1 SEE ALSO

=over

=item get-orthologs

=item dyad-analysis

=back

=head1 WISH LIST


The following options are not yet implemented, but this should be done
soon.

=over

=item B<-taxa>

Specify a file containing a list of taxa, each of which will be
analyzed separately. The results are stored in a separate folder for
each taxon. The folder name is defined automatically.

=item B<-all_taxa>

Automatically analyze all the taxa, and store each result in a
separate folder (the folder name is defined automatically).


=back

=cut


BEGIN {
  if ($0 =~ /([^(\/)]+)$/) {
    push (@INC, "$`lib/");
  }
}
require "RSA.lib";
require "footprint.lib.pl";



################################################################
## Main package
package main;
{

  ################################################################
  ## Initialise parameters
#  %supported_organism = ();
  $program_version = do { my @r = (q$Revision: 1.40 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
  #    $program_version = "0.00";

  %main::infile = ();
  %main::outfile = ();

  local $skip = 0;
  local $last = 0;

  ## Dyad-analysis parameters
  local $dyad_return_fields = "occ,proba,rank";	## Default return fields for dyad-analysis
  local @dyad_return_fields = (); ## User-specified return fields for dyad-analysis

  local $create_index = 0; ## Create an HTML file with index to the other result files
  local %index_list = (); ## List of index files (there can be several indexes with the option -sep_genes)
  local $verbose = 0;		## Verbosity
  local $noov = "-noov";     ## Treatment of self-overlapping patterns
  local $filter = 1; ## Only report dyads present in the query promoter
  local $bg_model = "taxfreq"; ## Background model for pattern discovery (Supported:taxfreq or monads)
  local $to_matrix = 0;		## Convert assembled dyads into PSSM
  local %lth = (occ=>1,occ_sig=>0); ## Lower thresholds
  local %uth = (rank=>50);	    ## Upper thresholds
  local $strands = "-2str"; ## Strands for pattern discovery and pattern matching

  ## Add the supported tasks specific for this program (other tasks
  ## have been defined in the library)
  $supported_task{filter_dyads} = 1;
  $supported_task{dyads} = 1;
  $supported_task{map} = 1;
  $supported_task{index} = 1;
#  $supported_tasks = join (",", keys %supported_task);

  ## Parameters for promoter retrieval
  local $taxon;			## Reference taxon
  local $organism_name;		## Query organism
  local @query_genes = ();	## list of query genes
  local $infer_operons = 0;     ## Infer operon leader genes
  local $dist_thr = 55;	        ## Distance threshold for operon inference
  local $sep_genes = 0;	        ## Analyze each gene separately
  local $promoter = "ortho";    ## Alternatively: leader when the option "infer operons" is active
#  local $all_genes = 0;         ## Analyze all the genes of the query organism
#  local $max_genes = undef;     ## Analyze a restricted number of genes

#  local $supported_organism;


  local $map_format = "png";	## Image format

  ## Job management options
  local $job_prefix = "fpdisco";
  local $die_on_error = 1;
  local $batch = 0;
  local $dry = 0;

  ################################################################
  ## Read argument values
  &ReadArguments();

  ################################################################
  ## Check argument values

  ## Check parameters for footprint analysis
  &CheckFootprintParameters();

  ################################################################
  ## Analyze query genes separately or altogether
  if ($sep_genes) {
    my $g = 0;
    my $gene_nb = scalar(@query_genes);
    foreach my $current_gene (@query_genes) {
      $g++;
      &RSAT::message::TimeWarn("Analyzing gene", $g."/".$gene_nb, $current_gene) if ($main::verbose >= 1);
      next if (($skip > 0) && ($g <= $skip));
      last if (($last > 0) && ($g >= $last));
      &RunFootprintDisco($current_gene);
    }
  } else {
    &RSAT::message::TimeWarn("Analyzing a group of ",scalar(@query_genes)," genes", join(";", @query_genes)) if ($main::verbose >= 1);
    &RunFootprintDisco(@query_genes);
  }

  ################################################################
  ## Report the index files
  if ($create_index) {
    if ($main::verbose >= 2) {
      print ("; Index files\n");
      foreach my $key (sort keys %index_list) {
	print join ("\t", ";", $key, $index_list{$key}), "\n";
      }
    }
  }
  exit(0);
}

################################################################
################### SUBROUTINE DEFINITION ######################
################################################################


################################################################
## Run footprint discovery flow chart for one or several query genes
sub RunFootprintDisco {
  local (@current_query_genes) = @_;

  local $start_time = &RSAT::util::StartScript();
  local $batch_cmd = "";
  local $out = "";
  local $genes = "";

  ################################################################
  ## Initialize output directory + output files
  local $query_prefix = &GetQueryPrefix();
  local $outfile{prefix} = &GetOutfilePrefix();
#  $outfile{prefix} .= "_".$bg_model;
  &InitOutput();

  ## Output files for dyad-analysis
  $outfile{filter_dyads} = $outfile{prefix}."_filter_dyads.tab";
  $outfile{dyad_prefix} = $outfile{prefix}."_".$promoter."_dyads_3nt_sp0-20".$strands.$noov."_".$bg_model."_sig".$lth{occ_sig};
  $outfile{dyads} = $outfile{dyad_prefix}.".tab";
  $outfile{dyads_html} = $outfile{dyad_prefix}.".html";
  $outfile{asmb} = $outfile{dyad_prefix}.".asmb";
  $outfile{map} = $outfile{dyad_prefix}.".".${map_format};
  if ($to_matrix) {
    $outfile{pssm_prefix} = $outfile{dyad_prefix}."_pssm";
    $outfile{sig_PSSM} = $outfile{pssm_prefix}."_sig_matrices.txt";
    $outfile{count_PSSM} = $outfile{pssm_prefix}."_count_matrices.txt";
  }

  ## Report all outfile names
  if ($main::verbose >= 5) {
    foreach my $key (sort keys %outfile) {
      &RSAT::message::Debug("outfile", $key, "'".$outfile{$key}."'");
    }
  }

  ################################################################
  ## Open file to the HTML index
  &OpenIndex() if ($create_index);

  ################################################################
  ## Parameters for dyad-analysis

  ## Background model file for dyad-analysis
  if ($bg_model eq "taxfreq") {
    $infile{bg_model} = &ExpectedFreqFile($taxon,
					  3,
					  # spacing ? 0-20
					  "upstream-noorf",
					  str=>$strands,
					  noov=>$noov,
					  type=>"dyad",
					  warn=>1,
					  taxon=>1);
  }


  ################################################################
  ## Return fields for dyad-analysis
  if (scalar(@dyad_return_fields) > 0) {
    $dyad_return_fields = join ",", @dyad_return_fields;
  }

  ################################################################
  ## Print query genes in the gene file
  foreach my $gene (@current_query_genes) {
    print $genes $gene, "\t", $organism_name, "\n";
  }
  &IndexOneFile("genes", $outfile{genes}) if ($create_index);

  ################################################################
  ## Print verbose
  &Verbose() if ($verbose);

  ## Predict operon leader genes of the query genes
  &InferQueryOperons() if ($infer_operons);

  ## Dyad filter tasks
  unless ($nofilter) {
    ## Retrieve promoters of the query organism
    &RetrieveQueryPromoters();

    ## Detect all dyads in promoters of query genes for dyad filtering
    &ComputeFilterDyads();
  }

  ################################################################
  ## Identify ortholog genes
  &GetOrthologs();

  ################################################################
  ## Predict operon leader genes for the orthologous genes
  &InferOrthoOperons() if ($infer_operons);

  ################################################################
  ## Retrieve sequences from orthologs
  &RetrieveOrthoSeq();

  ################################################################
  ## Purge sequences
  &PurgeOrthoSeq();

  ################################################################
  ## Discover over-represented dyads in promoters of orthologous genes
  &RSAT::message::TimeWarn("Running dyad-analysis", $outfile{dyads}) if ($verbose >= 2);
  &CheckDependency("dyads", "purged");
  $cmd = "$SCRIPTS/dyad-analysis -v 1 ";
  $cmd .= " -return ".$dyad_return_fields;
  if ($bg_model eq "taxfreq") {
    $cmd .= " -expfreq ".$infile{bg_model} ;
  } elsif ($bg_model eq "monads") {
    $cmd .= " -bg monads" ;
  }
  if (!$nofilter) {
    $cmd .= " -accept ".$outfile{filter_dyads};
  }
  foreach my $field (sort keys %lth) {
    $cmd .= " -lth ".$field." ".$lth{$field};
  }
  foreach my $field (sort keys %uth) {
    $cmd .= " -uth ".$field." ".$uth{$field};
  }
  $cmd .= " -i ".$outfile{purged};
  $cmd .= " -l 3 -sp 0-20";
  $cmd .= " ".$strands;
  $cmd .= " ".$noov;
  $cmd .= " -sort";
  $cmd .= " -o ".$outfile{dyads};
  &one_command($cmd) if ($task{dyads});
  &IndexOneFile("dyads", $outfile{dyads}) if ($create_index);

  ## Generate a HTML table with the discovered dyads
  &CheckDependency("dyads_html", "dyads");
  $cmd = "$SCRIPTS/text-to-html";
  $cmd .= " -i ".$outfile{dyads};
  $cmd .= " -font variable -chunk 1000";
  $cmd .= " -o ".$outfile{dyads_html};
  &one_command($cmd) if ($task{dyads});
  &IndexOneFile("dyads [htm]", $outfile{dyads_html}) if ($create_index);

  ################################################################
  ## Assemble significant dyads
  &RSAT::message::TimeWarn("Assembling significant dyads", $outfile{asmb}) if ($verbose >= 2);
  &CheckDependency("asmb", "dyads");
  $cmd = "$SCRIPTS/pattern-assembly -v 1";
  $cmd .= " -i ".$outfile{dyads};
  $cmd .= " -subst 0 -maxfl 1 -toppat 50";
  $cmd .= " -o ".$outfile{asmb};
  &one_command($cmd) if ($task{dyads});
  &IndexOneFile("assembled dyads", $outfile{asmb}) if ($create_index);

  ################################################################
  ## Generate a feature-map of the discovered dyads
  my $map_title = join "; ", $taxon, $organism_name, $query_prefix;
  &CheckDependency("map", "dyads");
  &CheckDependency("map", "seq");
  &RSAT::message::TimeWarn("Generating feature map", $outfile{map}) if ($verbose >= 2);
  $cmd = "$SCRIPTS/dna-pattern -return limits,sites -origin -0 -N 4";
  $cmd .= " -pl ".$outfile{dyads};
  $cmd .= " -format fasta -i ".$outfile{seq};
  $cmd .= " | $SCRIPTS/convert-features -from dnapat -to ft";
  $cmd .= " | $SCRIPTS/feature-map";
  $cmd .= " -title '".$map_title."'";
  $cmd .= " -format ".$map_format;
  $cmd .= " -scalebar -legend";
  $cmd .= " -scorethick";
  $cmd .= " -o ".$outfile{map};
  &one_command($cmd) if ($task{map});
  &IndexOneFile("feature map", $outfile{map}, image=>1) if ($create_index);

  ################################################################
  ## Convert assembled dyads into a position-specific scoring matrix
  if ($to_matrix) {
    &RSAT::message::TimeWarn("Converting assembled dyads into PSSM", $outfile{pssm}) if ($verbose >= 2);
    &CheckDependency("pssm", "asmb");
    &CheckDependency("pssm", "seq");
    $cmd = "$SCRIPTS/matrix-from-patterns -v 1";
    $cmd .= " -seq ".$outfile{seq};
    $cmd .= " -asmb ".$outfile{asmb};
    $cmd .= " -format fasta";
    $cmd .= " -uth Pval 0.00025";
    $cmd .= " -bginput -markov 0";
    $cmd .= " ".$strands;
    $cmd .= " -logo";
    $cmd .= " -o ".$outfile{pssm_prefix};
    &one_command($cmd);
    &IndexOneFile("significance matrix (PSSM)", $outfile{sig_PSSM}) if ($create_index);
    &IndexOneFile("count matrix (PSSM)", $outfile{count_PSSM}) if ($create_index);
  }

  ################################################################
  ## Send the command to a batch queue (e.g. PC cluster)
  if ($batch) {
    &doit($batch_cmd, $dry, $die_on_error, $verbose, 1, $job_prefix);
  }

  ################################################################
  ## Close output streams
  my $exec_time = &RSAT::util::ReportExecutionTime($start_time);
  print $main::out $exec_time if ($main::verbose >= 1);
  close $out if ($outfile{log});
  close $genes if ($outfile{genes});

  ## Terminate the index file
  if ($create_index) {
    print $index "</table>\n";
    print $index "</blockquote>";
    print $index "<hr size=2 color='#000088'>";
    print $index "</body>";
    print $index "</html>";
    close $index;
  }

}


################################################################
## Add one file to the index file
sub IndexOneFile {
  my ($name, $file, %args) = @_;
  $short_file = `basename $file`;
  print $index "<tr valign=top>\n";
  print $index "<td>", $name, "</td>\n<td><a href=".$short_file.">".$short_file."</a></td>\n";
  if ($args{image}) {
    #    print $index "<td><a href=".$short_file."><img width=100 src=".$short_file."></a></td>\n";
    print $index "</tr><tr><td colspan=\"2\">(Click on image below)</td></tr><tr><td colspan=\"2\"><a href=".$short_file."><img width=\"100%\" src=".$short_file."></a></td>\n";
  }
  print $index ("</tr>\n\n");
}

################################################################
## Display full help message 
sub PrintHelp {
#    system "pod2text -c $0";
    system "cat $0 $ENV{RSAT}/perl-scripts/lib/footprint.lib.pl | pod2text -c";
    exit(0);
}

################################################################
## Display short help message
sub PrintOptions {
    &PrintHelp();
}

################################################################
## Read arguments 
sub ReadArguments {

=pod

=head1 OPTIONS

=cut
  local $arg;
  local @arguments = @ARGV; ## create a copy to shift, because we need ARGV to report command line in &Verbose()
  while (scalar(@arguments) >= 1) {
    $arg = shift (@arguments);

    if (&ReadFootprintOptions()) {
      next;

      ### Lower threshold

=pod

=over 4

=item B<-lth field value>

Lower threshold for dyad-analysis.

See the manual of dyad-analysis for a description of the fields on
which a threshold can be imposed.

=cut
    } elsif ($arg eq "-lth") {
      my $thr_field = shift (@arguments);
      my $thr_value =  shift (@arguments);
      $main::lth{$thr_field} = $thr_value;

      ### Upper threshold

=pod

=item B<-uth field value>

Upper threshold for dyad-analysis.

See the manual of dyad-analysis for a description of the fields on
which a threshold can be imposed.

=cut
    } elsif ($arg eq "-uth") {
      my $thr_field = shift (@arguments);
      my $thr_value =  shift (@arguments);
      $main::uth{$thr_field} = $thr_value;

=pod

=item B<-return dyad_return_fields>

Return fields for dyad-analysis.  This argument is passed to
dyad-analysis for the discovery of dyads in promoters of orthologous
genes.

Multiple-fields can be entered either by calling this argument
iterativelyk or by entering multiple fields separated by commas.

Type I<dyad-analysis -help> to obtain the list of supported return
fields.

=cut
    } elsif ($arg eq "-return") {
      push @dyad_return_fields, shift (@arguments);

=pod

=item B<-to_matrix>

Convert assembled patterns into position-specific scoring matrices
(PSSM).  Caution ! This conversion can take time if the sequence set
is large and if there are many assemblies.

=cut
    } elsif ($arg eq "-to_matrix") {
      $main::to_matrix = 1;


	    ## Background model

=pod

=item B<-bg_model taxfreq|monads>

Allow the user to choose among alternative background model (see Janky
& van Helden, 2008).

=over

=item I<taxfreq>

Taxon-wide background model, computed by counting dyad frequencies in
all the promoters of all the genes of the reference taxon.

=item I<monad>

Expected dyad frequencies are the product of monad frequencies
observed in the input sequences.

=back

=cut
    } elsif ($arg eq "-bg_model") {
      $main::bg_model = shift(@arguments);

	  ## Dyads filtering

=pod

=item B<-filter>

Only accept dyads found in the promoter of the query gene, in the
query organism. (option selected by default)

=cut
    } elsif ($arg eq "-filter") {
      $main::filter = 1;

=pod

=item B<-no_filter>

Accept all dyads, even if they are not found in the promoter of the
query gene, in the query organism. (will cancel -filter option if selected)

=cut

    } elsif ($arg eq "-no_filter") {
      $main::nofilter = 1;


=pod

=item B<-dist_thr value> 

Specify here the intergenic distance threshold in base pairs. Pair of adjacent genes with intergenic distance equal or less than this value are predicted to be within operon. (default : 55)

=cut
    } elsif ($arg eq "-dist_thr") {
      $main::dist_thr = shift(@arguments);

    } else {
      &FatalError(join("\t", "Invalid option", $arg));
    }
  }

}

################################################################
## Verbose message
sub Verbose {
  my $header_verbose = "";
  $header_verbose .=  "; footprint-discovery ";
  $header_verbose .= &PrintArguments();
  $header_verbose .= "\n";
  $header_verbose .= sprintf "; %-22s\t%s\n", "Program version", $program_version;
  $header_verbose .= sprintf "; %-22s\t%s", "Working directory", `pwd`;
  $header_verbose .= sprintf "; %-22s\t%s\n", "Query organism", $organism_name;
  $header_verbose .= sprintf "; %-22s\t%s\n", "Reference taxon", $taxon;
  $header_verbose .= sprintf "; %s\n", "Promoters from predicted operon leader genes" if ($infer_operons);
  if (scalar(@current_query_genes) == 1) {
    $header_verbose .= sprintf "; %-22s\t%s\n", "Query gene", $current_query_genes[0];
  } elsif (scalar(@current_query_genes) <= 10) {
    $header_verbose .= sprintf "; %-22s\t%s\n", "Query genes", join "; ", @current_query_genes;
  } else {
    $header_verbose .= sprintf "; %-22s\t%s\n", "Query genes", scalar(@current_query_genes);
  }
  $header_verbose .= sprintf "; %-22s\t%s\n", "Background model", $bg_model;
  if ($filter) {
      $header_verbose .= sprintf "; %s\n", "Dyad filtering ON";
  }  else {
      $header_verbose .= sprintf "; %s\n", "No dyad filtering";
  }

  if (defined(%main::infile)) {
    print $main::out "; Input files\n";
    while (my ($key,$value) = each %main::infile) {
      printf $out ";\t%-13s\t%s\n", $key, $value;
    }
  }
  if (defined(%main::outfile)) {
    print $main::out "; Output files\n";
    while (my ($key,$value) = each %main::outfile) {
      printf $out ";\t%-13s\t%s\n", $key, $value;
    }
  }
  print $out $header_verbose;


  if ($main::create_index) {
    $header_verbose =~ s/^; //;
    $header_verbose =~ s/\n; /\n/gm;
    print $index "<pre>";
    print $index $header_verbose;
    print $index "</pre>";
  }
}


__END__


