#!/usr/bin/perl -w
############################################################
#
# $Id: footprint-scan,v 1.54 2011/02/23 15:59:37 amedina Exp $
#
############################################################

## use strict;

=pod

=head1 NAME

footprint-scan

=head1 DESCRIPTION

Scan promoters of orthologous genes with one or several
position-specific scoring matrices (PSSM) in order to detect motifs
showing a higher number of hits than expected by chance
(over-represented motifs).

=head1 AUTHORS

=over

=item Jacques.van. Helden <Jacques.van.Helden@ulb.ac.be>

=item Alejandra Medina-Rivera  <amedina@lcg.unam.mx>

=back

=head1 CATEGORY

=over

=item comparative genomics

=back

=head1 USAGE

footprint-scan [-m matrix_inputfile] [-o outputfile] [-v #] [...]

=head1 INPUT FORMAT

=head2 Query gene(s)

The analysis can be performed either on a single gene, or several
genes separately (option -sep_genes), or on a group of genes
altogether.

Query genes can be entered on the command line (option -q) or in a
text file (option-genes). Alternatively, teh option -all_genes will
run the analysis on all the genes of a genome.

=head2 Position-specific scoring matrices (PSSMs)

I<footprint-scan> requires a collection of (at least one)
position-specific scoring matrices (PSSM). 

All the format supported by I<matrix-scan> can be used to enter the
matrices. However, we recommend to use the TRANSFAC format, which
supports multiple matrices (we usually want tos can promoters with a
full collection of matrices), and associates an identifier with each
matrix (e.g. the name of the transcription factor).

=head3 Example of TRANSFAC format

The following example shows a text file describing two matrices,
representing the binding motifs annotated in RegulonDB for AgaR and
AraC, respectively. Motifs must be separated by a line containing a
double slash (//).

The complete file can be downloaded from RegulonDB
(http://regulondb.ccg.unam.mx/).

 AC  ECK12_ECK120012515_AgaR.24
 XX
 ID  ECK12_ECK120012515_AgaR.24
 XX
 P0       A     T     C     G
 1        5     0     1     5
 2        6     1     4     0
 3        4     0     5     2
 4        5     4     0     2
 5        4     6     0     1
 6        1     5     3     2
 7        0     2     8     1
 8        4     1     1     5
 9        4     5     1     1
 10       3     8     0     0
 11       5     6     0     0
 12       1     8     1     1
 13       2     0     4     5
 14       4     5     2     0
 15       3     8     0     0
 16       3     8     0     0
 17       0     2     9     0
 18       0     2     2     7
 19       3     7     1     0
 20       4     7     0     0
 21       3     8     0     0
 22       3     4     0     4
 23       3     4     0     4
 24       3     4     3     1
 25       3     8     0     0
 XX
 //
 AC  ECK12_ECK120012316_AraC.18
 XX
 ID  ECK12_ECK120012316_AraC.18
 XX
 P0       A     T     C     G
 1        0    10     0     3
 2        7     4     1     1
 3        0     6     5     2
 4        2     2     3     6
 5        0     0     6     7
 6        9     0     0     4
 7        0     2     9     2
 8        2     7     3     1
 9        9     3     0     1
 10       7     4     0     2
 11       4     8     0     1
 12       3     3     5     2
 13       2    10     0     1
 14       2     7     1     3
 15       6     1     6     0
 16       0    11     2     0
 17       1     0     3     9
 18       1     5     5     2
 19       5     2     0     6
 XX
 //


=head1 OUTPUT FORMAT

The result comprises several files for the orthologs, upstream
sequences, matrix-scan results, feature-maps. By default, a directory
is created for each query gene, with a name indicating the parameters:

 footprints/[taxon]/[Organism]/[gene]

Alternatively, the output folder can be specified manually with the
option I<-o>.

=head1 EXAMPLES OF UTILIZATION

=head2 Detecting trans-acting factors for single gene, with a collection of known motifs

Let us assume that we have a collection of PSSMs annotated for a given
organism (e.g. the matrices for all the I<Escherichia coli>
transcription factors annotated in RegulonDB). We would likt to scan
the promoters of orthologs of a given gene, in order to predict the
transcription factors that might be involved in its regulation. The
program will count the hits for each matrix, and report those showing
a significant enrichment in the promoters of its orthologs.

In this example, we use a slightly higher verbosity than usually (-v
2) in order to keep track of the progress of the analysis. This also
reports the commands that are executed, and allows us to examine all
their parameters.

 footprint-scan -v 2  -org Escherichia_coli_K12 \
    -taxon Enterobacteriales -q sodA -q lexA -q araC \
    -bgfile ${RSAT}/public_html/data/taxon_frequencies/Enterobacteriales/dyads_3nt_sp0-20_upstream-noorf_Enterobacteriales-noov-1str.freq.gz \
    -m RegulonDB_matrices_transfac_format.txt \
    -matrix_format transfac \
    -matrix_suffix RegulonDB \
    -sep_genes

 footprint-scan -v 2  -org Escherichia_coli_K12 \
    -taxon Enterobacteriales -q sodA  \
    -bgfile ${RSAT}/public_html/data/taxon_frequencies/Enterobacteriales/dyads_3nt_sp0-20_upstream-noorf_Enterobacteriales-noov-1str.freq.gz \
    -m RegulonDB_matrices.tab \
    -matrix_format tab \
    -matrix_suffix RegulonDB \
    -sep_genes


=head2 Detecting all putative target genes for a given transcription factor

Given a PSSM we would like detect new putative binding sites for a
given Transcription Factor. The usual approach would be to retrieve
all upstream region sequences of the organism of interest and then
search for high scored sites with matrix-scan, althougth to have a
high score in one sequence doesn’t mean is a real binding site.
 
As we know sequences with a functional relevance migth be conserved
througth some branches of phylogeny. So we expect binding sites with a
functional rele- vance to be conserved in a group of close othologous
sequences.  footprint-scan can search for putative bindign sites in
the hole set of up- stream regions of an organism while evaluating if
the detected binding sites are conserved (over-represented) in the
respective orthologous sequences.

 footprint-scan -v 2  -org Escherichia_coli_K12 \
    -taxon Enterobacteriales -all_genes \
    -bgfile ${RSAT}/public_html/data/taxon_frequencies/Enterobacteriales/dyads_3nt_sp0-20_upstream-noorf_Enterobacteriales-noov-1str.freq.gz \
    -m MetJ_Regulon_matrix.tab \
    -matrix_format tab \
    -matrix_suffix RegulonDB \
    -sep_genes

=head1 SEE ALSO

=head2 footprint-discovery

The difference betsween I<footprint-scan> and I<footprint-discovery>
is that I<footprint-scan> requires prior knowledge of the motifs (in
the form of position-specific matrices), whereas
I<footprint-discovery> perfoms I<ab initio> motif discovery.

=head1 WISH LIST

=head2 Revise the manual

The manual is still very incomplete, Jacques van Helden needs to
revise and complete it.

=head2 Support as Web services

On the basis of the existing Web service for footprint-discovery.

=head2 Web interface

Alejandra Medina-Rivera will implement the Web interface.  It would be
more convenient to program the Web page after the Web services, in
order to benefit ffrom the support of Web services (including the
token). To be checked with Morgane Thomas-Chollier & Olivier Sand.

=head2 Tutorial

It would be worth preparing a tutorial (or a chapter in Methods in
Molecular Biology) to explain in detail the interpretation of the
result.

The tutorial could cover the 3 interfaces (command-line, Web services
and Web form).

=head2 Motif co-occurrences

After having detected the motifs in the different sequences, analyze
their co-occurrences in order to report the factors having sites in
the same sequences (putatively interacting factors). Actually , this
option should be implemented in matrix-scan rather than
footprint-scan, because it applies to any type of analysis.

=cut


BEGIN {
    if ($0 =~ /([^(\/)]+)$/) {
	push (@INC, "$`lib/");
    }
}
require "RSA.lib";
require "footprint.lib.pl";

################################################################
## Main package
package main;
{

  ################################################################
  ## Initialise parameters
  local $start_time = &RSAT::util::StartScript();
  $program_version = do { my @r = (q$Revision: 1.54 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
  #    $program_version = "0.00";

  $main::verbose = 0;
  #$main::in = STDIN;
  $main::out = STDOUT;

  local $no_die = 1; ## Don't die when a problem occurs
  local $die_on_error = 1;
  local $skip = 0;
  local $last = 0;
  $main::window_size=50; #default length of window size based on bacterial genome size
  ## parameters for promoter retrieval
  local $taxon;			## Reference taxon
  local $organism_name;		## Query organism
  local @query_genes = ();	## list of query genes
  local $infer_operons = 0;     ## Infer operon leader genes
  $main::sep_genes = 0;	        ## Analyze each gene separately
  $main::crer=0; ## ON THE WISH LIST: when set to one, look for CRER over-representation
  local $skip_gene = 0;
  local %top_sig_row = ();

  ## Matrix scanning parameters
  local @matrix_files = ();
  local $strands = "-2str"; ## Strands for pattern matching
  local $pseudo = 1;
  $main::bg_pseudo = 0.05;
  local $strands = "-2str"; ## Strands for pattern discovery and pattern matching
  #local $matrix_dir="data/matrices";
  #local $matrix_file = $matrix_dir."/PHO4_matrix.tab";
  local $matrix_format = "tab";
  local $markov = 1;
  local $map_format = "png";
  local $plot_format = "png";
  local %lth = ();
  local %uth = ();
  local $occ_sig_opt = "";
  local $occ_sig_graph_opt = "";
  local $scan_opt = "";
  local $map_opt = "";
  $main::no_purge=0;
  $main::filter=0;
  $main::filter_pval=0;
  #$bg_format="oligo";
#  $main::footprint_scan=1;
  ## Local definition of threshold parameters passed to matrix-scan
  ## used to check if the options passed to matrix-scna are correct

  local %lth = ();		# lower threshold values
  local %uth = ();		# upper threshold values
  @supported_thresholds = qw (
			      score
			      normw
			      pval
			      ln_pval			   
			      sig
			      proba_m
			      proba_b
			      rank
			      rank_pm
			      occ
			      occ_cum
			      inv_cum
			      exp_occ
			      occ_pval
			      occ_eval
			      occ_sig
			      occ_sig_rank
			      crer_size
			      crer_sites
			      crer_sig
			      crer_pval
			     );
  my $supported_thresholds = join ",", @supported_thresholds;
  %main::supported_threshold = ();
  
  foreach my $thr (@supported_thresholds) {
    # if($thr eq 'score'){print "ahhhh$thr-\n";}
    $main::supported_threshold{$thr} = 1;
  }
  
  
  my $supported_graph_thresholds = join ",", @supported_graph_thresholds;
  %main::supported_graph_threshold = ();

  foreach my $thrg (@supported_graph_thresholds) {
    $main::supported_graph_threshold{$thrg} = 1;
  }

  ## info_lines Ocurrence Significance graph
  ## by default lines are not drawn 
  $main::draw_info_lines=0;

  ## Supported-tasks added to those of footprint.lib.pl

  ## Supported tasks
  @supported_tasks = qw(
			all
			operons
			query_seq
			orthologs
			ortho_seq
			purge
			occ_sig
			filter_scan
			occ_sig_graph
			scan
			map
			synthesis
		       );
  $supported_tasks = join (",", @supported_tasks);
  %supported_task = ();
  foreach my $task (@supported_tasks) {
    $supported_task{$task} = 1;
  }

  ################################################################
  ## Read argument values
  &ReadArguments();

  ################################################################
  ## Check argument values

  #### Organism
 #  &RSAT::OrganismManager::check_name($organism_name);
#   my $organism = new RSAT::organism();
#   $organism->set_attribute("name", $organism_name);
#   $organism->check_name($organism_name);

  ## Check parameters for footprint analysis
  &CheckFootprintParameters();

  ################################################################
  ## Check parameters for matrix-scan
  local $ms_parameters = " -v 1";
  if (defined($main::infile{bg})) {
      $ms_parameters .= " -bgfile ".$main::infile{bg};
      if(defined($main::bg_format)){
	$ms_parameters .= " -bg_format ".$main::bg_format;
      }
  } elsif ($bg_method eq "input") {
      $ms_parameters .= " -bginput ";
      if (defined($markov)) {
	  $ms_parameters .= " -markov ".$markov;
      } else {
	  &RSAT::error::FatalError("The option -bginput requires to specify a Markov model (option -markov)");
      } 
  } elsif ($bg_method eq "window") {
      $ms_parameters .= " -window ". $main::window_size ." " ;
      if (defined($markov)) {
	  $ms_parameters .= " -markov ".$markov;
      } else {
	  &RSAT::error::FatalError("The option -bginput requires o specify a Markov model (option -markov)");
      }       
  } else {
      &RSAT::error::FatalError("You must define a background model using either -bgfile , -window or -bginput ");
  }

  ## Matrix file and format
  &RSAT::error::FatalError("You must specify a matrix file (option -m)") unless (scalar(@matrix_files) > 0);
  foreach my $file (@matrix_files) {
      &RSAT::error::FatalError("Matrix file $file does not exist.Matrix file is mandatory.")  unless (-e $file ) ;
      $ms_parameters .= " -m ".$file;
  }
  &RSAT::error::FatalError("You must specify the matrix format (option -matrix_format)") unless (defined($matrix_format));
  $ms_parameters .= " -matrix_format ".$matrix_format;

  ## Matrix suffix
  &RSAT::error::FatalError("You must specify a matrix suffix (option -matrix_suffix)") unless (defined($matrix_suffix));
  
  $matrix_suffix.= "_infer-operons" if ($infer_operons);
  
  ## Sequence format
  $ms_parameters .= " -seq_format fasta";
  
  ##Filter
  if ($main::filter){
      $main::filter_pval=1e-4 unless ($main::filter_pval);
  }  
  &RSAT::error::FatalError("You must use the option -filter in order to set -filter_pval") if ($main::filter_pval && !$main::filter);
  &RSAT::error::FatalError("You must specify a bgfile for the filter task") if ($main::filter && !$main::filter_bgfile);

  ## Thresholds
    foreach my $key (keys(%lth)) {
      $ms_parameters .= " -lth ".$key." ".$lth{$key};
  }
  foreach my $key (keys(%uth)) {
      $ms_parameters .= " -uth ".$key." ".$uth{$key};
  }

  ## Strands
  $ms_parameters .= " ".$strands;

  ## Origin for the positions
  $ms_parameters .= " -origin end";

  ## Pseudo-count for the matrix
  $ms_parameters .= " -pseudo ".$pseudo;

  ## Pseudo-frequency for the background model
  $ms_parameters .= " -bg_pseudo ".$main::bg_pseudo;

  ################################################################
  ## If -tf is used analyze this gene independently

  if($main::tf){
      &RSAT::message::TimeWarn("Analyzing TF", $main::tf) if ($main::verbose >= 1);
      eval {
	  local $current_gene=$main::tf;
	  $main::filter = 0;
	  &RunFootprintScan($current_gene);
	  $main::filter = 1;
      };  
      if ($@) {
	  if ($no_die) {
	      ## Catch error and report in in log file
	      print $out join("\t", "ERROR", $current_gene, $@);
	      &RSAT::message::Warning("ERROR", $current_gene, $@);
	  } else {
	      &RSAT::error::FatalError($@);
	  }
      }
      &RSAT::error::FatalError("There were no organisms with an ortholog for the specified TF") unless (-s $main::tf_ortho_file );
  }

  ################################################################
  ## Analyze query genes separately or altogether
  if ($main::sep_genes) {
      
      if ($task{synthesis}){
	  ($synthesis_index, $synthesis_table) = &OpenSynthesisFilesScan ;
	  &RSAT::message::Info("Synthesis files: "," Index ". $outfile{synthesis_html} . " Table  ". $outfile{synthesis_tab} );
      }

      my $g = 0;
      my $gene_nb = scalar(@query_genes);
      local $current_gene;
      foreach $current_gene (@query_genes) {
	  $g++;
	  if (($skip > 0) && ($g <= $skip)) {
	      &RSAT::message::Info("Skipping gene", $g."/".$gene_nb, $current_gene) if ($main::verbose >= 2);
	      next;
	  }
	  if (($last > 0) && ($g >= $last)) {
	      &RSAT::message::Info("Last gene", $g."/".$gene_nb, $current_gene) if ($main::verbose >= 2);
	      last;
	  }
	  &RSAT::message::TimeWarn("Analyzing gene", $g."/".$gene_nb, $current_gene) if ($main::verbose >= 1);
	  eval {
	      &RunFootprintScan($current_gene);
	  };
	  if ($@) {
	      if ($no_die) {
		  ## Catch error and report in in log file
		  print $out join("\t", "ERROR", $current_gene, $@);
		  &RSAT::message::Warning("ERROR", $current_gene, $@);
	      } else {
		  &RSAT::error::FatalError($@);
	      }
	  }
      }

      ## Create the synthetic table
      if ($task{synthesis}) {

	  my $syn_dir = `dirname $outfile{synthesis_tab}`;
	  chomp($syn_dir);
	  my $img_hight=120;
	  my $g = 0;
	  my $occ_sig_header = "";
	  foreach my $gene (sort {$top_sig{$b} <=> $top_sig{$a}} keys %top_sig) {
	      $g++;
	      
	      my $gene_info= `(add-gene-info -i $gene_list{$gene} -org $organism_name -info name,descr,upstr_neighb_name | grep -v "^;")`;
	     
	      chomp ($gene_info);
	     
	      $gene_info=~s/Escherichia_coli_K12//;
	      my @gene_info=split(/\t+/,$gene_info);
	      shift @gene_info;  
	      
	      #shift @gene_info;

	      my $gene_info_line=join ("\t",@gene_info); #### Line containing the information about the current gene for the table
	      my $gene_info_html_line=join ("</td>\n<td>",@gene_info);
	      $gene_info_html_line="<td>".$gene_info_html_line."</td>";  #### Line containing the information about the current gene for the tablhtml index


	      ##############
	      ##Table
	      ## Grep the header for the synthesis
	      unless ($occ_sig_header){
		  &HeaderSynthesisFilesScan($synthesis_index, $synthesis_table,$occ_sig_file{$gene});
		  $occ_sig_header=1;
	      }
	      
	      ## Print the table
	      print $synthesis_table join("\t", $gene, $top_sig_row{$gene}, $g,$gene_info_line,  $occ_sig_file{$gene} ), "\n";

	      ## print HTML 
	      print $synthesis_index "<tr>\n"; 
	      my $sig_line=$top_sig_row{$gene};
	      my $html_sep="<\/td>\n<td>";
	      $sig_line =~ s/\t+/$html_sep/g;
	      print $synthesis_index "<td>".$gene.$html_sep. $sig_line.$html_sep. $g."</td>";


	      my $links = ""; 
	     
	      if ((defined($occ_sig_file{$gene})) && (-e $occ_sig_file{$gene})) {
		  my $file = $occ_sig_file{$gene}; $file =~ s|${syn_dir}/||g;
		  print $synthesis_index  "<td>".  join("", " <a href='", $file, "'>[sig]</a>") ." </td>"."\n";
	      }
	      if ((defined($occ_sig_graph_file{$gene})) && (-e $occ_sig_graph_file{$gene})) {
		  my $file = $occ_sig_graph_file{$gene}; $file =~ s|${syn_dir}/||g;
		  print $synthesis_index  "<td>".join("", " <a href='", $file, "'>","<img  src='",$file,"'height='",$img_hight,"'></a>")." </p>";
		  print $synthesis_index   join("", " <a href='", $file, "'>[sigplot]</a>")." </td>"."\n";

	      }
	      if ((defined($occ_freq_graph_file{$gene})) && (-e $occ_freq_graph_file{$gene})) {
		  my $file = $occ_freq_graph_file{$gene}; $file =~ s|${syn_dir}/||g;
		  print $synthesis_index  "<td>".join("", " <a href='", $file, "'>",,"<img  src='",$file,"'height='",$img_hight,"'></a>")." </p>";
		  print $synthesis_index   join("", " <a href='", $file, "'>[freqplot]</a>")." </td>"."\n";
	      }
	      if ((defined($map_file{$gene})) && (-e $map_file{$gene})) {
		  my $file = $map_file{$gene}; $file =~ s|${syn_dir}/||g;
		  print $synthesis_index "<td>".join("", " <a href='", $file, "'>[map]</a>")."</td>"."\n";
	      }
	      print $synthesis_index   $gene_info_html_line;
#	&RSAT::message::Debug("SYNTHESIS DIR", $syn_dir, $sig_file, $links) if ($main::verbose >= 10);
	      
	      print $synthesis_index "</tr>\n";
	      
	  }
	 
	  
      }

  } else {
      &RunFootprintScan(@query_genes);
  }

  ################################################################
  ## Print verbose
  &Verbose() if ($main::verbose);

  ################################################################
  ## Close output stream
  my $exec_time = &RSAT::util::ReportExecutionTime($start_time);
  print $main::out $exec_time if ($main::verbose >= 1);
  close $main::out if ($main::outfile{output});
  print $synthesis_index "</table>\n";
  print $synthesis_index "<p><pre>", $exec_time, "</pre></p>\n";
  print $synthesis_index "</body>\n";
  print $synthesis_index "</html>\n";
  close ($synthesis_index);
  
  ################################################################
  ## Report the index files
  if ($main::verbose >= 2) {
    print ("; Index files\n");
    foreach my $key (sort keys %index_list) {
      print join ("\t", ";", $key, $index_list{$key}), "\n";
    }
  }

  exit(0);
}

################################################################
################### SUBROUTINE DEFINITION ######################
################################################################


################################################################
## Display full help message 
sub PrintHelp {
#    system "pod2text -c $0";
    system "cat $0 $ENV{RSAT}/perl-scripts/lib/footprint.lib.pl | pod2text -c";
    exit()
}

################################################################
## Display short help message
sub PrintOptions {
    &PrintHelp();
}

################################################################
## Read arguments 
sub ReadArguments {
    local $arg;
    local @arguments = @ARGV; ## create a copy to shift, because we need ARGV to report command line in &Verbose()
    while (scalar(@arguments) >= 1) {
	$arg = shift (@arguments);

=pod

=head1 OPTIONS

=cut

	## Read the options that are common to all footprint-detection programs
	if (&ReadFootprintOptions()) {
	    next;
	    
=pod
    
=over 4

=item B<-m matrix_file>

Matrix file. This argument is mandatory.

This argument can be used iteratively to scan the sequence with
multiple matrices.

=cut
		
	} elsif ($arg eq "-m") {
	    push @matrix_files, shift(@arguments);


=pod

=item B<-matrix_format matrix_format>

Matrix format. Default is tab. This argument is mandatory.

=cut

	} elsif ($arg eq "-matrix_format") {
	    $main::matrix_format = lc(shift(@arguments));
	    
=pod

=item B<-matrix_suffix matrix_suffix>

Matrix suffix. This argument is mandatory.

The matrix suffix indicates the nature of the matrix file. For
example, if your matrix file contains a single matrix for a
transcription factor (say LexA), you can indicate it with

-matrix_suffix LexA

whereas if your matrix files contains all the matrices from the
RegulonDB database, you can specify

-matrix_suffix RegulonDB

The matrix suffix will be concatenated to the output prefix, in order
to maintain separate output files for distinct analyses performed on
the same promoter sequences. For example, if you run successively the
analysis with the matrix LexA, and then with the matrix CRP, you don't
want to loose the results of the first scanning when running the
second scanning.

=cut
	} elsif ($arg eq "-matrix_suffix") {
	    $main::matrix_suffix = shift(@arguments);
=pod

=item B<-tf transcription_factor>

Most matrices are derived from specific TFBS, so they represent the preferential sequence where a TF binds. 
This option will search for all the genomes in the given taxon where there is an ortholog for the specified tf.
Orthologs for the query genes will only be retrived if the organism has an ortholog for the TF. 

=cut
	} elsif ($arg eq "-tf") {
	    $main::tf = shift(@arguments);
=pod

=item B<-pseudo #>

Pseudo-count for the matrix (default: 1). See matrix-scan for details.

=cut
	} elsif ($arg eq "-pseudo") {
	    $main::pseudo = shift(@arguments);
	    &RSAT::error::FatalError(join("\t", $main::pseudo,
					  "Invalid value for pseudo count. Must be a real value."))
		unless (&IsReal($main::pseudo));

=pod
    
=item B<-bgfile background_file>

Background model file.

=cut
	} elsif ($arg eq "-bgfile") {
	    $main::infile{bg} = shift(@arguments);
=pod
    
=item B<-bg_format background_format>

Format of background model file.
For supported formats see: convert-background-model -h

=cut
	} elsif ($arg eq "-bg_format") {
	    $main::bg_format = shift(@arguments);

=pod

=item B<-bginput>

Calculate background model from the input sequence set.

=cut
	} elsif ($arg eq "-bginput") {
	    &RSAT::error::FatalError("Options -bgfile and -bginput  are mutually exclusive") if (defined($infile{bg}));
	    $main::bg_method = "input";
	    &RSAT::error::FatalError("The options -bginput and -bgfile are mutually exclusive.")
		if defined($main::infile{bg});

=pod

=item B<-markov>

Order of the markov chain for the background model. 

This option is incompatible with the option -bgfile. 

=cut
	} elsif ($arg eq "-markov") {
	    $main::markov = shift(@arguments);
	    &RSAT::error::FatalError("Markov order must be a natural number.") 
		unless &RSAT::util::IsNatural($main::markov);
	    &RSAT::error::FatalError("The options -markov and -bgfile are mutually exclusive.") 
		if defined($infile{bg});
=pod


=item B<-window>

Size of the sliding window for the background model calculation.
When this option is specified, the matrix pseudo-count is equally
distributed.

The background model is calculated locally at each step of the scan,
by computing transition frequencies from a sliding window centred
around the considered segment. The model is thus updated at each
scanned position. This model is called "adaptive". Note that the
sliding window must be large enough to train the local Markov model.
The required sequence length increases exponentially with the Markov
order. This option is thus usually suitable for low order models
only (-markov 0 to 1).

This option is incompatible with the option -bgfile. 

=cut
	} elsif ($arg eq "-window") { 
            &RSAT::error::FatalError("Options -bgfile and -window  are mutually exclusive") if (defined($infile{bg})); 
            &RSAT::error::FatalError("Options -bginput and -window  are mutually exclusive") if ($main::bg_method eq "input");
	    $main::bg_method = "window"; 
            $main::window_size = shift(@arguments);
	    &RSAT::error::FatalError("Window size must be a natural number.") unless &RSAT::util::IsNatural($main::window_size);
	    &RSAT::error::FatalError("The options -window and -bgfile are mutually exclusive.") if (defined($infile{bg})); 

=pod

=item B<-bg_pseudo #>

Pseudo frequency for the background model. Value must be a real
between 0 and 1.

If this option is not specified, the pseudo-frequency value depends on
the background calculation.

For -bginput and -window, the pseudo frequency is automatically
calculated from the length (L) of the sequence following this formula:

sqrt(L)/(L+sqrt(L))

For -bgfile, default value is 0.05. 

In other cases, if the length (L) of the training sequence is known
(e.g. all promoters for the considered organism), the value can be set
manually by using the option -bg_pseudo. In such case, the background
pseudo-frequency might be set, as suggested by Thijs et al., to the
following value:

sqrt(L)/(L+sqrt(L))


=cut
	} elsif ($arg eq "-bg_pseudo") {
	    $main::bg_pseudo = shift(@arguments);
	    &RSAT::error::FatalError(join("\t", $main::bg_pseudo,
					  "Invalid format for bg_pseudo, should be a Real number between 0 and 1."))
		unless ((&IsReal($main::bg_pseudo)) && (0 <= $main::bg_pseudo) && ($main::bg_pseudo <= 1));

	  
=pod

=item B<-filter>

Filter TF-interactions that are not present on the query organism.
The option -filter_pval can be used to set the threshold for the detected sites.


=cut
	} elsif ($arg eq "-filter") {
	    $main::filter = 1;
=pod

=item B<-filter_bgfile>

Background model file for the scanning of query sequences for filtering,.



=cut
	} elsif ($arg eq "-filter_bgfile") {
    	    $main::filter_bgfile= shift(@arguments);

	  
=pod

=item B<-filter_pval>

Set the threshold to filter out TF-interactions that are not present on the query organism.



=cut
	} elsif ($arg eq "-filter_pval") {
    	    $main::filter_pval= shift(@arguments);

	     &RSAT::error::FatalError($main::filter_pval, "Invalid value for filter_pval. Should be a real number. ") 
	unless (&RSAT::util::IsReal($main::filter_pval));
=pod

=item B<-occ_sig_opt>    

Additional options passed to matrix-scan for the test of
over-representation of matrix hits.  

Supported threshold fields for the matches : score pval eval sig normw
proba_M proba_B rank crer_sites crer_size

Supported threshold fields for score distributions: occ occ_sum
inv_cum exp_occ occ_pval occ_eval occ_sig occ_sig_rank

Examples:
 To return only the "best" score for each gene
-occ_sig_opt '-uth rank 1'

To analyze the distribution only above a weight threshold of 7.
 -occ_sig_opt '-lth score 7'

To analyze the distribution for sites having a P-value threshold of
1e-3.
-occ_sig_opt '-uth pval 1e-3'

Note: the argument passed to matrix-scan is delimited by single
quotes, and can thus not contain any quote.

=cut
    
    } elsif ($arg eq "-occ_sig_opt") {
    my $new_occ_sig_opt = shift(@arguments);
    $occ_sig_opt .= " ".$new_occ_sig_opt;
    my $th_field = (split(/ +/,$new_occ_sig_opt ))[1];
    &RSAT::error::FatalError("Field +$th_field+ is not a supported threshold field for matrix-scan.")  unless ($main::supported_threshold{$th_field}) ;

=pod

=item B<-info_lines>

Set of informative lines for the occurrence significance graph, this includes:

 '-hline violet 0 ' line showing the significance zero line.
 '-vline violet 0 ' line showing the score zero value 
 '-vline red sig_max ' the sig_max (score with the maximal
 significance) value is extracted from the occurrence significance
 table taken only for those score values grater than zero

=cut
	} elsif ($arg eq "-info_lines") {
    $main::draw_info_lines=1;
    &RSAT::message::Info("Informative lines for ocurrence significance graph will be drawn ");

=pod

=item B<-occ_sig_graph_opt>

Additional options passed to XYgraph for drawing the
occurrence significance graph.

Note: the argument passed to XYgraph is delimited by single quotes,
and can thus not contain any quote.

=cut

    } elsif ($arg eq "-occ_sig_graph_opt") {
    my $new_occ_sig_graph_opt = shift(@arguments);
    $occ_sig_graph_opt .= " ".$new_occ_sig_graph_opt;
    my $th_graph_field = (split(/ +/, $new_occ_sig_graph_opt ))[0];
    &RSAT::error::FatalError(join", " ,$th_graph_field, "is not a supported threshold field for XYgraph.")   unless (  $main::supported_graph_threshold{$th_graph_field} );

=pod

=item B<-plot_format>

Format for the occurrence plots (occurrence frequencies, occurrence sinificance).
Supported: all formats supported by the program XYgraph

=cut

    } elsif ($arg eq "-plot_format") {
    $main::plot_format = shift(@arguments);

=pod

=item B<-scan_opt>

Additional options passed to matrix-scan for site detection and
feature-map drawing.

Examples:

Scan sequences with an upper threshold of 0.001 on pval.
-scan_opt '-uth pval 0.001'

Note: the argument passed to matrix-scan is delimited by single
quotes, and can thus not contain any quote.

=cut

    } elsif ($arg eq "-scan_opt") {
    my $new_scan_opt = shift(@arguments);
    $scan_opt .= " ".$new_scan_opt;

=pod

=item B<-map_opt>

Additional options passed to feature-map for feature-map drawing.

Examples:

Change the thickness of the maps
-map_opt '-mapthick 12'

Write the weight score above each site (also activate the auto
adjustment of map thickness to ensure there is enough space for drawing the
labels).  
-map_opt  '-label score -mapthick auto'

Note: the argument passed to matrix-scan is delimited by single
quotes, and can thus not contain any quote.

=cut

  } elsif ($arg eq "-map_opt") {
    my $new_map_opt = shift(@arguments);

$map_opt .= " ".$new_map_opt;

=pod 

=item Wish B<-crer>

Return Cis-Regulatory elements Enriched-Regions (CRER).

            Calculate the statistical significance of the number of hits in
            windows of variable sizes. The number of hits is the sum of
            matches above a predefined threshold set on hits p-values, for
            all matrices and on both strands (if -2str). The maximum size
            for a CRER is defined by the option -crer_max.

            The prior probability to find an instance of the motif is the
            same for all matrices, and corresponds to the chosen pval
            threshold. Within a region of maximal CRER size, subwindows are
            defined between each hits, and the observed number of matches in
            a subwindow is the sum of hits above the threshold. The
            significance of the observed number of matches in a subwindow is
            estimated by calculating a P-value using the binomial
            distribution (Aerts et al., 2003).



=cut
	} elsif ($arg eq "-crer") {

	 
	  $main::crer=1 ;

	     &RSAT::error::FatalError($main::filter_pval, "Invalid value for filter_pval. Should be a real number. ") 
	unless (&RSAT::util::IsReal($main::filter_pval));

=pod

=item Wish  B<-lth_crer_size>

Minimal CRER size in bps

=cut
	} elsif ($arg eq "-lth_crer_size") {
    	    $main::lth_crer_size= shift(@arguments);

	     &RSAT::error::FatalError($main::lth_crer_size, "Invalid value for filter_pval. Should be a real number. ") 
	unless (&RSAT::util::IsReal($main::lth_crer_size));


=pod
 
=item Wish  B<-crer_pval>

Pval cutoff for selecting CRERs

=cut
	} elsif ($arg eq "-crer_pval") {
    	    $main::crer_pval= shift(@arguments);

	     &RSAT::error::FatalError($main::crer_pval, "Invalid value for filter_pval. Should be a real number. ") 
	unless (&RSAT::util::IsReal($main::crer_pval));


=pod

=item B<-uth_crer_size>

Maximal CRER size in bps

=cut
	} elsif ($arg eq "-uth_crer_size") {
    	    $main::uth_crer_size= shift(@arguments);

	     &RSAT::error::FatalError($main::uth_crer_size, "Invalid value for filter_pval. Should be a real number. ") 
	unless (&RSAT::util::IsReal($main::uth_crer_size));




} else {
    &FatalError(join("\t", "Invalid option", $arg));
}
    }


}

################################################################
## Verbose message
sub Verbose {
    print $main::out "; footprint-scan ";
    &PrintArguments($main::out);
    printf $main::out "; %-22s\t%s\n", "Program version", $program_version;
    if (%main::infile) {
	print $main::out "; Input files\n";
	while (my ($key,$value) = each %main::infile) {
	    printf $main::out ";\t%-13s\t%s\n", $key, $value;
	}
    }
    if (%main::outfile) {
	print $main::out "; Output files\n";
	while (my ($key,$value) = each %main::outfile) {
	    printf $main::out ";\t%-13s\t%s\n", $key, $value;
	}
    }
}



################################################################
## Run footprint scan flow chart for one or several query genes
sub RunFootprintScan {
  local (@current_query_genes) = @_;

  local $start_time = &RSAT::util::StartScript();
  local $batch_cmd = "";
  local $out = "";
  local $genes = "";


  ################################################################
  ## Initialize output directory + output files
  local ($outfile_prefix, $query_prefix)= &InitQueryOutput();

  ## Output files for matrix-scan
  $outfile{filter_scan} = $outfile{prefix}."_filter_scan.tab";
  $outfile{matrix_prefix} = $outfile{prefix}."_".$matrix_suffix;
  $outfile{occ_sig} = $outfile{matrix_prefix}."_occ_sig.tab";
  $outfile{occ_freq_graph} = $outfile{matrix_prefix}."_occ_freq.".$plot_format;
  $outfile{occ_sig_graph} = $outfile{matrix_prefix}."_occ_sig.".$plot_format;
  $outfile{sites} = $outfile{matrix_prefix}."_sites.tab";
  $outfile{map} = $outfile{matrix_prefix}."_sites.".$map_format;

  ################################################################
  ## Open file to the HTML index
  &OpenIndex("footprint-scan");

  ################################################################
  ## Print query genes in the gene file
  foreach my $gene (@current_query_genes) {
    print $genes $gene, "\t", $organism_name, "\n";
  }
  &IndexOneFile("genes", $outfile{genes});

  ## Retrieve promoters of the query organism 
  if ($main::filter) {
    &RetrieveQueryPromoters();
    &ComputeFilterScan($matrix_format,@matrix_files);
  }

  ################################################################
  ## Print verbose
  &Verbose() if ($verbose);

  if ($main::filter && $main::skip_gene) {
    &RSAT::message::Info("No genes remained after filtering  ", $outfile{genes} ) if ($main::verbose >= 0);  
    return;
  }
  ################################################################
  ## Identify ortholog genes
  &GetOrthologs();

  ## only for the $tf option
  if ($main::tf && !($main::tf_ortho_file) ) {
    $main::tf_ortho_file=$outfile{prefix}."_ortho_bbh_tf.tab";	   
    $org_column =qq@grep -v "#" $outfile{orthologs} |grep -v ";"| cut -f2 >$main::tf_ortho_file @;
    system ($org_column);
    &RSAT::message::Info("List of organisms with an ortholog for the corresponding TF ", "$main::tf_ortho_file ") 
      if ($main::verbose>=1);
  }

  ################################################################
  ## Predict operon leader genes for the orthologous genes
  &InferOrthoOperons() if ($infer_operons);

  ################################################################
  ## Retrieve sequences from orthologs
  &RetrieveOrthoSeq();

  ################################################################
  ## Purge sequences
  &PurgeOrthoSeq() unless  ($main::no_purge) ;

  ################################################################
  ## Scan the promoters of one group of orhtologous genes with one
  ## position-specific scoring matrix

  ## Estimate over-representation in hits for all threshold values
  &OccurrenceSig();
  &OccurrenceSigGraph();
  &GetTopSig() if (($task{synthesis}) && !($batch));
  #&OccurrenceSigGraph();
  ## Scan sequences to detect hits
  &OrthoScan();
  &OrthoMap();

  ################################################################
  ## Finish verbose
  if ($verbose >= 1) {
    my $done_time = &AlphaDate();
    print $out "; Job started $start_time\n";
    print $out "; Job done    $done_time\n";
  }

  ################################################################
  ## Close output stream
  close $out if ($outfile{log});
  close $genes if ($outfile{genes});
  print $index "</table>\n";
  print $index "</blockquote>";
  print $index "<hr size=2 color='#000088'>";
  print $index "</body>";
  print $index "</html>";
  close $index;

  ################################################################
  ## Send the command to a batch queue (e.g. PC cluster)
  if ($batch) {
    &doit($batch_cmd, $dry, $die_on_error, $verbose, 1, $job_prefix);
  }

}

__END__
