#!/usr/bin/perl -w
############################################################
#
# $Id: footprint-discovery,v 1.67 2011/10/28 06:15:47 jvanheld Exp $
#
############################################################

## use strict;

=pod

=head1 NAME

footprint-discovery

=head1 VERSION

$program_version

=head1 DESCRIPTION

Detect phylogenetic footprints by applying dyad-analysis in promoters
of a set of orthologous genes.

Adapted from the procedure described in Janky & van Helden (2008).

=head1 AUTHORS

=over

=item Rekin's Janky <rekins@bigre.ulb.ac.be>

=item Jacques van Helden <jacques.van.helden@ulb.ac.be>

=back

=head1 CATEGORY

Sequences

Motif discovery

=head1 USAGE

footprint-discovery [-i inputfile] -o [output_prefix] \
     -taxon \
     -q query_gene [-q query_gene2 ...] \
     [-v #] [...]


=head1 EXAMPLES

=head2 Single-gene footprint discovery

Discover conserved motifs in the promoters of the orthologs of lexA in
Enterobacteriales.

 footprint-discovery  -v 1 -org Escherichia_coli_K12 -taxon Enterobacteriales \
		-lth occ 1 -lth occ_sig 0 -uth rank 50 \
		-return occ,proba,rank -filter  \
		-to_matrix -bg_model taxfreq -q lexA

=head2 Analysis of a few genes

Discover conserved motifs in the promoters of the orthologs of lexA in
Enterobacteriales.

 footprint-discovery  -v 1 -org Escherichia_coli_K12 -taxon Enterobacteriales \
		-lth occ 1 -lth occ_sig 0 -uth rank 50 \
		-return occ,proba,rank -filter  \
		-to_matrix -bg_model taxfreq \
                -sep_genes -q lexA -q recA -q uvrA

Note the option -sep_genes indicating that the genes have to be
analyzed separately rather than grouped.

The genes can also be specified in a file with the option -genes.

=head2 Footprint discovery applied iteratively to each gene of a genome

Iterate footprint discovery for each gene separately.

 footprint-discovery  -v 1 -org Escherichia_coli_K12 -taxon Enterobacteriales \
		-lth occ 1 -lth occ_sig 0 -uth rank 50 \
		-return occ,proba,rank -filter \
		-to_matrix -bg_model taxfreq -all_genes -sep_genes


=head1 INPUT FORMAT

The program takes as input a taxon of interest + one or several query
genes.

=head1 OUTPUT FORMAT

The output consists in a set of files, containing the results of the
different steps of the analysis.

=over

=item [prefix]_log.txt

Log file listing the analysis parameters + output file names;

=item [prefix]_query_genes.tab

List of query genes (one or several genes can be entered)

=item [prefix]_ortho_bbh.tab

List of orthologous genes

=item [prefix]_ortho_seq.fasta

Promoter sequences of the orthologous genes

=item [prefix]_ortho_seq_purged.fasta

Purged promoter sequences (for motif discovery)
=item [prefix]_ortho_filter_dyads.tab

Dyads found in the query genes (for dyad filtering)

=item [prefix]_ortho_dyads.tab

Significant dyads found in the promoters of orthologous genes
(the footprints)

=item [prefix]_ortho_dyads.asmb

Assembled dyads

=item [prefix]_ortho_dyads.png

Feature-map

=item NOTE : 'ortho' is replaced by 'leaders' in the filename prefix with option -infer_operons

=back

=head1 REFERENCES

Janky, R. and van Helden, J. Evaluation of phylogenetic footprint
discovery for the prediction of bacterial cis-regulatory elements
(2008). BMC Bioinformatics 2008, 9:37 [Pubmed 18215291].

=pod

=head1 SEE ALSO

=over

=item get-orthologs

=item dyad-analysis

=item footprint-scan

=back

=head1 WISH LIST


The following options are not yet implemented, but this should be done
soon.

=over


=item B<-rand>

When the option I<-rand> is activated, the motif discovery is applied
to random selections of promoters rather than promoters of orthologs.

This option serves to perform negative controls in orde to estimate
empirically the rate of false prediction and check its correspondence
with the theoretical estimation of the significance.

The random selections are done by passing the option I<-rand> to the
program I<get-orthologs>.

=item B<-taxa>

Specify a file containing a list of taxa, each of which will be
analyzed separately. The results are stored in a separate folder for
each taxon. The folder name is defined automatically.

=item B<-all_taxa>

Automatically analyze all the taxa, and store each result in a
separate folder (the folder name is defined automatically).


=back

=cut


BEGIN {
  if ($0 =~ /([^(\/)]+)$/) {
    push (@INC, "$`lib/");
  }
}
require "RSA.lib";
require "footprint.lib.pl";



################################################################
## Main package
package main;
{

  ################################################################
  ## Initialise parameters
#  %supported_organism = ();
  $program_version = do { my @r = (q$Revision: 1.67 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
  #    $program_version = "0.00";

  local $skip = 0;
  local $last = 0;

  ## Dyad-analysis parameters
  local $dyad_return_fields = "occ,proba,rank";	## Default return fields for dyad-analysis
  local @dyad_return_fields = (); ## User-specified return fields for dyad-analysis

  local %index_list = (); ## List of index files (there can be several indexes with the option -sep_genes)
  local $verbose = 0;		## Verbosity
  local $noov = "-noov";     ## Treatment of self-overlapping patterns
  local $filter = 1; ## Only report dyads present in the query promoter
  local $bg_model = "taxfreq"; ## Background model for motif discovery (Supported:taxfreq or monads)
  local $bgfile = 0;
  local $to_matrix = 0;		## Convert assembled dyads into PSSM
  local %lth = (occ=>1,occ_sig=>0); ## Lower thresholds
  local %uth = (rank=>50);	    ## Upper thresholds
  local $strands = "-2str"; ## Strands for motif discovery and pattern matching
  local $start_time = &RSAT::util::StartScript();
  $main::orthologs_list_file=0;
  ## Supported tasks
  @supported_tasks = qw(
			all
			bg_model
			operons
			regulons
			query_seq
			filter_dyads
			orthologs
			ortho_seq
			purge
			dyads
			map
			network
			index
		       );
  $supported_tasks = join (",", @supported_tasks);
  %supported_task = ();
  foreach my $task (@supported_tasks) {
    $supported_task{$task} = 1;
  }


  ## Supported bg_models
  @supported_bg_models = qw(
			    monads
			    taxfreq
			    org_list
			    file
			   );
  $supported_bg_models = join (",", @supported_bg_models);
  %supported_bg_model = ();
  foreach my $bg_model (@supported_bg_models) {
    $supported_bg_model{$bg_model} = 1;
  }


  ## Parameters for promoter retrieval
  local $taxon;			## Reference taxon
  local $organism_name;		## Query organism
  local @query_genes = ();	## list of query genes
  local $infer_operons = 0;     ## Infer operon leader genes
  local $dist_thr = 55;	        ## Distance threshold for operon inference
  $main::sep_genes = 0;	        ## Analyze each gene separately
  local $promoter = "ortho";    ## Alternatively: leader when the option "infer operons" is active
#  local $all_genes = 0;         ## Analyze all the genes of the query organism
#  local $max_genes = undef;     ## Analyze a restricted number of genes
#  local $supported_organism;

  local $map_format = "png";	## Image format

  ## Job management options
  local $job_prefix = "fpdisco";
  local $die_on_error = 1;
  local $batch = 0;
  local $dry = 0;


  ## File index
  local %main_index = ();

  ################################################################
  ## Read argument values
  &ReadArguments();

  ################################################################
  ## Check argument values

  ## Check parameters for footprint analysis
  &CheckFootprintParameters();

  local $main_index = &OpenMainIndex();

  ################################################################
  ## Parameters for dyad-analysis

  ## Return fields for dyad-analysis
  if (scalar(@dyad_return_fields) > 0) {
    $dyad_return_fields = join ",", @dyad_return_fields;
  }

  ##Bg model for orthologs_list is monad
  $bg_model = "monad" if  $main::orthologs_list_file ;


  ## Background model file for dyad-analysis
  
  if ($bg_model eq "file") {
    &RSAT::error::FatalError("The option '-bg_model file' requires to specify a file with the option '-bgfile'")
      unless ($bgfile);
      $infile{bg_model} = $bgfile;
  } elsif ($bg_model eq "org_list") {

    ## Check that the list of organisms has been specified
    &RSAT::error::FatalError("The option '-bg_model org_list' requires to specify a list of organisms with the option '-org_list org_list_file'")
      unless ($orglist_file);

    ## If the option -org_list has been specified and the bg_model is
    ## taxfreq, compute the list-specific frequencies with the program
    ## taxon-frequencies -org_list org_list_file.
    &RSAT::message::TimeWarn("Computing background model for organism list") if ($main::verbose >= 1);

    ## Store the background model in the parent directory of the query sub-directories
    $dir{bg_model} = join("/",$main::dir{output_root}, "bg_models", "org_list");
    &RSAT::util::CheckOutDir($dir{bg_model});

    ## Background model file
    $infile{bg_model} = $dir{bg_model}."/";
    $infile{bg_model} .= "dyads_3nt_sp0-20_upstream-noorf_org_list".$noov.$strands.".freq";
    if ($task{bg_model}) {
      my $cmd = "taxon-frequencies -v 1 -type dyad  -ml 3";
      $cmd .= " -org_list ".$orglist_file;
      $cmd .= " > ".$infile{bg_model};
      &doit($cmd);
    }

  } elsif ($bg_model eq "taxfreq") {
    ## Check that the taxon has been specified
    &RSAT::error::FatalError("The option '-bg_model taxfreq' requires to specify a taxon with the option '-taxon'")
      unless ($taxon);

    ## Identify the server-installed taxon-specific background model file
    $infile{bg_model} = &ExpectedFreqFile($taxon,
					  3,
					  # spacing ? 0-20
					  "upstream-noorf",
					  str=>$strands,
					  noov=>$noov,
					  type=>"dyad",
					  warn=>1,
					  taxon=>1);

    ## If the required the taxon frequencies are not pre-installed,
    ## compute them before porocessing and store them in the result
    ## directory.
    unless ((-e $infile{bg_model}) ||
	    (-e ($infile{bg_model}.".gz"))) {


      ## Store the background model in the parent directory of the query sub-directories
      $dir{bg_model} = join("/",$main::dir{output_root}, "bg_models", $taxon);
      &RSAT::util::CheckOutDir($dir{bg_model});

      &RSAT::message::TimeWarn("Computing taxon-wise background model (not installed on the server)" ) if ($main::verbose >= 1);
      my ($local_bg_file) = &ShortFileName($infile{bg_model});
      $infile{bg_model} = $dir{bg_model}."/".$local_bg_file;
      if ($task{bg_model}) {
	my $cmd = "taxon-frequencies -v 1 -type dyad  -ml 3";
	$cmd .= " -taxon ".$taxon;
	$cmd .= " > ".$infile{bg_model};
	&doit($cmd);
      }
      &RSAT::message::Info("Taxon-wise background model", $infile{bg_model}) if ($main::verbose >= 1);

    } else {
      &RSAT::error::FatalError($bg_model, "Invalid background model. Supported: ", $supported_bg_models) unless ($supported_bg_model{$bg_model});
    }
    &RSAT::message::TimeWarn("Background model file", $infile{bg_model}) if ($main::verbose >= 3);
  }



  ################################################################
  ## Analyze query genes separately or altogether
  local $q = 0;
  if ($main::sep_genes) {
    local $query_nb = scalar(@query_genes);
    foreach our $current_gene (@query_genes) {
      $q++;
     #  if (($skip > 0) && ($q <= $skip)) {
# 	  &RSAT::message::Info("Skipping gene", $q."/".$query_nb, $current_gene) if ($main::verbose >= 1);
# 	  next;
#       }
      &RSAT::message::TimeWarn("Analyzing gene", $q."/".$query_nb, $current_gene) if ($main::verbose >= 1);
      &RunFootprintDisco($current_gene);
      # if (($last > 0) && ($q >= $last)) {
# 	  &RSAT::message::Info("Stopped after gene", $q."/".$query_nb, $current_gene) if ($main::verbose >= 1);
# 	  last;
#       }
    }
  } else {
      $q = 1;
      $query_nb = 1;
      &RSAT::message::TimeWarn("Analyzing a group of ",scalar(@query_genes)," genes", join(";", @query_genes)) if ($main::verbose >= 1);
      &RunFootprintDisco(@query_genes);
  }

  ################################################################
  ## Generate co-regulation network by comparing gene-wise footprints
  &CoregulationNetwork() if ($task{network});

  ################################################################
  ## Give a warning to report the gene-specific and main index files
  if ($main::verbose >= 2) {
    print ("; Index files\n");
    my $i = 0;
    foreach my $key (sort keys %index_list) {
      $i++;
      print join ("\t", ";", $key, $index_list{$key}), "\n";
      if ($i > 10) {
	print join ("\t", "; ... skipping other gene-wise index files"), "\n";
	last;
      }
    }
    print join ("\t", ";", "Main Table", $outfile{main_index}), "\n";
  }


  ## Close the main index stream
  my $exec_time = &RSAT::util::ReportExecutionTime($start_time);
  print $main_index "</table>\n";
  print $main_index "<h2>File index</h2>\n";
  print $main_index "<p><table>\n";
  foreach my $key (@to_index) {
    print $main_index "<tr>\n";
    print $main_index "<td>".$key."</td>\n";
    my $link = &LinkOneFile($outfile{main_index}, $outfile{$key});
    print $main_index "<td>".$link."</td>\n";
    print $main_index "</tr>\n";
  }
  print $main_index "</table></p>\n";

  print $main_index "<p><pre>", $exec_time, "</pre></p>\n";
  print $main_index "</body>\n";
  print $main_index "</html>\n";
  close ($main_index);

  exit(0);
}

################################################################
################### SUBROUTINE DEFINITION ######################
################################################################

sub DyadSuffix {
  my $dyad_suffix = "_".$promoter."_dyads_3nt_sp0-20".$strands.$noov."_".$bg_model."_sig".$lth{occ_sig};
  return($dyad_suffix);
}

################################################################
## Run footprint discovery flow chart for one or several query genes
sub RunFootprintDisco {
  local (@current_query_genes) = @_;

  local $query_start_time = &RSAT::util::AlphaDate();
  local $batch_cmd = "";
  local $out = "";
  local $genes = "";

  ################################################################
  ## Initialize output directory + output files

  local ($outfile_prefix, $query_prefix) = &InitQueryOutput();

  ## Output files for dyad-analysis
  $outfile{filter_dyads} = $outfile{prefix}."_filter_dyads.tab";

  my $dyad_suffix = &DyadSuffix();
  $outfile{dyad_prefix} = $outfile{prefix}.$dyad_suffix;
  $outfile{dyads} = $outfile{dyad_prefix}.".tab";
  $outfile{dyads_html} = $outfile{dyad_prefix}.".html";
  $outfile{asmb} = $outfile{dyad_prefix}.".asmb";
  $outfile{map} = $outfile{dyad_prefix}.".".${map_format};
  if ($to_matrix) {
    $outfile{pssm_prefix} = $outfile{dyad_prefix}."_pssm";
    $outfile{sig_PSSM} = $outfile{pssm_prefix}."_sig_matrices.tf";
    $outfile{count_PSSM} = $outfile{pssm_prefix}."_count_matrices.txt";
  }


  ################################################################
  ## Index files for the main index
  $main_index{$query_prefix}{dyads} = $outfile{dyads};

  ## Report all outfile names
  if ($main::verbose >= 5) {
    foreach my $key (sort keys %outfile) {
      &RSAT::message::Debug("outfile", $key, "'".$outfile{$key}."'");
    }
  }

  ################################################################
  ## Open the file to store the HTML index
  &OpenQueryReport("footprint-discovery");

  ################################################################
  ## Print query genes in the gene file
  foreach my $gene (@current_query_genes) {
    print $genes $gene, "\t", $organism_name, "\n";
  }
  &IndexOneFile("genes", $outfile{genes});

  ################################################################
  ## Print verbose
  &Verbose() if ($verbose);

  ## Predict operon leader genes of the query genes
  &InferQueryOperons() if ($infer_operons);

  ## Dyad filter tasks
  unless ($nofilter) {
    ## Retrieve promoters of the query organism
    &RetrieveQueryPromoters();

    ## Detect all dyads in promoters of query genes for dyad filtering
    &ComputeFilterDyads();
  }

  ################################################################
  ## Identify ortholog genes
  &GetOrthologs();

  ################################################################
  ## Predict operon leader genes for the orthologous genes
  &InferOrthoOperons() if ($infer_operons);

  ################################################################
  ## Retrieve sequences from orthologs
  &RetrieveOrthoSeq();

  ################################################################
  ## Purge sequences
  &PurgeOrthoSeq();

  ################################################################
  ## Discover over-represented dyads in promoters of orthologous genes
  &DyadAnalysis();

  ################################################################
  ## Generate a feature-map of the discovered dyads
  if ($task{map}) {
    my $map_title = join "; ", $taxon, $organism_name, $outfile_prefix;
    &CheckDependency("map", "dyads");
    &CheckDependency("map", "seq");
    &RSAT::message::TimeWarn("Generating feature map", $outfile{map}) if ($verbose >= 2);
    $cmd = "$SCRIPTS/dna-pattern -return limits,sites -origin -0 -N 4";
    $cmd .= " -pl ".$outfile{dyads};
    $cmd .= " -format fasta -i ".$outfile{seq};
    $cmd .= " | $SCRIPTS/convert-features -from dnapat -to ft";
    $cmd .= " | $SCRIPTS/feature-map";
    $cmd .= " -title '".$map_title."'";
    $cmd .= " -format ".$map_format;
    $cmd .= " -scalebar -legend";
    $cmd .= " -scorethick";
    $cmd .= " -o ".$outfile{map};
    &one_command($cmd);
  }
  &IndexOneFile("feature map", $outfile{map}, image=>1);


  ################################################################
  ## Send the command to a batch queue (e.g. PC cluster)
  if ($batch) {
    my $jobnb = 0;
    if (($q-1) % 500 == 0) {
      $jobnb = &get_job_nb();
    }
    while ($jobnb > 2000) {
      &RSAT::message::TimeWarn("Already $jobnb jobs on queue... waiting 30 seconds before submitting new jobs") if ($main::verbose >= 2);
      sleep(60);
      $jobnb = &get_job_nb();
    }
    &doit($batch_cmd, $dry, $die_on_error, $verbose, 1, $job_prefix);
  }

  ################################################################
  ## Close query-specific output streams
  my $query_exec_time = &RSAT::util::ReportExecutionTime($query_start_time);
  print $out $query_exec_time if ($main::verbose >= 1);
  close $out if ($outfile{log});
  close $genes if ($outfile{genes});

  ## Close the index file
  print $index "</table>\n";
  print $index "</blockquote>";
  print $index "<hr size=2 color='#000088'>";
  print $index "</body>";
  print $index "</html>";
  close $index;

  ################################################################
  ## Update the main index
  print $main_index "<tr>\n";
  print $main_index "<td>", $q, "</td>\n";
  ## Link to the query-specific index file
#  print $main_index "<td><a href='", &RSAT::util::RelativePath($outfile{main_index}, $outfile{index}), "'>", $query_prefix, "</a></td>\n";
  print $main_index "<td>", &LinkOneFile($outfile{main_index}, $outfile{index}, $query_prefix), "</td>\n";

  ## Report the significant dyads
  my $top_id = "";
  my $top_sig = "NA";
  my $dyad_nb = 0;
  local %header_col = ();

  if (-e $outfile{dyads}) {
    &RSAT::message::Info("Indexing dyads", $query_prefix, $outfile{dyads}) if ($main::verbose >= 4);
    my ($dyads) = &OpenInputFile($outfile{dyads});

    while (<$dyads>) {
      next if (/^;/); ## Skip comments
      next unless (/\S/); ## Skip empty lines
      chomp;


      ## Read the header to idenfify occ_sig column
      if (/^#/) {
	s/^#//;
	my @header_fields = split "\t";
	for my $h (0..$#header_fields) {
	  $header_fields[$h] = &RSAT::util::trim($header_fields[$h]);
	  $header_col{$header_fields[$h]} = $h;
	  #	&RSAT::message::Debug("header", $h, "'".$header_fields[$h]."'", $header_col{$header_fields[$h]}) if ($main::verbose >= 10);
	}
	next;
      }

      $dyad_nb++;

      ## Report top significance
      if ($dyad_nb == 1) {
	my @fields = split "\t";
	$top_id = $fields[$header_col{"identifier"}];
	$top_sig = $fields[$header_col{"occ_sig"}];
	#      &RSAT::message::Debug("top_id", $top_id, "top_sig", $top_sig, ) if ($main::verbose >= 10);
      }
    }
    close $dyads;
  }

  print $main_index "<td><tt>", $top_id, "</tt></td>\n";
  print $main_index "<td>", $top_sig, "</td>\n";
  print $main_index "<td>", $dyad_nb, "<td>";

  print $main_index "</tr>\n";
}


################################################################
## Display full help message 
sub PrintHelp {
#    system "pod2text -c $0";
    system "cat $0 $ENV{RSAT}/perl-scripts/lib/footprint.lib.pl | pod2text -c";
    exit(0);
}

################################################################
## Display short help message
sub PrintOptions {
    &PrintHelp();
}

################################################################
## Read arguments 
sub ReadArguments {

=pod

=head1 OPTIONS

=cut
  local $arg;
  local @arguments = @ARGV; ## create a copy to shift, because we need ARGV to report command line in &Verbose()
  while (scalar(@arguments) >= 1) {
    $arg = shift (@arguments);

    if (&ReadFootprintOptions()) {
      next;

      ### Lower threshold

=pod

=over 4

=item B<-lth field value>

Lower threshold for dyad-analysis.

See the manual of dyad-analysis for a description of the fields on
which a threshold can be imposed.

=cut
    } elsif ($arg eq "-lth") {
      my $thr_field = shift (@arguments);
      my $thr_value =  shift (@arguments);
      $main::lth{$thr_field} = $thr_value;

      ### Upper threshold

=pod

=item B<-uth field value>

Upper threshold for dyad-analysis.

See the manual of dyad-analysis for a description of the fields on
which a threshold can be imposed.

=cut
    } elsif ($arg eq "-uth") {
      my $thr_field = shift (@arguments);
      my $thr_value =  shift (@arguments);
      $main::uth{$thr_field} = $thr_value;

=pod

=item B<-return dyad_return_fields>

Return fields for dyad-analysis.  This argument is passed to
dyad-analysis for the discovery of dyads in promoters of orthologous
genes.

Multiple-fields can be entered either by calling this argument
iterativelyk or by entering multiple fields separated by commas.

Type I<dyad-analysis -help> to obtain the list of supported return
fields.

=cut
    } elsif ($arg eq "-return") {
      push @dyad_return_fields, shift (@arguments);

=pod

=item B<-to_matrix>

Convert assembled patterns into position-specific scoring matrices
(PSSM).  Caution ! This conversion can take time if the sequence set
is large and if there are many assemblies.

=cut
    } elsif ($arg eq "-to_matrix") {
      $main::to_matrix = 1;


	    ## Background model
=pod

=item B<-bg_model taxfreq|org_list|monads|file>

Allow the user to choose among alternative background model (see Janky
& van Helden, 2008).

Supported background model types:

=over


=item I<monads>

Expected dyad frequencies are estimated by taking the product of the
monad frequencies observed in the input sequence set. Example:

   F_exp(CAGn{10}GTA) = F_obs(CAG) * F_obs(GTA)

=item I<taxfreq>

Only valid in combination with the option -taxon.

Expected dyad frequencies are computed by summing the frequencies of
all dyads in the non-coding upstream sequences of all genes for all
the organisms of the reference taxon.

=item I<org_list>

Only valid in combination with the option -org_list.

Expected dyad frequencies are computed by summing the frequencies of
all dyads in the non-coding upstream sequences of all genes for each
organism of user-specified list.

=item I<file>

Only valid in combination with the option -bgfile.

Precises that the background model that will be used for dyad-analysis will be a file given as argument (with the option -bgfile, see below)

=back

=cut
    } elsif ($arg eq "-bg_model") {
      $main::bg_model = shift(@arguments);

	  ## Background model file

=pod

=item B<-bgfile>

File containing the word frequencies to be used as the background model for dyad-analysis.
This option must be used in combination with the option -bg_model file

=cut
    } elsif ($arg eq "-bgfile") {
      $main::bgfile = shift(@arguments);

	  ## Dyad filtering

=pod

=item B<-filter>

Only accept dyads found in the promoter of the query gene, in the
query organism. (option selected by default)

=cut
    } elsif ($arg eq "-filter") {
      $main::filter = 1;

=pod

=item B<-no_filter>

Accept all dyads, even if they are not found in the promoter of the
query gene, in the query organism. (will cancel -filter option if selected)

=cut

    } elsif ($arg eq "-no_filter") {
      $main::nofilter = 1;

    } else {
      &FatalError(join("\t", "Invalid option", $arg));
    }
  }

}

################################################################
## Verbose message
sub Verbose {
  my $header_verbose = "";
  $header_verbose .=  "; footprint-discovery ";
  $header_verbose .= &PrintArguments();
  $header_verbose .= "\n";
  $header_verbose .= sprintf "; %-22s\t%s\n", "Program version", $program_version;
  $header_verbose .= sprintf "; %-22s\t%s", "Working directory", `pwd`;
  $header_verbose .= sprintf "; %-22s\t%s\n", "Query organism", $organism_name;
  if($main::orglist_file) {
    $header_verbose .= sprintf "; %-22s\t%s\n", "Organism list", $orglist_file;
  } else {
    $header_verbose .= sprintf "; %-22s\t%s\n", "Reference taxon", $taxon;
  }
  $header_verbose .= sprintf "; %s\n", "Promoters from predicted operon leader genes" if ($infer_operons);
  if (scalar(@current_query_genes) == 1) {
    $header_verbose .= sprintf "; %-22s\t%s\n", "Query gene", $current_query_genes[0];
  } elsif (scalar(@current_query_genes) <= 10) {
    $header_verbose .= sprintf "; %-22s\t%s\n", "Query genes", join "; ", @current_query_genes;
  } else {
    $header_verbose .= sprintf "; %-22s\t%s\n", "Query genes", scalar(@current_query_genes);
  }
  $header_verbose .= sprintf "; %-22s\t%s\n", "Background model", $bg_model;
  if ($filter) {
      $header_verbose .= sprintf "; %s\n", "Dyad filtering ON";
  }  else {
      $header_verbose .= sprintf "; %s\n", "No dyad filtering";
  }

  if (%main::infile) {
    print $out "; Input files\n";
    while (my ($key,$value) = each %main::infile) {
      printf $out ";\t%-13s\t%s\n", $key, $value;
    }
  }
  if (%main::outfile) {
    print $out "; Output files\n";
    while (my ($key,$value) = each %main::outfile) {
      printf $out ";\t%-13s\t%s\n", $key, $value;
    }
  }
  print $out $header_verbose;


  $header_verbose =~ s/^; //;
  $header_verbose =~ s/\n; /\n/gm;
  print $index "<pre>";
  print $index $header_verbose;
  print $index "</pre>";
}


################################################################
## Run dyad-analysis
sub DyadAnalysis {
  if ($task{dyads}) {
    &RSAT::message::TimeWarn("Running dyad-analysis", $outfile{dyads}) if ($verbose >= 2);
    &CheckDependency("dyads", "purged");
    $cmd = "$SCRIPTS/dyad-analysis -v 1 ";
    $cmd .= " -return ".$dyad_return_fields;
    ## Backgroun model
    if ($bg_model eq "monads") {
      $cmd .= " -bg monads" ;
    } elsif ($bg_model eq "taxfreq" || $bg_model eq "file") {
      $cmd .= " -expfreq ".$infile{bg_model} ;
    }
    ## dyad filtering (only accept dyads found in the promoter of the query gene(s)
    if (!$nofilter) {
      $cmd .= " -accept ".$outfile{filter_dyads};
    }
    foreach my $field (sort keys %lth) {
    $cmd .= " -lth ".$field." ".$lth{$field};
  }
    foreach my $field (sort keys %uth) {
      $cmd .= " -uth ".$field." ".$uth{$field};
    }
    $cmd .= " -i ".$outfile{purged};
    $cmd .= " -l 3 -sp 0-20";
    $cmd .= " ".$strands;
    $cmd .= " ".$noov;
    $cmd .= " -sort";
    $cmd .= " -o ".$outfile{dyads};
    &one_command($cmd);

    ## Generate a HTML table with the discovered dyads
    &CheckDependency("dyads_html", "dyads");
    $cmd = "$SCRIPTS/text-to-html";
    $cmd .= " -i ".$outfile{dyads};
    $cmd .= " -font variable -chunk 1000";
    $cmd .= " -o ".$outfile{dyads_html};
    &one_command($cmd);

    ################################################################
    ## Assemble significant dyads
    &RSAT::message::TimeWarn("Assembling significant dyads", $outfile{asmb}) if ($verbose >= 2);
    &CheckDependency("asmb", "dyads");
    $cmd = "$SCRIPTS/pattern-assembly -v 1";
    $cmd .= " -i ".$outfile{dyads};
    $cmd .= " -subst 0 -maxfl 1 -toppat 50";
    $cmd .= " -o ".$outfile{asmb};
    &one_command($cmd);

    ################################################################
    ## Convert assembled dyads into a position-specific scoring matrix
    if ($to_matrix) {
      &RSAT::message::TimeWarn("Converting assembled dyads into PSSM", $outfile{pssm}) if ($verbose >= 2);
      &CheckDependency("pssm", "asmb");
      &CheckDependency("pssm", "seq");
      $cmd = "$SCRIPTS/matrix-from-patterns -v 1";
      $cmd .= " -seq ".$outfile{seq};
      $cmd .= " -asmb ".$outfile{asmb};
      $cmd .= " -format fasta";
      $cmd .= " -uth Pval 0.00025";
      $cmd .= " -bginput -markov 0";
      $cmd .= " ".$strands;
      $cmd .= " -logo";
      $cmd .= " -o ".$outfile{pssm_prefix};
      &one_command($cmd);
      &IndexOneFile("significance matrix (PSSM)", $outfile{sig_PSSM});
      &IndexOneFile("count matrix (PSSM)", $outfile{count_PSSM});
    }
  }
  &IndexOneFile("dyads", $outfile{dyads});
  &IndexOneFile("dyads [htm]", $outfile{dyads_html});
  &IndexOneFile("assembled dyads", $outfile{asmb});
}

################################################################
## Generate co-regulation network by comparing gene-wise footprints
sub CoregulationNetwork {
  my $main_prefix = &MainPrefix();
  my $dyad_suffix = &DyadSuffix();

  ## Create a directory for the inferred co-reglation network
  $dir{network} = $dir{output_root}."/network";
  &RSAT::util::CheckOutDir($dir{network});
  &RSAT::message::TimeWarn("Network directory", $dir{network}) if ($main::verbose >= 2);

  ################################################################
  ## Write a list of dyad files
  $outfile{dyad_file_list} .= $dir{network}."/".$main_prefix."_dyad_files.txt";
  my $out= &OpenOutputFile($outfile{dyad_file_list});
  foreach my $query_prefix (sort keys %main_index) {
    print $out $main_index{$query_prefix}{dyads}."\n" if (-e $main_index{$query_prefix}{dyads});
  }
  &RSAT::message::TimeWarn("Dyad file list", $outfile{dyad_file_list}) if ($main::verbose >= 2);
  push @to_index, "dyad_file_list";

  ################################################################
  ## Generate a dyad significance profiles.  Instead of a table with 1
  ## row per gene and 1 column per dyad, we generate a file in format
  ## "class": first column indicates the dyad, second column the gene,
  ## third column the score. We add two columns with primary gene name
  ## and ID, because the query gene list may contain synonyms.
  $outfile{dyad_sig_classes} .= $dir{network}."/".$main_prefix."_dyad_sig_classes.tab";
  my $compare_scores_cmd = "compare-scores -v 0";
  $compare_scores_cmd .= " -filelist ".$outfile{dyad_file_list};
  $compare_scores_cmd .= " -sc 8 -null .";
  $compare_scores_cmd .= " -basename";
  $compare_scores_cmd .= " -suppress ".$main_prefix;
  $compare_scores_cmd .= " -suppress ".$dyad_suffix.".tab";
  $compare_scores_cmd .= " -suppress _".$organism_name."_".$taxon;
  $compare_scores_cmd .= " -format classes";
  $compare_scores_cmd .= " | add-gene-info";
  $compare_scores_cmd .= " -col 2";
  $compare_scores_cmd .= " -org ".$organism_name;
  $compare_scores_cmd .= " -info name,id";
  $compare_scores_cmd .= " -o ".$outfile{dyad_sig_classes};
  &doit($compare_scores_cmd);
  &RSAT::message::TimeWarn("Dyad sig classes", $outfile{dyad_sig_classes}) if ($main::verbose >= 2);
  push @to_index, "dyad_sig_classes";

  ################################################################
  ## Build the network by comparing dyads score profiles between each
  ## pair of genes
  $outfile{gene_pairs} .= $dir{network}."/".$main_prefix."_gene_pairs.tab";
  my $compare_classes_cmd = 'awk \'{print $1"\t"$4"\t"$3}\' '.$outfile{dyad_sig_classes};
  $compare_classes_cmd .=  " | compare-classes -v 1";
  $compare_classes_cmd .= " -i /dev/stdin";
  $compare_classes_cmd .= " -triangle -distinct";
  $compare_classes_cmd .= " -return rank,occ,freq,proba,jac_sim,dotprod,entropy";
  $compare_classes_cmd .= " -sort dotprod";
  $compare_classes_cmd .= " -sc 3";
  $compare_classes_cmd .= " -lth dotprod 2 -lth QR 2 -lth Q 2 -lth R 2";
  $compare_classes_cmd .= " -o ".$outfile{gene_pairs};
  &doit($compare_classes_cmd);
  &RSAT::message::TimeWarn("Gene pairs", $outfile{gene_pairs}) if ($main::verbose >= 2);
  push @to_index, "gene_pairs";

  ################################################################
  ## Add columns with gene names and IDs
  $outfile{gene_pairs_names} .= $dir{network}."/".$main_prefix."_gene_pairs_names.tab";
  my $cc_gene_info_cmd = "add-gene-info";
  $cc_gene_info_cmd .= " -i ".$outfile{gene_pairs};
  $cc_gene_info_cmd .= " -col 1 -col 2";
  $cc_gene_info_cmd .= " -org ".$organism_name;
  $cc_gene_info_cmd .= " -info name,id";
  $cc_gene_info_cmd .= " -o ".$outfile{gene_pairs_names};
  &doit($cc_gene_info_cmd);
  &RSAT::message::TimeWarn("Gene pairs (with names)", $outfile{gene_pairs_names}) if ($main::verbose >= 2);
  push @to_index, "gene_pairs_names";

  ################################################################
  ## Convert the compare-classes result into a graph

  ## Identify score column
  my $score = "DPbits";
  my $score_column = 21;
  my ($in) = &OpenInputFile($outfile{gene_pairs});
  while (<$in>) {
    if (/^;\t(\d+)\t${score}\s+/) {
      $score_column = $1;
    }
  }
  close $in;

  ## Run the graph conversion
  $outfile{gene_pairs_gml} .= $dir{network}."/".$main_prefix."_gene_pairs_".$score.".gml";
  my $convert_graph_cmd = "convert-graph";
  $convert_graph_cmd .= " -scol 1 -tcol 2";
  $convert_graph_cmd .= " -wcol ".$score_column;
  $convert_graph_cmd .= " -from tab -to gml";
  $convert_graph_cmd .= " -ewidth -ecolors fire";
  $convert_graph_cmd .= " -i ".$outfile{gene_pairs};
  $convert_graph_cmd .= " -o ".$outfile{gene_pairs_gml};
  &doit($convert_graph_cmd);
  &RSAT::message::TimeWarn("Gene pair graph (GML)", $outfile{gene_pairs_gml}) if ($main::verbose >= 2);
  push @to_index, "gene_pairs_gml";


  ################################################################
  ## Get the neighborhood of each gene in the co-regulation network
  $outfile{gene_neighbours} .= $dir{network}."/".$main_prefix."_gene_neighbours.tab";
  my $gn_cmd = "graph-neighbours -v 1";
  $gn_cmd .= " -self";
#  $gn_cmd .= " -seed $gene";
  $gn_cmd .= " -all";
  $gn_cmd .= " -in_format gml";
  $gn_cmd .= " -i ".$outfile{gene_pairs_gml};
  $gn_cmd .= " -o ".$outfile{gene_neighbours};
  &doit($gn_cmd);
  &RSAT::message::TimeWarn("Gene neighbours", $outfile{gene_neighbours}) if ($main::verbose >= 2);
  push @to_index, "gene_neighbours";

}

#######################################################################
## Returns the number of jobs running currently for the current user

sub get_job_nb {
      my $batchuser = `whoami`;
      chomp $batchuser;
      my $jobnb = `qstat -u $batchuser | wc -l`;
      chomp $jobnb;
      $jobnb -= 2;
      &RSAT::message::TimeWarn("$jobsnb are in the queue for user $batchuser") if ($main::verbose >= 4);
      return $jobnb;
}
__END__


