#!/usr/bin/perl -w
############################################################
#
# $Id: footprint-discovery,v 1.57 2011/02/10 14:43:10 rsat Exp $
#
############################################################

## use strict;

=pod

=head1 NAME

footprint-discovery

=head1 VERSION

$program_version

=head1 DESCRIPTION

Detect phylogenetic footprints by applying dyad-analysis in promoters
of a set of orthologous genes.

Adapted from the procedure described in Janky & van Helden (2008).

=head1 AUTHORS

=over

=item Rekin's Janky <rekins@bigre.ulb.ac.be>

=item Jacques van Helden <jacques.van.helden@ulb.ac.be>

=back

=head1 CATEGORY

Sequences

Pattern discovery

=head1 USAGE

footprint-discovery [-i inputfile] -o [output_prefix] \
     -taxon \
     -q query_gene [-q query_gene2 ...] \
     [-v #] [...]


=head1 EXAMPLES

=head2 Single-gene footprint discovery

Discover conserved motifs in the promoters of the orthologs of lexA in
Enterobacteriales.

 footprint-discovery  -v 1 -org Escherichia_coli_K12 -taxon Enterobacteriales \
		-lth occ 1 -lth occ_sig 0 -uth rank 50 \
		-return occ,proba,rank -filter  \
		-to_matrix -bg_model taxfreq -q lexA

=head2 Analysis of a few genes

Discover conserved motifs in the promoters of the orthologs of lexA in
Enterobacteriales.

 footprint-discovery  -v 1 -org Escherichia_coli_K12 -taxon Enterobacteriales \
		-lth occ 1 -lth occ_sig 0 -uth rank 50 \
		-return occ,proba,rank -filter  \
		-to_matrix -bg_model taxfreq \
                -sep_genes -q lexA -q recA -q uvrA

Note the option -sep_genes indicating that the genes have to be
analyzed separately rather than grouped.

The genes can also be specified in a file with the option -genes.

=head2 Footprint discovery applied iteratively to each gene of a genome

Iterate footprint discovery for each gene separately.

 footprint-discovery  -v 1 -org Escherichia_coli_K12 -taxon Enterobacteriales \
		-lth occ 1 -lth occ_sig 0 -uth rank 50 \
		-return occ,proba,rank -filter \
		-to_matrix -bg_model taxfreq -all_genes -sep_genes


=head1 INPUT FORMAT

The program takes as input a taxon of interest + one or several query
genes.

=head1 OUTPUT FORMAT

The output consists in a set of files, containing the results of the
different steps of the analysis.

=over

=item [prefix]_log.txt

Log file listing the analysis parameters + output file names;

=item [prefix]_query_genes.tab

List of query genes (one or several genes can be entered)

=item [prefix]_ortho_bbh.tab

List of orthologous genes

=item [prefix]_ortho_seq.fasta

Promoter sequences of the orthologous genes

=item [prefix]_ortho_seq_purged.fasta

Purged promoter sequences (for pattern discovery)
=item [prefix]_ortho_filter_dyads.tab

Dyads found in the query genes (for dyad filtering)

=item [prefix]_ortho_dyads.tab

Significant dyads found in the promoters of orthologous genes
(the footprints)

=item [prefix]_ortho_dyads.asmb

Assembled dyads

=item [prefix]_ortho_dyads.png

Feature-map

=item NOTE : 'ortho' is replaced by 'leaders' in the filename prefix with option -infer_operons

=back

=head1 REFERENCES

Janky, R. and van Helden, J. Evaluation of phylogenetic footprint
discovery for the prediction of bacterial cis-regulatory elements
(2008). BMC Bioinformatics 2008, 9:37 [Pubmed 18215291].

=pod

=head1 SEE ALSO

=over

=item get-orthologs

=item dyad-analysis

=item footprint-scan

=back

=head1 WISH LIST


The following options are not yet implemented, but this should be done
soon.

=over

=item B<-taxa>

Specify a file containing a list of taxa, each of which will be
analyzed separately. The results are stored in a separate folder for
each taxon. The folder name is defined automatically.

=item B<-all_taxa>

Automatically analyze all the taxa, and store each result in a
separate folder (the folder name is defined automatically).


=back

=cut


BEGIN {
  if ($0 =~ /([^(\/)]+)$/) {
    push (@INC, "$`lib/");
  }
}
require "RSA.lib";
require "footprint.lib.pl";



################################################################
## Main package
package main;
{

  ################################################################
  ## Initialise parameters
#  %supported_organism = ();
  $program_version = do { my @r = (q$Revision: 1.57 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
  #    $program_version = "0.00";

  local $skip = 0;
  local $last = 0;

  ## Dyad-analysis parameters
  local $dyad_return_fields = "occ,proba,rank";	## Default return fields for dyad-analysis
  local @dyad_return_fields = (); ## User-specified return fields for dyad-analysis

  local %index_list = (); ## List of index files (there can be several indexes with the option -sep_genes)
  local $verbose = 0;		## Verbosity
  local $noov = "-noov";     ## Treatment of self-overlapping patterns
  local $filter = 1; ## Only report dyads present in the query promoter
  local $bg_model = "taxfreq"; ## Background model for pattern discovery (Supported:taxfreq or monads)
  local $to_matrix = 0;		## Convert assembled dyads into PSSM
  local %lth = (occ=>1,occ_sig=>0); ## Lower thresholds
  local %uth = (rank=>50);	    ## Upper thresholds
  local $strands = "-2str"; ## Strands for pattern discovery and pattern matching
  local $start_time = &RSAT::util::StartScript();
  $main::orthologs_list_file=0;
  ## Supported tasks
  @supported_tasks = qw(
			all
			bg_model
			operons
			query_seq
			filter_dyads
			orthologs
			ortho_seq
			purge
			dyads
			map
			network
			index
		       );
  $supported_tasks = join (",", @supported_tasks);
  %supported_task = ();
  foreach my $task (@supported_tasks) {
    $supported_task{$task} = 1;
  }


  ## Supported bg_models
  @supported_bg_models = qw(
			    monads
			    taxfreq
			    org_list
			   );
  $supported_bg_models = join (",", @supported_bg_models);
  %supported_bg_model = ();
  foreach my $bg_model (@supported_bg_models) {
    $supported_bg_model{$bg_model} = 1;
  }


  ## Parameters for promoter retrieval
  local $taxon;			## Reference taxon
  local $organism_name;		## Query organism
  local @query_genes = ();	## list of query genes
  local $infer_operons = 0;     ## Infer operon leader genes
  local $dist_thr = 55;	        ## Distance threshold for operon inference
  $main::sep_genes = 0;	        ## Analyze each gene separately
  local $promoter = "ortho";    ## Alternatively: leader when the option "infer operons" is active
#  local $all_genes = 0;         ## Analyze all the genes of the query organism
#  local $max_genes = undef;     ## Analyze a restricted number of genes
#  local $supported_organism;

  local $map_format = "png";	## Image format

  ## Job management options
  local $job_prefix = "fpdisco";
  local $die_on_error = 1;
  local $batch = 0;
  local $dry = 0;


  ## File index
  local %main_index = ();

  ################################################################
  ## Read argument values
  &ReadArguments();

  ################################################################
  ## Check argument values

  ## Check parameters for footprint analysis
  &CheckFootprintParameters();

  local $main_index = &OpenMainIndex();

  ################################################################
  ## Parameters for dyad-analysis

  ## Return fields for dyad-analysis
  if (scalar(@dyad_return_fields) > 0) {
    $dyad_return_fields = join ",", @dyad_return_fields;
  }

  ##Bg model for orthologs_list is monad
  $bg_model = "monad" if  $main::orthologs_list_file ;

  ## Background model file for dyad-analysis
  if ($bg_model eq "org_list") {

    ## Check that the list of organisms has been specified
    &RSAT::error::FatalError("The option '-bg_model org_list' requires to specify a list of organisms with the option '-org_list org_list_file'")
      unless ($orglist_file);

    ## If the option -org_list has been specified and the bg_model is
    ## taxfreq, compute the list-specific frequencies with the program
    ## taxon-frequencies -org_list org_list_file.
    &RSAT::message::TimeWarn("Computing background model for organism list") if ($main::verbose >= 1);

    ## Store the background model in the parent directory of the query sub-directories
    $dir{bg_model} = join("/",$main::dir{output_root}, "bg_models", "org_list");
    &RSAT::util::CheckOutDir($dir{bg_model});

    ## Background model file
    $infile{bg_model} = $dir{bg_model}."/";
    $infile{bg_model} .= "dyads_3nt_sp0-20_upstream-noorf_org_list".$noov.$strands.".freq";
    if ($task{bg_model}) {
      my $cmd = "taxon-frequencies -v 1 -type dyad  -ml 3";
      $cmd .= " -org_list ".$orglist_file;
      $cmd .= " > ".$infile{bg_model};
      &doit($cmd);
    }

  } elsif ($bg_model eq "taxfreq") {
    ## Check that the taxon has been specified
    &RSAT::error::FatalError("The option '-bg_model taxfreq' requires to specify a taxon with the option '-taxon'")
      unless ($taxon);

    ## Identify the server-installed taxon-specific background model file
    $infile{bg_model} = &ExpectedFreqFile($taxon,
					  3,
					  # spacing ? 0-20
					  "upstream-noorf",
					  str=>$strands,
					  noov=>$noov,
					  type=>"dyad",
					  warn=>1,
					  taxon=>1);

    ## If the required the taxon frequencies are not pre-installed,
    ## compute them before porocessing and store them in the result
    ## directory.
    unless ((-e $infile{bg_model}) ||
	    (-e ($infile{bg_model}.".gz"))) {

      ## Store the background model in the parent directory of the query sub-directories
      $dir{bg_model} = join("/",$main::dir{output_root}, "bg_models", $taxon);
      &RSAT::util::CheckOutDir($dir{bg_model});

      &RSAT::message::Warning("Computing taxon-wise background model (not installed on the server)" ) if ($main::verbose >= 1);
      my ($local_bg_file) = &ShortFileName($infile{bg_model});
      $infile{bg_model} = $dir{bg_model}."/".$local_bg_file;
      if ($task{bg_model}) {
	my $cmd = "taxon-frequencies -v 1 -type dyad  -ml 3";
	$cmd .= " -taxon ".$taxon;
	$cmd .= " > ".$infile{bg_model};
	&doit($cmd);
      }
      &RSAT::message::Info("Taxon-wise background model", $infile{bg_model}) if ($main::verbose >= 1);

    } else {
      &RSAT::error::FatalError($bg_model, "Invalid background model. Supported: ", $supported_bg_models) unless ($supported_bg_model{$bg_model});
    }
    &RSAT::message::Info("Background model file", $infile{bg_model}) if ($main::verbose >= 3);
  }

  ################################################################
  ## Analyze query genes separately or altogether
  local $q = 0;
  if ($main::sep_genes) {
    local $query_nb = scalar(@query_genes);
    foreach my $current_gene (@query_genes) {
      $q++;
      if (($skip > 0) && ($q <= $skip)) {
	  &RSAT::message::Info("Skipping gene", $q."/".$query_nb, $current_gene) if ($main::verbose >= 1);
	  next;
      }
      &RSAT::message::TimeWarn("Analyzing gene", $q."/".$query_nb, $current_gene) if ($main::verbose >= 1);
      &RunFootprintDisco($current_gene);
      if (($last > 0) && ($q >= $last)) {
	  &RSAT::message::Info("Stopped after gene", $q."/".$query_nb, $current_gene) if ($main::verbose >= 1);
	  last;
      }
    }
  } else {
      $q = 1;
      $query_nb = 1;
      &RSAT::message::TimeWarn("Analyzing a group of ",scalar(@query_genes)," genes", join(";", @query_genes)) if ($main::verbose >= 1);
      &RunFootprintDisco(@query_genes);
  }

  ################################################################
  ## Generate co-regulation network by comparing gene-wise footprints
  &CoregulationNetwork() if ($task{network});

  ################################################################
  ## Give a warning to report the gene-specific and main index files
  if ($main::verbose >= 2) {
    print ("; Index files\n");
    my $i = 0;
    foreach my $key (sort keys %index_list) {
      $i++;
      print join ("\t", ";", $key, $index_list{$key}), "\n";
      if ($i > 10) {
	print join ("\t", "; ... skipping other gene-wise index files"), "\n";
	last;
      }
    }
    print join ("\t", ";", "Main Table", $outfile{main_index}), "\n";
  }


  ## Close the main index stream
  my $exec_time = &RSAT::util::ReportExecutionTime($start_time);
  print $main_index "</table>\n";


  print $main_index "<h2>File index</h2>\n";
  print $main_index "<p><table>\n";
  foreach my $key (@to_index) {
    print $main_index "<tr>\n";
    print $main_index "<td>".$key."</td>\n";
    my $link = &LinkOneFile($outfile{main_index}, $outfile{$key});
    print $main_index "<td>".$link."</td>\n";
    print $main_index "</tr>\n";
  }
  print $main_index "</table></p>\n";

  print $main_index "<p><pre>", $exec_time, "</pre></p>\n";
  print $main_index "</body>\n";
  print $main_index "</html>\n";
  close ($main_index);

  exit(0);
}

################################################################
################### SUBROUTINE DEFINITION ######################
################################################################

sub DyadSuffix {
  my $dyad_suffix = "_".$promoter."_dyads_3nt_sp0-20".$strands.$noov."_".$bg_model."_sig".$lth{occ_sig};
  return($dyad_suffix);
}

################################################################
## Run footprint discovery flow chart for one or several query genes
sub RunFootprintDisco {
  local (@current_query_genes) = @_;

  local $query_start_time = &RSAT::util::AlphaDate();
  local $batch_cmd = "";
  local $out = "";
  local $genes = "";

  ################################################################
  ## Initialize output directory + output files

  local ($outfile_prefix, $query_prefix) = &InitQueryOutput();

  ## Output files for dyad-analysis
  $outfile{filter_dyads} = $outfile{prefix}."_filter_dyads.tab";

  my $dyad_suffix = &DyadSuffix();
  $outfile{dyad_prefix} = $outfile{prefix}.$dyad_suffix;
  $outfile{dyads} = $outfile{dyad_prefix}.".tab";
  $outfile{dyads_html} = $outfile{dyad_prefix}.".html";
  $outfile{asmb} = $outfile{dyad_prefix}.".asmb";
  $outfile{map} = $outfile{dyad_prefix}.".".${map_format};
  if ($to_matrix) {
    $outfile{pssm_prefix} = $outfile{dyad_prefix}."_pssm";
    $outfile{sig_PSSM} = $outfile{pssm_prefix}."_sig_matrices.txt";
    $outfile{count_PSSM} = $outfile{pssm_prefix}."_count_matrices.txt";
  }


  ################################################################
  ## Index files for the main index
  $main_index{$query_prefix}{dyads} = $outfile{dyads};

  ## Report all outfile names
  if ($main::verbose >= 5) {
    foreach my $key (sort keys %outfile) {
      &RSAT::message::Debug("outfile", $key, "'".$outfile{$key}."'");
    }
  }

  ################################################################
  ## Open the file to store the HTML index
  &OpenIndex("footprint-discovery");

  ################################################################
  ## Print query genes in the gene file
  foreach my $gene (@current_query_genes) {
    print $genes $gene, "\t", $organism_name, "\n";
  }
  &IndexOneFile("genes", $outfile{genes});

  ################################################################
  ## Print verbose
  &Verbose() if ($verbose);

  ## Predict operon leader genes of the query genes
  &InferQueryOperons() if ($infer_operons);

  ## Dyad filter tasks
  unless ($nofilter) {
    ## Retrieve promoters of the query organism
    &RetrieveQueryPromoters();

    ## Detect all dyads in promoters of query genes for dyad filtering
    &ComputeFilterDyads();
  }

  ################################################################
  ## Identify ortholog genes
  &GetOrthologs();

  ################################################################
  ## Predict operon leader genes for the orthologous genes
  &InferOrthoOperons() if ($infer_operons);

  ################################################################
  ## Retrieve sequences from orthologs
  &RetrieveOrthoSeq();

  ################################################################
  ## Purge sequences
  &PurgeOrthoSeq();

  ################################################################
  ## Discover over-represented dyads in promoters of orthologous genes
  &DyadAnalysis();

  ################################################################
  ## Generate a feature-map of the discovered dyads
  if ($task{map}) {
    my $map_title = join "; ", $taxon, $organism_name, $outfile_prefix;
    &CheckDependency("map", "dyads");
    &CheckDependency("map", "seq");
    &RSAT::message::TimeWarn("Generating feature map", $outfile{map}) if ($verbose >= 2);
    $cmd = "$SCRIPTS/dna-pattern -return limits,sites -origin -0 -N 4";
    $cmd .= " -pl ".$outfile{dyads};
    $cmd .= " -format fasta -i ".$outfile{seq};
    $cmd .= " | $SCRIPTS/convert-features -from dnapat -to ft";
    $cmd .= " | $SCRIPTS/feature-map";
    $cmd .= " -title '".$map_title."'";
    $cmd .= " -format ".$map_format;
    $cmd .= " -scalebar -legend";
    $cmd .= " -scorethick";
    $cmd .= " -o ".$outfile{map};
    &one_command($cmd);
  }
  &IndexOneFile("feature map", $outfile{map}, image=>1);


  ################################################################
  ## Send the command to a batch queue (e.g. PC cluster)
  if ($batch) {
    &doit($batch_cmd, $dry, $die_on_error, $verbose, 1, $job_prefix);
  }

  ################################################################
  ## Close query-specific output streams
  my $query_exec_time = &RSAT::util::ReportExecutionTime($query_start_time);
  print $main::out $query_exec_time if ($main::verbose >= 1);
  close $out if ($outfile{log});
  close $genes if ($outfile{genes});

  ## Close the index file
  print $index "</table>\n";
  print $index "</blockquote>";
  print $index "<hr size=2 color='#000088'>";
  print $index "</body>";
  print $index "</html>";
  close $index;

  ################################################################
  ## Update the main index
  print $main_index "<tr>\n";
  print $main_index "<td>", $q, "</td>\n";
  ## Link to the query-specific index file
#  print $main_index "<td><a href='", &RSAT::util::RelativePath($outfile{main_index}, $outfile{index}), "'>", $query_prefix, "</a></td>\n";
  print $main_index "<td>", &LinkOneFile($outfile{main_index}, $outfile{index}, $query_prefix), "</td>\n";

  ## Report the significant dyads
  my $top_id = "";
  my $top_sig = "NA";
  my $dyad_nb = 0;
  local %header_col = ();

  if (-e $outfile{dyads}) {
    &RSAT::message::Info("Indexing dyads", $query_prefix, $outfile{dyads}) if ($main::verbose >= 4);
    my ($dyads) = &OpenInputFile($outfile{dyads});

    while (<$dyads>) {
      next if (/^;/); ## Skip comments
      next unless (/\S/); ## Skip empty lines
      chomp;


      ## Read the header to idenfify occ_sig column
      if (/^#/) {
	s/^#//;
	my @header_fields = split "\t";
	for my $h (0..$#header_fields) {
	  $header_fields[$h] = &RSAT::util::trim($header_fields[$h]);
	  $header_col{$header_fields[$h]} = $h;
	  #	&RSAT::message::Debug("header", $h, "'".$header_fields[$h]."'", $header_col{$header_fields[$h]}) if ($main::verbose >= 10);
	}
	next;
      }

      $dyad_nb++;

      ## Report top significance
      if ($dyad_nb == 1) {
	my @fields = split "\t";
	$top_id = $fields[$header_col{"identifier"}];
	$top_sig = $fields[$header_col{"occ_sig"}];
	#      &RSAT::message::Debug("top_id", $top_id, "top_sig", $top_sig, ) if ($main::verbose >= 10);
      }
    }
    close $dyads;
  }

  print $main_index "<td><tt>", $top_id, "</tt></td>\n";
  print $main_index "<td>", $top_sig, "</td>\n";
  print $main_index "<td>", $dyad_nb, "<td>";

  print $main_index "</tr>\n";
}


################################################################
## Display full help message 
sub PrintHelp {
#    system "pod2text -c $0";
    system "cat $0 $ENV{RSAT}/perl-scripts/lib/footprint.lib.pl | pod2text -c";
    exit(0);
}

################################################################
## Display short help message
sub PrintOptions {
    &PrintHelp();
}

################################################################
## Read arguments 
sub ReadArguments {

=pod

=head1 OPTIONS

=cut
  local $arg;
  local @arguments = @ARGV; ## create a copy to shift, because we need ARGV to report command line in &Verbose()
  while (scalar(@arguments) >= 1) {
    $arg = shift (@arguments);

    if (&ReadFootprintOptions()) {
      next;

      ### Lower threshold

=pod

=over 4

=item B<-lth field value>

Lower threshold for dyad-analysis.

See the manual of dyad-analysis for a description of the fields on
which a threshold can be imposed.

=cut
    } elsif ($arg eq "-lth") {
      my $thr_field = shift (@arguments);
      my $thr_value =  shift (@arguments);
      $main::lth{$thr_field} = $thr_value;

      ### Upper threshold

=pod

=item B<-uth field value>

Upper threshold for dyad-analysis.

See the manual of dyad-analysis for a description of the fields on
which a threshold can be imposed.

=cut
    } elsif ($arg eq "-uth") {
      my $thr_field = shift (@arguments);
      my $thr_value =  shift (@arguments);
      $main::uth{$thr_field} = $thr_value;

=pod

=item B<-return dyad_return_fields>

Return fields for dyad-analysis.  This argument is passed to
dyad-analysis for the discovery of dyads in promoters of orthologous
genes.

Multiple-fields can be entered either by calling this argument
iterativelyk or by entering multiple fields separated by commas.

Type I<dyad-analysis -help> to obtain the list of supported return
fields.

=cut
    } elsif ($arg eq "-return") {
      push @dyad_return_fields, shift (@arguments);

=pod

=item B<-to_matrix>

Convert assembled patterns into position-specific scoring matrices
(PSSM).  Caution ! This conversion can take time if the sequence set
is large and if there are many assemblies.

=cut
    } elsif ($arg eq "-to_matrix") {
      $main::to_matrix = 1;


	    ## Background model
=pod

=item B<-bg_model taxfreq|org_list|monads>

Allow the user to choose among alternative background model (see Janky
& van Helden, 2008).

Supported background model types:

=over


=item I<monads>

Expected dyad frequencies are estimated by taking the product of the
monad frequencies observed in the input sequence set. Example:

   F_exp(CAGn{10}GTA) = F_obs(CAG) * F_obs(GTA)

=item I<taxfreq>

Only valid in combination with the option -taxon.

Expected dyad frequencies are computed by summing the frequencies of
all dyads in the non-coding upstream sequences of all genes for all
the organisms of the reference taxon.

=item I<org_list>

Only valid in combination with the option -org_list.

Expected dyad frequencies are computed by summing the frequencies of
all dyads in the non-coding upstream sequences of all genes for each
organism of user-specified list.

=back

=cut
    } elsif ($arg eq "-bg_model") {
      $main::bg_model = shift(@arguments);

	  ## Dyads filtering

=pod

=item B<-filter>

Only accept dyads found in the promoter of the query gene, in the
query organism. (option selected by default)

=cut
    } elsif ($arg eq "-filter") {
      $main::filter = 1;

=pod

=item B<-no_filter>

Accept all dyads, even if they are not found in the promoter of the
query gene, in the query organism. (will cancel -filter option if selected)

=cut

    } elsif ($arg eq "-no_filter") {
      $main::nofilter = 1;


=pod

=item B<-dist_thr value> 

Specify here the intergenic distance threshold in base pairs. Pair of adjacent genes with intergenic distance equal or less than this value are predicted to be within operon. (default : 55)

=cut
    } elsif ($arg eq "-dist_thr") {
      $main::dist_thr = shift(@arguments);

    } else {
      &FatalError(join("\t", "Invalid option", $arg));
    }
  }

}

################################################################
## Verbose message
sub Verbose {
  my $header_verbose = "";
  $header_verbose .=  "; footprint-discovery ";
  $header_verbose .= &PrintArguments();
  $header_verbose .= "\n";
  $header_verbose .= sprintf "; %-22s\t%s\n", "Program version", $program_version;
  $header_verbose .= sprintf "; %-22s\t%s", "Working directory", `pwd`;
  $header_verbose .= sprintf "; %-22s\t%s\n", "Query organism", $organism_name;
  if($main::orglist_file) {
    $header_verbose .= sprintf "; %-22s\t%s\n", "Organism list", $orglist_file;
  } else {
    $header_verbose .= sprintf "; %-22s\t%s\n", "Reference taxon", $taxon;
  }
  $header_verbose .= sprintf "; %s\n", "Promoters from predicted operon leader genes" if ($infer_operons);
  if (scalar(@current_query_genes) == 1) {
    $header_verbose .= sprintf "; %-22s\t%s\n", "Query gene", $current_query_genes[0];
  } elsif (scalar(@current_query_genes) <= 10) {
    $header_verbose .= sprintf "; %-22s\t%s\n", "Query genes", join "; ", @current_query_genes;
  } else {
    $header_verbose .= sprintf "; %-22s\t%s\n", "Query genes", scalar(@current_query_genes);
  }
  $header_verbose .= sprintf "; %-22s\t%s\n", "Background model", $bg_model;
  if ($filter) {
      $header_verbose .= sprintf "; %s\n", "Dyad filtering ON";
  }  else {
      $header_verbose .= sprintf "; %s\n", "No dyad filtering";
  }

  if (%main::infile) {
    print $main::out "; Input files\n";
    while (my ($key,$value) = each %main::infile) {
      printf $out ";\t%-13s\t%s\n", $key, $value;
    }
  }
  if (%main::outfile) {
    print $main::out "; Output files\n";
    while (my ($key,$value) = each %main::outfile) {
      printf $out ";\t%-13s\t%s\n", $key, $value;
    }
  }
  print $out $header_verbose;


  $header_verbose =~ s/^; //;
  $header_verbose =~ s/\n; /\n/gm;
  print $index "<pre>";
  print $index $header_verbose;
  print $index "</pre>";
}


################################################################
## Run dyad-analysis
sub DyadAnalysis {
  if ($task{dyads}) {
    &RSAT::message::TimeWarn("Running dyad-analysis", $outfile{dyads}) if ($verbose >= 2);
    &CheckDependency("dyads", "purged");
    $cmd = "$SCRIPTS/dyad-analysis -v 1 ";
    $cmd .= " -return ".$dyad_return_fields;
    ## Backgroun model
    if ($bg_model eq "monads") {
      $cmd .= " -bg monads" ;
    } elsif ($bg_model eq "taxfreq") {
      $cmd .= " -expfreq ".$infile{bg_model} ;
    }
    ## dyad filtering (only accept dyads found in the promoter of the query gene(s)
    if (!$nofilter) {
      $cmd .= " -accept ".$outfile{filter_dyads};
    }
    foreach my $field (sort keys %lth) {
    $cmd .= " -lth ".$field." ".$lth{$field};
  }
    foreach my $field (sort keys %uth) {
      $cmd .= " -uth ".$field." ".$uth{$field};
    }
    $cmd .= " -i ".$outfile{purged};
    $cmd .= " -l 3 -sp 0-20";
    $cmd .= " ".$strands;
    $cmd .= " ".$noov;
    $cmd .= " -sort";
    $cmd .= " -o ".$outfile{dyads};
    &one_command($cmd);

    ## Generate a HTML table with the discovered dyads
    &CheckDependency("dyads_html", "dyads");
    $cmd = "$SCRIPTS/text-to-html";
    $cmd .= " -i ".$outfile{dyads};
    $cmd .= " -font variable -chunk 1000";
    $cmd .= " -o ".$outfile{dyads_html};
    &one_command($cmd);

    ################################################################
    ## Assemble significant dyads
    &RSAT::message::TimeWarn("Assembling significant dyads", $outfile{asmb}) if ($verbose >= 2);
    &CheckDependency("asmb", "dyads");
    $cmd = "$SCRIPTS/pattern-assembly -v 1";
    $cmd .= " -i ".$outfile{dyads};
    $cmd .= " -subst 0 -maxfl 1 -toppat 50";
    $cmd .= " -o ".$outfile{asmb};
    &one_command($cmd);

    ################################################################
    ## Convert assembled dyads into a position-specific scoring matrix
    if ($to_matrix) {
      &RSAT::message::TimeWarn("Converting assembled dyads into PSSM", $outfile{pssm}) if ($verbose >= 2);
      &CheckDependency("pssm", "asmb");
      &CheckDependency("pssm", "seq");
      $cmd = "$SCRIPTS/matrix-from-patterns -v 1";
      $cmd .= " -seq ".$outfile{seq};
      $cmd .= " -asmb ".$outfile{asmb};
      $cmd .= " -format fasta";
      $cmd .= " -uth Pval 0.00025";
      $cmd .= " -bginput -markov 0";
      $cmd .= " ".$strands;
      $cmd .= " -logo";
      $cmd .= " -o ".$outfile{pssm_prefix};
      &one_command($cmd);
      &IndexOneFile("significance matrix (PSSM)", $outfile{sig_PSSM});
      &IndexOneFile("count matrix (PSSM)", $outfile{count_PSSM});
    }
  }
  &IndexOneFile("dyads", $outfile{dyads});
  &IndexOneFile("dyads [htm]", $outfile{dyads_html});
  &IndexOneFile("assembled dyads", $outfile{asmb});
}

################################################################
## Generate co-regulation network by comparing gene-wise footprints
sub CoregulationNetwork {
  my $main_prefix = &MainPrefix();
  my $dyad_suffix = &DyadSuffix();

  ## Create a directory for the inferred co-reglation network
  $dir{network} = $dir{output_root}."/network";
  &RSAT::util::CheckOutDir($dir{network});
  &RSAT::message::TimeWarn("Network directory", $dir{network}) if ($main::verbose >= 2);

  ################################################################
  ## Write a list of dyad files
  $outfile{dyad_file_list} .= $dir{network}."/".$main_prefix."_dyad_files.txt";
  my $out= &OpenOutputFile($outfile{dyad_file_list});
  foreach my $query_prefix (sort keys %main_index) {
    print $out $main_index{$query_prefix}{dyads}."\n" if (-e $main_index{$query_prefix}{dyads});
  }
  &RSAT::message::TimeWarn("Dyad file list", $outfile{dyad_file_list}) if ($main::verbose >= 2);
  push @to_index, "dyad_file_list";

  ################################################################
  ## Generate a dyad significance profiles.  Instead of a table with 1
  ## row per gene and 1 column per dyad, we generate a file in format
  ## "class": first column indicates the dyad, second column the gene,
  ## third column the score. We add two columns with primary gene name
  ## and ID, because the query gene list may contain synonyms.
  $outfile{dyad_sig_classes} .= $dir{network}."/".$main_prefix."_dyad_sig_classes.tab";
  my $compare_scores_cmd = "compare-scores -v 0";
  $compare_scores_cmd .= " -filelist ".$outfile{dyad_file_list};
  $compare_scores_cmd .= " -sc 8 -null .";
  $compare_scores_cmd .= " -basename";
  $compare_scores_cmd .= " -suppress ".$main_prefix;
  $compare_scores_cmd .= " -suppress ".$dyad_suffix.".tab";
  $compare_scores_cmd .= " -suppress _".$organism_name."_".$taxon;
  $compare_scores_cmd .= " -format classes";
  $compare_scores_cmd .= " | add-gene-info";
  $compare_scores_cmd .= " -col 2";
  $compare_scores_cmd .= " -org ".$organism_name;
  $compare_scores_cmd .= " -info name,id";
  $compare_scores_cmd .= " -o ".$outfile{dyad_sig_classes};
  &doit($compare_scores_cmd);
  &RSAT::message::TimeWarn("Dyad sig classes", $outfile{dyad_sig_classes}) if ($main::verbose >= 2);
  push @to_index, "dyad_sig_classes";

  ################################################################
  ## Build the network by comparing dyads score profiles between each
  ## pair of genes
  $outfile{gene_pairs} .= $dir{network}."/".$main_prefix."_gene_pairs.tab";
  my $compare_classes_cmd = 'awk \'{print $1"\t"$4"\t"$3}\' '.$outfile{dyad_sig_classes};
  $compare_classes_cmd .=  " | compare-classes -v 1";
  $compare_classes_cmd .= " -i /dev/stdin";
  $compare_classes_cmd .= " -triangle -distinct";
  $compare_classes_cmd .= " -return rank,occ,freq,proba,jac_sim,dotprod,entropy";
  $compare_classes_cmd .= " -sort dotprod";
  $compare_classes_cmd .= " -sc 3";
  $compare_classes_cmd .= " -lth dotprod 2 -lth QR 2 -lth Q 2 -lth R 2";
  $compare_classes_cmd .= " -o ".$outfile{gene_pairs};
  &doit($compare_classes_cmd);
  &RSAT::message::TimeWarn("Gene pairs", $outfile{gene_pairs}) if ($main::verbose >= 2);
  push @to_index, "gene_pairs";

  ################################################################
  ## Add columns with gene names and IDs
  $outfile{gene_pairs_names} .= $dir{network}."/".$main_prefix."_gene_pairs_names.tab";
  my $cc_gene_info_cmd = "add-gene-info";
  $cc_gene_info_cmd .= " -i ".$outfile{gene_pairs};
  $cc_gene_info_cmd .= " -col 1 -col 2";
  $cc_gene_info_cmd .= " -org ".$organism_name;
  $cc_gene_info_cmd .= " -info name,id";
  $cc_gene_info_cmd .= " -o ".$outfile{gene_pairs_names};
  &doit($cc_gene_info_cmd);
  &RSAT::message::TimeWarn("Gene pairs (with names)", $outfile{gene_pairs_names}) if ($main::verbose >= 2);
  push @to_index, "gene_pairs_names";

  ################################################################
  ## Convert the compare-classes result into a graph

  ## Identify score column
  my $score = "DPbits";
  my $score_column = 21;
  my ($in) = &OpenInputFile($outfile{gene_pairs});
  while (<$in>) {
    if (/^;\t(\d+)\t${score}\s+/) {
      $score_column = $1;
    }
  }
  close $in;

  ## Run the graph conversion
  $outfile{gene_pairs_gml} .= $dir{network}."/".$main_prefix."_gene_pairs_".$score.".gml";
  my $convert_graph_cmd = "convert-graph";
  $convert_graph_cmd .= " -scol 1 -tcol 2";
  $convert_graph_cmd .= " -wcol ".$score_column;
  $convert_graph_cmd .= " -from tab -to gml";
  $convert_graph_cmd .= " -ewidth -ecolors fire";
  $convert_graph_cmd .= " -i ".$outfile{gene_pairs};
  $convert_graph_cmd .= " -o ".$outfile{gene_pairs_gml};
  &doit($convert_graph_cmd);
  &RSAT::message::TimeWarn("Gene pair graph (GML)", $outfile{gene_pairs_gml}) if ($main::verbose >= 2);
  push @to_index, "gene_pairs_gml";


  ################################################################
  ## Get the neighborhood of each gene in the co-regulation network
  $outfile{gene_neighbours} .= $dir{network}."/".$main_prefix."_gene_neighbours.tab";
  my $gn_cmd = "graph-neighbours -v 1";
  $gn_cmd .= " -self";
#  $gn_cmd .= " -seed $gene";
  $gn_cmd .= " -all";
  $gn_cmd .= " -in_format gml";
  $gn_cmd .= " -i ".$outfile{gene_pairs_gml};
  $gn_cmd .= " -o ".$outfile{gene_neighbours};
  &doit($gn_cmd);
  &RSAT::message::TimeWarn("Gene neighbours", $outfile{gene_neighbours}) if ($main::verbose >= 2);
  push @to_index, "gene_neighbours";

}


__END__


