#!/usr/bin/perl -w

############################################################
#
# $Id: peak-motifs,v 1.83 2010/10/20 00:57:55 jvanheld Exp $
#
############################################################


## use strict;

=pod

=head1 NAME

peak-motifs

=head1 VERSION

$program_version

=head1 DESCRIPTION

Pipeline for discoering motifs from ChIP-seq (or ChIP-chip, or
ChIP-PET) peak sequences.

=head1 AUTHORS

=over

=item Jacques van Helden <Jacques.van.Helden@ulb.ac.be>

unConception and implementation of the work flow.

=item Morgane Thomas-Chollier <thomas-c@molgen.mpg.de>

Conception of the work flow + implementation of Web interface.

=item Matthieu Defrance <defrance@ccg.unam.mx>

Implementation of the efficient algorithms used in the work flow
(I<local-word-analysis>, I<count-words>, I<matrix-scan-quick>).

=item Olivier Sand <oly@bigre.ulb.ac.be> for the Web services

Web services.

=item Carl Herrmann <carl.herrmann@univmed.fr> and Denis Thieffry
<thieffry@tagc.univ-mrs.fr>

Analyis of the case studies. Definition of optimal conditions of
utilzation. Motif comparisons and clustering.

=back


=head1 CATEGORY

Pattern discovery

=head1 USAGE

peak-motifs [-i inputfile] [-o outputfile] [-v #] [...]

=head1 INPUT FORMAT

The program takes as input either one (test) or two sequence files
(test versus control).

All input sequences formats supported by convert-sequences are
supported.

=head1 OUTPUT FORMAT

The pipeline runs a series of programs generating each one or several
result file. An HTML index is generated in order to synthesize the
results and give access to the individual result files. 

The index file is formed from the output directory (option -outdir)
and the file prefix (option -prefix).

  [output_dir]/[prefix]_synthesis.html

=cut


BEGIN {
    if ($0 =~ /([^(\/)]+)$/) {
	push (@INC, "$`lib/");
    }
}
require "RSA.lib";
require "RSA.disco.lib";
require "footprint.lib.pl";
use RSAT::util;


################################################################
## Main package
package main;
{

  ################################################################
  ## Initialize parameters
  local $start_time = &RSAT::util::StartScript();

  local $die_on_error = 1;

  ## Check that the RSAT paths of the programs required for the script are specified
  $PYTHON =  $ENV{RSAT}."/python_scripts" unless ($PYTHON);
  $SCRIPTS = $ENV{RSAT}."/perl-scripts" unless ($SCRIPTS);
  $BIN = $ENV{RSAT}."/bin" unless ($BIN);

  $program_version = do { my @r = (q$Revision: 1.83 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
  #    $program_version = "0.00";

  %main::param = ();
  @main::param_list = ();

  %main::dir = ();
  %main::infile = ();
  %main::outfile = ();

  $main::verbose = 0;
  $main::out = STDOUT;

  ## Sequence types (can be eiter 'test' or 'test' + 'ctrl')
  @seq_types = ();

  ## Pattern types (oligos, positions, local-word-analysis,... with the oligo length suffix)
  @pattern_types = ();
  @patterns_to_merge = ();
  @timelog_keys = ();


  ################################################################
  ## Supported tasks
  @supported_tasks = ("all",
		      "purge",
		      "seqlen",
		      "composition",
		      "ref_motifs",

		      ## Pattern discovery algorithms
		      "oligos",
		      "dyads",
		      "positions",
		      "local_words",
		      "oligo_diff",
		      "merge_words",

		      "meme_bg",
		      "meme",

		      "merge_motifs",
#		      "cluster_motifs",
		      "motifs_vs_ref",
		      "motifs_vs_db",
#		      "motif_compa",

		      "scan",

		      "timelog",
		      "synthesis",
		      "clean_seq",
		     );

  my @future_tasks = qw(
			to_bed
		       );
  $supported_tasks = join ",", @supported_tasks;
  %supported_task = ();
  foreach my $task (@supported_tasks) {
    $supported_task{$task} = 1;
  }
  %task = ();			## List of tasks to be executed

  @motif_databases = ();
  @motif_db_format = ();

  local $meme_options = "";
  local $meme_suffix = "";

  ################################################################
  ## Set default parameters
  &DefaultParameters();

  ################################################################
  ## Read argument values
  &ReadArguments();

  &CheckArguments();

  &SetOutFileNames();

  ################################################################
  ## Open output stream
  $main::out = &OpenOutputFile($main::outfile{log});

  ################################################################
  ## Print verbose
  &Verbose() if ($main::verbose);

  foreach my $seq_type (@seq_types) {
    &PurgeSeq($seq_type) if ($task{purge});
    &SequenceLengths($seq_type) if ($task{seqlen});
    &CompositionProfiles($seq_type) if ($task{composition});
  }


  if ($main::infile{ctrl_seq}) {
    &ComputeBgModels($main::infile{ctrl_seq});
    &OligoDiff() if ($task{oligo_diff});
  }

  if (defined($main::infile{ref_motifs})) {
    &RefMotif() if ($task{ref_motifs});
  }

  &OligoAnalysis() if ($task{oligos});
  &DyadAnalysis() if ($task{dyads});
  &PositionAnalysis() if ($task{positions});
  &LocalWords() if ($task{local_words});
  &MergeWords() if ($task{merge_words});

  &CalcMemeBackground() if ($task{meme_bg});
  &RunMEME() if ($task{meme});

  &MergeMotifs() if ($task{merge_motifs});

#  &ClusterMotifs() if ($task{cluster_motifs});

  &MotifsVersusReference() if ((defined($main::infile{ref_motifs}))
			       && ($task{motifs_vs_ref}));

  &MotifsVersusDatabase() if ((scalar(@motif_databases) > 0)
			      && ($task{motifs_vs_db}));

  &ScanSequences() if ($task{scan});

  &TimeLog() if ($task{timelog});

  &Synthesis() if ($task{synthesis});

  &CleanSequences() if ($task{clean_seq});

  if ($main::verbose >= 1) {
    &TimeWarn("Log file", $main::outfile{log});
  }

  ################################################################
  ## Close output stream
  my $exec_time = &RSAT::util::ReportExecutionTime($start_time); ## This has to be exectuted by all scripts
  print $main::out $exec_time if ($main::verbose >= 1);	## only report exec time if verbosity is specified
  close $main::out if ($main::outfile{output});
  exit(0);
}

################################################################
################### SUBROUTINE DEFINITION ######################
################################################################


################################################################
## Display full help message 
sub PrintHelp {
  system "pod2text -c $0";
  exit()
}

################################################################
## Display short help message
sub PrintOptions {
  &PrintHelp();
}


################################################################
## Set parameter values for all the programs
sub DefaultParameters {
  &RSAT::message::TimeWarn("\n; Setting default parameter values") if ($main::verbose >= 1);

  ## Formats
  $main::param{seq_format} = "fasta"; push(@main::param_list, "seq_format");
  $main::param{img_format} = "pdf"; push(@main::param_list, "img_format");
  $main::param{img_format} = "png"; push(@main::param_list, "img_format");

  ## Motif formats
  $main::param{ref_motifs_format} = "transfac"; push(@main::param_list, "ref_motifs_format");

  ## Purge-seq
  $main::param{purge_match_length} = 30; push(@main::param_list, "purge_match_length");
  $main::param{purge_mismatches} = 0; push(@main::param_list, "purge_mismatches");

  ################################################################
  ## Residue  profiles (position-analysis)
  $main::param{profiles_ci} = 20; push(@main::param_list, "profiles_ci");
  $main::param{profiles_max_graphs} = 20; push(@main::param_list, "profiles_max_graphs");
  $main::param{profiles_strand} = "-1str"; push(@main::param_list, "profiles_strand");
  @main::profiles_oligo_lengths = (1,2);
  $main::param{profiles_oligo_lengths} = join(',',@profiles_oligo_lengths); push(@main::param_list, "profiles_oligo_lengths");

  ## Beware: computational profiles are always computed with the
  ## option -ovlp (count all occurrences), to avoid weird effectw:
  ## with the -noov mode, the transition tables are unbalanced even
  ## when the input sequences are generated with equiprobable
  ## residues. This is consistent with the fact that those profiles
  ## further serve to estimate the probability of larger words, which
  ## may include repeated residues. Using the option -noov here would
  ## induce an under-estimation of the expected frequency for words
  ## containing repeated residues.
  $main::param{profiles_noov} = "-ovlp"; push(@main::param_list, "profiles_noov");

  ################################################################
  ## Pattern discovery options
  $main::param{strand} = "-2str"; push(@main::param_list, "strand");
  $main::param{disco_noov} = "-noov"; push(@main::param_list, "disco_noov");
  $main::param{oligo_min_len} = 6; push(@main::param_list, "oligo_min_len");
  $main::param{oligo_max_len} = 7; push(@main::param_list, "oligo_max_len");
  $main::param{patterns_max_rank} = 25; push(@main::param_list, "patterns_max_rank");

  ## Thresholds for oligo-analysis, dyad-analysis and oligo-diff
  $main::param{min_ratio} = 1; push(@main::param_list, "min_ratio");
  $main::param{min_zscore} = 6; push(@main::param_list, "min_ratio");
  $main::param{min_sig} = 0; push(@main::param_list, "min_sig");

  ## oligo-analysis
  $main::param{oligo_min_mkv} = -2; push(@main::param_list, "oligo_min_mkv");
  $main::param{oligo_max_mkv} = -2; push(@main::param_list, "oligo_max_mkv");

  ## dyad-analysis

  ## position-analysis
  $main::param{positions_ci} = 50; ## Class interval
  $main::param{positions_min_occ} = 1; ## Min number of occurrences
  $main::param{positions_max_graphs} = 20; push(@main::param_list, "positions_max_graphs");

  ## local-word-analysis
#  $main::param{local_words_heuristic} = "slices"; ## Heuristic
  $main::param{local_words_window} = 50; ## Class interval
  $main::param{local_words_windowgroup} = 50; ## Class interval for centered windows of increasing sizes


  ## matrix-from-patterns
  $main::param{asmb_toppat} = $main::param{patterns_max_rank}; push(@main::param_list, "asmb_toppat");
  $main::param{matrix_nmotifs} = 3; push(@main::param_list, "matrix_nmotifs");

  ## Matrix comparisons
  $main::param{matrix_compa_min_w} = 5;
  $main::param{matrix_compa_min_cor} = 0.75;
  $main::param{matrix_compa_min_Ncor} = 0.5;

  ## matrix-scan-quick
  $main::param{scan_min_score} = 7.5;
  $main::param{scan_strands} = "-2str";

  ## Default MEME options
  $main::param{meme_text} = ""; push @MEME_options, "text";	## Ouptut format = text
  $main::param{meme_dna} = ""; push @MEME_options, "dna"; ## Sequence type == dna
  $main::param{meme_mod} = "anr"; push @MEME_options, "mod"; ## Accept any number of occurrences per sequence
  $main::param{meme_minw} = 12; push @MEME_options, "minw"; ## Minimal motif width
  $main::param{meme_maxw} = 12; push @MEME_options, "maxw"; ## Maximal motif width
  $main::param{meme_nmotifs} = $main::param{matrix_nmotifs}; push @MEME_options, "nmotifs"; ## Number of motifs
  $main::param{meme_evt} = 1; push @MEME_options, "evt"; ## upper threshold on E-value
  $main::param{meme_maxsize} = 10000000; push @MEME_options, "maxsize"; ## maximum size for the sequence set

  ## Options for MEME background model
  $main::param{bg_meme_markov} = 3; ## Markov order for MEME

}

################################################################
## Define one output file name by concatenating arguments
sub OutFileName {
  my ($subdir, $extension, @name_elements) = @_;

  my $dir = $main::dir{output};
  if ($subdir) {
    $dir = join ("/", $main::dir{output}, $subdir);
    &RSAT::util::CheckOutDir($dir);
  }
  my $out_file_name = join("/", $dir, join ("_", $main::param{prefix}, @name_elements));
  $out_file_name .= $extension;
  return($out_file_name);
}

################################################################
## Set output file names
sub SetOutFileNames {
  ## Purged sequences
  foreach my $seq_type (@seq_types) {
    my @seq_suffix = ();
    if (defined($main::param{max_seq_len})) {
      push @seq_suffix, "maxlen".$main::param{max_seq_len};
    }
    if (defined($main::param{top_peaks})) {
      push @seq_suffix, "top".$main::param{top_peaks};
    }
#    if (defined($main::param{max_seq_len})) {
    $main::outfile{$seq_type."_converted"} = &OutFileName("data/sequences",
							  ".fasta",
							  $seq_type,
							  @seq_suffix);
    $main::outfile{$seq_type."_purged"} = &OutFileName("data/sequences",
						       ".fasta",
						       $seq_type,
						       @seq_suffix,
						       "purged",
						       "ml".$main::param{purge_match_length},
						       "mis".$main::param{purge_mismatches},
						      );
    #    $seqfile{$seq_type} = $main::outfile{$seq_type."_converted"};
    #  } else {
    #      $main::outfile{$seq_type."_purged"} = &OutFileName("data/sequences",
    # ".fasta", 
    #							  #							 $seq_type,
    #							 "purged",
    #							 "ml".$main::param{purge_match_length},
    #							 "mis".$main::param{purge_mismatches});
    $seqfile{$seq_type} = $main::infile{$seq_type."_seq"};
    #    }

    ## Sequence lengths
    $main::outfile{$seq_type."_seqlen"} = &OutFileName("data/sequences", ".tab", $seq_type."_seqlen");
    $main::outfile{$seq_type."_seqlen_distrib"} = &OutFileName("data/sequences", ".tab", $seq_type."_seqlen_distrib");
    $main::outfile{$seq_type."_seqlen_distrib_graph"} = &OutFileName("data/sequences", ".".$param{img_format}, $seq_type."_seqlen_distrib");
    #    if ($infile{ctrl_seq}) {
    #	$main::outfile{"ctrl_seqlen"} = &OutFileName("data/sequences", ".tab", "ctrl_seqlen");
    #	$main::outfile{"ctrl_seqlen_distrib"} = &OutFileName("data/sequences", ".tab", "ctrl_seqlen_distrib");
    #	$main::outfile{"ctrl_seqlen_distrib_graph"} = &OutFileName("data/sequences", $param{img_format}, "ctrl_seqlen_distrib");
    #    }

    ## Compositional Profiles
    for my $ol (@profiles_oligo_lengths) {

      ################################################################
      ## Profiles of oligo frequencies as a function of the position
      ##
      $main::outfile{$ol."nt_".$seq_type."_profiles"} = &OutFileName("results/composition", ".tab", $seq_type."_profiles".$main::param{profiles_strand}.$main::param{profiles_noov},
								     $ol."nt","ci".$main::param{profiles_ci});
      push @timelog_keys, $ol."nt_".$seq_type."_profiles";

      ## oligo frequencies in the sequence set
      $main::outfile{$ol."nt_".$seq_type."_freq"} = &OutFileName("results/composition", ".tab", $seq_type."_freq"."-1str".$main::param{profiles_noov},$ol."nt");
      push @timelog_keys, $ol."nt_".$seq_type."_freq";
      $main::outfile{$ol."nt_".$seq_type."_transit"} = &OutFileName("results/composition", ".tab", $seq_type."_transitions"."-1str".$main::param{profiles_noov},$ol."nt");
      $main::outfile{$ol."nt_".$seq_type."_inclusive"} = &OutFileName("results/composition", ".txt", $seq_type."_inclusive"."-1str".$main::param{profiles_noov},$ol."nt");
      $main::outfile{$ol."nt_".$seq_type."_heatmap"} = &OutFileName("results/composition", ".".$main::param{img_format}, $seq_type."_heatmap"."-1str".$main::param{profiles_noov},$ol."nt");
      ## HTML index to the individual oligonucleotide profiles
      $main::outfile{$ol."nt_".$seq_type."_profiles_index"} = $main::outfile{$ol."nt_".$seq_type."_profiles"};
      $main::outfile{$ol."nt_".$seq_type."_profiles_index"} =~ s/\.tab$//;
      $main::outfile{$ol."nt_".$seq_type."_profiles_index"} .= "_graph_index.html";

      ## Graph prefix for positional profiles
      $main::outfile{$ol."nt_".$seq_type."_profiles_graph"} = $main::outfile{$ol."nt_".$seq_type."_profiles"};
      $main::outfile{$ol."nt_".$seq_type."_profiles_graph"} =~ s/\.tab$//;
      $main::outfile{$ol."nt_".$seq_type."_profiles_graph"} .= ".".$main::param{img_format};
    }
  }



  ################################################################
  ## Background model for matrix scanning
  $main::param{scan_markov_order} = 1;
  my $scan_bg_ol = $main::param{scan_markov_order}+1;
  $main::param{scan_bg_file} = $main::outfile{$scan_bg_ol."nt_test_inclusive"};
  &RSAT::message::Info("Background model file",
		       "order=".$main::param{scan_markov_order},
		       "bg_ol=".$scan_bg_ol,
		       "key=".$scan_bg_ol."nt_test_inclusive",
		       "file=".$main::param{scan_bg_file}
		      ) if ($main::verbose >= 2);


  ################################################################
  ## Pattern discovery results
  my $pattern_type = "";
  if ($main::infile{ctrl_seq}) {
    for my $len ($main::param{oligo_min_len}..$main::param{oligo_max_len}) {
      ## oligo-diff
      $pattern_type = 'oligo_diff_'.$len.'nt';
      $main::outfile{$pattern_type} = &OutFileName("results/".$pattern_type, ".tab", "oligo_diff".$main::param{strand}.$main::param{disco_noov},$len."nt");
      push @pattern_types, $pattern_type;
      push @patterns_to_merge, $pattern_type;
    }
  } else {
    for my $len ($main::param{oligo_min_len}..$main::param{oligo_max_len}) {

      if ($main::infile{ctrl_seq}) {
	################################################################
	## Background models estimated from the control sequences
	## oligo-analysis
	$pattern_type = 'oligos_'.$len.'nt_vs_ctrl';
	$main::outfile{$pattern_type} = &OutFileName("results/".$pattern_type, ".tab", "oligos".$main::param{strand}.$main::param{disco_noov},
						     $len."nt", "vs_ctrl");
	push @pattern_types, $pattern_type;
	push @patterns_to_merge, $pattern_type;

      } else {
	################################################################
	## Background models estimated from the test sequences
	## themselves (lower order Markov chain)
	for my $markov ($main::param{oligo_min_mkv}..$main::param{oligo_max_mkv}) {
	  ## Convert negative markov orders relative to the oligo length
	  $markov += $len if ($markov < 0);

	  ## oligo-analysis
	  $pattern_type = 'oligos_'.$len.'nt_mkv'.$markov;
	  $main::outfile{$pattern_type} = &OutFileName("results/".$pattern_type, ".tab", "oligos".$main::param{strand}.$main::param{disco_noov},
						       $len."nt", "mkv".$markov);
	  push @pattern_types, $pattern_type;
	  push @patterns_to_merge, $pattern_type;


	  ## oligo-analysis with all the oligos (no threshold) + a two-tails test (in order to detect under-represented patterns)
	  $main::outfile{$pattern_type."_all"} = &OutFileName("results/".$pattern_type."_all_2tails", ".tab", "oligos".$main::param{strand}.$main::param{disco_noov}."_all_2tails",
						       $len."nt", "vs_ctrl");


	  ## local-word-analysis
	  $pattern_type = 'local_words_'.$len.'nt_mkv'.$markov;
	  $main::outfile{$pattern_type} = &OutFileName("results/".$pattern_type, ".tab", "local_words".$main::param{strand}.$main::param{disco_noov},
						       $len."nt", "windgroup".$main::param{local_words_windowgroup},
						       #						     $main::param{local_words_heuristic},
						       "mkv".$markov);
	  push @pattern_types, $pattern_type;
	  push @patterns_to_merge, $pattern_type;
	}
      }

      ## position-analysis
      $pattern_type = 'positions_'.$len.'nt';
      $main::outfile{$pattern_type} = &OutFileName("results/".$pattern_type, ".tab", "positions".$main::param{strand}.$main::param{disco_noov},
						   $len."nt", "ci".$main::param{positions_ci});
      push @pattern_types, $pattern_type;
      push @patterns_to_merge, $pattern_type;
    }

    ## dyad-analysis
    $pattern_type = "dyads";
    $main::outfile{$pattern_type} = &OutFileName("results/".$pattern_type, ".tab", "dyads".$main::param{strand}.$main::param{disco_noov},
					 "3nt_sp0-20_bg_monads");

    ### TEMPORARY BUG
    $main::outfile{$pattern_type} = &OutFileName("results/positions_7nt", ".tab", "dyads".$main::param{strand}.$main::param{disco_noov},
					 "3nt_sp0-20_bg_monads");
    push @pattern_types, $pattern_type;
#    push @patterns_to_merge, $pattern_type;
  }

  ## Output files for MEME
  $main::outfile{meme_bg_file} = &OutFileName("results/meme", ".txt", "meme_bg_mkv".$main::param{bg_meme_markov});
  &CalcMemeOptions(); ## this also computes the suffix of the meme output file
  $main::outfile{meme} = &OutFileName("results/meme", ".txt", "meme_".$meme_suffix);
  $main::outfile{meme_tf} = &OutFileName("results/meme", ".tf", "meme_".$meme_suffix);
  $main::outfile{meme_logos} = &OutFileName("results/meme", "", "meme_".$meme_suffix."_logos");

  ## Comparison between significance of the discovered patterns
  $pattern_type = "merged_words";
  $main::outfile{merged_words} = &OutFileName("results/".$pattern_type, ".tab", "merged_words");
  $main::outfile{merged_words_html} = &OutFileName("results/".$pattern_type, ".html", "merged_words");
  $main::outfile{merged_words_heatmap} = &OutFileName("results/".$pattern_type, ".png", "merged_words_heatmap");
  push @pattern_types, $pattern_type;

  &RSAT::message::Info("Pattern types", join (",", @pattern_types)) if ($main::verbose >= 1);
  &RSAT::message::Info("Sequence types", join (",", @seq_types)) if ($main::verbose >= 1);

  ## Conversion from patterns to matrices + logos
  foreach my $pattern_type (@pattern_types) {
    push @timelog_keys, $pattern_type;
    $main::outfile{$pattern_type.'_pssm'} = $main::outfile{$pattern_type};
    $main::outfile{$pattern_type.'_pssm'} =~ s/\.tab$//;
    $main::outfile{$pattern_type.'_pssm'} .= "_pssm";
    $main::outfile{$pattern_type.'_2pssm'} = $main::outfile{$pattern_type.'_pssm'}."_log.txt"; push @timelog_keys, $pattern_type.'_2pssm';
    $main::outfile{$pattern_type.'_asmb'} = $main::outfile{$pattern_type.'_pssm'}.".asmb";
    $main::outfile{$pattern_type.'_pssm_sig'} = $main::outfile{$pattern_type.'_pssm'}."_sig_matrices.txt";
    $main::outfile{$pattern_type.'_pssm_gibbs'} = $main::outfile{$pattern_type.'_pssm'}."_gibbs_matrices.txt";
    $main::outfile{$pattern_type.'_pssm_counts'} = $main::outfile{$pattern_type.'_pssm'}."_count_matrices.txt";
    $main::outfile{$pattern_type.'_pssm_tf'} = $main::outfile{$pattern_type.'_pssm'}."_count_matrices.tf";
    $main::outfile{$pattern_type.'_pssm_sites'} = $main::outfile{$pattern_type.'_pssm'}."_sites.tab";
    $main::outfile{$pattern_type.'_pssm_site_distrib'} = $main::outfile{$pattern_type.'_pssm'}."_site_distrib.tab";
    $main::outfile{$pattern_type.'_pssm_site_distrib_graph'} = $main::outfile{$pattern_type.'_pssm'}."_site_distrib".$main::param{img_format};
    foreach my $logo_nb (1..$main::param{matrix_nmotifs}) {
      $main::outfile{$pattern_type.'_pssm_logo'.$logo_nb} =
	$main::outfile{$pattern_type.'_pssm'}."_count_matrices_logo_m".$logo_nb.".".$main::param{img_format};
      $main::outfile{$pattern_type.'_pssm_logo_rc'.$logo_nb} =
	$main::outfile{$pattern_type.'_pssm'}."_count_matrices_logo_m".$logo_nb."_rc.".$main::param{img_format};
    }

  }


  ## Clustering of the discovered motifs + comparison with reference motif
  $main::outfile{motifs_discovered} = &OutFileName("results/discovered_motifs", ".tf", "motifs_discovered");
  $main::outfile{motifs_disco_compa} = &OutFileName("results/discovered_motifs", ".tab", "motifs_disco_compa");
  $main::outfile{motifs_disco_compa_html} = &OutFileName("results/discovered_motifs", ".html", "motifs_disco_compa");
  push @timelog_keys, "motifs_disco_compa";
  $main::outfile{motifs_disco_compa_gml} = &OutFileName("results/discovered_motifs", ".gml", "motifs_disco_compa");
  $main::outfile{motifs_disco_compa_png} = &OutFileName("results/discovered_motifs", ".png", "motifs_disco_compa");
  $main::outfile{motifs_disco_clusters_mcl} = &OutFileName("results/discovered_motifs", ".mcl", "motifs_disco_clusters");
  $main::outfile{motifs_disco_clusters_tab} = &OutFileName("results/discovered_motifs", ".tab", "motifs_disco_clusters");
  $main::outfile{motifs_disco_clusters_graph} = &OutFileName("results/discovered_motifs", ".tab", "motifs_disco_clusters_graph");
  $main::outfile{motifs_disco_clusters_graph_gml} = &OutFileName("results/discovered_motifs", ".gml", "motifs_disco_clusters_graph");
  $main::outfile{motifs_disco_clusters_graph_png} = &OutFileName("results/discovered_motifs", ".png", "motifs_disco_clusters_graph");
  $main::outfile{motifs_disco_compa_cluster_intra_degree} = &OutFileName("results/discovered_motifs", ".tab", "motifs_disco_compa_cluster_intra_degree");
  #    $main::outfile{motifs_disco_ref} = &OutFileName("results/discovered_motifs", ".tf", "motifs_disco_ref");


  if (defined($main::infile{ref_motifs})) {
    ## Motif(s) considered as reference for the testing set
    $main::outfile{"ref_motifs"} = &OutFileName("data/ref_motifs", ".".$main::param{ref_motifs_format}, "ref_motifs");
    $main::outfile{"ref_motifs_transfac"} = &OutFileName("data/ref_motifs", ".tf", "ref_motifs");
    $main::outfile{"ref_motifs_tab"} = &OutFileName("data/ref_motifs", ".tab", "ref_motifs_tab");
    $main::outfile{"ref_motifs_logo"} = &OutFileName("data/ref_motifs", "", "ref_motifs_logo");
    $main::outfile{"ref_motifs_enriched"} = &OutFileName("data/ref_motifs", "tab", "ref_motifs_enriched");

    ## Comparison betweeen discovered motifs and reference motif
    $main::outfile{"motifs_vs_ref_prefix"} = &OutFileName("results/discovered_vs_ref", "", "motifs_vs_ref");
    $main::outfile{"motifs_vs_ref"} = &OutFileName("results/discovered_vs_ref", ".tab", "motifs_vs_ref");
    $main::outfile{"motifs_vs_ref_html"} = &OutFileName("results/discovered_vs_ref", ".html", "motifs_vs_ref");
    push @timelog_keys, "motifs_vs_ref";
    $main::outfile{"motifs_vs_ref_gml"} = &OutFileName("results/discovered_vs_ref", ".gml", "motifs_vs_ref");
    $main::outfile{"motifs_vs_ref_png"} = &OutFileName("results/discovered_vs_ref", ".png", "motifs_vs_ref");
    $main::outfile{"motifs_vs_ref_alignments_1ton"} = &OutFileName("results/discovered_vs_ref", ".tab", "motifs_vs_ref_alignments_1ton");
    $main::outfile{"motifs_vs_ref_alignments_1ton_html"} = &OutFileName("results/discovered_vs_ref", ".html", "motifs_vs_ref_alignments_1ton");
  }

  ## Comparison between discovered motifs and database(s)
  if (scalar(@motif_databases) > 0) {
    foreach my $db_name (@motif_databases) {
      $main::outfile{"motifs_vs_db_".$db_name."_prefix"} = &OutFileName("results/discovered_vs_db", "", "motifs_vs_db_".$db_name);
      $main::outfile{"motifs_vs_db_".$db_name} = &OutFileName("results/discovered_vs_db", ".tab", "motifs_vs_db_".$db_name);
      $main::outfile{"motifs_vs_db_".$db_name."_html"} = &OutFileName("results/discovered_vs_db", ".html", "motifs_vs_db_".$db_name);
      push @timelog_keys, "motifs_vs_db_".$db_name;
      $main::outfile{"motifs_vs_db_".$db_name."_gml"} = &OutFileName("results/discovered_vs_db", ".gml", "motifs_vs_db_".$db_name);
      $main::outfile{"motifs_vs_db_".$db_name."_png"} = &OutFileName("results/discovered_vs_db", ".png", "motifs_vs_db_".$db_name);
      $main::outfile{"motifs_vs_db_".$db_name."_alignments_1ton"} = &OutFileName("results/discovered_vs_db", ".tab", "motifs_vs_db_".$db_name."_alignments_1ton");
      $main::outfile{"motifs_vs_db_".$db_name."_alignments_1ton_html"} = &OutFileName("results/discovered_vs_db", ".html", "motifs_vs_db_".$db_name."_alignments_1ton");
    }
  }

}

################################################################
## Convert, truncate, select top and purge sequences
sub PurgeSeq {
  my ($seq_type) = @_;

  my $seq_file = $seqfile{$seq_type};
  ## Convert sequences to standard fasta format, mask non-DNA seqences
  ## selectd top peaks if required, and truncate sequences to max
  ## length if specified.
  &RSAT::message::TimeWarn("Converting sequences", $seq_type, $seq_file) if ($main::verbose >= 1);
  my $cmd = $SCRIPTS."/convert-seq";
  $cmd .= " -i ".$seq_file;
  $cmd .= " -from ".$main::param{seq_format};
  $cmd .= " -to fasta";
  $cmd .= " -mask non-dna";

  ## Select top peaks if required
  if (defined($main::param{top_peaks})) {
    &RSAT::message::Info("\tSelecting top peaks", $main::param{top_peaks}) if ($main::verbose >= 2);
    $cmd .= " -top ".$main::param{top_peaks};
  }

  ## Truncate sequences if required
  if (defined($main::param{max_seq_len})) {
    my $from = -round($main::param{max_seq_len}/2);
    my $to = $from + $main::param{max_seq_len} -1;
    &RSAT::message::Info("\tTruncating to ", $main::param{max_seq_len}." bp max") if ($main::verbose >= 2);
    $cmd .= "| ".$SCRIPTS."/sub-sequence";
    #    $cmd .= " -i ".$main::infile{$seq_type.'_seq'};
    $cmd .= " -origin center";
    $cmd .= " -from ".$from;
    $cmd .= " -to ".$to;
  }
  $cmd .= " -o ".$main::outfile{$seq_type."_converted"};
  &one_command($cmd, 1);

  ## Purge sequences (mask redundant fragments)
  &RSAT::message::TimeWarn("Purging sequences", $seq_type, $main::outfile{$seq_type."_converted"}) if ($main::verbose >= 1);
  $cmd = "$SCRIPTS/purge-sequence -dna";
  $cmd .= " -i ".$main::outfile{$seq_type."_converted"};
  $cmd .= " -ml ".$main::param{purge_match_length};
  $cmd .= " -mis ".$main::param{purge_mismatches};
  $cmd .= " -o ".$main::outfile{$seq_type."_purged"};
  &one_command($cmd, 1);
}


################################################################
## Compute sequence lengths
sub SequenceLengths {
  my ($seq_type) = @_;
  &RSAT::message::TimeWarn("Computing sequence lengths", $seq_type) if ($main::verbose >= 1);
  my $cmd = $SCRIPTS."/sequence-lengths";
#  $cmd .= " -i ".$main::infile{$seq_type.'_seq'};
  $cmd .= " -i ".$main::outfile{$seq_type."_purged"};
  $cmd .= " -o ".$main::outfile{$seq_type.'_seqlen'};
  $cmd .= " ; cut -f 2 ".$main::outfile{$seq_type.'_seqlen'};
  $cmd .= " | ".$SCRIPTS."/classfreq -v 1 -ci ".$main::param{profiles_ci};
  $cmd .= " -o ".$main::outfile{$seq_type.'_seqlen_distrib'};
  &one_command($cmd, 1);

  $cmd = $SCRIPTS."/XYgraph -lines -pointsize 0 -legend";
  $cmd .= " -format ".$main::param{img_format};
  my $title = "Sequence lengths";
  $title .= "; ".$main::param{title} if ($main::param{title});
  $cmd .= " -title '".$title."'";
  $cmd .= " -ysize 200 -ycol 4 -yleg1 'Number of peaks'";
  $cmd .= " -xsize 800 -xcol 3 -xleg1 'Peak length'";
  $cmd .= " -xmin 0 -ymin 0";
#  $cmd .= " -xgstep1 ".($main::param{profiles_ci});
#  $cmd .= " -xgstep2 ".$main::param{profiles_ci};
  $cmd .= " -i ".$main::outfile{$seq_type.'_seqlen_distrib'};
  $cmd .= " -o ".$main::outfile{$seq_type.'_seqlen_distrib_graph'};
  &one_command($cmd, 1);
}

################################################################
## Run position-analysis to compute composition profiles (residues,
## dinucleotides) anw count-words to compute background models for
## sequence scanning.
sub CompositionProfiles {
  my ($seq_type) = @_;

  ################################################################
  ## Compute nucleotide and dinucleotide frequencies
  &RSAT::message::TimeWarn("Computing nucleotide and dinucleotide frequencies") if ($main::verbose >= 2);
  for my $ol (@profiles_oligo_lengths) {

    ## Compute background models of order 0 (Bernoulli) and 1 (Markov)
    ## from the input sequence
    $cmd = $BIN."/count-words -v 1";
    $cmd .= " -i ".$main::outfile{$seq_type."_purged"};
    $cmd .= " -l ".$ol;
    $cmd .= " -1str";
    $cmd .= " ".$main::param{profiles_noov} if ($main::param{profiles_noov} eq "-noov"); ## The option -ovlp is not supported by count-words
    $cmd .= " > ".$main::outfile{$ol."nt_".$seq_type."_freq"};

    ## Convert background model in INCLUSIVE format for matrix-scan-quick
    $cmd .= "; ".$SCRIPTS."/convert-background-model -from oligos -to inclusive ";
    $cmd .= " -i ".$main::outfile{$ol."nt_".$seq_type."_freq"};
    $cmd .= " -o ".$main::outfile{$ol."nt_".$seq_type."_inclusive"};

    ## Convert background model to transition table and draw the heatmap of transition probabilities
    $cmd .= "; ".$SCRIPTS."/convert-background-model -from oligos -to transitions ";
    $cmd .= " -i ".$main::outfile{$ol."nt_".$seq_type."_freq"};
    $cmd .= " -o ".$main::outfile{$ol."nt_".$seq_type."_transit"};
    $cmd .= " ; cut -f 1-5,7 ".$main::outfile{$ol."nt_".$seq_type."_transit"};
    $cmd .= " | ".$SCRIPTS."/draw-heatmap -min 0 -max 1  -out_format png -col_width 50";
    $cmd .= " -o ".$main::outfile{$ol."nt_".$seq_type."_heatmap"};
    &one_command($cmd, 1);
  }

  ################################################################
  ## Compute position profiles for 1nt and 2nt
  &RSAT::message::TimeWarn("Computing composition profiles") if ($main::verbose >= 2);
  for my $ol (@profiles_oligo_lengths) {

    ## Compute positional profiles
    &RSAT::message::TimeWarn("Computing composition profiles", $ol."nt") if ($main::verbose >= 1);
    my $cmd =$SCRIPTS."/position-analysis -v 1";
    $cmd .= " -i ".$main::outfile{$seq_type."_purged"};
    $cmd .= " -format fasta";
    $cmd .= " -sort ";
    $cmd .= " -return chi,sig,distrib,graphs,rank";
    $cmd .= " -max_graphs ".$main::param{profiles_max_graphs} if ($main::param{profiles_max_graphs}  > 0);
    $cmd .= " ".$main::param{profiles_strand};
    $cmd .= " ".$main::param{profiles_noov};
    $cmd .= " -seqtype dna";
    $cmd .= " -l ".$ol;
    $cmd .= " -ci ".$main::param{profiles_ci};
    $cmd .= " -img_format ".$main::param{img_format};
    $cmd .= " -title '".$main::param{title}."'";
    $cmd .= " -origin center ";
    $cmd .= " -o ".$main::outfile{$ol."nt_".$seq_type."_profiles"};
    &one_command($cmd, 1);

    ## Draw the XY graph with composition profiles
    my $color_file = $ENV{RSAT}."/perl-scripts/lib/color_palettes/".$ol."nt".$main::param{profiles_strand}."_colors.tab";
    my $col_nb = 4**$ol + 1;
    $cmd = 'grep -v ";" '.$main::outfile{$ol."nt_".$seq_type."_profiles"}.' | sort | '.$SCRIPTS.'/transpose-table | grep -P \'(^id)|(^\-?\d+)\'';
    $cmd .= " | ".$SCRIPTS."/XYgraph -xcol 1 -ycol 2-".$col_nb;
    $cmd .= " -format ".$main::param{img_format};
    $cmd .= " -lines -pointsize 0 -legend -header";
    if (-e $color_file) {
      $cmd .= " -colors ".$color_file;
    } else {
      &RSAT::message::Warning("Cannot find residue color specification file", $color_file);
   }
    #    $cmd .= " -symbols "; ## THIS OPTION IS NOT WORKING ANYMORE : THE LEGEND DISPLAYS SYMBOLS BUT NOT THE GRAPH. THIS HAS TO BE FIXED
    my $title = $ol."nt composition profiles : ";
    $title .= "; ".$seq_type." sequence";
    $title .= "; ".$main::param{title} if ($main::param{title});
    $cmd .= " -title '".$title."'";
    $cmd .= " -xleg1 'Position' -xsize 800";
    $cmd .= " -yleg1 'Occurrences' -ysize 300  -ymin 0";
    $cmd .= " -o ".$main::outfile{$ol."nt_".$seq_type."_profiles_graph"};
    &one_command($cmd, 1);
  }
}


################################################################
## Run oligo-analysis on the test set
sub OligoAnalysis {
  for my $len ($main::param{oligo_min_len}..$main::param{oligo_max_len}) {
    for my $markov ($main::param{oligo_min_mkv}..$main::param{oligo_max_mkv}) {
      ## Convert negative markov orders relative to the oligo length
      $markov += $len if ($markov < 0);
      my $pattern_type = 'oligos_'.$len.'nt'.'_mkv'.$markov;
      &RSAT::message::TimeWarn("Running oligo-analysis", $len."nt", "markov=".$markov) if ($main::verbose >= 1);
      my $cmd = $SCRIPTS."/oligo-analysis -v 1";
      $cmd .= " -quick";
      $cmd .= " -i ".$main::outfile{"test_purged"};
      $cmd .= " -format fasta";
      $cmd .= " -sort -lth ratio ".$main::param{min_ratio};
      $cmd .= " -sort -lth occ_sig ".$main::param{min_sig};
      $cmd .= " -uth rank ".$main::param{patterns_max_rank};
      $cmd .= " -return occ,proba,rank";
      $cmd .= " ".$main::param{strand};
      $cmd .= " ".$main::param{disco_noov};
      $cmd .= " -seqtype dna";
      $cmd .= " -l ".$len;
#      if ($main::infile{ctrl_seq}) {
#	$cmd .= " -expfreq ".;
#      } else {
      $cmd .= " -markov ".$markov;
#      }
      $cmd .= " -pseudo 0.01";
      $cmd .= " -o ".$main::outfile{$pattern_type};
      &one_command($cmd, 1);
      &MatrixFromPatterns($main::outfile{$pattern_type}, $main::outfile{$pattern_type.'_pssm'}, $pattern_type, 0);
    }
  }
}

################################################################
## Run oligo-diff to compare the test set to the control set
sub OligoDiff {
  for my $len ($main::param{oligo_min_len}..$main::param{oligo_max_len}) {
    my $pattern_type = 'oligo_diff_'.$len.'nt';
    &RSAT::message::TimeWarn("Running oligo-diff", $len."nt") if ($main::verbose >= 1);
    my $cmd = $SCRIPTS."/oligo-diff -v 2";
    $cmd .= " -test ".$main::outfile{"test_purged"};
    $cmd .= " -ctrl ".$main::outfile{"ctrl_purged"};
    $cmd .= " -nopurge"; ## The input sequences have already been purged
    $cmd .= " -l ".$len;
    $cmd .= " ".$main::param{strand};
    $cmd .= " ".$main::param{disco_noov};
    $cmd .= " -side test";
    #    $cmd .= " -sort";
    $cmd .= " -lth ratio ".$main::param{min_ratio};
    $cmd .= " -lth occ_sig ".$main::param{min_sig};
    $cmd .= " -uth rank ".$main::param{patterns_max_rank};
    #    $cmd .= " -return occ,proba,rank";
    #    $cmd .= " -seqtype dna";
    #    $cmd .= " -pseudo 0.01";
    $cmd .= " -o ".$main::outfile{$pattern_type};
    &one_command($cmd, 1);
    &MatrixFromPatterns($main::outfile{$pattern_type}, $main::outfile{$pattern_type.'_pssm'}, $pattern_type, 0);
  }
}


################################################################
## Run dyad-analysis on the test set
sub DyadAnalysis {
  &RSAT::message::TimeWarn("Running dyad-analysis") if ($main::verbose >= 1);
  my $cmd = $SCRIPTS."/dyad-analysis -v 1";
  $cmd .= " -i ".$main::outfile{"test_purged"};
  $cmd .= " -quick";
  $cmd .= " -format fasta";
  $cmd .= " -sort -lth ratio ".$main::param{min_ratio};
  $cmd .= " -sort -lth occ_sig ".$main::param{min_sig};
  $cmd .= " -uth rank ".$main::param{patterns_max_rank}." -return occ,proba,ratio,zscore,rank";
  $cmd .= " ".$main::param{strand};
  $cmd .= " ".$main::param{disco_noov};
  $cmd .= " -seqtype dna";
  $cmd .= " -l 3 -sp 0-20 ";
  $cmd .= " -bg monads";
  $cmd .= " -pseudo 0.01";
  $cmd .= " | perl -pe 's/n\\{0\\}//g'"; ## Suppress zero spacings for merging with oligos
  $cmd .= " >".$main::outfile{dyads};
  &one_command($cmd, 1);
  &MatrixFromPatterns($main::outfile{dyads}, $main::outfile{dyads_pssm}, "dyads", 0);
}

################################################################
## Run position-analysis to discover oligonucleotides with positional
## biases
sub PositionAnalysis {
  for my $len ($main::param{oligo_min_len}..$main::param{oligo_max_len}) {
    my $pattern_type = 'positions_'.$len.'nt';
    &RSAT::message::TimeWarn("\n Running position-analysis", $len."nt") if ($main::verbose >= 1);
    my $cmd = $SCRIPTS."/position-analysis -v 2";
    $cmd .= " -i ".$main::outfile{"test_purged"};
    $cmd .= " -format fasta";
    $cmd .= " -sort ";
    $cmd .= " -return chi,sig,distrib,graphs,rank";
    $cmd .= " -max_graphs ".$main::param{patterns_max_rank};
    $cmd .= " ".$main::param{strand};
    $cmd .= " ".$main::param{disco_noov};
    $cmd .= " -seqtype dna";
    $cmd .= " -l ".$len;
    $cmd .= " -ci ".$main::param{positions_ci};
    $cmd .= " -lth_occ ".$main::param{positions_min_occ};
    $cmd .= " -lth_sig ".$main::param{min_sig};
    $cmd .= " -uth_rank ".$main::param{patterns_max_rank};
    $cmd .= " -img_format ".$main::param{img_format};
    $cmd .= " -title '".$main::param{title}."'";
    $cmd .= " -origin center ";
    $cmd .= " -max_graphs ".$main::param{positions_max_graphs} if ($main::param{positions_max_graphs}  > 0);
    $cmd .= " -o ".$main::outfile{$pattern_type};
    &one_command($cmd, 1);
    &MatrixFromPatterns($main::outfile{$pattern_type}, $main::outfile{$pattern_type."_pssm"}, $pattern_type, 0);
  }
}


################################################################
## Run local-word-analysis on the test set
sub LocalWords {
#  my $center_pos = &RSAT::util::round($main::param{local_words_window}/2);
  my $center_pos = 0;
  for my $len ($main::param{oligo_min_len}..$main::param{oligo_max_len}) {
    for my $markov ($main::param{oligo_min_mkv}..$main::param{oligo_max_mkv}) {
      ## Convert negative markov orders relative to the oligo length
      $markov += $len if ($markov < 0);
      my $pattern_type = 'local_words_'.$len.'nt'.'_mkv'.$markov;
      &RSAT::message::TimeWarn("Running local-word-analysis", $len."nt") if ($main::verbose >= 1);
      my $cmd = $PYTHON."/local-word-analysis -v 3";
      $cmd .= " -i ".$main::outfile{"test_purged"};
      #  $cmd .= " -format fasta";
      $cmd .= " --min=occ_sig ".$main::param{min_sig};
      $cmd .= " --sort=-occ_sig";
      $cmd .= " --max=rank ".$main::param{patterns_max_rank};
      $cmd .= " --max=w_rank ".1; ## Only return the most significant window for each word
      # " -return occ,proba,rank";
      if ($main::param{strand} eq "-1str") {
	$cmd .= " +";
      } else {
	$cmd .= " +-";
      }
      $cmd .= " --overlap" if ($main::param{disco_noov} eq "ovlp");
      $cmd .= " --center=".$center_pos;
      $cmd .= " --windowgroup=".$main::param{local_words_windowgroup};
#      $cmd .= " --window=".$main::param{local_words_window};
#      $cmd .= " --heuristic=".$main::param{local_words_heuristic};
      #  $cmd .= " -seqtype dna";
      $cmd .= " -l ".$len;
#      $cmd .= " --markov ".$markov;
      #  $cmd .= " -pseudo 0.01";
      $cmd .= " > ".$main::outfile{$pattern_type};
      &one_command($cmd, 1);
      &MatrixFromPatterns($main::outfile{$pattern_type}, $main::outfile{$pattern_type.'_pssm'}, $pattern_type, 0);
    }
  }
}


################################################################
## Concatenate the options for MEME and compute the suffix
sub CalcMemeOptions {
  $meme_options = ""; ## Must be a global variable
  $meme_suffix = ""; ## Must be a global variable

  ## input file
  $meme_options .= " ".$main::outfile{"test_purged"};

  ## strands
  $meme_suffix .= $main::param{strand};
  unless ($main::param{strand} eq "-1str") {
    $meme_options .= " -revcomp";
  }

  ## If bacgrkound sequences have been specified, check that
  ## background model file exists.
  if ($main::outfile{meme_bg_file}) {
    $meme_options .= " -bfile ".$main::outfile{meme_bg_file};
  }


  foreach my $option (@MEME_options) {
#    &RSAT::message::Debug("adding meme option", $option, $main::param{"meme_".$option}) if ($main::verbose >= 10);
    $meme_options .= " -".$option." ".$main::param{"meme_".$option};
    unless ($option eq "bfile") {
      $meme_suffix .= "_".$option.$main::param{"meme_".$option};
    }
  }
}

################################################################
## Compute background for MEME
sub CalcMemeBackground {
  my $fasta_get_markov_cmd = "cat ".$main::outfile{"test_purged"}." | ";
  $fasta_get_markov_cmd .= &RSAT::server::GetProgramPath("fasta-get-markov", $die_on_error);
  $fasta_get_markov_cmd .= " -m ".$main::param{bg_meme_markov};
  $fasta_get_markov_cmd .= " > ".$main::outfile{meme_bg_file};
  &one_command($fasta_get_markov_cmd);
}


################################################################
## Run MEME
sub RunMEME {
  my $die_on_error = 0;
  my $meme_cmd = &RSAT::server::GetProgramPath("meme", $die_on_error);
  my $command = "";

  ################################################################
  ## MEME does not accept sequences shorter than 8
  ## However, peas are usually smaller than 8bp -> we skip this conversion
#  my $min_seq_len = &max(8, $main::param{meme_minw});
#  $command = "convert-seq ";
#  $command .= " -from fasta -to fasta -skip_short ".$min_seq_len;
#  $command .= " -i ".$main::outfile{"test_purged"};
#  $command .= " -o ".$main::outfile{"test_purged_noshort"};


  $command .= $meme_cmd;
  $command .= " ".$meme_options;
  $command .= " > ".$main::outfile{meme};
#  unless (($batch) || ($verbose >= 3)) {
#    ## Redirect verbosity to a log file
#    $command = "(".$command.") >& meme_log.txt";
#  }
  &one_command($command);

  ## Convert the matrix
  $command = $SCRIPTS."/convert-matrix -i ".$main::outfile{meme};
  $command .= " -from meme";
  $command .= " -to transfac";
  $command .= " -return counts,parameters,logo";
  $command .= " -logo_file ".$main::outfile{meme_logos};
  $command .= " -o ".$main::outfile{meme_tf};

  ## Run the command
  &one_command($command);
}

################################################################
## Convert reference motif and generate logos
sub RefMotif {

  ## Copy the reference motif in the output directory
  my $cmd = "rsync -ruptL ";
  $cmd .= " ".$main::infile{ref_motifs};
  $cmd .= " ".$main::outfile{ref_motifs};
  &one_command($cmd, 1);

  ## Convert the reference motif to TRANSFAC format, because this
  ## format holds information about the motif name, ID etc
  ## Export the logo and matrix parameters
  if ((lc($main::param{ref_motifs_format}) eq "transfac") ||
      (lc($main::param{ref_motifs_format}) eq "tf")) {
    $cmd = "cp ";
    $cmd .= " ".$main::infile{ref_motifs};
    $cmd .= " ".$main::outfile{ref_motifs_transfac};
  } else {
    $cmd = $SCRIPTS."/convert-matrix -v 0";
    $cmd .= " -i ".$main::infile{ref_motifs};
    $cmd .= " -from ".$main::param{ref_motifs_format};
    $cmd .= " -to transfac -decimals 1";
    $cmd .= " -return counts,consensus,parameters";
    $cmd .= " -o ".$main::outfile{ref_motifs_transfac};
  }
  &one_command($cmd, 1);

  ## Also export the matrices in tab-delimited format and export sequence logos
  $cmd = $SCRIPTS."/convert-matrix -v 0";
  $cmd .= " -i ".$main::infile{ref_motifs};
  $cmd .= " -from ".$main::param{ref_motifs_format};
  $cmd .= " -to tab";
  $cmd .= " -return counts,logo";
    $cmd .= " -logo_file ".$main::outfile{ref_motifs_logo};
  $cmd .= " -o ".$main::outfile{ref_motifs_tab};
  &one_command($cmd, 1);
}


################################################################
## Compare the significance of words (oligos, dyads) discovered by
## the different approaches
sub MergeWords {
  my $cmd = $SCRIPTS."/compare-scores -v 1 ";
  my $file_nb = 0;
  foreach my $pattern_type (@patterns_to_merge) {
    &RSAT::message::Debug("Merging words of type", $pattern_type) if ($main::verbose >= 5);
    next if ($pattern_type eq "merged_words"); ## avoid recycling the merged words from a previous run in the compilation
    my $pattern_file = $main::outfile{$pattern_type};
    if (-e $pattern_file) {
      my $sig_col;
      $file_nb++;
      $cmd .= " -i ".$pattern_file;
      if ($pattern_type =~ /oligos_/) {
	$sig_col = 8;
      } elsif ($pattern_type =~ /oligo_diff/) {
	$sig_col = 11;
      } elsif ($pattern_type =~ /dyads/) {
	$sig_col = 8;
      } elsif ($pattern_type =~ /local_words/) {
	$sig_col = 9;
      } elsif ($pattern_type =~ /positions/) {
	$sig_col = 9;
      } else {
	&RSAT::message::Warning("Unknown score column for pattern type", $pattern_type);
	next;
      }
      $cmd .= " -sc".$file_nb." ".$sig_col;
      $cmd .= " -suppress ".$main::dir{output}."/"."results/".$pattern_type."/";
      #    &RSAT::message::Debug($file_nb, $pattern_type, $sig_col, "\n", $pattern_file) if ($main::verbose >= 5);
    } else {
      &RSAT::message::Warning("Missing pattern file", $pattern_type, $pattern_file);
    }
  }
  $cmd .= " -ic 1";
  $cmd .= " -lc";
  $cmd .= " -null .";
  $cmd .= " -suppress ".$main::param{prefix}."_";
  $cmd .= " -suppress '\.tab'";
  $cmd .= " | ".$SCRIPTS."/row-stats -after 1 -sort ";
  $cmd .= " -o ".$main::outfile{merged_words};
  &one_command($cmd, 1);

  ## Generate a HTML format of the table (convenient for sorting the
  ## words according to different columns)
  $cmd = $SCRIPTS."/text-to-html -i ".$main::outfile{merged_words};
  $cmd .= " -o ".$main::outfile{merged_words_html};
  &one_command($cmd, 1);

  ## Draw a heat map of the word significance table
  $cmd = $SCRIPTS."/draw-heatmap -min 0 -max 10  -out_format png";
  $cmd .= " -col_width 40 -rownames -gradient fire";
  $cmd .= " -row_height 16";
  $cmd .= " -i ".$main::outfile{merged_words};
  $cmd .= " -o ".$main::outfile{merged_words_heatmap};
  &one_command($cmd, 1);

  ## Extract position-specific column matrices
  my $pattern_type = "merged_words";
  &MatrixFromPatterns($main::outfile{$pattern_type}, $main::outfile{$pattern_type.'_pssm'}, $pattern_type, 4);

  &RSAT::message::TimeWarn("Word comparison table", $main::outfile{merged_words}) if ($main::verbose >= 2);

}


################################################################
## Merge all discoverd motifs in a single file
sub MergeMotifs {

  ## Remove previous versions of the merged motif file
  my $cmd = "rm -f ".$main::outfile{motifs_discovered};

  ## Initialize the motif file with the reference motif if provided
#  if (defined($main::infile{ref_motifs})) {
#    $cmd = "; cp -f";
#    $cmd .= " ".$main::outfile{ref_motifs_transfac};
#    $cmd .= " ".$main::outfile{motifs_discovered};
#  }

  ################################################################
  ## Concatenate all discovered motifs (matrices) in a single file.
  ## Use TRANSFAC format because if allows to associate a name to each
  ## matrix.
  foreach my $pattern_type (@pattern_types) {
    my $matrix_file = $main::outfile{$pattern_type.'_pssm_tf'};

    unless (-e $matrix_file) {
      ## TEMPORARY: ensure conversion for data sets of previous versions
      my $convert_cmd = " ; ".$SCRIPTS."/convert-matrix";
      $convert_cmd .= " -i ".$main::outfile{$pattern_type.'_pssm_counts'};
      $convert_cmd .= " -from tab -to transfac -return counts,consensus";
      $convert_cmd .= " -prefix $pattern_type";
      $convert_cmd .= " -o ".$matrix_file;
      &one_command($convert_cmd, 1);
   }

    if (-e $matrix_file) {
      $cmd .= "; cat ".$matrix_file." >> ".$main::outfile{motifs_discovered};
    } else {
      &RSAT::message::Warning("Missing matrix file", $pattern_type, $matrix_file);
    }
  }
  &one_command($cmd, 1);

  &RSAT::message::TimeWarn("Merged discovered motifs", $main::outfile{motifs_discovered}) if ($main::verbose >= 2);
}

################################################################
## Compare each discovered motifs to each other, and with the
## reference motif, identify clusters of similar motifs, and build
## consensus motifs.
sub ClusterMotifs {

  ## Comparison between discovered matrices, and with the reference motif
  my $cmd = $SCRIPTS."/compare-matrices -v 1 -mode matches";
  $cmd .= " -format transfac -file ".$main::outfile{motifs_discovered};
#  $cmd .= " -format2 transfac -file2 ".$main::outfile{motifs_discovered};;
#  $cmd .= " -format2 transfac -file2 ".$main::outfile{ref_motifs_transfac};
  $cmd .= " -DR -distinct";
  $cmd .= " -sort cor";
  $cmd .= " -uth rank 1"; ## Only report the best matching shift between a pair of matrices
  $cmd .= " -lth w ".$main::param{matrix_compa_min_w}; ## Min number of aligned columns
  $cmd .= " -lth cor ".$main::param{matrix_compa_min_cor}; ## Min correlation
  $cmd .= " -lth Ncor ".$main::param{matrix_compa_min_Ncor}; ## Min normalized correlation
  $cmd .= " -return matrix_name,strand,offset,all_metrics,width,consensus";
  $cmd .= " -o ".$main::outfile{motifs_disco_compa};
  $cmd .= "; text-to-html -i ".$main::outfile{motifs_disco_compa};
  $cmd .= " -o ".$main::outfile{motifs_disco_compa_html};
  &one_command($cmd, 1);

  ## Generate a GML graph with the matrix comparison result (can be opened with CytoScape or Yed)
  $cmd = $SCRIPTS."/convert-graph -i ".$main::outfile{motifs_disco_compa};
  $cmd .= " -ewidth -ecolors fire";
  $cmd .= " -layout spring";
  $cmd .= " -from tab -to gml -scol 1 -tcol 2 -wcol 3";
  $cmd .= " -o ".$main::outfile{motifs_disco_compa_gml};
  &one_command($cmd, 1);

  ## Generate a figure of the motif comparison graph
  $cmd = $SCRIPTS."/display-graph";
  $cmd .= " -in_format gml -i ".$main::outfile{motifs_disco_compa_gml};
  $cmd .= " -ewidth";
  $cmd .= " -layout none";
  $cmd .= " -out_format png -o ".$main::outfile{motifs_disco_compa_png};
  &one_command($cmd, 1);

  ## Use MCL to partition the motif graph into clusters
  my $mcl_dir = $ENV{mcl_dir};
  unless ($mcl_dir) {
    &RSAT::error::FatalError("Motif comparison requires to install MCL and indicate its path in the file $ENV{RSAT}/RSAT_config.props");
  }
  $cmd = "grep -v '^;' ".$main::outfile{motifs_disco_compa}.">".$main::outfile{motifs_disco_compa}.".mcl";
  $cmd .= "; ".$mcl_dir."/mcl ".$main::outfile{motifs_disco_compa}.".mcl";
  $cmd .= " -I 1.8 --abc -V all ";
  $cmd .= " -o ".$main::outfile{motifs_disco_clusters_mcl};
  $cmd .= " ; ${SCRIPTS}/convert-classes -i ".$main::outfile{motifs_disco_clusters_mcl};
  $cmd .= " -from mcl -to tab ";
  $cmd .= " -o ".$main::outfile{motifs_disco_clusters_tab};
  &one_command($cmd, 1);

  ## Split the motif graph into clusters as defined by MCL, and
  ## compute the intra-cluster degree (k) and weighted degree (wk) of
  ## each node
  $cmd = $SCRIPTS."/graph-get-clusters -i ".$main::outfile{motifs_disco_compa};
  $cmd .= " -in_format tab -scol 1 -tcol 2 -wcol 4 -return clusters ";
  $cmd .= " -clusters ".$main::outfile{motifs_disco_clusters_tab};
  $cmd .= " -out_format tab -o ".$main::outfile{motifs_disco_clusters_graph};

  ## Identify graph components and count the intra-component degree of
  ## each node. Most connected nodes will serve as seeds for motif
  ## clustering.
  $cmd .= " ; ".$SCRIPTS."/graph-connex-components -v 1";
  $cmd .= " -i ".$main::outfile{motifs_disco_clusters_graph};
  $cmd .= " -wcol 3";
  $cmd .= " -o ".$main::outfile{motifs_disco_compa_cluster_intra_degree};
  &one_command($cmd, 1);

  ## Generate a GML graph with the matrix comparison result (can be opened with CytoScape or Yed)
  $cmd = $SCRIPTS."/convert-graph -i ".$main::outfile{motifs_disco_clusters_graph};
  $cmd .= " -ewidth -ecolors fire";
  $cmd .= " -layout spring";
  $cmd .= " -from tab -to gml -scol 1 -tcol 2 -wcol 3";
  $cmd .= " -o ".$main::outfile{motifs_disco_clusters_graph_gml};
  &one_command($cmd, 1);

  ## Generate a figure of the motif comparison graph
  $cmd = $SCRIPTS."/display-graph";
  $cmd .= " -in_format gml -i ".$main::outfile{motifs_disco_clusters_graph_gml};
  $cmd .= " -ewidth";
  $cmd .= " -layout none";
  $cmd .= " -out_format png -o ".$main::outfile{motifs_disco_clusters_graph_png};
  &one_command($cmd, 1);



}


################################################################
## Compare discovered motifs to the referencemotif 
sub MotifsVersusReference {
  &RSAT::message::TimeWarn("Comparing discovered motifs with reference motif") if ($main::verbose >= 1);

  ## Comparison between discovered matrices, and with the reference motif
  my $cmd = $SCRIPTS."/compare-matrices -v 2 -mode matches";
  $cmd .= " -format1 transfac -file1 ".$main::outfile{ref_motifs_transfac};;
  $cmd .= " -format2 transfac -file2 ".$main::outfile{motifs_discovered};
  $cmd .= " -DR";
  $cmd .= " -sort Ncor";
  $cmd .= " -uth rank 1"; ## Only report the best matching shift between a pair of matrices
  $cmd .= " -lth w ".$main::param{matrix_compa_min_w}; ## Min number of aligned columns
  $cmd .= " -lth cor ".$main::param{matrix_compa_min_cor}; ## Min correlation
  $cmd .= " -lth Ncor ".$main::param{matrix_compa_min_Ncor}; ## Min normalized correlation
  $cmd .= " -return matrix_name,strand,offset,all_metrics,width,consensus,alignments_1ton";
  $cmd .= " -o ".$main::outfile{"motifs_vs_ref_prefix"};
  $cmd .= "; text-to-html -i ".$main::outfile{"motifs_vs_ref"};
  $cmd .= " -o ".$main::outfile{"motifs_vs_ref_html"};
  &one_command($cmd, 1);

  &RSAT::message::TimeWarn("Dicovered versus reference motif", $main::outfile{motifs_vs_ref}) if ($main::verbose >= 2);

  ## Generate a GML graph with the matrix comparison result (can be opened with CytoScape or Yed)
  $cmd = $SCRIPTS."/convert-graph -i ".$main::outfile{"motifs_vs_ref"};
  $cmd .= " -ewidth -ecolors fire";
  $cmd .= " -layout spring";
  $cmd .= " -from tab -to gml -scol 1 -tcol 2 -wcol 3";
  $cmd .= " -o ".$main::outfile{"motifs_vs_ref_gml"};
  &one_command($cmd, 1);

  ## Generate a figure of the motif comparison graph
  $cmd = $SCRIPTS."/display-graph";
  $cmd .= " -in_format gml -i ".$main::outfile{"motifs_vs_ref_gml"};
  $cmd .= " -ewidth";
  $cmd .= " -layout none";
  $cmd .= " -out_format png -o ".$main::outfile{"motifs_vs_ref_png"};
  &one_command($cmd, 1);

}

################################################################
## Compare discovered motifs to the motif database
sub MotifsVersusDatabase {
  foreach my $db_name (@motif_databases) {
    &RSAT::message::TimeWarn("Comparing discovered motifs with database", $db_name) if ($main::verbose >= 1);

    ## Comparison between discovered matrices, and with the reference motif
    my $cmd = $SCRIPTS."/compare-matrices -v 2 -mode matches";
    $cmd .= " -format1 transfac -file1 ".$main::outfile{motifs_discovered};
    $cmd .= " -format2 ".$main::motif_db_format{$db_name};
    $cmd .= " -file2 ".$main::infile{"motif_db_".$db_name};;
    $cmd .= " -DR";
    $cmd .= " -sort cor";
    $cmd .= " -uth rank 1"; ## Only report the best matching shift between a pair of matrices
    $cmd .= " -lth w ".$main::param{matrix_compa_min_w}; ## Min number of aligned columns
    $cmd .= " -lth cor ".$main::param{matrix_compa_min_cor}; ## Min correlation
    $cmd .= " -lth Ncor ".$main::param{matrix_compa_min_Ncor}; ## Min normalized correlation
    $cmd .= " -return matrix_name,strand,offset,all_metrics,width,consensus,alignments_1ton";
    $cmd .= " -o ".$main::outfile{"motifs_vs_db_".$db_name};
    $cmd .= " ; text-to-html -i ".$main::outfile{"motifs_vs_db_".$db_name};
    $cmd .= " -o ".$main::outfile{"motifs_vs_db_".$db_name."_html"};
    &one_command($cmd, 1);

    &RSAT::message::TimeWarn("Dicovered versus DB", $main::outfile{motifs_vs_db}) if ($main::verbose >= 2);

    ## Generate a GML graph with the matrix comparison result (can be opened with CytoScape or Yed)
    $cmd = $SCRIPTS."/convert-graph -i ".$main::outfile{"motifs_vs_db_".$db_name};
    $cmd .= " -ewidth -ecolors fire";
    $cmd .= " -layout spring";
    $cmd .= " -from tab -to gml -scol 1 -tcol 2 -wcol 3";
    $cmd .= " -o ".$main::outfile{"motifs_vs_db_".$db_name."_gml"};
    &one_command($cmd, 1);

    ## Generate a figure of the motif comparison graph
    $cmd = $SCRIPTS."/display-graph";
    $cmd .= " -in_format gml -i ".$main::outfile{"motifs_vs_db_".$db_name."_gml"};
    $cmd .= " -ewidth";
    $cmd .= " -layout none";
    $cmd .= " -out_format png -o ".$main::outfile{"motifs_vs_db_".$db_name."_png"};
    &one_command($cmd, 1);
  }
}

################################################################
## Convert Word assemblies into PSSMs
sub MatrixFromPatterns {
  my ($pattern_file, $pssm_file, $pattern_type, $score_column) = @_;
  &RSAT::message::TimeWarn("Matrix from patterns", $pattern_type) if ($verbose >= 3);
  my $cmd = $SCRIPTS."/matrix-from-patterns -v 1 ";
  $cmd .= " -seq ".$infile{test_seq};
  $cmd .= " -pl ".$pattern_file;
  $cmd .= " -bgfile ".$main::param{scan_bg_file};
  $cmd .= " -toppat ".$main::param{asmb_toppat};
  $cmd .= " -max_asmb_nb ".$main::param{matrix_nmotifs};
  $cmd .= " -sc ".$score_column if ($score_column > 0);
  if ($pattern_type eq "dyads") {
    $cmd .= " -subst 0";
  } else {
    $cmd .= " -subst 1";
  }
  $cmd .= " -prefix ".$pattern_type;
  $cmd .= " -flanks 2";
  $cmd .= " -collect_method matrix-scan-quick";
  $cmd .= " -logo";
  $cmd .= " -o ".$pssm_file;
  &one_command($cmd, 1);
}


################################################################
## Scan peak sequences with the discovered motif
##
## BEWARE: THIS IS NOT YET WORKING, BECAUSE matrix-scan-quick ONLY
## USES THE FIRST MATRIX OF EACH FILE.
sub ScanSequences {
  $main::param{scan_min_score} = 7.5;
  &RSAT::message::TimeWarn("Scanning sequences") if ($main::verbose >= 1);
#  foreach my $seq_type (@seq_types) {
    foreach my $pattern_type (@pattern_types) {
      my $cmd = $BIN."/matrix-scan-quick -v 1";
      $cmd .= " -i ".$infile{test_seq};
      $cmd .= " -m ".$main::outfile{$pattern_type.'_pssm_counts'};
      $cmd .= " -bgfile ".$main::param{scan_bg_file};
      $cmd .= " ".$main::param{scan_strands};
      $cmd .= " -origin center -return sites";
      $cmd .= " -t ".$main::param{scan_min_score};
      $cmd .= " >".$main::outfile{$pattern_type.'_pssm_sites'};
      &one_command($cmd, 1);

      ## Compute the positional distribution of sites
      $cmd = "awk '{print \(\$6\+\$5\)/2}'";
      $cmd .= " ".$main::outfile{$pattern_type.'_pssm_sites'};
      $cmd .= " | ".$SCRIPTS."/classfreq -v 1";
      $cmd .= " -ci ".$main::param{profiles_ci};
      $cmd .= " -o ".$main::outfile{$pattern_type.'_pssm_site_distrib'};
      &one_command($cmd, 1);

      ## Draw the graph of predicted site positions
      $cmd = $SCRIPTS."/XYgraph";
      $cmd .= " -format ".$main::param{img_format};
      $cmd .= " -i ".$main::outfile{$pattern_type.'_pssm_site_distrib'};
      $cmd .= " -lines -xcol 3 -ycol 4";
      $cmd .= " -ysize 200 -ycol 4 -yleg1 'Number of sites'";
      $cmd .= " -xsize 800 -xcol 3 -xleg1 'Sequence position relative to peak center'";
      $cmd .= " -title1 'Predicted sites : $pattern_type'";
      $cmd .= " -o ".$main::outfile{$pattern_type.'_pssm_site_distrib_graph'};
      &one_command($cmd, 1);
    }
#  }
}


################################################################
## Generate a file summarizing the time spent in the different tasks
sub TimeLog {
  my $timelog = &OpenOutputFile($main::outfile{timelog});

  my $prefix = "NA";
  if (defined($main::param{prefix})) {
    $prefix = $main::param{prefix};
#    $prefix =~ s/_$//;
  }

  print $timelog join("\t", "#start_time       ", "done_time        ", "elapsed", "seconds", "task", "prefix", "file"), "\n";

#  foreach my $pattern_type (@pattern_types) {
#    foreach my $key ($pattern_type, $pattern_type."_2pssm") {
  foreach my $key (@timelog_keys) {
    my $file = $main::outfile{$key};
    my $start = "NA";
    my $done = "NA";
    my $elapsed = "NA";
    my $seconds = "NA";
    if (-e $file) {
      my ($in) = &OpenInputFile($file);
      while (<$in>) {
	if (/^;\s*Job started\s+(\S+)/i) {
	  $start = $1;
	} elsif (/^;\s*Job done\s+(\S+)/i) {
	  $done = $1;
	} elsif (/^;\s*Seconds\s+(\S+)/i) {
	  $seconds = $1;
	}
      }
      close $in;
    }
    if ($start =~ /(\d{4})_(\d{2})_(\d{2}).(\d{2})(\d{2})(\d{2})/) {
      my ($start_year, $start_month, $start_day, $start_hour, $start_min, $start_sec) = 
	($1, $2, $3, $4, $5, $6);
      if ($done =~ /(\d{4})_(\d{2})_(\d{2}).(\d{2})(\d{2})(\d{2})/) {
	my ($done_year, $done_month, $done_day, $done_hour, $done_min, $done_sec) = 
	  ($1, $2, $3, $4, $5, $6);
	if ($done_month > $start_month) {
	  $done_day = $start_day +1; ## Quick and tricky treatment of
	  ## month-overlapping tasks
	}
	$elapsed = ($done_sec - $start_sec)
	  + ($done_min - $start_min)*60
	    + ($done_hour - $start_hour)*3600
	      + ($done_day - $start_day)*86400;
      }
    }
    print $timelog join("\t", $start, $done, $elapsed, $seconds, $key, $prefix, $file), "\n";
  }
  #}
  close $timelog;
  &RSAT::message::TimeWarn("Time log file", $main::outfile{timelog}) if ($main::verbose >= 1);

  ## Create a HMTL version of the timelog table
  my $cmd = $SCRIPTS."/text-to-html";
  $cmd .= " -font variable";
  $cmd .= " -i ".$main::outfile{timelog};
  $cmd .= " -o ".$main::outfile{timelog_html};
  &one_command($cmd, 1);
  &RSAT::message::TimeWarn("Time log html", $main::outfile{timelog_html}) if ($main::verbose >= 1);
}

################################################################
## Generate a synthetic table summarizing the main results with links
## to the iriginal result files.
sub Synthesis {
  local $syn = &OpenOutputFile($main::outfile{synthesis});
  local $synthesis_path = `dirname $main::outfile{synthesis}`;
  chomp($synthesis_path);

  ## HTML Header and title
  print $syn "<html>\n";
  print $syn "<head>\n";
  print $syn "<title>peak-motifs ".$main::param{title}."</title>\n";

  ## Page styles
  print $syn "<style type='text/css'>\n";
  print $syn `cat $ENV{RSAT}/perl-scripts/lib/results.css`;
  print $syn "</style>\n";
#  print $syn "<link rel='stylesheet' type='text/css' href='".$ENV{RSAT}."/public_html/main.css'/>\n";

  print $syn "</head>\n";
  print $syn "<body>\n";
  print $syn "<h1>Result: peak-motifs ".$main::param{title}."</h1>\n";

  ## Print the command line
  print $syn "<pre>";
  print $syn "<b>Command:</b>  peak-motifs ";
  &PrintArguments($syn);
  print $syn "</pre>";

  ## Header of the synthetic table
  print $syn "<h2>Synthetic report</h2>";
  print $syn "<p><table class='sortable'>\n";

  &SynthesisSequenceComposition();

  ## Reference motif
  &SynthesisRefMotif();

  ## Header line
  &SyntheticTableAddHeaderRow("Pattern discovery");

  foreach my $pattern_type (@pattern_types) {
    &SynthesisPatternDisco($pattern_type);
  }

  ## Word comparison
  &SynthesisWordCompa();

  ## Motif comparison
#  &SynthesisMotifCompa();

  ## Comparison between discovered motifs and reference motif
  &SynthesisMotifsVersusReference() if (defined($main::infile{ref_motifs}));

  ## Comparison between discovered motifs and database(s)
  &SynthesisMotifsVersusDatabase() if (scalar(@motif_databases) > 0);

  ## Log file
  &SyntheticTableAddHeaderRow("Log files");
  &SyntheticTableAddRow("Parameters and files",
			"",
 			"txt"=>$main::outfile{log},
 		       );
  &SyntheticTableAddRow("Time log",
			"",
 			"txt"=>$main::outfile{timelog},
 			"html"=>$main::outfile{timelog_html},
 		       );


  ## Close the synthetic table
  print $syn "</table></p>\n";

  ## Log file
#  print $syn "<h2>Log</h2>";
#  print $syn "<pre>";
#  print $syn `cat $main::outfile{log}`;
#  print $syn "</pre>";

  ## End of the HTML file
  print $syn "</body>\n";
  print $syn "</html>\n";
  close $syn;
  &RSAT::message::TimeWarn("Synthetic report", $main::outfile{synthesis}) if ($main::verbose >= 1);
}

# ################################################################
# ## Compute a file path relative to the synthesis file
# sub RelativePath {
#   my ($file) = @_;
#   my ($link, $shared_path) = &RSAT::util::RelativePath($main::outfile{synthesis}, $file);
#   return $link;
# }

################################################################
## Add a row to the syhnthetic table
sub SyntheticTableAddRow {
  my ($type, $summary, @files) = @_;
  print $syn "<tr>\n";
  print $syn "<td>$type</td>\n";
  print $syn "<td>$summary</td>\n";
  print $syn "<td>\n";
  my $key;
  my $file;
  while ($key = shift (@files)){
    last unless $key;
    $file = shift(@files);
    if (-e $file) {
      &RSAT::message::Debug($key, $file) if ($main::verbose >= 5);
      $file = &RSAT::util::RelativePath($main::outfile{synthesis}, $file);
      print $syn "<a href='".$file."'>[".$key."]</a><br>\n";
    } else {
      print $syn "<font color='red'>[".$key."]</font><br>\n";
    }
  }
  print $syn "</td>\n";
  print $syn "</tr>\n";
}


################################################################
## Add a header row to the synthetic table
sub SyntheticTableAddHeaderRow {
  my ($header) = @_;
  print $syn "<tr>\n";
  print $syn "<th colspan=3>\n";
  print $syn $header;
  print $syn "</th>\n";
  print $syn "</tr>\n";
}

################################################################
## Add sequence composition on the synthetic report
sub SynthesisSequenceComposition {
  foreach my $seq_type (@seq_types) {

    ## Header line
    &SyntheticTableAddHeaderRow("Sequence composition (".$seq_type." sequences)");


    my $peak_nb = "NA";
    my $mean_peak_len = "NA";
    my $min_peak_len = "NA";
    my $max_peak_len = "NA";
    my $seq_size = "NA";

    if (-e $main::outfile{$seq_type."_seqlen_distrib"}) {
      &RSAT::message::Info("Collecting sequence length statistics from file", $main::outfile{$seq_type."_seqlen_distrib"}) if ($main::verbose >= 3);

      ## Get number of peaks from sequence length distribution
      $peak_nb = `grep '; count' $main::outfile{$seq_type."_seqlen_distrib"}`;
      chomp($peak_nb);
      $peak_nb =~ s/.*count\s*//;

      ## Get min peak length
      $min_peak_len = `grep '; min' $main::outfile{$seq_type."_seqlen_distrib"}`;
      chomp($min_peak_len);
      $min_peak_len =~ s/.*min\s*//;

      ## Get mean peak length
      $mean_peak_len = `grep '; mean' $main::outfile{$seq_type."_seqlen_distrib"}`;
      chomp($mean_peak_len);
      $mean_peak_len =~ s/.*mean\s*//;

      ## Get max peak length
      $max_peak_len = `grep '; max' $main::outfile{$seq_type."_seqlen_distrib"}`;
      chomp($max_peak_len);
      $max_peak_len =~ s/.*max\s*//;

      ## Get total sequence size from sequence length distribution
      $seq_size = `grep '; sum' $main::outfile{$seq_type."_seqlen_distrib"}`;
      chomp($seq_size);
      $seq_size =~ s/.*sum\s*//;
      $seq_size = round($seq_size/1000);

      &RSAT::message::Debug("Nb of peaks=".$peak_nb,
			    "sequence size=".$seq_size) if ($main::verbose >= 5);
    } else {
      &RSAT::message::Warning("Sequence length file does not exist. Cannot collect statistics for the synthesis.\n", $main::outfile{$seq_type."_seqlen_distrib"});
    }

    ## Sequence lengths
    my $img = &RSAT::util::RelativePath($main::outfile{synthesis}, $main::outfile{$seq_type."_seqlen_distrib_graph"});
    my $seq_stats = "Nb of peaks: ".$peak_nb;
    $seq_stats .= "<br>\nTotal seq. size: ".$seq_size." kb";
    $seq_stats .= "<br>\nMin length: ".$min_peak_len." bp";
    $seq_stats .= "<br>\nMean length: ".$mean_peak_len." bp";
    $seq_stats .= "<br>\nMax length: ".$max_peak_len." bp";
    &SyntheticTableAddRow($seq_stats,
			  "<a  href='".$img."'><img height=150 src='".$img."'></a>",
			  "converted",$main::outfile{$seq_type."_converted"},
			  "purged",$main::outfile{$seq_type."_purged"},
			  "lengths",$main::outfile{$seq_type."_seqlen"},
			  "distrib",$main::outfile{$seq_type."_seqlen_distrib"},
			  "graph",$main::outfile{$seq_type."_seqlen_distrib_graph"},
			 );


    ## Residue composition
    for my $ol (@profiles_oligo_lengths) {
      my $table = "<table><tr>";
      my $img = &RSAT::util::RelativePath($main::outfile{synthesis}, $main::outfile{$ol."nt_".$seq_type."_heatmap"});
      $table .= "<td>Transition frequencies<br><a  href='".$img."'><img width=200 src='".$img."'></a></td>";
      $img = &RSAT::util::RelativePath($main::outfile{synthesis}, $main::outfile{$ol."nt_".$seq_type."_profiles_graph"});
      $table .= "<td>Position profile<br><a  href='".$img."'><img height=150 src='".$img."'></a></td>";
      $table .= "</tr></table>";
      &SyntheticTableAddRow($ol."nt composition",
			    $table,
			    $ol."nt freq",$main::outfile{$ol."nt_".$seq_type."_freq"},
			    $ol."nt transitions",$main::outfile{$ol."nt_".$seq_type."_transit"},
			    "inclusive bg model",$main::outfile{$ol."nt_".$seq_type."_inclusive"},
			    "profile table",$main::outfile{$ol."nt_".$seq_type."_profiles"},
			    "individual profiles",$main::outfile{$ol."nt_".$seq_type."_profiles_index"},
			   );
    }
  }
}

################################################################
## Add the reference motif to the report
sub SynthesisRefMotif {
  if (defined($main::infile{ref_motifs})) {
    &SyntheticTableAddHeaderRow("Reference motif(s)");

    my $logo_table = "<table cellpadding=0 cellspacing=0 align=center border=0>";

    ## get the names of all reference logo files
    my @logo_files = glob($main::outfile{ref_motifs_logo}."_m*.".$main::param{img_format});
    @logo_files = grep {!/_rc\./} @logo_files;
    #    my @logo_files = glob($main::outfile{ref_motifs_logo}."_m\d+.".$main::param{img_format});

    ## Prepare a table with the logos
    for my $i (1..scalar(@logo_files)) {
      $logo_table .= "<tr>\n";
      $logo_table .= "<td>Ref motif ".$i."</td>\n";
      my $logo_file = $main::outfile{ref_motifs_logo}."_m".$i.".".$main::param{img_format};
      my $logo_file_rc = $main::outfile{ref_motifs_logo}."_m".$i."_rc.".$main::param{img_format};
      if (-e $logo_file) {
	my $img = &RSAT::util::RelativePath($main::outfile{synthesis}, $logo_file);
	$logo_table .= "<td align='right'><a  href='".$img."'><img height=70 src='".$img."'></a></td>\n";
      } elsif ($main::verbose >= 1) {
	&RSAT::message::Warning("Cannot find reference logo", $logo_file);
      }
      if (-e $logo_file_rc) {
	my $img = &RSAT::util::RelativePath($main::outfile{synthesis}, $logo_file_rc);
	$logo_table .= "<td align='left'><a  href='".$img."'><img height=70 src='".$img."'></a></td>\n";
      } elsif ($main::verbose >= 1) {
	&RSAT::message::Warning("Cannot find reverse complementary logo for the reference motif", $logo_file_rc);
      }
      $logo_table .= "</tr>\n";
    }
    $logo_table .= "</table>\n";

    &SyntheticTableAddRow("Reference motif",
			  $logo_table,
#			  $main::param{ref_motifs_format}, &RSAT::util::RelativePath($main::outfile{synthesis}, $main::infile{ref_motifs}),
			  "tf",$main::outfile{ref_motifs_transfac},
			  "tab",$main::outfile{ref_motifs_tab},
			 );
  }
}

################################################################
## Synthesis for one pattern discovery algorithm
## Usage:
##   &SynthesisPatternDisco($pattern_type);
## Where pattern type can be oligos, dyads, local_words, oligo-diff
sub SynthesisPatternDisco {
  my ($pattern_type) = @_;

  if (-e $main::outfile{$pattern_type.'_asmb'}) {

    ## Synthesize results of pattern assembly (assembly consensus + sig scores)
    my ($asmb) = &OpenInputFile($main::outfile{$pattern_type.'_asmb'});
    my $pattern_nb = 0;
    my $asmb_or_isol = "";
    my %pattern_info = ();
    while (<$asmb>) {
      next if /^#/;		## Skip header line
      next unless /\S/;		## Skip empty lines
      if (/assembly\s+\#\s+(\d+)/i) {
	## Assembly number
	$pattern_nb = $1;
	$asmb_or_isol = "asmb";
	next;
      } elsif (/Isolated patterns/) {
	## Isolated patterns at the end of the assembly file
	$asmb_or_isol = "isol";
	next;
      }
      next if /^;/;		## Skip comments
      if ($asmb_or_isol eq "isol") {
	$pattern_nb++;
      } elsif (!/consensus/) {
	next;
      }
      my ($word, $rc_word, $score) = split "\t";
      $pattern_info[$pattern_nb]->{type} = $asmb_or_isol;
      $pattern_info[$pattern_nb]->{word} = $word;
      $pattern_info[$pattern_nb]->{rc_word} = $rc_word;
      $pattern_info[$pattern_nb]->{score} = $score;
    }
    close $asmb;

    ## Synthesize matrix logos
    my $pattern_table = "<table cellpadding='0' cellspacing='0' align='center' border='0'>";
    foreach my $logo_nb (1..$main::param{matrix_nmotifs}) {
      $pattern_table .= "<tr>\n";
      my $logo_file = $main::outfile{$pattern_type.'_pssm_logo'.$logo_nb};
      my $logo_file_rc = $main::outfile{$pattern_type.'_pssm_logo_rc'.$logo_nb};
      my $start_format="";
      my $end_format="";
      my $score = $pattern_info[$logo_nb]->{score};
      if (&IsReal($score)) {
	if ($score >10) {
	  $start_format .= '<b>';
	  $end_format .= '</b>';
	  if ($score >= 75) {
	    $start_format .= "<font color='red'>";
	    $end_format = "</font>".$end_format;
	  }
	}
      }
      &RSAT::message::Debug("Logo", $logo_nb, $logo_file) if ($main::verbose >= 5);
      if (-e $logo_file) {
	my $img = &RSAT::util::RelativePath($main::outfile{synthesis}, $logo_file);
	$pattern_table .= "<td align='right'>";
	$pattern_table .= $start_format;
	$pattern_table .= "$asmb_or_isol";
	$pattern_table .= " (sig=".$pattern_info[$logo_nb]->{score}.")";
	$pattern_table .= "&nbsp;"x5;
	$pattern_table .= $pattern_info[$logo_nb]->{word};
	$pattern_table .= "<br><a  href='".$img."'><img height=70 src='".$img."'></a>";
	$pattern_table .= $start_format;
	$pattern_table .= "</td>\n";
      } elsif ($main::verbose >= 1) {
	&RSAT::message::Warning("Cannot find logo", $logo_file);
      }
      if (-e $logo_file_rc) {
	my $img = &RSAT::util::RelativePath($main::outfile{synthesis}, $logo_file_rc);
	$pattern_table .= "<td align='left'>";
	$pattern_table .= $start_format;
	$pattern_table .= $pattern_info[$logo_nb]->{rc_word};
	$pattern_table .= "<br><a  href='".$img."'><img height=70 src='".$img."'></a>";
	$pattern_table .= "</td>\n";
      } elsif ($main::verbose >= 1) {
	&RSAT::message::Warning("Cannot find reverse complementary logo", $logo_file_rc);
      }
      #    $pattern_table .= "<td align='right'>".$pattern_info[$logo_nb]->{score}."</td>";
      $pattern_table .= "</tr>\n";
    }
    $pattern_table .= "</table>\n";

    &SyntheticTableAddRow($pattern_type,
			  $pattern_table,
			  $pattern_type,$main::outfile{$pattern_type},
			  "assembly",$main::outfile{$pattern_type.'_asmb'},
			  "sig matrix",$main::outfile{$pattern_type.'_pssm_sig'},
			  "matrices .tab",$main::outfile{$pattern_type.'_pssm_counts'},
			  "matrices .tf",$main::outfile{$pattern_type.'_pssm_tf'},
			  "sites",$main::outfile{$pattern_type.'_pssm_sites'},
			  "site distrib",$main::outfile{$pattern_type.'_pssm_site_distrib'},
			  "site distrib graph",$main::outfile{$pattern_type.'_pssm_site_distrib_graph'},
			 );
  } else {
    &SyntheticTableAddRow($pattern_type,
			  "<font color='red'red>Missing files</font>",
			  $pattern_type,$main::outfile{$pattern_type},
			  "assembly",$main::outfile{$pattern_type.'_asmb'},
			  "sig matrix",$main::outfile{$pattern_type.'_pssm_sig'},
			  "matrices",$main::outfile{$pattern_type.'_pssm_counts'},
			  "sites",$main::outfile{$pattern_type.'_pssm_sites'},
			  "site distrib",$main::outfile{$pattern_type.'_pssm_site_distrib'},
			  "site distrib graph",$main::outfile{$pattern_type.'_pssm_site_distrib_graph'},
			 );
  }
}

################################################################
## Synthesis of word comparisons
sub SynthesisWordCompa {
  my $img = &RSAT::util::RelativePath($main::outfile{synthesis}, $main::outfile{merged_words_heatmap});
  &SyntheticTableAddRow("Word comparisons",
			"<a  href='".$img."'><img height=150 src='".$img."'></a>",
			"tab",$main::outfile{merged_words},
			"html",$main::outfile{merged_words_html},
			"heatmap",$main::outfile{merged_words_heatmap},
		       );
}

################################################################
## Synthesis of motif comparisons
sub SynthesisMotifCompa {
  my $img = &RSAT::util::RelativePath($main::outfile{synthesis}, $main::outfile{motifs_disco_compa_png});
  &SyntheticTableAddRow("Motif comparisons",
			"<a  href='".$img."'><img height=300 src='".$img."'></a>",
			"motifs", $main::outfile{motifs_discovered},
			"table (txt)",$main::outfile{motifs_disco_compa},
			"table (html)",$main::outfile{motifs_disco_compa_html},
			"gml (for CytoScape)",$main::outfile{motifs_disco_compa_gml},
			"png (low resolution)",$main::outfile{motifs_disco_compa_png},
			"MCL clusters" ,$main::outfile{motifs_disco_clusters_tab},
			"intra-cluster degree", $main::outfile{motifs_disco_compa_cluster_intra_degree},
			"MCL cluster graph (tab)", $main::outfile{motifs_disco_clusters_graph},
			"MCL cluster graph (gml)", $main::outfile{motifs_disco_clusters_graph_gml},
			"MCL cluster graph (png)", $main::outfile{motifs_disco_clusters_graph_png},
		       );
}

################################################################
## Synthesis of comparisons betwween discovered motifs and reference motif
sub SynthesisMotifsVersusReference {
  &SyntheticTableAddHeaderRow("Discovered motifs versus reference motif");
  my $img = &RSAT::util::RelativePath($main::outfile{synthesis}, $main::outfile{"motifs_vs_ref_png"});
  &SyntheticTableAddRow("Motifs versus reference motif",
			"<a  href='".$img."'><img height=300 src='".$img."'></a>",
			"Reference motif",$main::infile{ref_motifs},
			"table (txt)",$main::outfile{"motifs_vs_ref"},
			"table (html)",$main::outfile{"motifs_vs_ref_html"},
			"alignments", $main::outfile{"motifs_vs_ref_alignments_1ton"},
			"aligned logos", $main::outfile{"motifs_vs_ref_alignments_1ton_html"},
			"gml (for CytoScape)",$main::outfile{"motifs_vs_ref_gml"},
			"png (low resolution)",$main::outfile{"motifs_vs_ref_png"},
		       );
}

################################################################
## Synthesis of comparisons betwween discovered motifs and motif databases
sub SynthesisMotifsVersusDatabase {
  &SyntheticTableAddHeaderRow("Discovered motifs versus transcription factor databases");
  foreach my $db_name (@motif_databases) {
    my $img = &RSAT::util::RelativePath($main::outfile{synthesis}, $main::outfile{"motifs_vs_db_".$db_name."_png"});
    &SyntheticTableAddRow("Motifs versus ".$db_name." database",
			  "<a  href='".$img."'><img height=300 src='".$img."'></a>",
			  $db_name." DB",$main::infile{"motif_db_".$db_name},
			  "table (txt)",$main::outfile{"motifs_vs_db_".$db_name},
			  "table (html)",$main::outfile{"motifs_vs_db_".$db_name."_html"},
			  "alignments", $main::outfile{"motifs_vs_db_".$db_name."_alignments_1ton"},
			  "aligned logos", $main::outfile{"motifs_vs_db_".$db_name."_alignments_1ton_html"},
			  "gml (for CytoScape)",$main::outfile{"motifs_vs_db_".$db_name."_gml"},
			  "png (low resolution)",$main::outfile{"motifs_vs_db_".$db_name."_png"},
			 );
  }
}

################################################################
## Delete purged sequence files after analysis has been completed.
sub CleanSequences {
  foreach my $seq_type (@seq_types) {
    &RSAT::message::TimeWarn("Cleaning sequences", $seq_type) if ($main::verbose >= 1);
    &one_command("rm -f ".$main::outfile{$seq_type."_purged"}, 1);
  }
}


################################################################
## Read arguments 
sub ReadArguments {
  &RSAT::message::TimeWarn("Reading arguments") if ($main::verbose >= 1);
  my $arg;
  my @arguments = @ARGV; ## create a copy to shift, because we need ARGV to report command line in &Verbose()
  while (scalar(@arguments) >= 1) {
    $arg = shift (@arguments);
    ## Verbosity

=pod

=head1 OPTIONS

=over 4

=item B<-v #>

Level of verbosity (detail in the warning messages during execution)

=cut
    if ($arg eq "-v") {
      if (&IsNatural($arguments[0])) {
	$main::verbose = shift(@arguments);
      } else {
	$main::verbose = 1;
      }

=pod

=item B<-h>

Display full help message

=cut
    } elsif ($arg eq "-h") {
      &PrintHelp();

=pod

=item B<-help>

Same as -h

=cut
    } elsif ($arg eq "-help") {
      &PrintOptions();

=pod

=item B<-i test_seq_file>

Test peak sequence file (mandatory).

For single-set analysis, this file contains the peak seuences of the
unique set.  For test versus control analysis, it contains the test
sequences.

=cut
    } elsif ($arg eq "-i") {
      $main::infile{test_seq} = shift(@arguments);

=pod

=item B<-ctrl control_seq_file>

Control peak sequence file (optional).

The control sequence file is used:
- as control sequence for oligo-diff
- to estimate the background models for oligo-analysis and
  dyad-analysis.

Control sequences are supposed to contain a large number of sequences
without particular enrichment for any motif. The choice of appropriate
background sequences is crucial to detect relevant motifs.

The file should be sufficiently large (several Mb) to provide a robust
estimate of prior probabilities (frequencies expected at random) for
oligonucleotides and dyads.

Typical examples of control sequences:

- random fragments of the genome of interest
  (e.g. obtained with I<random-genome-fragments>)

- sets of sequences pulled down in a mock experiment (without the
  antibody) and characterized by ChIP-seq or ChIP-chip.

- sets of peaks for a compendium of transcription factors different
  from the factor of interest.

=cut
    } elsif ($arg eq "-ctrl") {
      $main::infile{ctrl_seq} = shift(@arguments);


=pod

=item B<-max_seq_len msl>

Maximal sequence length. larger sequences are truncated at the
specified length around the sequence center (from -msl/2 to +msl/2).

=cut
    } elsif ($arg eq "-max_seq_len") {
      my $max_seq_len = shift(@arguments);
      &RSAT::error::FatalError($max_seq_len, "is not a valid value for max sequence length. Should be a Natural number.")
	unless ((&IsNatural($max_seq_len)));
      if ($max_seq_len > 0) {
	$main::param{max_seq_len} = $max_seq_len;
      } else {
	&RSAT::message::Info("Max seq len = 0 interpreted as no limit") if ($main::verbose >= 2);
      }

=pod

=item B<-top_peaks N>

Restrict the analysis to the N peaks at the top of the input sequence
file. Some peak calling programs return the peaks sorted by score. In
such case, the -top_peaks option allows to restrict the analysis to
the highest scoring peaks. In some cases, the top-scoring peaks might
contain a higher density of binding sites, allowing to detect motifs
with a higher significance.

This option can alos be convenient for performing quick tests,
parameter selection and debugging before running the full analysis of
large sequence sets.

=cut
    } elsif ($arg eq "-top_peaks") {
      my $top_peaks = shift(@arguments);
      &RSAT::error::FatalError($top_peaks, "is not a valid value for max sequence length. Should be a Natural number.")
	unless ((&IsNatural($top_peaks)));
      if ($top_peaks > 0) {
	$main::param{top_peaks} = $top_peaks;
      } else {
	&RSAT::message::Info("Top peak number = 0 interpreted as no limit") if ($main::verbose >= 2);
      }

=pod

=item B<-ref_motifs reference_motif>

Reference motif (optional).

In some cases, we already dispose of a reference motif, for example
the motif annotated in some transcription factor database
(e.g. RegulonDB, Jaspar, TRANSFAC)
 for the transcription factor of
interest. These annotations may come from low-throughput experiments,
and rely on a poor number of sites, but the reference motif may
nevertheless be informative, because it is based on several
independent studies.

Each discovered motif can be compared to the reference motif, in order
to evaluate its correspondence with the binding motif of the factor of
interest.

=cut
    } elsif ($arg eq "-ref_motifs") {
      $main::infile{ref_motifs} = shift(@arguments);


=pod

=item B<-motif_db db_name db_format db_file>

File containinf a database of transcription factor binding motifs
(e.g. JASPAR, TRANSFAC, RegulonDB, ...) which will be compared to the
discovered motifs (task motifs_vs_db). 

The option requires three arguments:

 - DB name

 - matrix format. same supported formats as convert-matrices, but we
   recommend to use a format that includes an ID and a name for each
   motif (e.g. TRANSFAC)

 - file containing the DB motifs

The option can be called iteratively on the ame command line in order
to compare discovered motifs with several databases.

Examples:

 -motif_db TRANSFAC transfac transfac_download_dir/cgi-bin/data/matrix.dat

   will load a file containing all matrices from the TRANSFAC
   database.

 -motif_db JASPAR jaspar jaspar_file.tf

   will load a file containing motifs from the JASPAR database that
   have previously been converted to TRANSFAC format.

=cut

    } elsif ($arg eq "-motif_db") {
      my $db_name = shift(@arguments);
      $db_name =~ s/\s/_/g;
      push @main::motif_databases, $db_name;
      $main::motif_db_format{$db_name} = shift(@arguments);
      my $db_file = shift(@arguments);
      unless (-e $db_file) {
	&RSAT::error::FatalError("Motif DB file does not exist.", $db_file);
      }
      $main::infile{"motif_db_".$db_name} = $db_file;

=pod

=item	B<-outdir output_directory>

Output directory (mandatory).

The result files and index files produced by the different programs
will be stored in this directory.

=cut
    } elsif ($arg eq "-outdir") {
      $main::dir{output} = shift(@arguments);


=pod

=item	B<-prefix output_prefix>

Prefix for the output files.

=cut
    } elsif ($arg eq "-prefix") {
      $main::param{prefix} = shift(@arguments);

=pod

=item	B<-title graph_title>

Title displayed on top of the graphs.

=cut
    } elsif ($arg eq "-title") {
      $main::param{title} = shift(@arguments);

=pod

=item	B<-img_format img_format>

Image format. 

All the formats supported by XYgraph can be used.

=cut
    } elsif ($arg eq "-img_format") {
      $main::param{img_format} = shift(@arguments);

=pod

=item B<-task>

Specify a subset of tasks to be executed.

By default, the program runs all necessary tasks. However, in some
cases, it can be useful to select one or several tasks to be executed
separately.

Beware: task selection requires expertise, because most tasks depends
on the prior execution of some other tasks in the workflow. Selecting
tasks before their prerequisite tasks have been completed will provoke
fatal errors.

I<Avilable Tasks.>

=over

=item I<all> (default)

Run all supported tasks.

=item I<purge>

Purge input sequences (test set and, if specified, control set) to
mask redundant fragments before applying pattern discovey
algorithms. Sequence purging is necessary because redundant fragments
would violate the hypothesis of independence underlying the binomial
significance test, resulting in a large number of false positive
patterns.

=item I<seqlen>

Compute sequence lengths and their distribution. 

Sequence lengths are useful for the negative control (selection of
random genome fragments).

Sequence length distribution is informative to get an idea about the
variability of peak lengths.

=item I<composition>

Compute compositional profiles, i.e. distributions of residues and
dinucleotide frequencies per position (using I<position-analysis>).

Residue profiles may reveal composition biases in the neighborhood of
the peak sequences. Dinucleotide profiles can reveal (for example) an
enrichment in CpG island.

Note that I<peak-motifs> also runs I<position-analysis> with
larger oligonucleotide length (see option -l) to detect motifs on the
basis of positionally biased oligonucleotides (see task B<positions>).

=item I<ref_motifs>

This task combines various operations.

=over

=item Formating of the reference motif

Perform various format conversion for the reference motif (compute
parameters, consensus, logo).

=item Motif enrichment

Generate an enriched motif by scanning the peak sequence set with the
reference motif.

=item Motif comparison

Compare all discovered motifs with the reference motif.

=back

=item I<oligos>

Run I<oligo-analysis> to detect over-represented oligonucleotides of a
given length (k, specified with option -l) in the test set (van Helden
et al., 1998). Prior frequencies of oligonucleotides are taken from
Markov model of order m (see option -markov) estimated from the test
set sequences themselves.

=item I<dyads>

Run I<dyad-analysis> to detect over-represented dyads, i.e. pairs of
short oligonucleotides (monads) spaced by a region of fixed width but
variable content (van Helden et al., 2000). Spaced motifs are typical
of certain classes of transcription factors forming homo- or
heterodimers.

By default, peak-motifs analyzes pairs of trinucleotides with
any spacing between 0 and 20.

The expected frequency of each dyad is estimated as the product of its
monad frequencies in the input sequences (option -bg monads of
dyad-analysis).

=item I<positions>

Run I<position-analysis> to detect oligonucleotides showing a
positional bias, i.e. have a non-homogeneous distribution in the peak
sequence set.

This method was initially developed to analyze termination and
poly-adenylation signals in downstream sequences (van Helden et al.,
2001), and it turns out to be very efficient for detecting motifs
centred on the ChIP-seq peaks. For ChIP-seq analysis, the reference
position is the center of each sequence.

Note that I<peak-motifs> also uses I<position-analysis> for the
task B<composition>, in order to detect compositional biases (residues,
dinucleotides) in the test sequence set.

=item I<local_words>

Run I<local-word-analysis> to detect locally over-represented
oligonucleotides and dyads. 

The program I<local-word-analysis> (Matthieu Defrance,unpublished)
tests the over-representation of each possible word (oligo, dyad)
in positional windows in the input sequence set.

Two types of background models are supported: (i) Markov model of
order m estimated locally (within the window under consideration; (ii)
the frequency observed for a word in the whole sequence set is used as
estimator of the prior probability of this word in the window.

After our first trials, this program gives excellent results in
ChIP-seq datasets, because its senstivitity increases with large
number of sequences (several hundreds/thousands), and its background
model is more stringent than for programs computing the globl
over-representation (oligo-analysis, dyad-analysis).

=item I<merge_words>

Merge the words (oligos-dyads) discovered by the different
algorithms. Those words will then be used as seeds for extracting
over-represented position-specific scoring matrices from the sequences
(using the program I<matrix-from-patterns>).

The table of merged words has one row per word (oligo or dyad) and one
column per pattern discovery program. This allows to analyze the
consistency between the words detected by different approaches,
e.g. show that a word is both over-represented (oligo-analysis,
dyad-analysis) and positionally biased (position-analysis,
local-words). A heatmap is also exported to provide a graphical
representation of the significance of each word (row) for each
algorthm (column).


=item I<motifs_vs_ref>

Compare each discovered motif to the reference motifs.

=item I<motifs_vs_db>

Compare each discovered motif to a database of known motifs
(e.g. Jaspar, TRANSFAC, RegulonDB, UniProbe, ...)

=item I<timelog>

Generate a log file summarizing the time spent in the different tasks.

=item I<synthesis>

Generate the HTML file providing a synthesis of the results and
pointing towards the individual result files.

=item I<clean_seq>

Delete the purged sequence files after the analysis, in order to save
space.

This task is executed only when it is called explicitly. It is not
part of the tasks running with the option "-task all".

=back

=cut
    } elsif ($arg eq "-task") {
      my @requested_tasks = split ",", shift (@arguments);
      foreach my $task (@requested_tasks) {
	next unless $task;
	if ($supported_task{$task}) {
	  $task{$task} = 1;
	} else {
	  &RSAT::error::FatalError("Task '$task' is not supported. \n\tSupported: $supported_tasks");
	}
      }

=pod

=item B<-nmotifs max_motif_number>

Maximal number of motifs (matrices) to return for pattern discovery
algorithms. Note the distinction between the maximal number of motifs
(matrices) and the maximum number of patterns (words, dyads): a motif
generally corresponds to mutually overlapping several patterns (dyads,
words).

=cut

    } elsif ($arg eq "-nmotifs") {
      $main::param{matrix_nmotifs} = shift (@arguments);

=pod

=item B<-l oligo_len>


Oligonucleotide length for word-counting approaches (oligo-analysis,
position-analysis, local-word-analysis, oligo-diff).

In our experience, optimal results are obtained with hexanucleotides
and heptanucleotides.

Note: the monad length used for dyad-analysis is not affected by those
options. Instead it is fixed to to 3. Indeed, dyad-analysis can detect
larger motifs by sampling various spacings between the two
trinucleotide monads.

=item B<-minol oligo_min_len>

=item B<-maxol oligo_max_len>

Minimal (-minol) and maximal (-maxol) oligonucleotide lengths. If
those options are used, the program iterated over the specified range
of oligonucleotide lengths.

=cut
    } elsif ($arg eq "-l") {
      my $oligo_len = shift (@arguments);
      $main::param{oligo_min_len} = $oligo_len;
      $main::param{oligo_max_len} = $oligo_len;
    } elsif ($arg eq "-minol") {
      $main::param{oligo_min_len} = shift (@arguments);
    } elsif ($arg eq "-maxol") {
      $main::param{oligo_max_len} = shift (@arguments);

=pod

=item B<-markov>

Order of the Markov model used to estimatd
expected oligonucleotide frequencies for I<oligo-analysis> and
I<local-word-analysis>.

Higher order Markov models are more stringent, lower order are more
sensitive, but tend to return a large number of false positives.

Markov models can be specified with either a positive or a negative
value. Positive value indicate the length of the prefix in the
transition matrix. Negative value indicate the order of the Markov
model relative to the oligonucleotide length. For example, the option
-markov -2 gives a model of order m=k-2 (thus, an order 5 for
heptanucleotides, an order 4 for hexanucleotides).

The optimal Markov order depends on the number of sequences in the
test set. Since ChIP-seq data typically contain hundreds to thoursands
of peaks, high Markov orders are generally good, because they are
stringent and still sensitive enough.  In our experience, motifs are
well detected with the most stringent Markov order (-markov -2).

=item B<-min_markov min_markov_order>

=item B<-max_markov max_markov_order>

A miminal and a maximal value can be specified for the Markov
order. The program then iterates over all markov values between
min_markov_order and max_markov_order.


=cut
    } elsif ($arg eq "-markov") {
      $main::param{oligo_min_mkv} = $main::param{oligo_max_mkv} = shift (@arguments);
    } elsif ($arg eq "-min_markov") {
      $main::param{oligo_min_mkv} = shift (@arguments);
    } elsif ($arg eq "-max_markov") {
      $main::param{oligo_max_mkv} = shift (@arguments);

=pod

=item B<-1str | -2str>

Single-strand (-1str) or double-strand (-2str) analysis.

The default is double-strand analysis, since ChIP-seq results have no
particular strand orientation.

=cut
    } elsif ($arg eq "-1str") {
      $main::param{strand} = "-1str";
    } elsif ($arg eq "-2str") {
      $main::param{strand} = "-2str";

=pod

=item B<-noov | -ovlp>

Treatment of self-overlapping words for motif discovery: count (-ovlp)
or do not count (-noov) overlapping occurrences. In -noov mode, only
renewing occurrences are counted.

It is recommended to use the -noov mode (default) to avoid the effect
of self-overlap, which violates the hypothesis of independence of
successive occurrences underlying the binomial significance test
(oligo-analysis, dyad-analysis).

B<Beware>: the options -noov and -ovlp only apply to pattern discovery,
and not to compositional profiles. Dinucleotide frequencies are always
computed with the option -ovlp (count all occurrences), to avoid weird
effect. Since those compositin profiles further serve to estimate the
probability of larger words, which may include repeated residues, we
need to count all dinucleotide occurrences. Indeed with the -noov mode
(renewing occurrences only), the transition tables of the first order
Markov model would be unbalanced: the expected frequency of all the
repeated dinucleotides (AA, TT, CC, GG) would be under-estimated,
leading to an under-estimation of the expected frequency of
repeat-containing words (e.g. AAAAAA, AAAGGG, ...).

=cut
    } elsif ($arg eq "-noov") {
      $main::param{disco_noov} = "-noov";
    } elsif ($arg eq "-ovlp") {
      $main::param{disco_noov} = "-ovlp";

=pod

=item B<-ci class_interval>

Class interval for I<position-analysis>.

=cut

    } elsif ($arg eq "-ci") {
      $main::param{profiles_ci} = shift(@arguments);
      &RSAT::error::FatalError($main::param{profiles_ci}, "is not a valid value for class interval. Should be a strictly positive Natural number.")
	unless ((&IsNatural($main::param{profiles_ci})) && ($main::param{profiles_ci} > 0));

      ## Other parameters are not accepted
    } else {
      &FatalError(join("\t", "Invalid option", $arg));

    }
  }

=pod

=back

=cut

}

################################################################
## Check arguments
sub CheckArguments {
  &RSAT::message::TimeWarn("Checking arguments") if ($main::verbose >= 2);

  ## Input sequence file
  if ($infile{test_seq}) {
    unless (-e $infile{test_seq}) {
      &FatalError("Test sequence file does not exist", $infile{test_seq});
    }
    @main::seq_types = ("test");
  } else {
    &FatalError("You must define the test sequence set (option -i)");
  }

  ## If control file has been specified, chec that it exists
  if ($infile{ctrl_seq}) {
    unless (-e $infile{ctrl_seq}) {
      &FatalError("Control sequence file does not exist", $infile{ctrl_seq});
    }
    push @main::seq_types, ("ctrl");
  }

  ## Output directory
  if ($main::dir{output}) {
    &RSAT::util::CheckOutDir($main::dir{output});
  } else {
    &FatalError("You must define the output directory (option -outdir)");
  }

  ## Prefix
  unless ($main::param{prefix}) {
    &FatalError("You must define a prefix for the output files (option -prefix)");
  }

  ## Title
  unless ($main::param{title}) {
    $main::param{title} = $main::param{prefix};
  }

  ## Log files
  $main::outfile{log} = &OutFileName("reports", ".txt", "log");
  $main::outfile{timelog} = &OutFileName("reports", ".txt", "timelog");
  $main::outfile{timelog_html} = &OutFileName("reports", ".html", "timelog");

  ## Synthesis file 
  $main::outfile{synthesis} = &OutFileName("", ".html", "synthesis");


  ## Modalities of motif comparisons
  if ($task{motif_compa}) {
   $task{merge_motifs} = 1;
#   $task{cluster_motifs} = 1;
   $task{motifs_vs_ref} = 1 if (defined($main::infile{ref_motifs}));
   $task{motifs_vs_db} = 1 if (scalar(@motif_databases) > 0);
  }

  ## Check that reference motif has been specified if required
  if ($task{motifs_vs_ref}) {
    &RSAT::error::FatalError("The task motifs_vs_dref requires to specify a file containing the reference motif (option -ref_motifs).")
      unless (defined($main::infile{ref_motifs}));
  }

  ## Check that motif DB has been specified if required
  if ($task{motifs_vs_db}) {
    &RSAT::error::FatalError("The task motifs_vs_db requires to specify at least one file containing database motifs (option -motif_db).")
      unless (scalar(@motif_databases) > 0)
  }


  ## If all tasks are requested or if no task is defined, execute all
  ## tasks.
  if ((scalar(keys(%task)) == 0) || ($task{all})) {
    %task = %supported_task;
    delete($task{all});
    delete($task{clean_seq});
    if (defined($main::infile{ctrl_seq})) {
      delete($task{oligos});
      delete($task{dyads});
      delete($task{positions});
      delete($task{local_words});
    } else {
      delete($task{oligo_diff});
    }
    delete($task{motifs_vs_ref}) unless (defined($main::infile{ref_motifs}));
    delete($task{motifs_vs_db}) unless (scalar(@motif_databases) > 0);
  }
  foreach my $task (@supported_tasks) {
   push (@tasks, $task) if $task{$task};
  }
  &RSAT::message::Info("; Tasks: ", join (",", @tasks)) if ($main::verbose >= 1);
}

################################################################
## Verbose message
sub Verbose {
    print $main::out "; peak-motifs ";
    &PrintArguments($main::out);
    printf $main::out "; %-22s\t%s\n", "Program version", $program_version;

    ## Parameter values
    print $main::out "; Parameter values\n";
    foreach my $param_name (@param_list) {
      print $main::out sprintf ";\t%-22s\t%s\n", $param_name, $param{$param_name};
    }

    ## Input file(s)
    if (defined(%main::infile)) {
	print $main::out "; Input files\n";
	while (my ($key,$value) = each %main::infile) {
	  printf $main::out ";\t%-22s\t%s\n", $key, $value;
	}
    }

    print $out &PrintThresholdValues();

    ## Output files
    if (defined(%main::outfile)) {
	print $main::out "; Output files\n";
	foreach my $key (sort keys  %main::outfile) {
	  $value = $main::outfile{$key};
	  printf $main::out ";\t%-30s\t%s\n", $key, $value;
	}
    }
}

=pod

=head1 REFERENCES

The program I<peak-motifs> combines a series of tried-and-tested
programs which have been detailed in the following publications.

=over

=item I<oligo-analysis>

van Helden, J., Andre, B. and Collado-Vides, J. (1998). Extracting
regulatory sites from the upstream region of yeast genes by
computational analysis of oligonucleotide frequencies. J Mol Biol 281,
827-42.

=item I<dyad-analysis>

van Helden, J., Rios, A. F. and Collado-Vides, J. (2000). Discovering
regulatory elements in non-coding sequences by analysis of spaced
dyads. Nucleic Acids Res 28, 1808-18.

=item I<position-analysis>

van Helden, J., del Olmo, M. and Perez-Ortin,
J. E. (2000). Statistical analysis of yeast genomic downstream
sequences reveals putative polyadenylation signals. Nucleic Acids Res
28, 1000-10.

=item I<matrix-scan>

Turatsinze, J. V., Thomas-Chollier, M., Defrance, M. and van Helden,
J. (2008). Using RSAT to scan genome sequences for transcription
factor binding sites and cis-regulatory modules. Nat Protoc 3,
1578-88.

=back

=head1 SEE ALSO

=over

=item I<oligo-analysis>

=item I<dyad-analysis>

=item I<position-analysis>

=item I<matrix-scan>


=back

=head1 WISH LIST

=over

=item B<background models from ctrl sequences>

Estimate background models from control sequences, for oligo-analysis,
dyad-analysis, and local-word-analysis. This should in principle
reduce the rate of false positive.

=item B<partial synthesis>

For the Web server: generate temporary synthetic table showing the
results already obtained so far, and finishing by a message "Partial
results, please don't forget to reload the file later".

=item B<motif_cluster>

Compare all discovered motifs (plus reference motif if specified) and
cluster them in order to extract a consensus motif.

=item B<weeder>

Add a task to run Weeder on the peak sequences.

 weederlauncher.out input orgnism large S M T5

=item B<meme_bg>

Compute a background model for MEME.

=item B<all_oligos>

Run oligo-analysis without any threshold in order to produce a plot of
observed versus expected occurrences for all the
oligonucleotides. This analysis is performed with the option
-two_tails, which detects both under- and over-represented patterns.

=item B<full HTML report>

- link to the directories for each algorithm/task

- link from the result page to the link table returned by
  position-analysis (file *_graph_index.html).

=back


=cut

__END__
