#!/usr/bin/perl -w
############################################################
#
# $Id: chip-seq-analysis,v 1.62 2010/06/12 23:20:58 jvanheld Exp $
#
############################################################

die "ERROR: chip-seq-analysis has been renamed chip-motifs\n";


## use strict;

=pod

=head1 NAME

chip-seq-analysis

=head1 VERSION

$program_version

=head1 DESCRIPTION

Pipeline for discoering motifs from ChIP-seq (or ChIP-chip) peak
sequences.

=head1 AUTHORS

=over

=item Jacques van Helden <Jacques.van.Helden@ulb.ac.be>

Conception and mplementation of the work flow.

=item Morgane Thomas-Chollier <thomas-c@molgen.mpg.de>

Conception of the work flow + implementation of Web interface.

=item Matthieu Defrance <defrance@ccg.unam.mx>

Algorithms used in the work flow (local-word-analysis).

=item Olivier Sand <oly@bigre.ulb.ac.be> for the Web services

Web services.

=item Carl Herrmann <carl.herrmann@univmed.fr> and Denis Thieffry <thieffry@tagc.univ-mrs.fr>

Definition of optimal conditions of utilzation. Comparisons between
motifs, motif clustering and consensus.

=back


=head1 CATEGORY

Pattern discovery

=head1 USAGE

chip-seq-analysis [-i inputfile] [-o outputfile] [-v #] [...]

=head1 INPUT FORMAT

The program takes as input either one (test) or two sequence files
(test versus control).

All input sequences formats supported by convert-sequences are
supported.

=head1 OUTPUT FORMAT

The pipeline runs a series of programs generating each one or several
result file. An HTML index is generated in order to synthesize the
results and give access to the individual result files. 

The index file is formed from the output directory (option -outdir)
and the file prefix (option -prefix).

  [output_dir]/[prefix]_synthesis.html

=cut


BEGIN {
    if ($0 =~ /([^(\/)]+)$/) {
	push (@INC, "$`lib/");
    }
}
require "RSA.lib";
require "RSA.disco.lib";
require "footprint.lib.pl";
use RSAT::util;


################################################################
## Main package
package main;
{

  ################################################################
  ## Initialize parameters
  local $start_time = &RSAT::util::StartScript();

  ## Check that the RSAT paths of the programs required for the script are specified
  $PYTHON =  $ENV{RSAT}."/python_scripts" unless ($PYTHON);
  $SCRIPTS = $ENV{RSAT}."/perl-scripts" unless ($SCRIPTS);
  $BIN = $ENV{RSAT}."/bin" unless ($BIN);

  $program_version = do { my @r = (q$Revision: 1.62 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
  #    $program_version = "0.00";

  %main::param = ();
  @main::param_list = ();

  %main::dir = ();
  %main::infile = ();
  %main::outfile = ();

  $main::verbose = 0;
  $main::out = STDOUT;

  ## Sequence types (can be eiter 'test' or 'test' + 'ctrl')
  @seq_types = ();

  ## Pattern types (oligos, positions, local-word-analysis,... with the oligo length suffix)
  @pattern_types = ();
  @timelog_keys = ();


  ################################################################
  ## Supported tasks
  @supported_tasks = qw (
			 all
			 seqlen
			 purge
			 profiles

			 oligos
			 dyads
			 positions
			 local_words

			 oligo_diff

			 ref_motif
			 word_compa

			 merge_motifs
			 cluster_motifs
                         motifs_vs_ref
                         motifs_vs_db
			 motif_compa

			 scan

			 timelog
			 synthesis
			 clean_seq
			);

  my @future_tasks = qw(
			to_bed
		       );
  $supported_tasks = join ",", @supported_tasks;
  %supported_task = ();
  foreach my $task (@supported_tasks) {
    $supported_task{$task} = 1;
  }
  %task = ();			## List of tasks to be executed

  @motif_databases = ();
  @motif_db_format = ();

  ################################################################
  ## Set default parameters
  &DefaultParameters();

  ################################################################
  ## Read argument values
  &ReadArguments();

  &CheckArguments();

  &SetOutFileNames();

  ################################################################
  ## Open output stream
  $main::out = &OpenOutputFile($main::outfile{log});

  ################################################################
  ## Print verbose
  &Verbose() if ($main::verbose);

  foreach my $seq_type (@seq_types) {
    &PurgeOneSeq($seqfile{$seq_type}, $seq_type) if ($task{purge});
    &SequenceLengths($seqfile{$seq_type}, $seq_type) if ($task{seqlen});
    &CompositionProfiles($seqfile{$seq_type}, $seq_type) if ($task{profiles});
  }


  if ($main::infile{ctrl_seq}) {
    &OligoDiff() if ($task{oligo_diff});
  } else {
    &OligoAnalysis() if ($task{oligos});
    &DyadAnalysis() if ($task{dyads});
    &PositionAnalysis() if ($task{positions});
    &LocalWords() if ($task{local_words});
  }

  if (defined($main::infile{ref_motif})) {
    &RefMotif() if ($task{ref_motif});
  }

  &WordsVersusWords() if ($task{word_compa});

  &MergeMotifs() if ($task{merge_motifs});

  &ClusterMotifs() if ($task{cluster_motifs});

  &MotifsVersusReference() if ((defined($main::infile{ref_motif}))
			       && ($task{motifs_vs_ref}));

  &MotifsVersusDatabase() if ((scalar(@motif_databases) > 0)
			      && ($task{motifs_vs_db}));

  &ScanSequences() if ($task{scan});

  &TimeLog() if ($task{timelog});

  &Synthesis() if ($task{synthesis});

  &CleanSequences() if ($task{clean_seq});

  if ($main::verbose >= 1) {
    &TimeWarn("Log file", $main::outfile{log});
  }

  ################################################################
  ## Close output stream
  my $exec_time = &RSAT::util::ReportExecutionTime($start_time); ## This has to be exectuted by all scripts
  print $main::out $exec_time if ($main::verbose >= 1);	## only report exec time if verbosity is specified
  close $main::out if ($main::outfile{output});
  exit(0);
}

################################################################
################### SUBROUTINE DEFINITION ######################
################################################################


################################################################
## Display full help message 
sub PrintHelp {
  system "pod2text -c $0";
  exit()
}

################################################################
## Display short help message
sub PrintOptions {
  &PrintHelp();
}


################################################################
## Set parameter values for all the programs
sub DefaultParameters {
  &RSAT::message::TimeWarn("Setting default parameter values") if ($main::verbose >= 1);

  ## Formats
  $main::param{seq_format} = "fasta"; push(@main::param_list, "seq_format");
  $main::param{img_format} = "png"; push(@main::param_list, "img_format");

  ## Motif formats
  $main::param{ref_motif_format} = "transfac"; push(@main::param_list, "ref_motif_format");

  ## Purge-seq
  $main::param{purge_match_length} = 30; push(@main::param_list, "purge_match_length");
  $main::param{purge_mismatches} = 0; push(@main::param_list, "purge_mismatches");

  ## residue profiles (position-analysis)
  $main::param{profiles_ci} = 20; push(@main::param_list, "profiles_ci");
  $main::param{profiles_max_graphs} = 20; push(@main::param_list, "profiles_max_graphs");
  $main::param{profiles_strand} = "-2str"; push(@main::param_list, "profiles_strand");
  @main::profiles_oligo_lengths = (1,2);
  $main::param{profiles_oligo_lengths} = join(',',@profiles_oligo_lengths); push(@main::param_list, "profiles_oligo_lengths");

  ################################################################
  ## Pattern discovery options
  $main::param{strand} = "-2str"; push(@main::param_list, "strand");
  $main::param{noov} = "-noov"; push(@main::param_list, "noov");
  $main::param{oligo_min_len} = 6; push(@main::param_list, "oligo_min_len");
  $main::param{oligo_max_len} = 7; push(@main::param_list, "oligo_max_len");
  $main::param{patterns_max_rank} = 25; push(@main::param_list, "patterns_max_rank");

  ## Thresholds for oligo-analysis, dyad-analysis and oligo-diff
  $main::param{min_ratio} = 1; push(@main::param_list, "min_ratio");
  $main::param{min_zscore} = 6; push(@main::param_list, "min_ratio");
  $main::param{min_sig} = 0; push(@main::param_list, "min_sig");

  ## oligo-analysis
  $main::param{oligo_min_mkv} = -3; push(@main::param_list, "oligo_min_mkv");
  $main::param{oligo_max_mkv} = -3; push(@main::param_list, "oligo_max_mkv");

  ## dyad-analysis

  ## position-analysis
  $main::param{positions_ci} = 50; ## Class interval
  $main::param{positions_min_occ} = 1; ## Min number of occurrences

  ## local-word-analysis
#  $main::param{local_words_heuristic} = "slices"; ## Heuristic
  $main::param{local_words_window} = 50; ## Class interval

  ## matrix-from-patterns
  $main::param{matrix_nmotifs} = 3; push(@main::param_list, "matrix_nmotifs");

  ## Matrix comparisons
  $main::param{matrix_compa_min_cor} = 0.75;
  $main::param{matrix_compa_min_w} = 5;
  $main::param{matrix_compa_min_Ncor} = 0.3;


  ## matrix-scan-quick
  $main::param{scan_min_score} = 7.5;
  $main::param{scan_strands} = "-2str";

}


################################################################
## Define one output file name by concatenating arguments
sub OutFileName {
  my ($extension, @name_elements) = @_;
  my $out_file_name = $dir{output}."/".$main::param{prefix};
  $out_file_name .= join ("_", @name_elements);
  $out_file_name .= $extension;
  return($out_file_name);
}

################################################################
## Set output file names
sub SetOutFileNames {
  ## Purged sequences
  foreach my $seq_type (@seq_types) {
    if (defined($main::param{max_seq_len})) {
      $main::outfile{"truncated_".$seq_type} = &OutFileName(".fasta",
							    $seq_type,
							    "maxlen".$main::param{max_seq_len});
      $main::outfile{"purged_".$seq_type} = &OutFileName(".fasta", 
							 $seq_type,
							 "purged",
							 "ml".$main::param{purge_match_length},
							 "mis".$main::param{purge_mismatches},
							 "maxlen".$main::param{max_seq_len},
							);
      $seqfile{$seq_type} = $main::outfile{"truncated_".$seq_type};
    } else {
      $main::outfile{"purged_".$seq_type} = &OutFileName(".fasta", 
							 $seq_type,
							 "purged",
							 "ml".$main::param{purge_match_length},
							 "mis".$main::param{purge_mismatches});
      $seqfile{$seq_type} = $main::infile{$seq_type."_seq"};
    }

    ## Sequence lengths
    $main::outfile{$seq_type."_seqlen"} = &OutFileName(".tab", $seq_type."_seqlen");
    $main::outfile{$seq_type."_seqlen_distrib"} = &OutFileName(".tab", $seq_type."_seqlen_distrib");
    $main::outfile{$seq_type."_seqlen_distrib_graph"} = &OutFileName(".".$param{img_format}, $seq_type."_seqlen_distrib");
    #    if ($infile{ctrl_seq}) {
    #	$main::outfile{"ctrl_seqlen"} = &OutFileName(".tab", "ctrl_seqlen");
    #	$main::outfile{"ctrl_seqlen_distrib"} = &OutFileName(".tab", "ctrl_seqlen_distrib");
    #	$main::outfile{"ctrl_seqlen_distrib_graph"} = &OutFileName($param{img_format}, "ctrl_seqlen_distrib");
    #    }

    ## Compositional Profiles
    for my $ol (@profiles_oligo_lengths) {
      ## Profiles of oligo frequencies as a function of the position
      $main::outfile{$ol."nt_".$seq_type."_profiles"} = &OutFileName(".tab", $seq_type."_profiles".$main::param{profiles_strand}.$main::param{noov},$ol."nt",
								     "ci".$main::param{profiles_ci});
      push @timelog_keys, $ol."nt_".$seq_type."_profiles";

      ## oligo frequencies in the sequence set
      $main::outfile{$ol."nt_".$seq_type."_freq"} = &OutFileName(".tab", $seq_type."_freq"."-1str".$main::param{noov},$ol."nt");
      push @timelog_keys, $ol."nt_".$seq_type."_freq";
      $main::outfile{$ol."nt_".$seq_type."_transit"} = &OutFileName(".tab", $seq_type."_transitions"."-1str".$main::param{noov},$ol."nt");
      $main::outfile{$ol."nt_".$seq_type."_inclusive"} = &OutFileName(".txt", $seq_type."_inclusive"."-1str".$main::param{noov},$ol."nt");
      $main::outfile{$ol."nt_".$seq_type."_heatmap"} = &OutFileName(".".$main::param{img_format}, $seq_type."_heatmap"."-1str".$main::param{noov},$ol."nt");

      ## HTML index to the individual oligonucleotide profiles
      $main::outfile{$ol."nt_".$seq_type."_profiles_index"} = $main::outfile{$ol."nt_".$seq_type."_profiles"};
      $main::outfile{$ol."nt_".$seq_type."_profiles_index"} =~ s/\.tab$//;
      $main::outfile{$ol."nt_".$seq_type."_profiles_index"} .= "_graph_index.html";
    }

    ## Compositional profiles
    for my $ol (1,2) {
      $main::outfile{$ol."nt_".$seq_type."_profiles_graph"} = $main::outfile{$ol."nt_".$seq_type."_profiles"};
      $main::outfile{$ol."nt_".$seq_type."_profiles_graph"} =~ s/\.tab$//;
      $main::outfile{$ol."nt_".$seq_type."_profiles_graph"} .= ".".$main::param{img_format};
    }
  }


  ################################################################
  ## Background model for matrix scanning
  $main::param{scan_markov_order} = 1;
  my $bg_ol = $main::param{scan_markov_order}+1;
  $main::param{scan_bg_file} = $main::outfile{$bg_ol."nt_test_inclusive"};
  &RSAT::message::Debug("bg model", 
			"order=".$main::param{scan_markov_order},
			"bg_ol=".$bg_ol,
			"key=".$bg_ol."nt_test_inclusive",
			"file=".$main::param{scan_bg_file}
		       );

  ################################################################
  ## Pattern discovery results
  my $pattern_type = "";
  if ($main::infile{ctrl_seq}) {
    for my $len ($main::param{oligo_min_len}..$main::param{oligo_max_len}) {
      ## oligo-diff
      $pattern_type = 'oligo_diff_'.$len.'nt';
      $main::outfile{$pattern_type} = &OutFileName(".tab", "oligo_diff".$main::param{strand}.$main::param{noov},$len."nt");
      push @pattern_types, $pattern_type;
    }
  } else {
    for my $len ($main::param{oligo_min_len}..$main::param{oligo_max_len}) {
      for my $markov ($main::param{oligo_min_mkv}..$main::param{oligo_max_mkv}) {
	## Convert negative markov orders relative to the oligo length
	$markov += $len if ($markov < 0);
	## oligo-analysis
	$pattern_type = 'oligos_'.$len.'nt_mkv'.$markov;
	$main::outfile{$pattern_type} = &OutFileName(".tab", "oligos".$main::param{strand}.$main::param{noov},
						     $len."nt", "mkv".$markov);
	push @pattern_types, $pattern_type;

	## local-word-analysis
	$pattern_type = 'local_words_'.$len.'nt_mkv'.$markov;
	$main::outfile{$pattern_type} = &OutFileName(".tab", "local_words".$main::param{strand}.$main::param{noov},
						     $len."nt", "wind".$main::param{local_words_window},
#						     $main::param{local_words_heuristic},
						     "mkv".$markov);
	push @pattern_types, $pattern_type;
      }

      ## position-analysis
      $pattern_type = 'positions_'.$len.'nt';
      $main::outfile{$pattern_type} = &OutFileName(".tab", "positions".$main::param{strand}.$main::param{noov},
						   $len."nt", "ci".$main::param{positions_ci});
      push @pattern_types, $pattern_type;
    }

    ## dyad-analysis
    $main::outfile{dyads} = &OutFileName(".tab", "dyads".$main::param{strand}.$main::param{noov},
					 "3nt_sp0-20_bg_monads");
    push @pattern_types, "dyads";
  }

  &RSAT::message::Info("Pattern types", join (",", @pattern_types)) if ($main::verbose >= 1);
  &RSAT::message::Info("Sequence types", join (",", @seq_types)) if ($main::verbose >= 1);

  ## Conversion from patterns to matrices + logos
  foreach my $pattern_type (@pattern_types) {
    push @timelog_keys, $pattern_type;
    $main::outfile{$pattern_type.'_pssm'} = $main::outfile{$pattern_type};
    $main::outfile{$pattern_type.'_pssm'} =~ s/\.tab$//;
    $main::outfile{$pattern_type.'_pssm'} .= "_pssm";
    $main::outfile{$pattern_type.'_2pssm'} = $main::outfile{$pattern_type.'_pssm'}."_log.txt"; push @timelog_keys, $pattern_type.'_2pssm';
    $main::outfile{$pattern_type.'_asmb'} = $main::outfile{$pattern_type.'_pssm'}.".asmb";
    $main::outfile{$pattern_type.'_pssm_sig'} = $main::outfile{$pattern_type.'_pssm'}."_sig_matrices.txt";
    $main::outfile{$pattern_type.'_pssm_gibbs'} = $main::outfile{$pattern_type.'_pssm'}."_gibbs_matrices.txt";
    $main::outfile{$pattern_type.'_pssm_counts'} = $main::outfile{$pattern_type.'_pssm'}."_count_matrices.txt";
    $main::outfile{$pattern_type.'_pssm_tf'} = $main::outfile{$pattern_type.'_pssm'}."_count_matrices.tf";
    $main::outfile{$pattern_type.'_pssm_sites'} = $main::outfile{$pattern_type.'_pssm'}."_sites.tab";
    $main::outfile{$pattern_type.'_pssm_site_distrib'} = $main::outfile{$pattern_type.'_pssm'}."_site_distrib.tab";
    $main::outfile{$pattern_type.'_pssm_site_distrib_graph'} = $main::outfile{$pattern_type.'_pssm'}."_site_distrib.png";
    foreach my $logo_nb (1..$main::param{matrix_nmotifs}) {
      $main::outfile{$pattern_type.'_pssm_logo'.$logo_nb} =
	$main::outfile{$pattern_type.'_pssm'}."_count_matrices_logo_m".$logo_nb.".".$main::param{img_format};
      $main::outfile{$pattern_type.'_pssm_logo_rc'.$logo_nb} =
	$main::outfile{$pattern_type.'_pssm'}."_count_matrices_logo_m".$logo_nb."_rc.".$main::param{img_format};
    }

  }

  ## Comparison between significance of the discovered patterns
  $main::outfile{word_compa} = &OutFileName(".tab", "word_comparison");
  $main::outfile{word_compa_heatmap} = &OutFileName(".png", "word_comparison_heatmap");


  ## Clustering of the discovered motifs + comparison with reference motif
  $main::outfile{motifs_discovered} = &OutFileName(".tf", "motifs_discovered");
  $main::outfile{motifs_disco_compa} = &OutFileName(".tab", "motifs_disco_compa");
  push @timelog_keys, "motifs_disco_compa";
  $main::outfile{motifs_disco_compa_gml} = &OutFileName(".gml", "motifs_disco_compa");
  $main::outfile{motifs_disco_compa_png} = &OutFileName(".png", "motifs_disco_compa");
  $main::outfile{motifs_disco_compa_clusters_mcl} = &OutFileName(".mcl", "motifs_disco_compa_clusters");
  $main::outfile{motifs_disco_compa_clusters_tab} = &OutFileName(".tab", "motifs_disco_compa_clusters");
  $main::outfile{motifs_disco_compa_clusters_graph} = &OutFileName(".tab", "motifs_disco_compa_clusters_graph");
  $main::outfile{motifs_disco_compa_cluster_intra_degree} = &OutFileName(".tab", "motifs_disco_compa_cluster_intra_degree");
  #    $main::outfile{motifs_disco_ref} = &OutFileName(".tf", "motifs_disco_ref");


  if (defined($main::infile{ref_motif})) {
    ## Motif(s) considered as reference for the testing set
    $main::outfile{"ref_motif"} = &OutFileName($main::param{ref_motif_format}, "ref_motif");
    $main::outfile{"ref_motif_transfac"} = &OutFileName(".tf", "ref_motif");
    $main::outfile{"ref_motif_info"} = &OutFileName(".tab", "ref_motif_info");
    $main::outfile{"ref_motif_logo"} = &OutFileName("", "ref_motif_logo");
    $main::outfile{"ref_motif_enriched"} = &OutFileName("tab", "ref_motif_enriched");

    ## Comparison betweeen discovered motifs and reference motif
    $main::outfile{"motifs_vs_ref"} = &OutFileName(".tab", "motifs_vs_ref");
    push @timelog_keys, "motifs_vs_ref";
    $main::outfile{"motifs_vs_ref_gml"} = &OutFileName(".gml", "motifs_vs_ref");
    $main::outfile{"motifs_vs_ref_png"} = &OutFileName(".png", "motifs_vs_ref");
    $main::outfile{"motifs_vs_ref_matrices"} = &OutFileName(".tab", "motifs_vs_ref_matrices");
    $main::outfile{"motifs_vs_ref_matrices_html"} = &OutFileName(".html", "motifs_vs_ref_matrices");
  }

  ## Comparison between discovered motifs and database(s)
  if (scalar(@motif_databases) > 0) {
    foreach my $db_name (@motif_databases) {
      $main::outfile{"motifs_vs_db_".$db_name} = &OutFileName(".tab", "motifs_vs_db_".$db_name);
      push @timelog_keys, "motifs_vs_db_".$db_name;
      $main::outfile{"motifs_vs_db_".$db_name."_gml"} = &OutFileName(".gml", "motifs_vs_db_".$db_name);
      $main::outfile{"motifs_vs_db_".$db_name."_png"} = &OutFileName(".png", "motifs_vs_db_".$db_name);
      $main::outfile{"motifs_vs_db_".$db_name."_matrices"} = &OutFileName(".tab", "motifs_vs_db_".$db_name."_matrices");
      $main::outfile{"motifs_vs_db_".$db_name."_matrices_html"} = &OutFileName(".html", "motifs_vs_db_".$db_name."_matrices");
    }
  }

}

################################################################
## Purge sequences
sub PurgeOneSeq {
  my ($seq_file, $seq_type) = @_;
  &RSAT::message::TimeWarn("Purging sequences", $seq_type) if ($main::verbose >= 1);
  my $cmd = $SCRIPTS."/convert-seq";
  $cmd .= " -i ".$seq_file;
  $cmd .= " -from ".$main::param{seq_format};
  $cmd .= " -to fasta";
  $cmd .= " -mask non-dna";
  $cmd .= "| $SCRIPTS/purge-sequence -dna";
  $cmd .= " -ml ".$main::param{purge_match_length};
  $cmd .= " -mis ".$main::param{purge_mismatches};
  $cmd .= " -o ".$main::outfile{"purged_".$seq_type};
  &one_command($cmd, 1);
}


################################################################
## Compute sequence lengths
sub SequenceLengths {
  my ($seq_file, $seq_type) = @_;
  &RSAT::message::TimeWarn("Computing sequence lengths", $seq_type) if ($main::verbose >= 1);
  my $cmd = $SCRIPTS."/sequence-lengths";
  $cmd .= " -i ".$main::infile{$seq_type.'_seq'};
  $cmd .= " -o ".$main::outfile{$seq_type.'_seqlen'};
  $cmd .= " ; cut -f 2 ".$main::outfile{$seq_type.'_seqlen'};
  $cmd .= " | ".$SCRIPTS."/classfreq -v 1 -ci ".$main::param{profiles_ci};
  $cmd .= " -o ".$main::outfile{$seq_type.'_seqlen_distrib'};
  &one_command($cmd, 1);

  $cmd = $SCRIPTS."/XYgraph -lines -pointsize 0 -legend";
  $cmd .= " -format ".$main::param{img_format};
  my $title = "Sequence lengths";
  $title .= "; ".$main::param{title} if ($main::param{title});
  $cmd .= " -title '".$title."'";
  $cmd .= " -ysize 200 -ycol 4 -yleg1 'Number of peaks'";
  $cmd .= " -xsize 800 -xcol 3 -xleg1 'Peak length'";
  $cmd .= " -xmin 0 -ymin 0";
#  $cmd .= " -xgstep1 ".($main::param{profiles_ci});
#  $cmd .= " -xgstep2 ".$main::param{profiles_ci};
  $cmd .= " -i ".$main::outfile{$seq_type.'_seqlen_distrib'};
  $cmd .= " -o ".$main::outfile{$seq_type.'_seqlen_distrib_graph'};
  &one_command($cmd, 1);

  ## Truncate sequences if required
  if (defined($main::param{max_seq_len})) {
    my $from = -round($main::param{max_seq_len}/2);
    my $to = $from + $main::param{max_seq_len} -1;
    $cmd = $SCRIPTS."/sub-sequence";
    $cmd .= " -i ".$main::infile{$seq_type.'_seq'};
    $cmd .= " -origin center";
    $cmd .= " -from ".$from;
    $cmd .= " -to ".$to;
    $cmd .= " -o ".$main::outfile{"truncated_".$seq_type};
    &one_command($cmd, 1);
  }
}

################################################################
## Run position-analysis to compute composition profiles (residues,
## dinucleotides)
sub CompositionProfiles {
  my ($seq_file, $seq_type) = @_;
  for my $ol (@profiles_oligo_lengths) {
    &RSAT::message::TimeWarn("Computing composition profiles", $ol."nt") if ($main::verbose >= 1);
    my $cmd =$SCRIPTS."/position-analysis -v 1";
    $cmd .= " -i ".$main::outfile{"purged_".$seq_type};
    $cmd .= " -format fasta";
    $cmd .= " -sort ";
    $cmd .= " -return chi,sig,distrib,graphs,rank";
    #    $cmd .= " -max_graphs ".$main::param{profiles_max_graphs};
    $cmd .= " ".$main::param{profiles_strand};
    $cmd .= " ".$main::param{noov};
    $cmd .= " -seqtype dna";
    $cmd .= " -l ".$ol;
    $cmd .= " -ci ".$main::param{profiles_ci};
    $cmd .= " -img_format ".$main::param{img_format};
    $cmd .= " -title '".$main::param{title}."'";
    $cmd .= " -origin center ";
    $cmd .= " -o ".$main::outfile{$ol."nt_".$seq_type."_profiles"};
    &one_command($cmd, 1);

    ## Draw the XY graph with composition profiles
    my $color_file = $ENV{RSAT}."/perl-scripts/lib/color_palettes/".$ol."nt".$main::param{profiles_strand}."_colors.tab";
    my $col_nb = 4**$ol + 1;
    $cmd = 'grep -v ";" '.$main::outfile{$ol."nt_".$seq_type."_profiles"}.' | sort | '.$SCRIPTS.'/transpose-table | grep -P \'(^id)|(^\-?\d+)\'';
    $cmd .= " | ".$SCRIPTS."/XYgraph -xcol 1 -ycol 2-".$col_nb;
    $cmd .= " -lines -pointsize 0 -legend -header";
    if (-e $color_file) {
      $cmd .= " -colors ".$color_file;
    } else {
      &RSAT::message::Warning("Cannot find residue color specification file", $color_file);
    }
    #    $cmd .= " -symbols "; ## THIS OPTION IS NOT WORKING ANYMORE : THE LEGEND DISPLAYS SYMBOLS BUT NOT THE GRAPH. THIS HAS TO BE FIXED
    my $title = $ol."nt composition profiles : ";
    $title .= "; ".$seq_type." sequence";
    $title .= "; ".$main::param{title} if ($main::param{title});
    $cmd .= " -title '".$title."'";
    $cmd .= " -xleg1 'Position' -xsize 800";
    $cmd .= " -yleg1 'Occurrences' -ysize 300";
    $cmd .= " -o ".$main::outfile{$ol."nt_".$seq_type."_profiles_graph"};
    &one_command($cmd, 1);

    ## Compute background models of order 0 (Bernoulli) and 1 (Markov)
    ## from the input sequence
    $cmd = $BIN."/count-words -v 1";
    $cmd .= " -i ".$main::outfile{"purged_".$seq_type};
    $cmd .= " -l ".$ol;
    $cmd .= " -1str";
    $cmd .= " ".$main::param{noov};
    $cmd .= " > ".$main::outfile{$ol."nt_".$seq_type."_freq"};

    ## Convert background model in INCLUSIVE format for matrix-scan-quick
    $cmd .= "; ".$SCRIPTS."/convert-background-model -from oligos -to inclusive ";
    $cmd .= " -i ".$main::outfile{$ol."nt_".$seq_type."_freq"};
    $cmd .= " -o ".$main::outfile{$ol."nt_".$seq_type."_inclusive"};

    ## Convert background model to transition table and draw the heatmap of transition probabilities
    $cmd .= "; ".$SCRIPTS."/convert-background-model -from oligos -to transitions ";
    $cmd .= " -i ".$main::outfile{$ol."nt_".$seq_type."_freq"};
    $cmd .= " -o ".$main::outfile{$ol."nt_".$seq_type."_transit"};
    $cmd .= " ; cut -f 1-5,7 ".$main::outfile{$ol."nt_".$seq_type."_transit"};
    $cmd .= " | ".$SCRIPTS."/draw-heatmap -min 0 -max 1  -out_format png -col_width 50";
    $cmd .= " -o ".$main::outfile{$ol."nt_".$seq_type."_heatmap"};
    &one_command($cmd, 1);

  }
}


################################################################
## Run oligo-analysis on the test set
sub OligoAnalysis {
  for my $len ($main::param{oligo_min_len}..$main::param{oligo_max_len}) {
    for my $markov ($main::param{oligo_min_mkv}..$main::param{oligo_max_mkv}) {
      ## Convert negative markov orders relative to the oligo length
      $markov += $len if ($markov < 0);
      my $pattern_type = 'oligos_'.$len.'nt'.'_mkv'.$markov;
      &RSAT::message::TimeWarn("Running oligo-analysis", $len."nt", "markov=".$markov) if ($main::verbose >= 1);
      my $cmd = $SCRIPTS."/oligo-analysis -v 1";
      $cmd .= " -quick";
      $cmd .= " -i ".$main::outfile{"purged_test"};
      $cmd .= " -format fasta";
      $cmd .= " -sort -lth ratio ".$main::param{min_ratio};
      $cmd .= " -sort -lth occ_sig ".$main::param{min_sig};
      $cmd .= " -uth rank ".$main::param{patterns_max_rank};
      $cmd .= " -return occ,proba,rank";
      $cmd .= " ".$main::param{strand};
      $cmd .= " ".$main::param{noov};
      $cmd .= " -seqtype dna";
      $cmd .= " -l ".$len;
      $cmd .= " -markov ".$markov;
      $cmd .= " -pseudo 0.01";
      $cmd .= " -o ".$main::outfile{$pattern_type};
      &one_command($cmd, 1);
      &MatrixFromPatterns($main::outfile{$pattern_type}, $main::outfile{$pattern_type.'_pssm'}, $pattern_type);
    }
  }
}

################################################################
## Run oligo-diff to compare the test set to the control set
sub OligoDiff {
  for my $len ($main::param{oligo_min_len}..$main::param{oligo_max_len}) {
    my $pattern_type = 'oligo_diff_'.$len.'nt';
    &RSAT::message::TimeWarn("Running oligo-diff", $len."nt") if ($main::verbose >= 1);
    my $cmd = $SCRIPTS."/oligo-diff -v 2";
    $cmd .= " -test ".$main::outfile{"purged_test"};
    $cmd .= " -ctrl ".$main::outfile{"purged_ctrl"};
    $cmd .= " -nopurge"; ## The input sequences have already been purged
    $cmd .= " -l ".$len;
    $cmd .= " ".$main::param{strand};
    $cmd .= " ".$main::param{noov};
    $cmd .= " -side test";
    #    $cmd .= " -sort";
    $cmd .= " -lth ratio ".$main::param{min_ratio};
    $cmd .= " -lth occ_sig ".$main::param{min_sig};
    $cmd .= " -uth rank ".$main::param{patterns_max_rank};
    #    $cmd .= " -return occ,proba,rank";
    #    $cmd .= " -seqtype dna";
    #    $cmd .= " -pseudo 0.01";
    $cmd .= " -o ".$main::outfile{$pattern_type};
    &one_command($cmd, 1);
    &MatrixFromPatterns($main::outfile{$pattern_type}, $main::outfile{$pattern_type.'_pssm'}, $pattern_type);
  }
}


################################################################
## Run dyad-analysis on the test set
sub DyadAnalysis {
  &RSAT::message::TimeWarn("Running dyad-analysis") if ($main::verbose >= 1);
  my $cmd = $SCRIPTS."/dyad-analysis -v 2";
  $cmd .= " -i ".$main::outfile{"purged_test"};
  $cmd .= " -quick";
  $cmd .= " -format fasta";
  $cmd .= " -sort -lth ratio ".$main::param{min_ratio};
  $cmd .= " -sort -lth occ_sig ".$main::param{min_sig};
  $cmd .= " -uth rank ".$main::param{patterns_max_rank}." -return occ,proba,ratio,zscore,rank";
  $cmd .= " ".$main::param{strand};
  $cmd .= " ".$main::param{noov};
  $cmd .= " -seqtype dna";
  $cmd .= " -l 3 -sp 0-20 ";
  $cmd .= " -bg monads";
  $cmd .= " -pseudo 0.01";
  $cmd .= " -o ".$main::outfile{dyads};
  &one_command($cmd, 1);
  &MatrixFromPatterns($main::outfile{dyads}, $main::outfile{dyads_pssm}, "dyads");
}

################################################################
## Run position-analysis to discover oligonucleotides with positional
## biases
sub PositionAnalysis {
  for my $len ($main::param{oligo_min_len}..$main::param{oligo_max_len}) {
    my $pattern_type = 'positions_'.$len.'nt';
    &RSAT::message::TimeWarn("Running position-analysis", $len."nt") if ($main::verbose >= 1);
    my $cmd = $SCRIPTS."/position-analysis -v 2";
    $cmd .= " -i ".$main::outfile{"purged_test"};
    $cmd .= " -format fasta";
    $cmd .= " -sort ";
    $cmd .= " -return chi,sig,distrib,graphs,rank";
    $cmd .= " -max_graphs ".$main::param{patterns_max_rank};
    $cmd .= " ".$main::param{strand};
    $cmd .= " ".$main::param{noov};
    $cmd .= " -seqtype dna";
    $cmd .= " -l ".$len;
    $cmd .= " -ci ".$main::param{positions_ci};
    $cmd .= " -lth_occ ".$main::param{positions_min_occ};
    $cmd .= " -lth_sig ".$main::param{min_sig};
    $cmd .= " -uth_rank ".$main::param{patterns_max_rank};
    $cmd .= " -img_format ".$main::param{img_format};
    $cmd .= " -title '".$main::param{title}."'";
    $cmd .= " -origin center ";
    $cmd .= " -o ".$main::outfile{$pattern_type};
    &one_command($cmd, 1);
    &MatrixFromPatterns($main::outfile{$pattern_type}, $main::outfile{$pattern_type."_pssm"}, $pattern_type);
  }
}


################################################################
## Run local-word-analysis on the test set
sub LocalWords {
  my $center_pos = &RSAT::util::round($main::param{local_words_window}/2);
  for my $len ($main::param{oligo_min_len}..$main::param{oligo_max_len}) {
    for my $markov ($main::param{oligo_min_mkv}..$main::param{oligo_max_mkv}) {
      ## Convert negative markov orders relative to the oligo length
      $markov += $len if ($markov < 0);
      my $pattern_type = 'local_words_'.$len.'nt'.'_mkv'.$markov;
      &RSAT::message::TimeWarn("Running local-word-analysis", $len."nt") if ($main::verbose >= 1);
      my $cmd = $PYTHON."/local-word-analysis -v 3";
      $cmd .= " -i ".$main::outfile{"purged_test"};
      #  $cmd .= " -format fasta";
      $cmd .= " --min=occ_sig ".$main::param{min_sig};
      $cmd .= " --sort=-occ_sig";
      $cmd .= " --max=rank ".$main::param{patterns_max_rank};
      $cmd .= " --max=w_rank ".1; ## Only return the most significant window for each word
      # " -return occ,proba,rank";
      if ($main::param{strand} eq "-1str") {
	$cmd .= " +";
      } else {
	$cmd .= " +-";
      }
      $cmd .= " --overlap" if ($main::param{noov} eq "ovlp");
      $cmd .= " --center=".$center_pos;
      $cmd .= " --window=".$main::param{local_words_window};
#      $cmd .= " --heuristic=".$main::param{local_words_heuristic};
      #  $cmd .= " -seqtype dna";
      $cmd .= " -l ".$len;
      $cmd .= " --markov ".$markov;
      #  $cmd .= " -pseudo 0.01";
      $cmd .= " > ".$main::outfile{$pattern_type};
      &one_command($cmd, 1);
      &MatrixFromPatterns($main::outfile{$pattern_type}, $main::outfile{$pattern_type.'_pssm'}, $pattern_type);
    }
  }
}

################################################################
## Convert reference motif and generate logos
sub RefMotif {

  ## Copy the reference motif in the output directory
  my $cmd = "rsync -ruptL ";
  $cmd .= " ".$main::infile{ref_motif};
  $cmd .= " ".$main::outfile{ref_motif};
  &one_command($cmd, 1);

  ## Convert the reference motif to TRANSFAC format, because this
  ## format holds information about the motif name, ID etc
  ## Export the logo and matrix parameters
  $cmd = $SCRIPTS."/convert-matrix -v 0";
  $cmd .= " -i ".$main::infile{ref_motif};
  $cmd .= " -from ".$main::param{ref_motif_format};
  $cmd .= " -to transfac -decimals 1";
  $cmd .= " -return counts,logo,consensus,parameters";
  $cmd .= " -logo_file ".$main::outfile{ref_motif_logo};
  $cmd .= " -o ".$main::outfile{ref_motif_transfac};
  &one_command($cmd, 1);

  ## Also export the matrices in tab-delimited format
  $cmd = $SCRIPTS."/convert-matrix -v 0";
  $cmd .= " -i ".$main::infile{ref_motif};
  $cmd .= " -from ".$main::param{ref_motif_format};
  $cmd .= " -to tab";
  $cmd .= " -return counts";
  $cmd .= " -o ".$main::outfile{ref_motif_info};
  &one_command($cmd, 1);
}


################################################################
## Compare the significance of words (oligos, dyads) discovered by
## the different approaches
sub WordsVersusWords {
  my $cmd = $SCRIPTS."/compare-scores -v 1 ";
  my $file_nb = 0;
  foreach my $pattern_type (@pattern_types) {
    my $pattern_file = $main::outfile{$pattern_type};
    if (-e $pattern_file) {
      my $sig_col;
      $file_nb++;
      $cmd .= " -i ".$pattern_file;
      if ($pattern_type =~ /oligos_/) {
	$sig_col = 8;
      } elsif ($pattern_type =~ /oligo_diff/) {
	$sig_col = 11;
      } elsif ($pattern_type =~ /dyads/) {
	$sig_col = 8;
      } elsif ($pattern_type =~ /local_words/) {
	$sig_col = 9;
      } elsif ($pattern_type =~ /positions/) {
	$sig_col = 9;
      } else {
	&RSAT::message::Warning("Unknown score column for pattern type", $pattern_type);
	next;
      }
      $cmd .= " -sc".$file_nb." ".$sig_col;
      #    &RSAT::message::Debug($file_nb, $pattern_type, $sig_col, "\n", $pattern_file) if ($main::verbose >= 5);
    } else {
      &RSAT::message::Warning("Missing pattern file", $pattern_type, $pattern_file);
    }
  }
  $cmd .= " -ic 2 -lc";
  $cmd .= " -null .";
  $cmd .= " -suppress ".$main::dir{output}."/";
  $cmd .= " -suppress ".$main::param{prefix};
  $cmd .= " -suppress '\.tab'";
  $cmd .= " -o ".$main::outfile{word_compa};
  $cmd .= " ; ".$SCRIPTS."/draw-heatmap -min 0 -max 10  -out_format png";
  $cmd .= " -col_width 40 -rownames -gradient fire";
  $cmd .= " -row_height 16";
  $cmd .= " -i ".$main::outfile{word_compa};
  $cmd .= " -o ".$main::outfile{word_compa_heatmap};

  &one_command($cmd, 1);

  &RSAT::message::TimeWarn("Word comparison table", $main::outfile{word_compa}) if ($main::verbose >= 2);
}


################################################################
## Merge all discoverd motifs in a single file
sub MergeMotifs {

  ## Remove previous versions of the merged motif file
  my $cmd = "rm -f ".$main::outfile{motifs_discovered};

  ## Initialize the motif file with the reference motif if provided
#  if (defined($main::infile{ref_motif})) {
#    $cmd = "; cp -f";
#    $cmd .= " ".$main::outfile{ref_motif_transfac};
#    $cmd .= " ".$main::outfile{motifs_discovered};
#  }

  ################################################################
  ## Concatenate all discovered motifs (matrices) in a single file.
  ## Use TRANSFAC format because if allows to associate a name to each
  ## matrix.
  foreach my $pattern_type (@pattern_types) {
    my $matrix_file = $main::outfile{$pattern_type.'_pssm_tf'};

    unless (-e $matrix_file) {
      ## TEMPORARY: ensure conversion for data sets of previous versions
      my $convert_cmd = " ; ".$SCRIPTS."/convert-matrix";
      $convert_cmd .= " -i ".$main::outfile{$pattern_type.'_pssm_counts'};
      $convert_cmd .= " -from tab -to transfac -return counts,consensus";
      $convert_cmd .= " -prefix $pattern_type";
      $convert_cmd .= " -o ".$matrix_file;
      &one_command($convert_cmd, 1);
   }

    if (-e $matrix_file) {
      $cmd .= "; cat ".$matrix_file." >> ".$main::outfile{motifs_discovered};
    } else {
      &RSAT::message::Warning("Missing matrix file", $pattern_type, $matrix_file);
    }
  }
  &one_command($cmd, 1);

  &RSAT::message::TimeWarn("Merged discovered motifs", $main::outfile{motifs_discovered}) if ($main::verbose >= 2);
}

################################################################
## Compare each discovered motifs to each other, and with the
## reference motif, identify clusters of similar motifs, and build
## consensus motifs.
sub ClusterMotifs {

  ## Comparison between discovered matrices, and with the reference motif
  my $cmd = $SCRIPTS."/compare-matrices -v 1";
  $cmd .= " -format1 transfac -file1 ".$main::outfile{motifs_discovered};;
  $cmd .= " -format2 transfac -file2 ".$main::outfile{motifs_discovered};;
#  $cmd .= " -format2 transfac -file2 ".$main::outfile{ref_motif_transfac};
  $cmd .= " -DR -distinct -triangle";
  $cmd .= " -sort cor";
  $cmd .= " -uth rank 1"; ## Only report the best matching shift between a pair of matrices
  $cmd .= " -lth w ".$main::param{matrix_compa_min_w}; ## Min number of aligned columns
  $cmd .= " -lth cor ".$main::param{matrix_compa_min_cor}; ## Min correlation
  $cmd .= " -lth Ncor ".$main::param{matrix_compa_min_Ncor}; ## Min normalized correlation
  $cmd .= " -return matrix_name,direction,Ncor,SW,cor,NdEucl,width,consensus";
  $cmd .= " -o ".$main::outfile{motifs_disco_compa};
  &one_command($cmd, 1);

  ## Use MCL to partition the motif graph into clusters
  my $mcl_dir = $ENV{mcl_dir};
  unless ($mcl_dir) {
    &RSAT::error::FatalError("Motif comparison requires to install MCL and indicate its path in the file $ENV{RSAT}/RSAT_config.props");
  }
  $cmd = "grep -v '^;' ".$main::outfile{motifs_disco_compa}.">".$main::outfile{motifs_disco_compa}.".mcl";
  $cmd .= "; ".$mcl_dir."/mcl ".$main::outfile{motifs_disco_compa}.".mcl";
  $cmd .= " -I 1.8 --abc -V all ";
  $cmd .= " -o ".$main::outfile{motifs_disco_compa_clusters_mcl};
  $cmd .= " ; ${SCRIPTS}/convert-classes -i ".$main::outfile{motifs_disco_compa_clusters_mcl};
  $cmd .= " -from mcl -to tab ";
  $cmd .= " -o ".$main::outfile{motifs_disco_compa_clusters_tab};
  &one_command($cmd, 1);

  ## Split the motif graph into clusters as defined by MCL, and
  ## compute the intra-cluster degree (k) and weighted degree (wk) of
  ## each node
  $cmd = $SCRIPTS."/graph-get-clusters -i ".$main::outfile{motifs_disco_compa};
  $cmd .= " -in_format tab -scol 1 -tcol 2 -wcol 4 -return clusters ";
  $cmd .= " -clusters ".$main::outfile{motifs_disco_compa_clusters_tab};
  $cmd .= " -out_format tab -o ".$main::outfile{motifs_disco_compa_clusters_graph};
  $cmd .= " ; ".$SCRIPTS."/graph-connex-components -v 1";
  $cmd .= " -i ".$main::outfile{motifs_disco_compa_clusters_graph};
  $cmd .= " -wcol 3";
  $cmd .= " -o ".$main::outfile{motifs_disco_compa_cluster_intra_degree};
  &one_command($cmd, 1);

  ## Generate a GML graph with the matrix comparison result (can be opened with CytoScape or Yed)
  $cmd = $SCRIPTS."/convert-graph -i ".$main::outfile{motifs_disco_compa};
  $cmd .= " -ewidth -ecolors fire";
  $cmd .= " -layout spring";
  $cmd .= " -from tab -to gml -scol 1 -tcol 2 -wcol 3";
  $cmd .= " -o ".$main::outfile{motifs_disco_compa_gml};
  &one_command($cmd, 1);

  ## Generate a figure of the motif comparison graph
  $cmd = $SCRIPTS."/display-graph";
  $cmd .= " -in_format gml -i ".$main::outfile{motifs_disco_compa_gml};
  $cmd .= " -ewidth";
  $cmd .= " -layout none";
  $cmd .= " -out_format png -o ".$main::outfile{motifs_disco_compa_png};
  &one_command($cmd, 1);

}


################################################################
## Compare discovered motifs to the referencemotif 
sub MotifsVersusReference {
  &RSAT::message::TimeWarn("Comparing discovered motifs with reference motif") if ($main::verbose >= 1);

  ## Comparison between discovered matrices, and with the reference motif
  my $cmd = $SCRIPTS."/compare-matrices -v 2";
  $cmd .= " -format1 transfac -file1 ".$main::outfile{ref_motif_transfac};;
  $cmd .= " -format2 transfac -file2 ".$main::outfile{motifs_discovered};
  $cmd .= " -DR";
  $cmd .= " -sort cor";
  $cmd .= " -uth rank 1"; ## Only report the best matching shift between a pair of matrices
  $cmd .= " -lth w ".$main::param{matrix_compa_min_w}; ## Min number of aligned columns
  $cmd .= " -lth cor ".$main::param{matrix_compa_min_cor}; ## Min correlation
  $cmd .= " -lth Ncor ".$main::param{matrix_compa_min_Ncor}; ## Min normalized correlation
  $cmd .= " -return matrix_name,direction,Ncor,SW,cor,width,consensus,aligned_matrices";
  $cmd .= " -o ".$main::outfile{"motifs_vs_ref"};
  $cmd .= " -out_matrices ".$main::outfile{"motifs_vs_ref_matrices"};
  &one_command($cmd, 1);

  &RSAT::message::TimeWarn("Dicovered versus reference motif", $main::outfile{motifs_vs_ref}) if ($main::verbose >= 2);

  ## Generate a GML graph with the matrix comparison result (can be opened with CytoScape or Yed)
  $cmd = $SCRIPTS."/convert-graph -i ".$main::outfile{"motifs_vs_ref"};
  $cmd .= " -ewidth -ecolors fire";
  $cmd .= " -layout spring";
  $cmd .= " -from tab -to gml -scol 1 -tcol 2 -wcol 3";
  $cmd .= " -o ".$main::outfile{"motifs_vs_ref_gml"};
  &one_command($cmd, 1);

  ## Generate a figure of the motif comparison graph
  $cmd = $SCRIPTS."/display-graph";
  $cmd .= " -in_format gml -i ".$main::outfile{"motifs_vs_ref_gml"};
  $cmd .= " -ewidth";
  $cmd .= " -layout none";
  $cmd .= " -out_format png -o ".$main::outfile{"motifs_vs_ref_png"};
  &one_command($cmd, 1);

}

################################################################
## Compare discovered motifs to the motif database
sub MotifsVersusDatabase {
  foreach my $db_name (@motif_databases) {
    &RSAT::message::TimeWarn("Comparing discovered motifs with database", $db_name) if ($main::verbose >= 1);

    ## Comparison between discovered matrices, and with the reference motif
    my $cmd = $SCRIPTS."/compare-matrices -v 2";
    $cmd .= " -format1 transfac -file1 ".$main::outfile{motifs_discovered};
    $cmd .= " -format2 ".$main::motif_db_format{$db_name};
    $cmd .= " -file2 ".$main::infile{"motif_db_".$db_name};;
    $cmd .= " -DR";
    $cmd .= " -sort cor";
    $cmd .= " -uth rank 1"; ## Only report the best matching shift between a pair of matrices
    $cmd .= " -lth w ".$main::param{matrix_compa_min_w}; ## Min number of aligned columns
    $cmd .= " -lth cor ".$main::param{matrix_compa_min_cor}; ## Min correlation
    $cmd .= " -lth Ncor ".$main::param{matrix_compa_min_Ncor}; ## Min normalized correlation
    $cmd .= " -return matrix_name,direction,Ncor,SW,cor,width,consensus,aligned_matrices";
    $cmd .= " -o ".$main::outfile{"motifs_vs_db_".$db_name};
    $cmd .= " -out_matrices ".$main::outfile{"motifs_vs_db_".$db_name."_matrices"};
    &one_command($cmd, 1);

    &RSAT::message::TimeWarn("Dicovered versus DB", $main::outfile{motifs_vs_db}) if ($main::verbose >= 2);

    ## Generate a GML graph with the matrix comparison result (can be opened with CytoScape or Yed)
    $cmd = $SCRIPTS."/convert-graph -i ".$main::outfile{"motifs_vs_db_".$db_name};
    $cmd .= " -ewidth -ecolors fire";
    $cmd .= " -layout spring";
    $cmd .= " -from tab -to gml -scol 1 -tcol 2 -wcol 3";
    $cmd .= " -o ".$main::outfile{"motifs_vs_db_".$db_name."_gml"};
    &one_command($cmd, 1);

    ## Generate a figure of the motif comparison graph
    $cmd = $SCRIPTS."/display-graph";
    $cmd .= " -in_format gml -i ".$main::outfile{"motifs_vs_db_".$db_name."_gml"};
    $cmd .= " -ewidth";
    $cmd .= " -layout none";
    $cmd .= " -out_format png -o ".$main::outfile{"motifs_vs_db_".$db_name."_png"};
    &one_command($cmd, 1);
  }
}

################################################################
## Convert Word assemblies into PSSMs
sub MatrixFromPatterns {
  my ($pattern_file, $pssm_file, $pattern_type) = @_;

  &RSAT::message::TimeWarn("\tMatrix from patterns", $pattern_type) if ($verbose >= 1);
  my $cmd = $SCRIPTS."/matrix-from-patterns -v 1 ";
  $cmd .= " -seq ".$infile{test_seq};
  $cmd .= " -pl ".$pattern_file;
  $cmd .= " -bgfile ".$main::param{scan_bg_file};
  $cmd .= " -max_asmb_nb ".$main::param{matrix_nmotifs};
  $cmd .= " -prefix ".$pattern_type;
  $cmd .= " -flanks 2";
  $cmd .= " -collect_method matrix-scan-quick";
  $cmd .= " -logo";
  $cmd .= " -o ".$pssm_file;
  &one_command($cmd, 1);
}


################################################################
## Scan peak sequences with the discovered motif
##
## BEWARE: THIS IS NOT YET WORKING, BECAUSE matrix-scan-quick ONLY
## USES THE FIRST MATRIX OF EACH FILE.
sub ScanSequences {
  $main::param{scan_min_score} = 7.5;
  &RSAT::message::TimeWarn("Scanning sequences") if ($main::verbose >= 1);
#  foreach my $seq_type (@seq_types) {
    foreach my $pattern_type (@pattern_types) {
      my $cmd = $BIN."/matrix-scan-quick -v 1";
      $cmd .= " -i ".$infile{test_seq};
      $cmd .= " -m ".$main::outfile{$pattern_type.'_pssm_counts'};
      $cmd .= " -bgfile ".$main::param{scan_bg_file};
      $cmd .= " ".$main::param{scan_strands};
      $cmd .= " -origin center -return sites";
      $cmd .= " -t ".$main::param{scan_min_score};
      $cmd .= " >".$main::outfile{$pattern_type.'_pssm_sites'};
      &one_command($cmd, 1);

      ## Compute the positional distribution of sites
      $cmd = "awk '{print \(\$6\+\$5\)/2}'";
      $cmd .= " ".$main::outfile{$pattern_type.'_pssm_sites'};
      $cmd .= " | ".$SCRIPTS."/classfreq -v 1";
      $cmd .= " -ci ".$main::param{profiles_ci};
      $cmd .= " -o ".$main::outfile{$pattern_type.'_pssm_site_distrib'};
      &one_command($cmd, 1);

      ## Draw the graph of predicted site positions
      $cmd = 'XYgraph';
      $cmd .= " -i ".$main::outfile{$pattern_type.'_pssm_site_distrib'};
      $cmd .= " -lines -xcol 3 -ycol 4";
      $cmd .= " -ysize 200 -ycol 4 -yleg1 'Number of sites'";
      $cmd .= " -xsize 800 -xcol 3 -xleg1 'Sequence position relative to peak center'";
      $cmd .= " -title1 'Predicted sites : $pattern_type'";
      $cmd .= " -o ".$main::outfile{$pattern_type.'_pssm_site_distrib_graph'};
      &one_command($cmd, 1);
    }
#  }
}


################################################################
## Generate a file summarizing the time spent in the different tasks
sub TimeLog {
  my $timelog = &OpenOutputFile($main::outfile{timelog});

  my $prefix = "NA";
  if (defined($main::param{prefix})) {
    $prefix = $main::param{prefix};
    $prefix =~ s/_$//;
  }

  print $timelog join("\t", "#start_time       ", "done_time        ", "elapsed", "seconds", "task", "prefix", "file"), "\n";

#  foreach my $pattern_type (@pattern_types) {
#    foreach my $key ($pattern_type, $pattern_type."_2pssm") {
  foreach my $key (@timelog_keys) {
    my $file = $main::outfile{$key};
    my $start = "NA";
    my $done = "NA";
    my $elapsed = "NA";
    my $seconds = "NA";
    if (-e $file) {
      my ($in) = &OpenInputFile($file);
      while (<$in>) {
	if (/^;\s*Job started\s+(\S+)/i) {
	  $start = $1;
	} elsif (/^;\s*Job done\s+(\S+)/i) {
	  $done = $1;
	} elsif (/^;\s*Seconds\s+(\S+)/i) {
	  $seconds = $1;
	}
      }
      close $in;
    }
    if ($start =~ /(\d{4})_(\d{2})_(\d{2}).(\d{2})(\d{2})(\d{2})/) {
      my ($start_year, $start_month, $start_day, $start_hour, $start_min, $start_sec) = 
	($1, $2, $3, $4, $5, $6);
      if ($done =~ /(\d{4})_(\d{2})_(\d{2}).(\d{2})(\d{2})(\d{2})/) {
	my ($done_year, $done_month, $done_day, $done_hour, $done_min, $done_sec) = 
	  ($1, $2, $3, $4, $5, $6);
	if ($done_month > $start_month) {
	  $done_day = $start_day +1; ## Quick and tricky treatment of
	  ## month-overlapping tasks
	}
	$elapsed = ($done_sec - $start_sec)
	  + ($done_min - $start_min)*60
	    + ($done_hour - $start_hour)*3600
	      + ($done_day - $start_day)*86400;
      }
    }
    print $timelog join("\t", $start, $done, $elapsed, $seconds, $key, $prefix, $file), "\n";
  }
  #}
  close $timelog;
  &RSAT::message::TimeWarn("Time log file", $main::outfile{timelog}) if ($main::verbose >= 1);

  ## Create a HMTL version of the timelog table
  my $cmd = $SCRIPTS."/text-to-html";
  $cmd .= " -font variable";
  $cmd .= " -i ".$main::outfile{timelog};
  $cmd .= " -o ".$main::outfile{timelog_html};
  &one_command($cmd, 1);
  &RSAT::message::TimeWarn("Time log html", $main::outfile{timelog_html}) if ($main::verbose >= 1);
}

################################################################
## Generate a synthetic table summarizing the main results with links
## to the iriginal result files.
sub Synthesis {
  local $syn = &OpenOutputFile($main::outfile{synthesis});
  local $synthesis_path = `dirname $main::outfile{synthesis}`;
  chomp($synthesis_path);

  ## HTML Header and title
  print $syn "<html>\n";
  print $syn "<head>\n";
  print $syn "<title>chip-seq-analysis ".$main::param{title}."</title>\n";

  ## Page styles
  print $syn "<style type='text/css'>\n";
  print $syn `cat $ENV{RSAT}/perl-scripts/lib/results.css`;
  print $syn "</style>\n";
#  print $syn "<link rel='stylesheet' type='text/css' href='".$ENV{RSAT}."/public_html/main.css'/>\n";

  print $syn "</head>\n";
  print $syn "<body>\n";
  print $syn "<h1>Result: chip-seq-analysis ".$main::param{title}."</h1>\n";

  ## Print the command line
  print $syn "<pre>";
  print $syn "<b>Command:</b>  chip-seq-analysis ";
  &PrintArguments($syn);
  print $syn "</pre>";

  ## Header of the synthetic table
  print $syn "<h2>Synthetic report</h2>";
  print $syn "<p><table class='sortable'>\n";

  &SynthesisSequenceComposition();

  ## Reference motif
  &SynthesisRefMotif();

  ## Header line
  &SyntheticTableAddHeaderRow("Pattern discovery");

  foreach my $pattern_type (@pattern_types) {
    &SynthesisPatternDisco($pattern_type);
  }

  ## Word comparison
  &SynthesisWordCompa();

  ## Motif comparison
  &SynthesisMotifCompa();

  ## Comparison between discovered motifs and reference motif
  &SynthesisMotifsVersusReference() if (defined($main::infile{ref_motif}));

  ## Comparison between discovered motifs and database(s)
  &SynthesisMotifsVersusDatabase() if (scalar(@motif_databases) > 0);

  ## Log file
  &SyntheticTableAddHeaderRow("Log files");
  &SyntheticTableAddRow("Parameters and files",
			"",
 			"txt"=>$main::outfile{log},
 		       );
  &SyntheticTableAddRow("Time log",
			"",
 			"txt"=>$main::outfile{timelog},
 			"html"=>$main::outfile{timelog_html},
 		       );


  ## Close the synthetic table
  print $syn "</table></p>\n";

  ## Log file
#  print $syn "<h2>Log</h2>";
#  print $syn "<pre>";
#  print $syn `cat $main::outfile{log}`;
#  print $syn "</pre>";

  ## End of the HTML file
  print $syn "</body>\n";
  print $syn "</html>\n";
  close $syn;
  &RSAT::message::TimeWarn("Synthetic report", $main::outfile{synthesis}) if ($main::verbose >= 1);
}

################################################################
## Compute a file path relative to the synthesis file
sub RelativePath {
  my ($file) = @_;
#  &RSAT::message::Debug("RelativePath", $file) if ($main::verbose >= 10);
  my $rel_file = $file;
  $rel_file =~ s|${synthesis_path}/||;
  return ($rel_file);
}

################################################################
## Add a row to the syhnthetic table
sub SyntheticTableAddRow {
  my ($type, $summary, @files) = @_;
  print $syn "<tr>\n";
  print $syn "<td>$type</td>\n";
  print $syn "<td>$summary</td>\n";
  print $syn "<td>\n";
  my $key;
  my $file;
  while ($key = shift (@files)){
    last unless $key;
    $file = shift(@files);
    &RSAT::message::Debug($key, $file) if ($main::verbose >= 5);
    $file = &RelativePath($file);
    print $syn "<a href='".$file."'>[".$key."]</a><br>\n";
  }
  print $syn "</td>\n";
  print $syn "</tr>\n";
}


################################################################
## Add a header row to the synthetic table
sub SyntheticTableAddHeaderRow {
  my ($header) = @_;
  print $syn "<tr>\n";
  print $syn "<th colspan=3>\n";
  print $syn $header;
  print $syn "</th>\n";
  print $syn "</tr>\n";
}

################################################################
## Add sequence composition on the synthetic report
sub SynthesisSequenceComposition {
  foreach my $seq_type (@seq_types) {

    ## Header line
    &SyntheticTableAddHeaderRow("Sequence composition (".$seq_type." sequences)");

    ## Get number of peaks from sequence length distribution
    my $peak_nb = `grep '; count' $main::outfile{$seq_type."_seqlen_distrib"}`;
    chomp($peak_nb);
    $peak_nb =~ s/.*count\s*//;

    ## Get min peak length
    my $min_peak_len = `grep '; min' $main::outfile{$seq_type."_seqlen_distrib"}`;
    chomp($min_peak_len);
    $min_peak_len =~ s/.*min\s*//;

    ## Get mean peak length
    my $mean_peak_len = `grep '; mean' $main::outfile{$seq_type."_seqlen_distrib"}`;
    chomp($mean_peak_len);
    $mean_peak_len =~ s/.*mean\s*//;

    ## Get max peak length
    my $max_peak_len = `grep '; max' $main::outfile{$seq_type."_seqlen_distrib"}`;
    chomp($max_peak_len);
    $max_peak_len =~ s/.*max\s*//;

    ## Get total sequence size from sequence length distribution
    my $seq_size = `grep '; sum' $main::outfile{$seq_type."_seqlen_distrib"}`;
    chomp($seq_size);
    $seq_size =~ s/.*sum\s*//;
    $seq_size = round($seq_size/1000);

    &RSAT::message::Debug("Nb of peaks=".$peak_nb, 
			  "sequence size=".$seq_size) if ($main::verbose >= 0);

    ## Sequence lengths
    my $img = &RelativePath($main::outfile{$seq_type."_seqlen_distrib_graph"});
    my $seq_stats = "Nb of peaks: ".$peak_nb;
    $seq_stats .= "<br>\nTotal seq. size: ".$seq_size." kb";
    $seq_stats .= "<br>\nMin length: ".$min_peak_len." bp";
    $seq_stats .= "<br>\nMean length: ".$mean_peak_len." bp";
    $seq_stats .= "<br>\nMax length: ".$max_peak_len." bp";
    &SyntheticTableAddRow($seq_stats,
			  "<a  href='".$img."'><img height=150 src='".$img."'></a>",
			  "lengths",$main::outfile{$seq_type."_seqlen"},
			  "distrib",$main::outfile{$seq_type."_seqlen_distrib"},
			  "graph",$main::outfile{$seq_type."_seqlen_distrib_graph"},
			 );


    ## Residue composition
    for my $ol (@profiles_oligo_lengths) {
      my $table = "<table><tr>";
      my $img = &RelativePath($main::outfile{$ol."nt_".$seq_type."_heatmap"});
      $table .= "<td>Transition frequencies<br><a  href='".$img."'><img width=200 src='".$img."'></a></td>";
      $img = &RelativePath($main::outfile{$ol."nt_".$seq_type."_profiles_graph"});
      $table .= "<td>Position profile<br><a  href='".$img."'><img height=150 src='".$img."'></a></td>";
      $table .= "</tr></table>";
      &SyntheticTableAddRow($ol."nt composition",
			    $table,
			    $ol."nt freq",$main::outfile{$ol."nt_".$seq_type."_freq"},
			    $ol."nt transitions",$main::outfile{$ol."nt_".$seq_type."_transit"},
			    "inclusive bg model",$main::outfile{$ol."nt_".$seq_type."_inclusive"},
			    "profile table",$main::outfile{$ol."nt_".$seq_type."_profiles"},
			    "individual profiles",$main::outfile{$ol."nt_".$seq_type."_profiles_index"},
			   );
    }
  }
}

################################################################
## Add the reference motif to the report
sub SynthesisRefMotif {
  if (defined($main::infile{ref_motif})) {
    &SyntheticTableAddHeaderRow("Reference motif");

    my $logo_table = "<table cellpadding=0 cellspacing=0 align=center border=0>";

    ## get the names of all reference logo files
    my @logo_files = glob($main::outfile{ref_motif_logo}."_m*.".$main::param{img_format});
    @logo_files = grep {!/_rc\./} @logo_files;
#    my @logo_files = glob($main::outfile{ref_motif_logo}."_m\d+.".$main::param{img_format});

    ## Prepare a table with the logos
    for my $i (1..scalar(@logo_files)) {
      $logo_table .= "<tr>\n";
      $logo_table .= "<td>Ref motif ".$i."</td>\n";
      my $logo_file = $main::outfile{ref_motif_logo}."_m".$i.".".$main::param{img_format};
      my $logo_file_rc = $main::outfile{ref_motif_logo}."_m".$i."_rc.".$main::param{img_format};
      if (-e $logo_file) {
	my $img = &RelativePath($logo_file);
	$logo_table .= "<td align='right'><a  href='".$img."'><img height=70 src='".$img."'></a></td>\n";
      } elsif ($main::verbose >= 1) {
	&RSAT::message::Warning("Cannot find reference logo", $logo_file);
      }
      if (-e $logo_file_rc) {
	my $img = &RelativePath($logo_file_rc);
	$logo_table .= "<td align='left'><a  href='".$img."'><img height=70 src='".$img."'></a></td>\n";
      } elsif ($main::verbose >= 1) {
	&RSAT::message::Warning("Cannot find reverse complementary logo for the reference motif", $logo_file_rc);
      }
      $logo_table .= "</tr>\n";
    }
    $logo_table .= "</table>\n";

    &SyntheticTableAddRow("Reference motif",
			  $logo_table,
#			  $main::param{ref_motif_format}, &RelativePath($main::infile{ref_motif}),
			  "tf",$main::outfile{ref_motif_transfac},
			  "info",$main::outfile{ref_motif_info},
			 );
  }
}

################################################################
## Synthesis for one pattern discovery algorithm
## Usage:
##   &SynthesisPatternDisco($pattern_type);
## Where pattern type can be oligos, dyads, local_words, oligo-diff
sub SynthesisPatternDisco {
  my ($pattern_type) = @_;

  if (-e $main::outfile{$pattern_type.'_asmb'}) {

    ## Synthesize results of pattern assembly (assembly consensus + sig scores)
    my ($asmb) = &OpenInputFile($main::outfile{$pattern_type.'_asmb'});
    my $pattern_nb = 0;
    my $asmb_or_isol = "";
    my %pattern_info = ();
    while (<$asmb>) {
      next if /^#/;		## Skip header line
      next unless /\S/;		## Skip empty lines
      if (/assembly\s+\#\s+(\d+)/i) {
	## Assembly number
	$pattern_nb = $1;
	$asmb_or_isol = "asmb";
	next;
      } elsif (/Isolated patterns/) {
	## Isolated patterns at the end of the assembly file
	$asmb_or_isol = "isol";
	next;
      }
      next if /^;/;		## Skip comments
      if ($asmb_or_isol eq "isol") {
	$pattern_nb++;
      } elsif (!/consensus/) {
	next;
      }
      my ($word, $rc_word, $score) = split "\t";
      $pattern_info[$pattern_nb]->{type} = $asmb_or_isol;
      $pattern_info[$pattern_nb]->{word} = $word;
      $pattern_info[$pattern_nb]->{rc_word} = $rc_word;
      $pattern_info[$pattern_nb]->{score} = $score;
    }
    close $asmb;

    ## Synthesize matrix logos
    my $pattern_table = "<table cellpadding='0' cellspacing='0' align='center' border='0'>";
    foreach my $logo_nb (1..$main::param{matrix_nmotifs}) {
      $pattern_table .= "<tr>\n";
      my $logo_file = $main::outfile{$pattern_type.'_pssm_logo'.$logo_nb};
      my $logo_file_rc = $main::outfile{$pattern_type.'_pssm_logo_rc'.$logo_nb};
      my $start_format="";
      my $end_format="";
      if ($pattern_info[$logo_nb]->{score}>10) {
	$start_format .= "<b>";
	$end_format .= "</b>";
	if ($pattern_info[$logo_nb]->{score}>300) {
	  $start_format .= "<font color='red'>";
	  $end_format = "</font>".$end_format;
	}
      }
      &RSAT::message::Debug("Logo", $logo_nb, $logo_file) if ($main::verbose >= 5);
      if (-e $logo_file) {
	my $img = &RelativePath($logo_file);
	$pattern_table .= "<td align='right'>";
	$pattern_table .= $start_format;
	$pattern_table .= "$asmb_or_isol";
	$pattern_table .= " (sig=".$pattern_info[$logo_nb]->{score}.")";
	$pattern_table .= "&nbsp;"x5;
	$pattern_table .= $pattern_info[$logo_nb]->{word};
	$pattern_table .= "<br><a  href='".$img."'><img height=70 src='".$img."'></a>";
	$pattern_table .= $start_format;
	$pattern_table .= "</td>\n";
      } elsif ($main::verbose >= 1) {
	&RSAT::message::Warning("Cannot find logo", $logo_file);
      }
      if (-e $logo_file_rc) {
	my $img = &RelativePath($logo_file_rc);
	$pattern_table .= "<td align='left'>";
	$pattern_table .= $start_format;
	$pattern_table .= $pattern_info[$logo_nb]->{rc_word};
	$pattern_table .= "<br><a  href='".$img."'><img height=70 src='".$img."'></a>";
	$pattern_table .= "</td>\n";
      } elsif ($main::verbose >= 1) {
	&RSAT::message::Warning("Cannot find reverse complementary logo", $logo_file_rc);
      }
      #    $pattern_table .= "<td align='right'>".$pattern_info[$logo_nb]->{score}."</td>";
      $pattern_table .= "</tr>\n";
    }
    $pattern_table .= "</table>\n";

    &SyntheticTableAddRow($pattern_type,
			  $pattern_table,
			  $pattern_type,$main::outfile{$pattern_type},
			  "assembly",$main::outfile{$pattern_type.'_asmb'},
			  "sig matrix",$main::outfile{$pattern_type.'_pssm_sig'},
			  "matrices .tab",$main::outfile{$pattern_type.'_pssm_counts'},
			  "matrices .tf",$main::outfile{$pattern_type.'_pssm_tf'},
			  "sites",$main::outfile{$pattern_type.'_pssm_sites'},
			  "site distrib",$main::outfile{$pattern_type.'_pssm_site_distrib'},
			  "site distrib graph",$main::outfile{$pattern_type.'_pssm_site_distrib_graph'},
			 );
  } else {
    &SyntheticTableAddRow($pattern_type,
			  "<font color='red'red>Missing files</font>",
			  $pattern_type,$main::outfile{$pattern_type},
			  "assembly",$main::outfile{$pattern_type.'_asmb'},
			  "sig matrix",$main::outfile{$pattern_type.'_pssm_sig'},
			  "matrices",$main::outfile{$pattern_type.'_pssm_counts'},
			  "sites",$main::outfile{$pattern_type.'_pssm_sites'},
			  "site distrib",$main::outfile{$pattern_type.'_pssm_site_distrib'},
			  "site distrib graph",$main::outfile{$pattern_type.'_pssm_site_distrib_graph'},
			 );
  }
}

################################################################
## Synthesis of word comparisons
sub SynthesisWordCompa {
  my $img = &RelativePath($main::outfile{word_compa_heatmap});
  &SyntheticTableAddRow("Word comparisons",
			"<a  href='".$img."'><img height=150 src='".$img."'></a>",
			"tab",$main::outfile{word_compa},
			"heatmap",$main::outfile{word_compa_heatmap},
		       );
}

################################################################
## Synthesis of motif comparisons
sub SynthesisMotifCompa {
  my $img = &RelativePath($main::outfile{motifs_disco_compa_png});
  &SyntheticTableAddRow("Motif comparisons",
			"<a  href='".$img."'><img height=300 src='".$img."'></a>",
			"motifs", $main::outfile{motifs_discovered},
			"tab",$main::outfile{motifs_disco_compa},
			"gml",$main::outfile{motifs_disco_compa_gml},
			"png",$main::outfile{motifs_disco_compa_png},
		       );
}

################################################################
## Synthesis of comparisons betwween discovered motifs and reference motif
sub SynthesisMotifsVersusReference {
  &SyntheticTableAddHeaderRow("Discovered motifs versus reference motif");
  my $img = &RelativePath($main::outfile{"motifs_vs_ref_png"});
  &SyntheticTableAddRow("Motifs versus reference motif",
			"<a  href='".$img."'><img height=300 src='".$img."'></a>",
			"Reference motif",$main::infile{ref_motif},
			"table",$main::outfile{"motifs_vs_ref"},
			"alignments", $main::outfile{"motifs_vs_ref_matrices"},
			"aligned logos", $main::outfile{"motifs_vs_ref_matrices_html"},
			"gml",$main::outfile{"motifs_vs_ref_gml"},
			"png",$main::outfile{"motifs_vs_ref_png"},
		       );
}

################################################################
## Synthesis of comparisons betwween discovered motifs and motif databases
sub SynthesisMotifsVersusDatabase {
  &SyntheticTableAddHeaderRow("Discovered motifs versus transcription factor databases");
  foreach my $db_name (@motif_databases) {
    my $img = &RelativePath($main::outfile{"motifs_vs_db_".$db_name."_png"});
    &SyntheticTableAddRow("Motifs versus ".$db_name." database",
			  "<a  href='".$img."'><img height=300 src='".$img."'></a>",
			  $db_name." DB",$main::infile{"motif_db_".$db_name},
			  "table",$main::outfile{"motifs_vs_db_".$db_name},
			  "alignments", $main::outfile{"motifs_vs_db_".$db_name."_matrices"},
			  "aligned logos", $main::outfile{"motifs_vs_db_".$db_name."_matrices_html"},
			  "gml",$main::outfile{"motifs_vs_db_".$db_name."_gml"},
			  "png",$main::outfile{"motifs_vs_db_".$db_name."_png"},
			 );
  }
}

################################################################
## Delete purged sequence files after analysis has been completed.
sub CleanSequences {
  foreach my $seq_type (@seq_types) {
    &RSAT::message::TimeWarn("Cleaning sequences", $seq_type) if ($main::verbose >= 1);
    &one_command("rm -f ".$main::outfile{"purged_".$seq_type}, 1);
  }
}


################################################################
## Read arguments 
sub ReadArguments {
  &RSAT::message::TimeWarn("Reading arguments") if ($main::verbose >= 1);
  my $arg;
  my @arguments = @ARGV; ## create a copy to shift, because we need ARGV to report command line in &Verbose()
  while (scalar(@arguments) >= 1) {
    $arg = shift (@arguments);
    ## Verbosity

=pod

=head1 OPTIONS

=over 4

=item B<-v #>

Level of verbosity (detail in the warning messages during execution)

=cut
    if ($arg eq "-v") {
      if (&IsNatural($arguments[0])) {
	$main::verbose = shift(@arguments);
      } else {
	$main::verbose = 1;
      }

=pod

=item B<-h>

Display full help message

=cut
    } elsif ($arg eq "-h") {
      &PrintHelp();

=pod

=item B<-help>

Same as -h

=cut
    } elsif ($arg eq "-help") {
      &PrintOptions();

=pod

=item B<-i test_seq_file>

Test peak sequence file (mandatory).

For single-set analysis, this file contains the peak seuences of the
unique set.  For test versus control analysis, it contains the test
sequences.

=cut
    } elsif ($arg eq "-i") {
      $main::infile{test_seq} = shift(@arguments);

=pod

=item B<-ctrl control_seq_file>

Control peak sequence file (optional).

This option is used only for the test versus control analysis.

=cut
    } elsif ($arg eq "-ctrl") {
      $main::infile{ctrl_seq} = shift(@arguments);

=pod

=item B<-max_seq_len msl>

Maximal sequence length. larger sequences are truncated at the
specified length around the sequence center (from -msl/2 to +msl/2).

=cut
    } elsif ($arg eq "-max_seq_len") {
      $main::param{max_seq_len} = shift(@arguments);
      &RSAT::error::FatalError($main::param{max_seq_len}, "is not a valid value for max sequence length. Should be a strictly positive Natural number.")
	unless ((&IsNatural($main::param{max_seq_len})) && ($main::param{max_seq_len} > 0));


=pod

=item B<-ref_motif reference_motif>

Reference motif (optional).

In some cases, we already dispose of a reference motif, for example
the motif annotated in some transcription factor database
(e.g. RegulonDB, Jaspar, TRANSFAC)
 for the transcription factor of
interest. These annotations may come from low-throughput experiments,
and rely on a poor number of sites, but the reference motif may
nevertheless be informative, because it is based on several
independent studies.

Each discovered motif can be compared to the reference motif, in order
to evaluate its correspondence with the binding motif of the factor of
interest.

=cut
    } elsif ($arg eq "-ref_motif") {
      $main::infile{ref_motif} = shift(@arguments);


=pod

=item B<-motif_db db_name motif_db_file>

File containinf a database of transcription factor binding motifs
(e.g. JASPAR, TRANSFAC, RegulonDB, ...) which will be compared to the
discovered motifs (task motifs_vs_db). 

The option requires three arguments:
 - DB name
 - matrix format (same supported formats as convert-matrices)
 - file containing the DB motifs

The option can be called iteratively on the ame command line in order
to compare discovered motifs with several databases.

Example:
 -motif_db JASPAR jaspar jaspar_file.tab
 -motif_db TRANSFAC transfac transfac_download_dir/cgi-bin/data/matrix.dat


=cut

    } elsif ($arg eq "-motif_db") {
      my $db_name = shift(@arguments);
      $db_name =~ s/\s/_/g;
      push @main::motif_databases, $db_name;
      $main::motif_db_format{$db_name} = shift(@arguments);
      $main::infile{"motif_db_".$db_name} = shift(@arguments);

=pod

=item	B<-outdir output_directory>

Output directory (mandatory).

The result files and index files produced by the different programs
will be stored in this directory.

=cut
    } elsif ($arg eq "-outdir") {
      $main::dir{output} = shift(@arguments);


=pod

=item	B<-prefix output_prefix>

Prefix for the output files

=cut
    } elsif ($arg eq "-prefix") {
      $main::param{prefix} = shift(@arguments);

=pod

=item	B<-title graph_title>

Title displayed on top of the graphs.

=cut
    } elsif ($arg eq "-title") {
      $main::param{title} = shift(@arguments);

=pod

=item	B<-img_format img_format>

Image format. 

All the formats supported by XYgraph can be used.

=cut
    } elsif ($arg eq "-img_format") {
      $main::param{img_format} = shift(@arguments);

=pod

=item B<-task>

Specify a subset of tasks to be executed.

By default, the program runs all necessary tasks. However, in some
cases, it can be useful to select one or several tasks to be executed
separately.

Beware: task selection requires expertise, because most tasks depends
on the prior execution of some other tasks in the workflow. Selecting
tasks before their prerequisite tasks have been completed will provoke
fatal errors.

I<Avilable Tasks.>

=over

=item I<all> (default)

Run all supported tasks.

=item I<purge>

Purge input sequences (test set and, if specified, control set) to
mask redundant fragments before applying pattern discovey
algorithms. Sequence purging is necessary because redundant fragments
would violate the hypothesis of independence underlying the binomial
significance test, resulting in a large number of false positive
patterns.

=item I<seqlen>

Compute sequence lengths and their distribution. 

Sequence lengths are useful for the negative control (selection of
random genome fragments).

Sequence length distribution is informative to get an idea about the
variability of peak lengths.

=item I<profiles>

Compute compositional profiles, i.e. distributions of residues and
dinucleotide frequencies per position (using I<position-analysis>).

Residue profiles may reveal composition biases in the neighborhood of
the peak sequences. Dinucleotide profiles can reveal (for example) an
enrichment in CpG island.

Note that I<chip-seq-analysis> also runs I<position-analysis> with
larger oligonucleotide length (see option -l) to detect motifs on the
basis of positionally biased oligonucleotides (see task B<positions>).

=item I<ref_motif>

This task combines various operations.

=over

=item Formating of the reference motif

Perform various format conversion for the reference motif (compute
parameters, consensus, logo).

=item Motif enrichment

Generate an enriched motif by scanning the peak sequence set with the
reference motif.

=item Motif comparison

Compare all discovered motifs with the reference motif.

=back

=item I<oligos>

Run I<oligo-analysis> to detect over-represented oligonucleotides of a
given length (k, specified with option -l) in the test set (van Helden
et al., 1998). Prior frequencies of oligonucleotides are taken from
Markov model of order m (see option -markov) estimated from the test
set sequences themselves.

=item I<dyads>

Run I<dyad-analysis> to detect over-represented dyads, i.e. pairs of
short oligonucleotides (monads) spaced by a region of fixed width but
variable content (van Helden et al., 2000). Spaced motifs are typical
of certain classes of transcription factors forming homo- or
heterodimers.

By default, chip-seq-analysis analyzes pairs of trinucleotides with
any spacing between 0 and 20.

The expected frequency of each dyad is estimated as the product of its
monad frequencies in the input sequences (option -bg monads of
dyad-analysis).

=item I<positions>

Run I<position-analysis> to detect oligonucleotides showing a
positional bias, i.e. have a non-homogeneous distribution in the peak
sequence set.

This method was initially developed to analyze termination and
poly-adenylation signals in downstream sequences (van Helden et al.,
2001), and it turns out to be very efficient for detecting motifs
centred on the ChIP-seq peaks. For ChIP-seq analysis, the reference
position is the center of each sequence.

Note that I<chip-seq-analysis> also uses I<position-analysis> for the
task B<profiles>, in order to detect compositional biases (residues,
dinucleotides) in the test sequence set.

=item I<local_words>

Run I<local-word-analysis> to detect locally over-represented
oligonucleotides and dyads. 

The program I<local-word-analysis> (Matthieu Defrance,unpublished)
tests the over-representation of each possible word (oligo, dyad)
in positional windows in the input sequence set.

Two types of background models are supported: (i) Markov model of
order m estimated locally (within the window under consideration; (ii)
the frequency observed for a word in the whole sequence set is used as
estimator of the prior probability of this word in the window.

After our first trials, this program gives excellent results in
ChIP-seq datasets, because its senstivitity increases with large
number of sequences (several hundreds/thousands), and its background
model is more stringent than for programs computing the globl
over-representation (oligo-analysis, dyad-analysis).

=item I<word_compa>

Compare the words (oligos-dyads) discovered by the different
algorithms in order to assess their consistency. Draw a heatmap
showing the significance estiamted for each word (row) by each
algorthm (column).

=item I<motif_compa>

Motifs are compared in three ways.

=over

=item I<Discovered versus discovered (task cluster_motifs)>

Perform pairwise comparisons between all motifs (matrices) discovered
by the different algorithms, to assess their consistency.

=item I<Discovered versus reference>

Compare each discovered motif to the reference motif.

=item I<Discovered versus database (task motifs_vs_db)>

Compare each discovered motif to a database of known motifs
(e.g. Jaspar, TRANSFAC, RegulonDB, UniProbe, ...)

=back

=item I<timelog>

Generate a log file summarizing the time spent in the different tasks.

=item I<synthesis>

Generate the HTML file providing a synthesis of the results and
pointing towards the individual result files.

=item I<clean_seq>

Delete the purged sequence files after the analysis, in order to save
space.

This task is executed only when it is called explicitly. It is not
part of the tasks running with the option "-task all".

=back

=cut
    } elsif ($arg eq "-task") {
      my @requested_tasks = split ",", shift (@arguments);
      foreach my $task (@requested_tasks) {
	next unless $task;
	if ($supported_task{$task}) {
	  $task{$task} = 1;
	} else {
	  &RSAT::error::FatalError("Task '$task' is not supported. \n\tSupported: $supported_tasks");
	}
      }

=pod

=item B<-nmotifs max_motif_number>

Maximal number of motifs (matrices) to return for pattern discovery
algorithms. Note the distinction between the maximal number of motifs
(matrices) and the maximum number of patterns (words, dyads): a motif
generally corresponds to mutually overlapping several patterns (dyads,
words).

=cut

    } elsif ($arg eq "-nmotifs") {
      $main::param{matrix_nmotifs} = shift (@arguments);

=pod

=item B<-l oligo_len>


Oligonucleotide length for word-counting approaches (oligo-analysis,
position-analysis, local-word-analysis, oligo-diff).

In our experience, optimal results are obtained with hexanucleotides
and heptanucleotides.

Note: the monad length used for dyad-analysis is not affected by those
options. Instead it is fixed to to 3. Indeed, dyad-analysis can detect
larger motifs by sampling various spacings between the two
trinucleotide monads.

=item B<-minol oligo_min_len>

=item B<-maxol oligo_max_len>

Minimal (-minol) and maximal (-maxol) oligonucleotide lengths. If
those options are used, the program iterated over the specified range
of oligonucleotide lengths.

=cut
    } elsif ($arg eq "-l") {
      my $oligo_len = shift (@arguments);
      $main::param{oligo_min_len} = $oligo_len;
      $main::param{oligo_max_len} = $oligo_len;
    } elsif ($arg eq "-minol") {
      $main::param{oligo_min_len} = shift (@arguments);
    } elsif ($arg eq "-maxol") {
      $main::param{oligo_max_len} = shift (@arguments);

=pod

=item B<-markov>

Order of the Markov model used to estimatd
expected oligonucleotide frequencies for I<oligo-analysis> and
I<local-word-analysis>.

Higher order Markov models are more stringent, lower order are more
sensitive, but tend to return a large number of false positives.

Markov models can be specified with either a positive or a negative
value. Positive value indicate the length of the prefix in the
transition matrix. Negative value indicate the order of the Markov
model relative to the oligonucleotide length. For example, the option
-markov -2 gives a model of order m=k-2 (thus, an order 5 for
heptanucleotides, an order 4 for hexanucleotides).

The optimal Markov order depends on the number of sequences in the
test set. Since ChIP-seq data typically contain hundreds to thoursands
of peaks, high Markov orders are generally good, because they are
stringent and still sensitive enough.  In our experience, motifs are
well detected with the most stringent Markov order (-markov -2).

=item B<-min_markov min_markov_order>

=item B<-max_markov max_markov_order>

A miminal and a maximal value can be specified for the Markov
order. The program then iterates over all markov values between
min_markov_order and max_markov_order.


=cut
    } elsif ($arg eq "-markov") {
      $main::param{oligo_min_mkv} = $main::param{oligo_max_mkv} = shift (@arguments);
    } elsif ($arg eq "-min_markov") {
      $main::param{oligo_min_mkv} = shift (@arguments);
    } elsif ($arg eq "-max_markov") {
      $main::param{oligo_max_mkv} = shift (@arguments);

=pod

=item B<-1str | -2str>

Single-strand (-1str) or double-strand (-2str) analysis.

The default is double-strand analysis, since ChIP-seq results have no
particular strand orientation.

=cut
    } elsif ($arg eq "-1str") {
      $main::param{strand} = "-1str";
    } elsif ($arg eq "-2str") {
      $main::param{strand} = "-2str";

=pod

=item B<-noov | -ovlp>

Treatment of self-overlapping words: count (-ovlp) or do not count
(-noov) overlapping occurrences. In -noov mode, only renewing
occurrences are counted.

It is recommended to use the -noov mode (default) to avoid the effect
of self-overlap, which violates the hypothesis of independence of
successive occurrences underlying the binomial significance test
(oligo-analysis, dyad-analysis).

=cut
    } elsif ($arg eq "-noov") {
      $main::param{noov} = "-noov";
    } elsif ($arg eq "-ovlp") {
      $main::param{noov} = "-ovlp";

=pod

=item B<-ci class_interval>

Class interval for I<position-analysis>.

=cut

    } elsif ($arg eq "-ci") {
      $main::param{profiles_ci} = shift(@arguments);
      &RSAT::error::FatalError($main::param{profiles_ci}, "is not a valid value for class interval. Should be a strictly positive Natural number.")
	unless ((&IsNatural($main::param{profiles_ci})) && ($main::param{profiles_ci} > 0));

      ## Other parameters are not accepted
    } else {
      &FatalError(join("\t", "Invalid option", $arg));

    }
  }

=pod

=back

=cut

}

################################################################
## Check arguments
sub CheckArguments {
  &RSAT::message::TimeWarn("Checking arguments") if ($main::verbose >= 1);

  if ($infile{test_seq}) {
    unless (-e $infile{test_seq}) {
      &FatalError("Test sequence file does not exist", $infile{test_seq});
    }
    @main::seq_types = ("test");
  } else {
    &FatalError("You must define the test sequence set (option -i)");
  }

  if ($infile{ctrl_seq}) {
    unless (-e $infile{ctrl_seq}) {
      &FatalError("Control sequence file does not exist", $infile{ctrl_seq});
    }
    push @main::seq_types, ("ctrl");
  }

  ## output directory
  if ($main::dir{output}) {
    &RSAT::util::CheckOutDir($main::dir{output});
  } else {
    &FatalError("You must define the output directory (option -outdir)");
  }

  ## Log files
  $main::outfile{log} = &OutFileName(".txt", "log");
  $main::outfile{timelog} = &OutFileName(".txt", "timelog");
  $main::outfile{timelog_html} = &OutFileName(".html", "timelog");

  ## Synthesis file 
  $main::outfile{synthesis} = &OutFileName(".html", "synthesis");


  ## Modalities of motif comparisons
  if ($task{motif_compa}) {
   $task{merge_motifs} = 1;
   $task{cluster_motifs} = 1;
   $task{motifs_vs_ref} = 1 if (defined($main::infile{ref_motif}));
   $task{motifs_vs_db} = 1 if (scalar(@motif_databases) > 0);
  }

  ## Check that reference motif has been specified if required
  if ($task{motifs_vs_ref}) {
    &RSAT::error::FatalError("The task motifs_vs_dref requires to specify a file containing the reference motif (option -ref_motif).")
      unless (defined($main::infile{ref_motif}));
  }

  ## Check that motif DB has been specified if required
  if ($task{motifs_vs_db}) {
    &RSAT::error::FatalError("The task motifs_vs_db requires to specify at least one file containing database motifs (option -motif_db).")
      unless (scalar(@motif_databases) > 0)
  }


  ## If all tasks are requested or if no task is defined, execute all
  ## tasks.
  if ((scalar(keys(%task)) == 0) || ($task{all})) {
    %task = %supported_task;
    delete($task{all});
    delete($task{clean_seq});
    if (defined($main::infile{ctrl_seq})) {
      delete($task{oligos});
      delete($task{dyads});
      delete($task{positions});
      delete($task{local_words});
    } else {
      delete($task{oligo_diff});
    }
    delete($task{motifs_vs_ref}) unless (defined($main::infile{ref_motif}));
    delete($task{motifs_vs_db}) unless (scalar(@motif_databases) > 0);
  }
  foreach my $task (@supported_tasks) {
   push (@tasks, $task) if $task{$task};
  }
  &RSAT::message::Info("Tasks: ", join (",", @tasks)) if ($main::verbose >= 1);
}

################################################################
## Verbose message
sub Verbose {
    print $main::out "; chip-seq-analysis ";
    &PrintArguments($main::out);
    printf $main::out "; %-22s\t%s\n", "Program version", $program_version;

    ## Parameter values
    print $main::out "; Parameter values\n";
    foreach my $param_name (@param_list) {
      print $main::out sprintf ";\t%-22s\t%s\n", $param_name, $param{$param_name};
    }

    ## Input file(s)
    if (defined(%main::infile)) {
	print $main::out "; Input files\n";
	while (my ($key,$value) = each %main::infile) {
	  printf $main::out ";\t%-22s\t%s\n", $key, $value;
	}
    }

    print $out &PrintThresholdValues();

    ## Output files
    if (defined(%main::outfile)) {
	print $main::out "; Output files\n";
	foreach my $key (sort keys  %main::outfile) {
	  $value = $main::outfile{$key};
	  printf $main::out ";\t%-30s\t%s\n", $key, $value;
	}
    }
}

=pod

=head1 REFERENCES

The program I<chip-seq-analysis> combines a series of tried-and-tested
programs which have been detailed in the following publications.

=over

=item I<oligo-analysis>

van Helden, J., Andre, B. and Collado-Vides, J. (1998). Extracting
regulatory sites from the upstream region of yeast genes by
computational analysis of oligonucleotide frequencies. J Mol Biol 281,
827-42.

=item I<dyad-analysis>

van Helden, J., Rios, A. F. and Collado-Vides, J. (2000). Discovering
regulatory elements in non-coding sequences by analysis of spaced
dyads. Nucleic Acids Res 28, 1808-18.

=item I<position-analysis>

van Helden, J., del Olmo, M. and Perez-Ortin,
J. E. (2000). Statistical analysis of yeast genomic downstream
sequences reveals putative polyadenylation signals. Nucleic Acids Res
28, 1000-10.

=item I<matrix-scan>

Turatsinze, J. V., Thomas-Chollier, M., Defrance, M. and van Helden,
J. (2008). Using RSAT to scan genome sequences for transcription
factor binding sites and cis-regulatory modules. Nat Protoc 3,
1578-88.

=back

=head1 SEE ALSO

=over

=item I<oligo-analysis>

=item I<dyad-analysis>

=item I<position-analysis>

=item I<matrix-scan>


=back

=head1 WISH LIST

=over

=item B<partial synthesis>

For the Web server: generate temporary synthetic table showing the
results already obtained so far, and finishing by a message "Partial
results, please don't forget to reload the file later".

=item B<motif_cluster>

Compare all discovered motifs (plus reference motif if specified) and
cluster them in order to extract a consensus motif.

=back


=cut

__END__
