#!/usr/bin/env perl
############################################################
#
# $Id: peak-quality,v 1.9 2012/10/16 07:09:13 jvanheld Exp $
#
############################################################

## use strict;

=pod

=head1 NAME

peak-quality

=head1 VERSION

$program_version

=head1 DESCRIPTION

Evaluate the quality of a set of peaks by measuring its enrichment for
one or several reference motifs.


=head1 AUTHORS

Jacques.van-Helden\@univ-amu.fr

=head1 CATEGORY

=over

=item NGS

=back

=head1 USAGE

peak-quality [-i inputfile] [-o outputfile] [-v #] [...]

=head1 INPUT FORMAT

The program requires three input files : 

=over

=item B<Peak coordinates>

A set of peaks coordinates sorted by decreasing score, in BED format.

=item B<Peak sequences>

A set of sequences for these peaks, in fasta format (must be in the
same order as the peak coordinates in the bed file).

=item B<Reference motifs>

A file containing one or several reference matrix (matrices), in
transfac format.

=back

=head1 OUTPUT FORMAT

Various statistics are computed on the peaks, and drawings are
generated to give an intuition of the data.


=head2 Score versus length

Peak score as a function of its length.

=head2 Peak score distribution

Histogram of peak score distribution. 

=head2 Peak score by rank

Decreasing curve of peak scores (score as a function of the rank).

=head1 SEE ALSO

=head1 WISH LIST

=over

=item B<convert-features>

Add option -from swembl to the command I<convert-features> in ordefr
to automatically produce a consistent bed file: score at the 5th
column and name at the 4th column. The name should be the same as the
one generated by I<fetch-sequences>.

=item B<-task peakmo>

Run peak-motifs in the whole peak set and in each slice separately, to
compare discovered motifs.


=item B<-motif_db db_name db_format db_file>

File containinf a database of transcription factor binding motifs
(e.g. JASPAR, TRANSFAC, RegulonDB, ...) which will be compared to the
discovered motifs (task motifs_vs_db).

The option requires three arguments:

 - DB name

 - matrix format. same supported formats as convert-matrices, but we
   recommend to use a format that includes an ID and a name for each
   motif (e.g. TRANSFAC)

 - file containing the DB motifs

=back

=cut

BEGIN {
  if ($0 =~ /([^(\/)]+)$/) {
    push (@INC, "$`lib/");
  }
}
require "RSA.lib";
require "RSA.disco.lib";
require "footprint.lib.pl";
use RSAT::util;
use RSAT::MatrixReader;
use RSAT::SeqUtil;
use Data::Dumper;
use File::Basename;

################################################################
## Main package
package main;
{
  ## Check that the RSAT paths of the programs required for the script are specified

  ################################################################
  ## Initialise parameters
  our $start_time = &RSAT::util::StartScript();
  our $program_version = do { my @r = (q$Revision: 1.9 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
  #    $program_version = "0.00";

  our %infile = ();
  our %outfile = ();
  our %dir = ();

  our $verbose = 0;
  our $in = STDIN;
  our $out = STDOUT;

  our @peaks = (); ## peak information
  our @peak_ids = ();
  our @peak_len = ();

  our %param = ();
  our @param_list = (); ## Parameters to print out in the report
  $param{nb_slices} = 5; push @param_list, "nb_slices"; ## Number of sequence slices
  $param{seq_per_slice} = 0; push @param_list, "seq_per_slice"; ## Number of sequences per slice
  $param{strand} = "-2str"; push @param_list, "strand"; ## Single- or two-strand analysis
  $param{seq_len_distrib_ci} = 100; push @param_list, "seq_len_distrib_ci"; ## class interval for sequence length distribution
  $param{nb_peaks_from_bed} = 0; push @param_list, "nb_peaks_from_bed"; ## Number of peaks defined in the bed file
  $param{nb_peaks_from_seq} = 0; push @param_list, "nb_peaks_from_seq"; ## Number of peaks found in the sequence file
  $param{top_matrices} = 0; push @param_list, "top_matrices"; ## Number of top matrices taken in consideration in the reference matrix file
  $param{matrix_permutations} = 5; push @param_list, "matrix_permutations"; ## Number of matrix permutations for matrix-quality (negative control)
  $param{decimals} = 1; push @param_list, "decimals"; ## Number of decimals for matrix scanning
  $param{scan_markov_order} = 1; push @param_list, "scan_markov_order"; ## Order of the markov model used for scanning sequences with the motif
  $param{one_site_per_peak} = 1; push @param_list, "one_site_per_peak"; ## Only analyze the top-scoring site per peak

  # Supported tasks
  our @supported_tasks = qw(seq_len
			    bg_model
			    slices
			    enrichment
			    peakmo
			  );
  our $supported_tasks = join ",", @supported_tasks;
  our %supported_task = ();
  foreach my $task (@supported_tasks) {
    $supported_task{$task} = 1;
  }

#			    peakmmo

   ## Define tasks to execute
#   $task{slices} = 1; ## Extract peak sequence slices
#   $task{seq_len} = 1; ## Compute sequence lengths
#   $task{bg_model} = 1; ## compute background model
#   $task{enrichment} = 1; ## Analyze sequence slices with matrix-quality
#  $task{peakmo} = 1; ## Analyze sequence slices with peak-motifs

  ################################################################
  ## Read argument values
  &ReadArguments();

  ## If no task is specified, run all of them
  unless (scalar(keys(%task))) {
    %task = %supported_task;
  }

  ## If option -task all was called, activate all tasks
  if ($task{all}) {
    %task = %supported_task;
  }

  ################################################################
  ## Check argument values

  ## Output directory
  if ($dir{output}) {
    &RSAT::util::CheckOutDir($dir{output}, "", 755);
  } else {
    &RSAT::error::FatalError("You must define the output directory (option -outdir)");
  }

  ## Prefix
  unless ($param{prefix}) {
    $param{prefix} = "peak_quality";
#    &RSAT::error::FatalError("You must define a prefix for the output files (option -prefix)");
  }
  push @param_list, "prefix";

   ## Reference motif file
  if ($task{enrichment}) {
    unless ($infile{ref_motifs}) {
      &RSAT::error::FatalError("You must define a reference motif file (option -ref_motifs) to run enrichment analysis.");
    }
  }

  ################################################################
  ## Open log file
  $outfile{log} = &OutFileName("", ".txt", "log");
  $out = &OpenOutputFile($outfile{log});

  ################################################################
  ## Read bed coordinates
  &RSAT::message::TimeWarn("Reading bed file", $infile{bed});
  my $peak_nb = 0;
  my ($bed_handle) = &OpenInputFile($infile{bed});
  my $l = 0 ; ## Line counter
  my $bed_errors = 0;
  while (<$bed_handle>) {
    $l++; ## Increment line counter
    next if (/^#/); ## Skip header/comment rows
    next if (/^;/); ## Skip comment rows
    next unless (/\S/); ## Skip empty rows
    chomp();
    my ($chrom, $start, $end, $name, $score) = split(/\t/);

    ## Check score
    unless (&RSAT::util::IsReal($score)) {
      $bed_errors++;
      &RSAT::message::Warning("Line", $l, "skipped because of invalid score", $score);
      next;
    }

    ## Check coordinates
    unless (&RSAT::util::IsNatural($start)) {
      $bed_errors++;
      &RSAT::message::Warning("Line", $l, "skipped because of invalid start", $start);
      next;
    }
    unless (&RSAT::util::IsNatural($end)) {
      $bed_errors++;
      &RSAT::message::Warning("Line", $l, "skipped because of invalid end", $end);
      next;
    }

    my $len = $end - $start + 1;
    if ($len < 0) {
      $bed_errors++;
      &RSAT::message::Warning("Line", $l, "skipped because start (".$start.") smaller than end (".$end.")");
      next;
    }

    ## Check name
    unless ($name) {
      $name = join ("", $chrom, "_", $start, "_", $end, "+");
    }

    $peak_nb++; ## Increment feature counter

    $peaks[$peak_nb]->{chrom} = $chrom;
    $peaks[$peak_nb]->{start} = $start;
    $peaks[$peak_nb]->{end} = $end;
    $peaks[$peak_nb]->{name} = $name;
    $peaks[$peak_nb]->{score} = $score;
    $peaks[$peak_nb]->{len} = $len;
  }
  close $bed_handle;
  if ($main::verbose >= 1) {
    &RSAT::message::TimeWarn("Read ".$peak_nb." peaks");
    &RSAT::message::TimeWarn("Skipped ".$bed_errors." invalid rows.");
  }
  $param{nb_peaks_from_bed} = $peak_nb;

  ################################################################
  ## Sequence lengths
  &RSAT::message::TimeWarn("Computing sequence lengths");
  $outfile{seq_len} = &OutFileName("seq_stats", ".tab", "seq_len");
  $outfile{seq_len_distrib} = &OutFileName("seq_stats", ".tab", "seq_len_distrib_all_peaks");
  @seq_len_distrib_files = ($outfile{seq_len_distrib});
  if ($task{seq_len}) {
    my $cmd =  &RSAT::server::GetProgramPath("sequence-lengths");
    $cmd .= " -i ".$infile{seq};
    $cmd .= " -o ".$outfile{seq_len};
    $cmd .= "; classfreq -v 1";
    $cmd .= " -i ".$outfile{seq_len};
    $cmd .= " -ci ".$param{seq_len_distrib_ci};
    $cmd .= " -o ".$outfile{seq_len_distrib};
    &one_command($cmd, 1);
  }

  ################################################################
  ## Read sequence lengths and check that they are compatible with
  ## slice nb and size.
  ##
  ## We need to take this information from the sequence file rather
  ## than the peak file, because some peaks could correspond to no
  ## sequence (e.g. there are sometimes problem with peaks on the Mt
  ## chromosome).
  my ($len_handle) = &OpenInputFile($outfile{seq_len});
  while (<$len_handle>) {
    next if (/^;/); # Skip comment lines
    next if (/^#/); # Skip header line
    next unless (/\S/); # Skip empty lines
    chomp();
    my ($id, $len) = split ("\t");
    push @peak_ids, $id;
    push @peak_len, $len;
  }
  close ($len_handle);


  ################################################################
  ## Define number of slices per sequence if required
  $param{nb_peaks_from_seq} = scalar(@peak_ids);
  unless ($param{seq_per_slice}) {
    $param{seq_per_slice} = $param{nb_peaks_from_seq} / $param{nb_slices};
  }

  if (scalar(@peak_ids) < $param{nb_slices} * $param{seq_per_slice}) {
    my $warning = join("\n",
		       "Number of peak sequences ($param{nb_peaks_from_seq}) is insuffient to extract",
		       $param{nb_slices}, "slices of",
		       $param{seq_per_slice}, "sequences.",
		       "Changed to ".$param{nb_peaks_from_seq} / $param{nb_slices});
    &RSAT::message::Warning($warning);
    print $out "\n\nWARNING\n", $warning , "\n";
    $param{seq_per_slice} = $param{nb_peaks_from_seq} / $param{nb_slices};
  }

  ################################################################
  ## Prepare sequence slices
  $param{slice_spacing} = ($param{nb_peaks_from_seq} - $param{nb_slices}*$param{seq_per_slice}) / ($param{nb_slices}-1);
  push @param_list, "slice_spacing"; ## Number of sequences skipped between two successive slices
  foreach my $s (1..$param{nb_slices}) {
    my $first_seq = &RSAT::util::round(($s-1) * ($param{seq_per_slice} + $param{slice_spacing}))+1;
    my $last_seq = &RSAT::util::round($first_seq + $param{seq_per_slice} -1);
    $last_seq = &RSAT::stats::min($last_seq, $param{nb_peaks_from_seq}); ## Make sure that rounding effects don't provoke an excessive value for last_seq
    &RSAT::message::TimeWarn("Extracting slice",
			     $s."/".$param{nb_slices},
			     $first_seq." -> ".$last_seq,
			     $param{seq_per_slice}." sequences"
			    )
      if ($main::verbose >= 1);


    ## Compute slice statistics
    my %slice_stats = &RSAT::stats::summary(@peak_len[($first_seq-1)..($last_seq-1)]);
    $slice_stats[$s]->{s} = $s;
    $slice_stats[$s]->{L_n} = $slice_stats{n};
    $slice_stats[$s]->{L_sum} = $slice_stats{sum};
    $slice_stats[$s]->{L_mean} = sprintf("%.1f", $slice_stats{mean});
    $slice_stats[$s]->{L_med} = $slice_stats{median};
    $slice_stats[$s]->{L_sd} = sprintf("%.1f", $slice_stats{sd});
    $slice_stats[$s]->{L_min} = $slice_stats{min};
    $slice_stats[$s]->{L_max} = $slice_stats{max};
    $slice_stats[$s]->{first} = $first_seq;
    $slice_stats[$s]->{last} = $last_seq;
    $slice_stats[$s]->{seq_nb} = $last_seq - $first_seq + 1;
    $slice_stats[$s]->{id} = "slice".$s."_".$first_seq."-".$last_seq;

    ## Extract sequence slices from the input sequence file
    $outfile{"seq_slice_".$s} = &OutFileName("sequences", ".fasta", $slice_stats[$s]->{id}, "seq");
    if ($task{slices}) {
      my $skip_seq = $first_seq -1;
      my $cmd =  &RSAT::server::GetProgramPath("convert-seq");
      $cmd .= " -i ".$infile{seq};
      $cmd .= " -from fasta -to fasta";
      $cmd .= " -dna";
      $cmd .= " -mask non-dna";
      $cmd .= " -skip ".$skip_seq;
      $cmd .= " -top ".$last_seq;
      $cmd .= " -o ".$outfile{"seq_slice_".$s};
      &one_command($cmd, 1);
    }

    ## Compute seq length distribution for each slice
    $outfile{"seq_len_distrib_slice_".$s} = &OutFileName("seq_stats", ".tab", "seq_len_distrib_slice_".$s);
    push @seq_len_distrib_files, $outfile{"seq_len_distrib_slice_".$s};
    if ($task{seq_len}) {
      $cmd =  &RSAT::server::GetProgramPath("sequence-lengths");
      $cmd .= " -i ".$outfile{"seq_slice_".$s};
      $cmd .= " | classfreq -v 1";
      $cmd .= " -ci ".$param{seq_len_distrib_ci};
      $cmd .= " -o ".$outfile{"seq_len_distrib_slice_".$s};
      &one_command($cmd, 1);
    }
  }

  ## Compare sequence length distributions between slices
  $outfile{seq_len_distrib_compa} = &OutFileName("seq_stats", ".tab", "seq_len_distrib_comparison");
  $outfile{seq_len_distrib_compa_plot_png} = &OutFileName("seq_stats", ".png", "seq_len_distrib_comparison");
  $outfile{seq_len_distrib_compa_plot_pdf} = &OutFileName("seq_stats", ".pdf", "seq_len_distrib_comparison");
  if ($task{seq_len}) {
    $cmd =  &RSAT::server::GetProgramPath("compare-scores");
    $cmd .= " -v 1";
    $cmd .= " -ic 3 -numeric -null .";
    $cmd .= " -sc 9";
    $cmd .= " -suppress '.tab'";
    $cmd .= " -suppress ".$dir{output}."/seq_stats/";
    $cmd .= " -suppress ".$param{prefix}."_seq_len_distrib_";
    $cmd .= " -o ".$outfile{seq_len_distrib_compa};
    $cmd .= " -files ";
    $cmd .= join " ", @seq_len_distrib_files;
    &one_command($cmd, 1);

    ## Generate an XY plot of sequence length distributions
    foreach my $img_format ("png", "pdf") {
      my $last_col = 1 + scalar(@seq_len_distrib_files);
      $cmd =  &RSAT::server::GetProgramPath("XYgraph");
      $cmd .= " -i ".$outfile{seq_len_distrib_compa};
      $cmd .= " -xsize 800";
      $cmd .= " -ysize 400";
      $cmd .= " -xcol 1";
      $cmd .= " -xmin 0";
      $cmd .= " -ymin 0 -ymax 1";
      $cmd .= " -hline violet 0.5";
      $cmd .= " -ygstep1 0.1";
#      $cmd .= " -xgstep1 ".$param{seq_len_distrib_ci};
      $cmd .= " -ycol 2-".$last_col;
      $cmd .= " -lines -pointsize 0 -legend";
      $cmd .= " -title1 'Sequence length distributions'";
      $cmd .= " -title2 ".$param{prefix};
      $cmd .= " -xleg1 'Sequence length'";
      $cmd .= " -yleg1 'Decreasing cumulative frequency: F(L <= x)'";
      $cmd .= " -format ".$img_format;
      $cmd .= " -o ".$outfile{"seq_len_distrib_compa_plot_".$img_format};
      &one_command($cmd, 1);
    }
  }

  ## Print out slice stats
  my @slice_stat_fields = qw(s
			     id
			     first
			     last
			     seq_nb
			     L_n
			     L_min
			     L_max
			     L_sum
			     L_mean
			     L_med
			     L_sd
			    );

  my %stat_descr = ();
  $stat_descr{s} = "Slice number";
  $stat_descr{id} = "Slice identifier";
  $stat_descr{first} = "Index of first sequence in the slice";
  $stat_descr{last} = "Index of last sequence in the slice";
  $stat_descr{seq_nb} = "Number of sequences in the slice";
  $stat_descr{L_n} = "Number of sequences in the slice";
  $stat_descr{L_min} = "Min sequence length in the slice";
  $stat_descr{L_max} = "Max sequence length in the slice";
  $stat_descr{L_sum} = "Sum of sequence lengths in the slice";
  $stat_descr{L_mean} = "Mean sequence length in the slice";
  $stat_descr{L_med} = "Median sequence length in the slice";
  $stat_descr{L_sd} = "Standard deviation of sequence length in the slice";
  $outfile{slice_stats} = &OutFileName("seq_stats", ".tab", "slice_statistics");
  my $slice_stat_hdl = &OpenOutputFile($outfile{slice_stats});
  my $s = 0;
  foreach my $stat (@slice_stat_fields) {
    $s++;
    print $slice_stat_hdl join ("\t", ";", $s, $stat, $stat_descr{$stat}), "\n";
  }
  print $slice_stat_hdl "#", join ("\t",  @slice_stat_fields), "\n";
  foreach my $s (1..$param{nb_slices}) {
    my @stats = ();
    foreach my $stat (@slice_stat_fields) {
      push @stats, $slice_stats[$s]->{$stat};
    }
    print $slice_stat_hdl join ("\t", @stats), "\n";
  }
  close $slice_stat_hdl;


  ################################################################
  ## Compute background model
  $param{bg_oligo_len} = $param{scan_markov_order} + 1;
  $outfile{bg_model_freq} = &OutFileName("bg_models", ".tab", $param{bg_oligo_len}."nt", "freq");
  $outfile{bg_model_inclusive} = &OutFileName("bg_models", ".inclusive", $param{bg_oligo_len}."nt", "freq");
  $outfile{bg_model_transitions} = &OutFileName("bg_models", ".tab", "markov".$param{scan_markov_order}, "transitions");
  $outfile{bg_model_heatmap} = &OutFileName("bg_models", ".png", "markov".$param{scan_markov_order}, "transitions");

  if ($task{bg_model}) {
    &RSAT::message::TimeWarn("Computing background model") if ($main::verbose >= 1);

    ## Compute oligonucleotide frequencies (k = m+1) in input sequence file
    $cmd =  &RSAT::server::GetProgramPath("count-words");
    $cmd .= " -v 1";
    $cmd .= " -i ".$main::infile{seq};
    $cmd .= " -l ".$param{bg_oligo_len};
    $cmd .= " -1str";
    $cmd .= " > ".$outfile{bg_model_freq};
    &one_command($cmd, 1);

    ## Convert background model in INCLUSIVE format for matrix-scan-quick
    $cmd =  &RSAT::server::GetProgramPath("convert-background-model");
    $cmd .= " -from oligos -to inclusive ";
    $cmd .= " -i ".$main::outfile{bg_model_freq};
    $cmd .= " -o ".$main::outfile{bg_model_inclusive};

    ## Convert background model to transition table and draw the heatmap of transition probabilities
    $cmd .= "; ". &RSAT::server::GetProgramPath("convert-background-model");
    $cmd .= " -from oligos -to transitions ";
    $cmd .= " -i ".$main::outfile{bg_model_freq};
    $cmd .= " -o ".$main::outfile{bg_model_transitions};
    $cmd .= " ; cut -f 1-5,7 ".$main::outfile{bg_model_transitions};
    $cmd .= " | ". &RSAT::server::GetProgramPath("draw-heatmap");
    $cmd .= " -min 0 -max 1  -out_format png -col_width 50";
    $cmd .= " -o ".$main::outfile{bg_model_heatmap};
    &one_command($cmd, 1);

    foreach my $s (1..$param{nb_slices}) {
      $outfile{"bg_model_slice".$s."_freq"} = &OutFileName("bg_models", ".tab", "slice".$s, $param{bg_oligo_len}."nt", "freq");
      $outfile{"bg_model_slice".$s."_inclusive"} = &OutFileName("bg_models", ".inclusive", "slice".$s, $param{bg_oligo_len}."nt", "freq");
      $outfile{"bg_model_slice".$s."_transitions"} = &OutFileName("bg_models", ".tab", "slice".$s, "markov".$param{scan_markov_order}, "transitions");
      $outfile{"bg_model_slice".$s."_heatmap"} = &OutFileName("bg_models", ".png", "slice".$s, "markov".$param{scan_markov_order}, "transitions");
      &RSAT::message::TimeWarn("Computing background model for slice", $s."/".$param{nb_slices}) if ($main::verbose >= 1);

      ## Compute oligonucleotide frequencies (k = m+1) in input sequence file
      $cmd =  &RSAT::server::GetProgramPath("count-words");
      $cmd .= " -v 1";
      $cmd .= " -i ".$outfile{"seq_slice_".$s};
      $cmd .= " -l ".$param{bg_oligo_len};
      $cmd .= " -1str";
      $cmd .= " > ".$outfile{"bg_model_slice".$s."_freq"};
      &one_command($cmd, 1);

      ## Convert background model in INCLUSIVE format for matrix-scan-quick
      $cmd =  &RSAT::server::GetProgramPath("convert-background-model");
      $cmd .= " -from oligos -to inclusive ";
      $cmd .= " -i ".$main::outfile{"bg_model_slice".$s."_freq"};
      $cmd .= " -o ".$main::outfile{"bg_model_slice".$s."_inclusive"};

      ## Convert background model to transition table and draw the heatmap of transition probabilities
      $cmd .= "; ". &RSAT::server::GetProgramPath("convert-background-model");
      $cmd .= " -from oligos -to transitions ";
      $cmd .= " -i ".$main::outfile{"bg_model_slice".$s."_freq"};
      $cmd .= " -o ".$main::outfile{"bg_model_slice".$s."_transitions"};
      $cmd .= " ; cut -f 1-5,7 ".$main::outfile{"bg_model_slice".$s."_transitions"};
      $cmd .= " | ". &RSAT::server::GetProgramPath("draw-heatmap");
      $cmd .= " -min 0 -max 1  -out_format png -col_width 50";
      $cmd .= " -o ".$main::outfile{"bg_model_slice".$s."_heatmap"};
      &one_command($cmd, 1);
    }
  }

  ################################################################
  ## Run peak-motifs to discover exceptional motifs in every slice.
  foreach my $s (1..$param{nb_slices}) {
      &RSAT::message::TimeWarn("Running peak-motifs on slice", $s."/".$param{nb_slices}) if ($main::verbose >= 1);
      $outfile{"peak_motifs_slice".$s} = &OutFileName("discovered_motifs", "", "peak-motifs_slice".$s);
      my $prefix = join ("_", $param{prefix}, $slice_stats[$s]->{id});
      my $cmd =  &RSAT::server::GetProgramPath("peak-motifs");
      $cmd .= " -v ".$verbose;
      $cmd .= " -i ".$outfile{"seq_slice_".$s};
      $cmd .= " -title '".$prefix."'";
      $cmd .= " -prefix '".$prefix."'";
      $cmd .= " -markov auto";
      $cmd .= " -disco oligos,positions";
      $cmd .= " -nmotifs 3";
      $cmd .= " -minol 7";
      $cmd .= " -maxol 7";
      $cmd .= " ".$param{strand};
      #	$cmd .= " -motif_db jaspar_core_vertebrates tf $RSAT/public_html/motif_databases/JASPAR/jaspar_core_vertebrates_2009_10.tf";
      $cmd .= " -source galaxy";
      $cmd .= " -task purge,seqlen,composition,disco,merge_words,collect_motifs,synthesis,scan";
      $cmd .= " -prefix peak-motifs";
      $cmd .= " -noov";
      $cmd .= " -img_format png";
      $cmd .= " -outdir ".$outfile{"peak_motifs_slice".$s};
      if ($task{peakmo}) {
	&one_command($cmd, 1);
      }
    }


  ################################################################
  ## Use matrix-quality to compute motif enrichment in the different
  ## slices
  $outfile{quality_prefix} = &OutFileName("quality", "", "slice_enrichment");

  if ($task{enrichment}) {
    &RSAT::message::TimeWarn("Running matrix-quality") if ($main::verbose >= 1);
    my $cmd =  &RSAT::server::GetProgramPath("matrix-quality");
    $cmd .= " -v ".$main::verbose;
    $cmd .= " -m ".$infile{ref_motifs};
    $cmd .= " -matrix_format transfac";
    $cmd .= " -top ".$param{top_matrices};
    $cmd .= " -no_cv";
    $cmd .= " ".$param{strand};
    $cmd .= " -perm all_peaks ".$param{matrix_permutations};
    $cmd .= " -decimals ".$param{decimals};
    $cmd .= " -bgfile ".$outfile{bg_model_inclusive};
    $cmd .= " -bg_format inclusive";
    foreach my $s (1..$param{nb_slices}) {
      $cmd .= " -seq ".$slice_stats[$s]->{id}." ".$outfile{"seq_slice_".$slice_stats[$s]->{s}};
      if ($param{one_site_per_peak}) {
	$cmd .= " -scanopt $slice_stats[$s]->{id} ' -uth rank 1'"; ## only consider the top-scoring site per matrix
      }
    }
    ## Scan all sequences only in quick mode (and thus skip it in "one site per peak" mode)
    unless ($param{one_site_per_peak}) {
      $cmd .= " -seq all_peaks ".$infile{seq};
#      if ($param{one_site_per_peak}) {
#	$cmd .= " -scanopt all_peaks ' -uth rank 1'"; ## only consider the top-scoring site per matrix
#      }
    }
    $cmd .= " -o ".$outfile{quality_prefix};
    &one_command($cmd, 1);
  }

  ################################################################
  ## Print verbose
  &Verbose() if ($main::verbose >= 1);

  ################################################################
  ## Report execution time and close output stream
  my $exec_time = &RSAT::util::ReportExecutionTime($start_time); ## This has to be exectuted by all scripts
  print $out $exec_time if ($main::verbose >= 1); ## only report exec time if verbosity is specified
  close $out if ($outfile{output});

  exit(0);
}

################################################################
################### SUBROUTINE DEFINITION ######################
################################################################


################################################################
## Display full help message 
sub PrintHelp {
  system "pod2text -c $0";
  exit()
}

################################################################
## Display short help message
sub PrintOptions {
  &PrintHelp();
}

################################################################
## Read arguments 
sub ReadArguments {
  my $arg;
  my @arguments = @ARGV; ## create a copy to shift, because we need ARGV to report command line in &Verbose()
  while (scalar(@arguments) >= 1) {
    $arg = shift (@arguments);
    ## Verbosity

=pod

=head1 OPTIONS

=over 4

=item B<-v #>

Level of verbosity (detail in the warning messages during execution)

=cut
    if ($arg eq "-v") {
      if (&IsNatural($arguments[0])) {
	$main::verbose = shift(@arguments);
      } else {
	$main::verbose = 1;
      }


=pod

=item B<-h>

Display full help message

=cut
    } elsif ($arg eq "-h") {
      &PrintHelp();


=pod

=item B<-help>

Same as -h

=cut
    } elsif ($arg eq "-help") {
      &PrintOptions();


=pod

=item B<-bed peak_coordinate_file>

Peak coordinates and score provided in bed format.
Each row corresponds to one peak.
Columns must be provided in the following order:
1. Chromosome
2. Start
3. End
4. Name
5. Score

B<Important remarks>

=over


=item I<Name>

peak names of the bed files should be consistent with those of the
fasta file.

=item I<Score column>

The score coliumn is required, since it is used for computing
various statistics.

=item SWEMBL files must be converted in order to get the name and
score at the right position.

=back

=cut
    } elsif ($arg eq "-bed") {
      $infile{bed} = shift(@arguments);

=pod

=item B<-seq peak_sequence_file>

Name of the file containing the peak sequences (must be in fasta
format).

B<Important remarks>

=over

=item Peak names of the sequence files should be consistent with
those of the bed file.

=item Sequences should be sorted in the same order as the bed file.

=back

=cut
    } elsif ($arg eq "-seq") {
      $infile{seq} = shift(@arguments);

=pod

=item B<-ref_motifs reference_motif>

Reference motif(s)

A file containing one or several reference motif(s)
(i.e. position-specific scoring matrices) for the peak sequences,
i.e. the motifs for which the enrichment will be assessed in the
different peak slices.

Must be in TRANSFAC format (use I<convert-matrix> to obtain
transfac-formatted matrices).

=cut
    } elsif ($arg eq "-ref_motifs") {
      $infile{ref_motifs} = shift(@arguments);

=pod

=item B<-top_matrices top_matrices>

Maximal number of reference matrices to take in consideration for
matrix-quality.

Default: 0 (means that all matrices of the ref_motif file are analyzed).

Reference motif files may contain one or several matrices. In some
cases, databases contain distinct matrices for the same transcription
factor, for different reasons (method to collect the sites, or species
accepted for building the matrix). 

If the reference file contains multiple motifs, each of them will be
analyzed in turn, unless the number of motifs is restrictd with the
parameter -top_matrices.

=cut
    } elsif ($arg eq "-top_matrices") {
      $param{top_matrices} = shift(@arguments);
      &RSAT::error::FatalError($param{top_matrices}, "Invalid value for option -top. Must be a strictly positive Natural number")
	unless ((&IsNatural($param{top_matrices})) && ($param{top_matrices} > 0));

=pod

=item	B<-o output_directory>

Output directory. All result files will be stored in this directory.

Mandatory argument.

=cut
    } elsif ($arg eq "-o") {
      $dir{output} = shift(@arguments);

=pod

=item	B<-prefix output_prefix>

Prefix for the output files.

=cut
    } elsif ($arg eq "-prefix") {
      $param{prefix} = shift(@arguments);


=pod

=item B<-slices nb_slices>

Number of "slices", i.e. subsets of the sorted peaks submitted to
enrichment analysis.

Default: 5

=cut
    } elsif ($arg eq "-slices") {
      $param{nb_slices} = shift(@arguments);
      &RSAT::error::FatalError($param{nb_slices}, "Invalid number of slices, must be a strictly positive Natural number")
	unless (&RSAT::util::IsNatural($param{nb_slices}));


=pod

=item B<-seq_per_slice nb_seq_per_slice>

Number of sequences per slice.  If not specified, this parameter is
automatically set to N / s, where N is the number of peak sequences
and s the number of slices.

=cut
    } elsif ($arg eq "-seq_per_slice") {
      $param{seq_per_slice} = shift(@arguments);
      &RSAT::error::FatalError($param{seq_per_slice}, "Invalid number of sequences per slice, must be a strictly positive Natural number")
	unless (&RSAT::util::IsNatural($param{seq_per_slice}));

=pod

=item B<-one_site_per_peak>

Active by default. Can be inactivated with the option I<-all_sites>.

Adapt I<matrix-quality> parameters to consider only the top-scoring
site for each matrix.  By default, I<matrix-quality> analyzes the full
empirical distribution of scores along all positions of the
sequences. However, the biological model behind ChIP-seq analysis is
that, ideally, each peak corresponds to one binding
location. Consequently, the enrichment of a peak collection for a
given motif (matrix) is better estimated by considering the
top-scoring site in each peak.

Warning: the option -one_site_per_peak costs time, because sequences
are scanned with the slow Perl script matrix-scan rather than the fast
C program matrix-scan-quick.

=cut

    } elsif ($arg eq "-one_site_per_peak") {
      $param{one_site_per_peak} = 1;

=pod

=item B<-all_sites>

Analyze score distribution of all sites (rather than the top-scoring
site per peak). See option I<-one_site_per_peak>.

=cut

    } elsif ($arg eq "-all_sites") {
      $param{one_site_per_peak} = 0;

=pod

=item B<-task>

Specify a subset of tasks to be executed.

By default, the program runs all necessary tasks. However, in some
cases, it can be useful to select one or several tasks to be executed
separately.

Beware: task selection requires expertise, because most tasks depends
on the prior execution of some other tasks in the workflow. Selecting
tasks before their prerequisite tasks have been completed will provoke
fatal errors.

B<Default tasks>

=over

=item I<all> (default)

Run all supported tasks.

=item I<slices>

Extract peak sequence slices.

=item I<seq_len>

Compute sequence lengths.

=item I<bg_model>

Compute background model.

=item I<enrichment>

Analyze sequence slices with I<matrix-quality>. This requires to enter a
reference motif file.

=item I<peakmo>

Analyze sequence slices with I<peak-motifs>.

=back

=cut
    } elsif ($arg eq "-task") {
      my @requested_tasks = split ",", shift (@arguments);
      foreach my $task (@requested_tasks) {
	next unless $task;
	if ($supported_task{$task}) {
	  $task{$task} = 1;
	} else {
	  &RSAT::error::FatalError("Task '$task' is not supported. \n\tSupported: $supported_tasks");
	}
      }

=pod

=item B<-1str | -2str>

Single-strand (-1str) or double-strand (-2str) analysis.

The default is double-strand analysis (-2str), since ChIP-seq results
have no particular strand orientation.

However, for some data types such as CLIP-seq it might be relevant to
analyze single-strand motifs (option -1str).

=cut
    } elsif ($arg eq "-1str") {
      $param{strand} = "-1str";
    } elsif ($arg eq "-2str") {
      $param{strand} = "-2str";


    } else {
      &FatalError(join("\t", "Invalid option", $arg));

    }

=pod

=back

=cut
  }

}


################################################################
## Verbose message
sub Verbose {
  print $out "; peak-quality ";
  &PrintArguments($out);
  printf $out "; %-22s\t%s\n", "Program version", $program_version;

  ## List tasks
  printf $out "; %-22s\t%s\n", "Requested tasks", join(",", sort keys %task);

  ## List directories
  if (%dir) {
    print $out "; Directories\n";
    foreach my $key (sort keys (%dir)) {
      my $value = $dir{$key};
#    while (my ($key,$value) = each %dir) {
      printf $out ";\t%-13s\t%s\n", $key, $value;
    }
  }

  ## List input files
  if (%infile) {
    print $out "; Input files\n";
    foreach my $key (sort keys (%infile)) {
      my $value = $infile{$key};
#    while (my ($key,$value) = each %infile) {
      printf $out ";\t%-13s\t%s\n", $key, $value;
    }
  }

  ## List output files
  if (%outfile) {
    print $out "; Output files\n";
    foreach my $key (sort keys (%outfile)) {
      my $value = $outfile{$key};
#    while (my ($key,$value) = each %outfile) {
      printf $out ";\t%-13s\t%s\n", $key, $value;
    }
  }

  ## List parameter values
  print $out "; Parameter values\n";
  foreach my $param_name (@param_list) {
    print $out sprintf ";\t%-22s\t%s\n", $param_name, $param{$param_name};
  }
}


__END__
