#!/usr/bin/perl

## Ce script est une usine a gaz

############################################################
#
# $Id: gene-cluster-motifs,v 1.300 2013/08/09 05:29:57 jvanheld Exp $
#
# Time-stamp: <2003-10-21 01:19:12 jvanheld>
#
############################################################
BEGIN {
    if ($0 =~ /([^(\/)]+)$/) {
	push (@INC, "$`lib/");
    }
    require "RSA.lib";
    require "RSA.seq.lib";
    require "RSA.disco.lib";
    require "RSA.help.pl";
    require "RSA2.cgi.lib";
    push @INC, "$ENV{RSAT}/perl-scripts/parsers/";
    require "lib/load_classes.pl";
    require "lib/util.pl";
    require "lib/parsing_util.pl";
    require "$ENV{RSAT}/perl-scripts/lib/RSA.classes";
}

#require RSAT::server; ## For program paths
require RSAT::util;
require RSAT::server;
require RSAT::matrix;
require RSAT::pattern;
require RSAT::Analysis;
require RSAT::Family;
require RSAT::MatrixReader;

package main;
{

  local $start_time = &RSAT::util::StartScript();
  local $program_version = do { my @r = (q$Revision: 1.300 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };

  ## Paths of the programs called by gene-cluster-motifs
  local $count_words_cmd = &RSAT::server::GetProgramPath("count-words");
  local $matrix_scan_cmd = $ENV{RSAT}."/perl-scripts/matrix-scan";

  $size_names = 0;

  ################################################################
  ## initialize global variables
  $toppat = 50;
  $organism = "";
  $org_fam = 0;
  $taxon = "";
  $force_calib=1;
  $null="NA";			## Null character for the exports
  $skip=0;			## Skip the first clusters
  $last=-1;			## Stop after a few clusters
  $batch = 0;		   ## Run task in batch mode (on a PC cluster)
  @batch_commands = ();	       ## Set of commands to run in batch mode
  #$batch_script = ""; ## Script collecting the commands to be run on the same node of the PC cluster
  $min_genes = 1;		## Min number of genes per family
  $max_genes = -1;		## Min number of genes per family
  $rel_w = 1;	  ## Threshold on relative weight for compare-patterns
  $sliding_window_size = 0;
  $die_on_error = 1;
  $analyze_purged_sequences = 1;
  $calibN_repet=100;		### Repetitions for the calibN
  $db_site_name="";
  %dir = ();
  %lth = ();
  $lth{occ} = 1;
  %uth = ();
  $dir{main} = `pwd`;
  chomp($dir{main});
  %family = ();			## Index of clusters per name
  @families = ();		## Families
  @selected = ();	  ## Clusters selected with the option -select

  $markov = 0; ## Default method for oligos/dyads background model is not Markov. Set to 1 with the option -markov
  $markov_order = 2;

  $dir{calib1} = "calibrations_1gene";
  %max_score = ();

  ## Supported tasks
  @supported_tasks = qw (upstream
  		         sequences
			 upstream_ensembl
  		         sequences_ensembl
			 purge

			 compute_bg
			 calibrate
			 calibN
			 oligos
			 merge_oligos
			 assemble_oligos
			 oligo_maps
			 db_match_oligos
			 sig_distrib_oligos
			 validate_oligos

			 dyads
			 dyad_maps
			 db_match_dyads
			 sig_distrib_dyads
			 validate_dyads

			 pssm
			 oligos_pssm
			 dyads_pssm
			 orm_pssm

			 orm
			 assemble_orm
			 orm_maps
			 sig_distrib_orm
			 validate_orm

			 merge_patterns
			 assemble_patterns
			 maps

			 slide

			 consensus
			 consensus_maps

			 gibbs
			 gibbs_maps

			 AlignACE
			 AlignACE_maps

			 infogibbs
			 infogibbs_maps

			 MotifSampler

			 meme
			 meme_maps

			 motifs_vs_db

			 report
			 synthesis
			 sql
			 clean
			 all
			);
  foreach my $task (@supported_tasks) {
    $supported_task{$task} = 1;
  }
  $supported_tasks = join ",", @supported_tasks;


  ## Keys for sorting the results in the summary table
  %supported_sort_key = (
			 "score"=>1,
			 "name"=>1
			);
  $supported_sort_keys  = join ",", sort keys %supported_sort_key;


  ## Background models for string-based motif discovery
  %supported_background = (
			   "intergenic"=>1,
			   "upstream"=>1,
			   "upstreamL"=>1,
			   "upstream-noorf"=>1,
			   "calib1"=>1,
			   "calibN"=>1,
			   "upstream-rm"=>1,
			   "upstream-noorf-rm"=>1
			  );
  $supported_background  = join ",", sort keys %supported_background;

  ## Background models
  local %exp_freq_file = (); ## specified with the options -oligo_exp_freq and -dyad_exp_freq
  local %bg_model_file = ();

  ## input files
  $family_file = "";
  $sequence_file_list = "";
  $all_seq_file = "";
  $all_seq_format = "fasta"; ## Sequence format for the input sequences (onlyt useful in combination with the option -seq)
  $bg_seq_file = "";

  ## feature-map options
  $htmaps=0;
  $draw_maps=1;
  $scalestep = 100;
  $img_format = $ENV{rsat_img_format} || "png";
  $map_origin = "-0";
  $scan_origin = "end";
  $quick_scan = "-quick";

  ## retrieve-seq options
  $noorf = "-noorf";
  $repeat_masked = 0;
  $seq_type = "upstream";
  $seq_format = "fasta"; ## Sequence format for the output files (upstream and purged sequences)
  $seq_dir = "";
  $seq_ext = "fasta";

  ## purge-seq options
  $purge_ml = 40;
  $purge_mis = 3;

  ## motif discovery options
  $background = "upstream-noorf";
  $noov = "-noov";
  $sort_key = "score";
  $two_tails = 0; ## two-tails test for oligo-analysis and dyad-analysis

  ## oligo-analysis options
  $min_oligo_len = 6;
  $max_oligo_len = 6;

  ## dyad-analysis options
  $monad_length = 3;
  $min_sp = 0;
  $max_sp = 20;

  ## Default matrix-based motif discovery options
  $matrix_width = 16;
  $expected_sites_per_seq = 2;
  $nmotifs = 3;
  $seed = undef;
  $matrix_pseudo = 1;

  ## Program-specific keys for sorting matrices (PSSM)
  ## Values are defined below for each matrix-based motif discovery program
  %pssm_sort_key = ();
  %pssm_sort_order = ();

  ## Default MotifSampler options
  $MS_bg_order = 0;
  if (defined($ENV{MOTIF_SAMPLER_DIR})) {
    $MS_b =$ENV{MOTIF_SAMPLER_DIR}."/background_models/yeast_up800_nomit_noorf_o3.bg"; ### TEMPORARY
  }
  $MS_p = 0.2; ## Sets prior probability of 1 motif copy. (default 0.5).
  $MS_M = 0; ## Maximal number of motif instances per sequence. (default unset = 0)
  $MS_n = $nmotifs;	## Number of different motifs per sequence set
  $MS_w = $matrix_width;
  $MS_x = 1;			## allowed overlap between motifs
  $MS_r = 1;	       ## Number of repetitions of the gibbs per motif
  $pssm_sort_key{MotifSampler} = 'MS.ic';
  $pssm_sort_order{MotifSampler} = 'desc';

  ## Default MEME options
  $MEME_options{text} = ""; push @MEME_options, "text";	## Ouptut format = text
  $MEME_options{dna} = ""; push @MEME_options, "dna"; ## Sequence type == dna
  $MEME_options{mod} = "anr"; push @MEME_options, "mod"; ## Accept any number of occurrences per sequence
  $MEME_options{minw} = 8; push @MEME_options, "minw"; ## Minimal motif width
  $MEME_options{maxw}=20; push @MEME_options, "maxw"; ## Maximal motif width
  $MEME_options{nmotifs}=$nmotifs; push @MEME_options, "nmotifs"; ## Number of motifs
  $MEME_options{evt}=1; push @MEME_options, "evt"; ## upper threshold on E-value
  $pssm_sort_key{meme} = 'meme.E-value';
  $pssm_sort_order{meme} = 'asc';

  #$MEME_options{bfile}="";

  ## Default orm options
  $orm_lth_width = 10;	## Minimal window width for variable window size
  $orm_uth_wrank = 1;		     ## Max rank per word
  $orm_uth_rank=50;		     ## max rank (all words)
  $orm_lth_occ_sig= 0;	  ## lower thresold on occurrence significance
  #$orm_fixedsizewindow=0; ## Fixed window size
  $orm_overlap=0; ## Allow overlapping occurrences for self-overlapping words
  $orm_strand="+-";		   ## Strands
  $orm_word_length=6;		   ## Word length
  $orm_window=100;	       ## Window size for the background model
  $orm_markov_order=-1;	      ## markov order for the background model

  ## Default consensus options
  $pssm_sort_key{consensus} = 'cons.ln.Eval';
  $pssm_sort_order{consensus} = 'asc';

  ## Default gibbs options
  $pssm_sort_key{gibbs} = 'MAP.per.site';
  $pssm_sort_order{gibbs} = 'desc';

  ## Default AlignACE options
  $AlignACE_gcback=0.4332384392880;
  $pssm_sort_key{AlignACE} = 'MAP.per.site';
  $pssm_sort_order{AlignACE} = 'desc';

  ## Default infogibbs options
  $pssm_sort_key{infogibbs} = 'total.information';
  $pssm_sort_order{infogibbs} = 'desc';
  $infogibbs_g = 1; ## Matthieu: a quoi sert cette option ?

  ## Default infogiggs options
  %infogibbs_options = ();
  $infogibbs_options{v} = 1; push @infogibbs_options, "v"; ## Verbosity
#  $infogibbs_options{l} = $matrix_width; push @infogibbs_options, "l"; ## Motif length is defined with option -width
#  $infogibbs_options{m} = $nmotifs; push @infogibbs_options, "m"; ## Number of motifs to return is defined with option -nmotifs
#  $infogibbs_options{e} = $expected_sites_per_seq; push @infogibbs_options, "e"; ## Expected number of sites per sequence is defined with option -sps
#  $infogibbs_options{s} = $strands; push @infogibbs_options, "s"; ## Strand(s) is defined with option -1str or -2str
  $infogibbs_options{n} = 1000; push @infogibbs_options, "n"; ## Number of iterations
  $infogibbs_options{d} = 5; push @infogibbs_options, "d"; ## Minimal distance between 2 motif occurrences
  $infogibbs_options{nrun} = 3; push @infogibbs_options, "nrun"; ## Number of runs per motif

  ## Comparison between discovered and known motifs
  $known_site_file = "";
  $known_sites_provided = 0;
  $min_matching_score = 4;
  $known_site_max_len=50;	## ignore known sites larger than 50bp

  ## miscellaneous
  $verbose = 0;

  #### data export options
  $schema = "multifam";
  $host="localhost";
  $user = getlogin();
  $password="";

  ################################################################
  ### MAIN

  #### Read arguments
  &ReadArguments();

  if ($background eq 'upstream-noorf') {
    unless ($noorf) {
      $background = 'upstream';
    }
  }
  if ($repeat_masked) {
    $background .= '-rm';
  }

  #### Check parameters
  &CheckParameters();

  ################################################################
  ## Create class factories

  ## Unique prefix for each analysis, to allow merging them in a database
  local $run_date = &AlphaDate();
  local $run_prefix = `mktemp XXXXX`;
  chomp($run_prefix);
  system "rm $run_prefix" if (-f $run_prefix);

  ## Class holder for the analysis
  local $analysis_factory = classes::ClassFactory->new_class(object_type=>"RSAT::Analysis");
  $current_analysis = $analysis_factory->new_object(id=>$run_prefix);

  ## Class holder for clusters
  local $family_factory = classes::ClassFactory->new_class(object_type=>"RSAT::Family",
							   prefix=>$run_prefix."_fam");
  $family_factory->set_out_fields(qw(id analysis organism size name  genes));

  ## Class holder for patterns (oligos and dyads)
  local $pattern_factory = classes::ClassFactory->new_class(object_type=>"RSAT::pattern",
							    prefix=>$run_prefix."_pat");
  @pattern_out_fields = qw(id
			   family_id
			   family
			   type
			   sequence
			   rev_compl
			   occ
			   exp_occ
			   occ_P
			   occ_E
			   occ_sig
			   rank
			   zscore
			   ratio
			  );
  $pattern_factory->set_out_fields(@pattern_out_fields);

  ## Class holder for the matrices (gibbs, AlignACE, meme, consensus, MotifSampler, infogibbs)
  local $matrix_factory = classes::ClassFactory->new_class(object_type=>"RSAT::matrix",
							   prefix=>$run_prefix."_mat");
  @matrix_scalar_out_fields = qw(id
				 family_id
				 family
				 program
				 sites
				 matrix.nb
				 ncol
				 nrow
				 type
				 alphabet.size

				 consensus.IUPAC
				 consensus.IUPAC.rc
				 consensus.strict
				 consensus.strict.rc
				 consensus.regexp
				 consensus.regexp.rc

				 total.information
				 info.log.base
				 information.per.column
				 max.bits
				 max.possible.info.per.col
				 min.prior

				 cons.Pval
				 cons.ln.Pval
				 cons.Eval
				 cons.ln.Eval
				 cons.adjusted.information
				 cons.unadjusted.information

				 MAP
				 MAP.per.site
				 gibbs.betaprior.map
				 gibbs.model.map
				 seed

				 meme.llr
				 meme.E-value

				 MS.ic
				 MS.ll
				 MS.cs
				 MS.consensus
				 MS.sequences

				 input_file
				 command
				);

  @matrix_array_out_fields = qw(site_ids
				sequences
				alphabet
				prior
				parameters
				column.information
			       );
  $matrix_factory->set_out_fields(@matrix_scalar_out_fields, @matrix_array_out_fields);

  local %matrix_by_file = (); ## index for the matrices already treated

  ################################################################
  ## Read data

  ## Known sites
  if ($known_site_file) {
    &ReadKnownSites($known_site_file);
  }

  ## List of sequence files
  if ($sequence_file_list) {
    $task{sequences} = 0;
    &ReadSequenceList();

  } else {
    ## Read clusters

    #### Check organism name
    if ($organism_name) {
      ### read gene name and identifiers
      &RSAT::message::TimeWarn("Reading ORF information")
	if ($verbose >= 1);
      ## initialize the organism, which will be used both for identifying features and for choosing  background models
      $organism = new RSAT::organism();
      $organism->check_name($organism_name);
      $organism->set_attribute("name", $organism_name);

      ## Load the features of the organism in order to identify the genes
      unless ($all_seq_file) {
	$organism->DefineAcceptedFeatureTypes(sort keys %accepted_feature_types);
	$organism->LoadFeatures($annotation_table);
	$organism->LoadSynonyms();
      }

    } elsif ($org_fam) {
      &RSAT::message::Info("Organisms will be read from the second column of the family file") if ($verbose >= 1);
    } else {
      &RSAT::error::FatalError("You must either specify an organism (-org), or an input sequence file (-seq)");
    }

    ### Read cluster file
    &RSAT::message::TimeWarn("Reading gene clusters") if ($main::verbose >= 1);
    if (($org_fam) || ($all_seq_file)) {
      %family = &ReadClasses($family_file, 0);
    } else {
      %family = &ReadClasses($family_file, 1, $organism);
    }
    &RSAT::message::Info("Gene clusters read", scalar(keys(%family))) if ($main::verbose >= 1);
  }


  ################################################################
  ## Delete some clusters depending on the options

  my %family_to_delete = ();
  @families = sort (keys (%family));


  ## Check minimum and maximum number of genes
  &RSAT::message::TimeWarn("Checking min and max number of genes") if ($main::verbose >= 1);
  foreach my $family_name (@families) {
    my $gene_nb = scalar(@{$family{$family_name}->{members}});
    if ($gene_nb < $min_genes) {
      &RSAT::message::Warning("Removing family",
			      $fam_count, $family_name,
			      $gene_nb." genes",
			      "< min = ".$min_genes), "\n" if ($verbose >= 2);
      $family_to_delete{$family_name}++;
    }
    if (($max_genes >= 0) && ($gene_nb > $max_genes)) {
      &RSAT::message::Warning("Removing family",
			      $fam_count, $family_name,
			      $gene_nb." genes",
			      "> max = ".$max_genes), "\n" if ($verbose >= 2);
      $family_to_delete{$family_name}++;
    }
  }

  ## Check selected clusters
  if (scalar(@selected) > 0) {
    &RSAT::message::Info("Selecting user-defined clusters") if ($main::verbose >= 1);
    my %selected = ();
    foreach my $family_name (@selected) {
      $selected{$family_name}++;
      &FatalError(join ("\t", "Selected family", $family_name, "is not found in the family file", $family_file))
	unless defined(($family{$family_name}));
      &RSAT::message::Warning("Selected family", $family_name) if ($verbose >= 3);
    }

    ## Delete non-selected clusters
    foreach my $family_name (keys (%family)) {
      unless ($selected{$family_name}) {
	$family_to_delete{$family_name}++;
      }
    }
  }

  ## Update the list of clusters
  foreach my $family_name (keys(%family_to_delete)) {
    delete $family{$family_name};
    &RSAT::message::Warning("Deleted family", $family_name) if ($verbose >= 2);
  }
  @families = sort keys %family;

  &RSAT::message::Info("Remaining clusters after filtering", scalar(@families)) if ($verbose >= 1);

  ################################################################
  ## Skip the first or last clusters if required
  if (($skip > 0) || ($last > 0)) {

    ## Apply the options -skip (suppress first clusters)
    if ($skip > 0) {
      &RSAT::message::Info("Skipping", $skip, "first clusters") if ($main::verbose >= 1);
      for my $f (1..$skip) {
	$family_name = $families[$f-1];
	$family_to_delete{$family_name}++;
	&RSAT::message::Debug("Skipping first family", $f, $family_name) if ($verbose >= 2);
      }
    }

    ## Apply the options -last (suppress last clusters)
    if ($last > 0) {
      &RSAT::message::Debug("Skipping last", $last, "clusters", (scalar(@families)-$last)) if ($verbose >= 2);
      for my $f ($last..$#families) {
	$family_name = $families[$f-1];
	$family_to_delete{$family_name}++;
	&RSAT::message::Debug("Skipping last family", $f, $family_name) if ($verbose >= 2);
      }
    }

    ## Update the list of clusters
    foreach my $family_name (keys(%family_to_delete)) {
      delete $family{$family_name};
      &RSAT::message::Warning("Deleted family", $family_name) if ($verbose >= 2);
    }
    @families = sort keys %family;

    &RSAT::message::Info(join("\t", "Remaining clusters after applying options -first and -last", scalar(@families))) if ($verbose >= 1);
  }

  ## Set all the file names
  &SetFileNames();

  ## Create output directories
  ##
  ## Note: the main directory is defined as
  ## absolute path. Output directotry is generally relative to the
  ## main directory.
  chdir($dir{main});
  &RSAT::util::CheckOutDir($dir{output});
  chdir($dir{output});
  &MakeDirectories();

  #### verbose
  $verbose_message = &Verbose() if ($verbose >= 1);

  ################################################################
  ## Define the names of the background model files
  if ($bg_seq_file) {
    &define_bg_model_files();
  }

  ################################################################
  ### perform analysis
  &ComputeBG() if ($task{compute_bg});
  &AnalyzeFamilies();
  &MDCreport() if ($task{report});
  &Validate() if ($task{validate});
  &SigDistrib() if (($task{sig_distrib_oligos}) || ($task{sig_distrib_dyads}) || ($task{sig_distrib_orm}));
  &ExportSQL() if ($task{sql});
  &SyntheticTable() if ($task{synthesis});
  &DatabaseMatch() if ($task{db_match});

  ### Warn output dir and files
  if ($verbose >= 1) {
    &RSAT::message::Info("Results were stored in directory\t".$dir{output});
    &RSAT::message::Info("Synthetic table\t", $dir{output}."/".$outfile{table});
    if ($task{report}) {
      &RSAT::message::Info("Result report (MDC 2004)\t", $dir{output}."/".$outfile{results});
      &RSAT::message::Info("Parameter report (MDC 2004)\t", $dir{output}."/".$outfile{parameters});
    }
  }

  my $exec_time = &RSAT::util::ReportExecutionTime($start_time); ## This has to be exectuted by all scripts
  print $exec_time if ($main::verbose >= 1); ## only report exec time if verbosity is specified
  exit(0);

}

################################################################
#
#                 SUBROUTINE DEFINITIONS
#
################################################################

################################################################

#### set the file names as a function of parameters
sub SetFileNames {
  ## Options for dyad-analysis
  $dyad_spacing = $min_sp."-".$max_sp;
  $dyad_options = "-v";
  #    $dyad_options .= " -lth occ 1"; ## This is to avoid calculating P-value for patterns not rpesent in the dataset, and for having a correct correction for multi-testing.
  $dyad_options .= " -sort";
  $dyad_options .= " ".$strands;
  $dyad_options .= " -type ".$dyad_type;
  $dyad_options .= " ".$noov;
  $dyad_options .= " -l ".$monad_length;
  $dyad_options .= &ThresholdOptions();
  #    $dyad_options .= " -lth occ_sig $lth{occ_sig} ";
  $dyad_options .= " -sp ".$dyad_spacing;

  $dyad_suffix = "_dyads";
  if ($bg_seq_file) {
    $dyad_suffix .= "_bgseq";
  } elsif ($background) {
    #	if ($repeat_masked) {        ### Should be here?
    #	    $background .= "-rm";    ### Should be here?
    #	}                            ### Should be here?
    $dyad_suffix .= "_bg_".$background; ### TEMP
  } else {
    $dyad_suffix .= "_monad";	### TEMP
  }
  $dyad_suffix .= "_l".$monad_length;
  $dyad_suffix .= "_sp".$dyad_spacing;
  $dyad_suffix .= $strands;
  $dyad_suffix .= "_".$dyad_type;
  $dyad_suffix .= &ThresholdSuffix();
  #    $dyad_suffix .= "_sig".$lth{occ_sig};
  $dyad_suffix .= $noov;

  #### Background model for dyad-analysis
  if ($bg_seq_file) {
    ## Background model estimated from user-specified background
    ## sequences. If the background model file does not exist, compute
    ## it from background sequences.
    &define_bg_model_files() unless (defined($bg_model_file{"dyads"}));
#    &compute_bg_dyads() unless (-e $bg_model_file{dyads});
    $dyad_options .= " -expfreq ".$bg_model_file{"dyads"};

  } elsif ($exp_freq_file{dyads}) {
    ## Convert relative path in absolute path
    unless ($exp_freq_file{dyads} =~ /^\//) {
      $exp_freq_file{dyads} = $dir{main}."/".$exp_freq_file{dyads};
    }
    ### Manually specified expected frequencies
    $dyad_options .= " -expfreq $exp_freq_file{dyads}";

  } elsif ($background) {
    $dyad_options .= " -bg $background ";
    $dyad_options .= " -org ".$organism_name if ($organism_name);

  } elsif ($background eq "mncf") {
    $dyad_options .= " -mncf ";
    $dyad_options .= " -org ".$organism_name if ($organism_name);
    $dyad_suffix .= "_mncf";
  }

  #### suffix for the oligo-analysis file
  $noorf_string = "_noorf" if ($noorf);
  $oligo_suffix = "_oligos";
  if ($bg_seq_file) {
    $oligo_suffix .= "_bgseq";
  } elsif ($background) {
    $oligo_suffix .= "_bg_".$background;
  } elsif ($markov) {
    $oligo_suffix .= "_mkv".$markov_order;
  }
  $oligo_suffix .= "_".$min_oligo_len."-".$max_oligo_len."nt";
  $oligo_suffix .= $strands;
  $oligo_suffix .= &ThresholdSuffix();
  #    $oligo_suffix .= "_sig".$lth{occ_sig};
  #    if (&IsReal($lth{ms_freq})) {$oligo_suffix .= "_msf".$lth{ms_freq} ; }
  #    if (&IsReal($lth{ms_sig})) {$oligo_suffix .= "_mssig".$lth{ms_sig} ; }
  $oligo_suffix .= $noov;

  foreach my $family_name (@families) {
    my $gene_nb = scalar(@{$family{$family_name}->{members}});
    #	warn join ("\t", "Number of genes", $family_name, $gene_nb), "\n";

    my $expected_sites = $gene_nb*$expected_sites_per_seq;
    $family{$family_name}->{family_file} = $family_name."/".$family_name."_".$feature_types.".fam";
    $family{$family_name}->{known_site_file} = $family_name."/".$family_name."_known_sites";

    ## Prefix for files
    $family{$family_name}->{file_prefix} = $family_name;

    unless ($all_seq_file) {
	if ($size_names) { ## Old naming system, maintained only for backward compatibility
	    $family{$family_name}->{file_prefix} .= "_up".$seq_length unless ($sequence_file_list);
	} else {
	    $family{$family_name}->{file_prefix} .= "_from".$from."_to".$to unless ($sequence_file_list);
	}
	$family{$family_name}->{file_prefix} .= "_".$feature_types;
	$family{$family_name}->{file_prefix} .= $noorf_string;
	unless ($size_names) {	## only for backward compatibility
	    $family{$family_name}->{file_prefix} .= '-rm' if ($repeat_masked);
	}
	if ($taxon) {
	    $family{$family_name}->{file_prefix} .= "_".$taxon;
	    $family{$family_name}->{ortho_file} = $family_name."/".$family_name."_".$feature_types."_orthologs_".$taxon.".fam";
	}
    }

    ## Sequence files	unless ($sequence_file_list) {
    unless ($sequence_file_list) {
      $family{$family_name}->{seq_file} = $family_name."/".$family{$family_name}->{file_prefix}.".".$seq_format;
      $family{$family_name}->{seq_file_purged} = $family_name."/".$family{$family_name}->{file_prefix}."_purged.${seq_format}";
    }
    if ($analyze_purged_sequences) {
      $family{$family_name}->{input_seq_file} = $family{$family_name}->{seq_file_purged};
    } else {
      $family{$family_name}->{input_seq_file} = $family{$family_name}->{seq_file};
    }
    $family{$family_name}->{seq_len_file} = $family_name."/".$family{$family_name}->{file_prefix}."_seq_len.tab";

    ## Known sites for this family
    if ($task{validate}) {
      #	    $family{$family_name}->{known_sites} = $family_name."/".$family{$family_name}->{file_prefix}."_known_sites.tab";
      $family{$family_name}->{known_sites} = $family_name."/".$family_name."_known_sites.tab";
    }

    ################################################################
    ## oligo-analysis results
    $family{$family_name}->{oligo_file} =  "${family_name}/oligos_${family_name}/".$family{$family_name}->{file_prefix}.$oligo_suffix;
    $family{$family_name}->{oligo_assembly_file} = $family{$family_name}->{oligo_file}.".asmb";
    $family{$family_name}->{oligo_pssm_file} = $family{$family_name}->{oligo_file}."_pssm";
    $family{$family_name}->{oligo_selection} = $family{$family_name}->{oligo_file}."_selection";
    $family{$family_name}->{oligo_ft_prefix} = $family{$family_name}->{oligo_file};
    #	$family{$family_name}->{selection_ft_prefix} = $family{$family_name}->{oligo_ft_prefix}."_report";
    $family{$family_name}->{oligo_ft_file} = $family{$family_name}->{oligo_ft_prefix}.".ft";
    $family{$family_name}->{oligo_vs_known} = $family{$family_name}->{oligo_file}."__vs__known_relw".$rel_w.".tab";
    $family{$family_name}->{oligo_vs_known_weight_table} = $family{$family_name}->{oligo_file}."__vs__known_weigth_table.tab";
    $family{$family_name}->{oligo_vs_known_relw_table} = $family{$family_name}->{oligo_file}."__vs__known_relw_table.tab";
    $family{$family_name}->{oligo_vs_db} = $family{$family_name}->{oligo_file}."__vs__db_".$db_site_name."_relw".$rel_w.".tab";
    $family{$family_name}->{oligo_vs_db_weight_table} = $family{$family_name}->{oligo_file}."__vs__db_".$db_site_name."_weigth_table.tab";
    $family{$family_name}->{oligo_vs_db_relw_table} = $family{$family_name}->{oligo_file}."__vs__db_".$db_site_name."_relw_table.tab";

    ################################################################
    ## dyad-analysis results
    $family{$family_name}->{dyad_file} = "$family_name/dyads_${family_name}/".$family{$family_name}->{file_prefix}.$dyad_suffix;
    $family{$family_name}->{dyad_assembly_file} = $family{$family_name}->{dyad_file}.".asmb";
    $family{$family_name}->{dyad_pssm_file} = $family{$family_name}->{dyad_file}."_pssm";
    $family{$family_name}->{dyad_ft_prefix} = $family{$family_name}->{dyad_file};
    $family{$family_name}->{dyad_ft_file} = $family{$family_name}->{dyad_ft_prefix}.".ft";
    $family{$family_name}->{fmap_file} = $family{$family_name}->{dyad_file}.".".$img_format;
    $family{$family_name}->{htmap_file} = $family{$family_name}->{dyad_file}.".html";
    $family{$family_name}->{dyad_vs_known} = $family{$family_name}->{dyad_file}."__vs__known_relw".$rel_w.".tab";
    $family{$family_name}->{dyad_vs_known_weight_table} = $family{$family_name}->{dyad_file}."__vs__known_weight_table.tab";
    $family{$family_name}->{dyad_vs_known_relw_table} = $family{$family_name}->{dyad_file}."__vs__known_relw_table.tab";
    $family{$family_name}->{dyad_vs_db} = $family{$family_name}->{dyad_file}."__vs__db_".$db_site_name."relw".$rel_w.".tab";
    $family{$family_name}->{dyad_vs_db_weight_table} = $family{$family_name}->{dyad_file}."__vs__db_".$db_site_name."weigth_table.tab";
    $family{$family_name}->{dyad_vs_db_relw_table} = $family{$family_name}->{dyad_file}."__vs__db_".$db_site_name."relw_table.tab";

    ################################################################
    ## oligos + dyads
    $family{$family_name}->{merged_file} =  "${family_name}/merged_${family_name}/".$family{$family_name}->{file_prefix}.$oligo_suffix.$dyad_suffix;
    $family{$family_name}->{merged_ft_file} = $family{$family_name}->{merged_file}.".ft";

    ################################################################
    ## Andrew Neuwald's Gibbs 95 options and files
    my $gibbs_options = "";
    my $gibbs_suffix = "";

    ## input file
    $family{$family_name}->{gibbs_input_seq_file} = $family{$family_name}->{input_seq_file}.".gibbs";
    $gibbs_options .= " ".$family{$family_name}->{gibbs_input_seq_file};

    ## Matrix width
    my @gibbs_widths = ();
    my @gibbs_expected_sites = ();
    my $gene_nb = scalar(@{$family{$family_name}->{members}});
    my $expected_sites = $gene_nb*$expected_sites_per_seq;
    for my $i (1..$nmotifs) {
      push @gibbs_widths, $matrix_width;
      push @gibbs_expected_sites, $expected_sites;
    }
    $gibbs_options .= " ".join(",",  @gibbs_widths);
    $gibbs_suffix .= "-nmotifs".$nmotifs;
    $gibbs_suffix .= "-L".$matrix_width;

    ## Expected number of sites
    $gibbs_options .= " ".join(",", @gibbs_expected_sites);
    $gibbs_suffix .= "-n".$expected_sites;

    ## DON'T use fragmentation (i.e., column sampler)
    $gibbs_options .= " -d";
    $gibbs_suffix .= "-d";

    ## Use nucleic acid alphabet
    $gibbs_options .= " -n";
    $gibbs_suffix .= "-n";

    ## Random seed
    if (defined($seed)) {
      $gibbs_options .= " -s ".$seed;
      $gibbs_suffix .= "-s".$seed;
    }

    ## Assign gibbs options to the family
    $family{$family_name}->{gibbs_suffix} = $gibbs_suffix;
    $family{$family_name}->{gibbs_options} = $gibbs_options;
    $family{$family_name}->{gibbs_dir} = ${family_name}."/gibbs_".${family_name};
    $family{$family_name}->{gibbs_file} = $family{$family_name}->{gibbs_dir}."/".$family{$family_name}->{file_prefix}.$gibbs_suffix;

    ################################################################
    ## Roth's AlignACE
    my $AlignACE_options = "";
    my $AlignACE_suffix = "";

    $family{$family_name}->{AlignACE_input_seq_file} = $family{$family_name}->{input_seq_file}.".AlignACE";

    ## input file
    $AlignACE_options .= " -i ".$family{$family_name}->{AlignACE_input_seq_file};

    ## Matrix width
    $AlignACE_options .= " -numcols ".$matrix_width;
    $AlignACE_suffix .= "-L".$matrix_width;

    ## Expected number of sites
    my $gene_nb = scalar(@{$family{$family_name}->{members}});
    my $expected_sites = $gene_nb*$expected_sites_per_seq;
    $alignace_options .= " -expect ".$expected_sites;
    $AlignACE_suffix .= "-n".$expected_sites;

    ## background fractional GC content of input sequence (0.38)
    $AlignACE_options .= " -gcback ".$AlignACE_gcback;
    $AlignACE_suffix .= "-gcback".sprintf("%.2f",$AlignACE_gcback);

    ## Random seed
    if (defined($seed)) {
      $AlignACE_options .= " -s ".$seed;
      $AlignACE_suffix .= "-s".$seed;
    }

    ## Assign AlignACE options to the family
    $family{$family_name}->{AlignACE_suffix} = $AlignACE_suffix;
    $family{$family_name}->{AlignACE_options} = $AlignACE_options;
    $family{$family_name}->{AlignACE_dir} = ${family_name}."/AlignACE_".${family_name};
    $family{$family_name}->{AlignACE_file} = $family{$family_name}->{AlignACE_dir}."/".$family{$family_name}->{file_prefix}.$AlignACE_suffix;

    ################################################################
    ## infogibbs
    ## version developed by Matthieu Defrance
    my $infogibbs_options = "";
    my $infogibbs_suffix = "";

    ## input file
    $infogibbs_options .= " -i ".$family{$family_name}->{input_seq_file};

    ## Matrix width
    $infogibbs_options .= " -l ".$matrix_width;
    $infogibbs_suffix .= "-l".$matrix_width;

    ## Expected number of sites
    my $gene_nb = scalar(@{$family{$family_name}->{members}});
    $infogibbs_options .= " -e ".$expected_sites_per_seq;
    $infogibbs_suffix .= "-e".$expected_sites_per_seq;

    ## Strand
    $infogibbs_suffix .= $strands;
    if ($strands eq "-2str") {
      $infogibbs_options .= " -s +- ";
    } else {
      $infogibbs_options .= " -s + ";
    }

    ## Number of motifs is not yet implemented
    #	$infogibbs_options .= " -m ".$nmotifs;
    #	$infogibbs_suffix .= "-m".$nmotifs;

    ## Add other options
    foreach my $option (@infogibbs_options) {
      if (length($option) == 1) {
	$infogibbs_options .= " -".$option." ".$infogibbs_options{$option};
      } elsif (length($option) > 1) {
	$infogibbs_options .= " --".$option." ".$infogibbs_options{$option};
      }
      unless ($option eq "bfile") {
	$infogibbs_suffix .= "_".$option.$infogibbs_options{$option};
      }
    }

    ## Amplification factor
    #	$infogibbs_options .= " -e ".$infogibbs_e;
    #	$infogibbs_suffix .= "-e".$infogibbs_e;

    ## Random seed
    # if (defined($seed)) {
    #     $infogibbs_options .= " -s ".$seed;
    #     $infogibbs_suffix .= "-s".$seed;
    # }

    ## Assign infogibbs options to the family
    $family{$family_name}->{infogibbs_suffix} = $infogibbs_suffix;
    $family{$family_name}->{infogibbs_options} = $infogibbs_options;
    $family{$family_name}->{infogibbs_dir} = ${family_name}."/infogibbs_".${family_name};
    $family{$family_name}->{infogibbs_file} = $family{$family_name}->{infogibbs_dir}."/".$family{$family_name}->{file_prefix}.$infogibbs_suffix;

    ################################################################
    ## Gert Thijs' MoifSampler options and files
    my $MS_options = "";
    my $MS_suffix = "";

    ## input file
    $MS_options = " -f ".$family{$family_name}->{input_seq_file};

    ## background model
    if ($MS_b) {
      $MS_options .= " -b ".$MS_b;
    } else {
      $family{$family_name}->{MotifSamplerBackground} = $dir{output}."/".$family{$family_name}->{input_seq_file}."_MotifSampler_bg".$MS_bg_order;
      $MS_options .= " -b ".$family{$family_name}->{MotifSamplerBackground};
    }

    ## strand
    if ($strands eq "-1str") {
      $MS_options .= " -s 0";
      $MS_suffix .= "-s0";
    } else {
      $MS_options .= " -s 1";
      $MS_suffix .= "-s1";
    }

    ## prior
    $MS_options .= " -p ".$MS_p;
    $MS_suffix .= "-p".$MS_p;

    ## Maximal number of motif instances per sequence. (default unset = 0)
    $MS_options .= " -M ".$MS_M;
    $MS_suffix .= "-M".$MS_M;

    ## Sets number of different motifs to search for (default 1).
    $MS_options .= " -n ".$MS_n;
    $MS_suffix .= "-n".$MS_n;

    ## Sets length of the motif (default 8).
    $MS_options .= " -w ".$matrix_width;
    $MS_suffix .= "-w".$matrix_width;

    ## Sets allowed overlap between different motifs. (default 1)
    $MS_options .= " -x ".$MS_x;
    $MS_suffix .= "-x".$MS_x;

    ## Set number of times the MotifSampler should be repeated
    $MS_options .= " -r ".$MS_r;
    $MS_suffix .= "-r".$MS_r;

    ## output file

    $family{$family_name}->{MotifSampler_dir} = "${family_name}/MotifSampler_${family_name}";
    $family{$family_name}->{MotifSampler_file} = $family{$family_name}->{MotifSampler_dir}."/".$family{$family_name}->{file_prefix}.$MS_suffix.".sites";
    $MS_options .= " -o ".$family{$family_name}->{MotifSampler_file};

    $family{$family_name}->{MotifSampler_matrix} = $family{$family_name}->{MotifSampler_dir}."/".$family{$family_name}->{file_prefix}.$MS_suffix.".matrix";
    $MS_options .= " -m ".$family{$family_name}->{MotifSampler_matrix};

    ## Assign MotifSampler options to the family
    $family{$family_name}->{MotifSampler_suffix} = $MS_suffix;
    $family{$family_name}->{MotifSampler_options} = $MS_options;


    ################################################################
    ## MEME
    my $meme_options = "";
    my $meme_suffix = "";

    ## input file
    $family{$family_name}->{meme_input_seq_file} = $family{$family_name}->{input_seq_file}.".meme";
    $meme_options .= " ".$family{$family_name}->{meme_input_seq_file};

    ## strands
    $meme_suffix .= $strands;
    unless ($strands eq "-1str") {
      $meme_options .= " -revcomp";
    }

    ## If bacgrkound sequences have been specified, check that
    ## background model file exists.
    if ($bg_seq_file) {
      &define_bg_model_files() unless (defined( $bg_model_file{"meme"}));
      #	  &compute_bg_meme() unless (-e $bg_model_file{meme});
      $meme_options .= " -bfile ".$bg_model_file{meme};
    }

    ## Background frequency file specified with the option -MEME_bfile
    if ($MEME_bfile) {
      $meme_options .= " -bfile ".$MEME_bfile;
    }

    foreach my $option (@MEME_options) {
      $meme_options .= " -".$option." ".$MEME_options{$option};
      unless ($option eq "bfile") {
	$meme_suffix .= "_".$option.$MEME_options{$option};
      }
    }

    ## Output file
    $family{$family_name}->{meme_dir} = ${family_name}."/"."meme_".${family_name};
    $family{$family_name}->{meme_file} = $family{$family_name}->{meme_dir}."/".$family{$family_name}->{file_prefix}.$meme_suffix;
    $meme_options .= " > ".$family{$family_name}->{meme_file};

    $family{$family_name}->{meme_options} = $meme_options;
    $family{$family_name}->{meme_suffix} = $meme_suffix;

    ################################################################
    ## orm

    my $orm_options = "";

    ## Options for orm
    my $orm_suffix = "";

    ## Word length
    $orm_options .= " --length=".$orm_word_length;
    $orm_suffix .= "w".$orm_word_length;

    ## Location
    #my $orm_location = $from.":".$to;
    #$orm_options .= " --location=".$orm_location;
    #$orm_suffix .= $from."_".$to;

    ## Window size
    $orm_options .= " --window=".$orm_window;
    $orm_suffix .= "_W".$orm_window;

    ## Strand
    $orm_options .= " --strand=".$orm_strand;
    if ($orm_strand eq "+-") {
      $orm_suffix .= "-2str";
    } else {
      $orm_suffix .= "-1str";
    }

    ## Self-overlapping words
    if ($orm_overlap) {
      $orm_options .= " --overlap";
      $orm_suffix .= "-ovlp";
    } else {
      $orm_suffix .= "-noov";
    }

    ## Markov model
    if ($orm_markov_order >= 0) {
      $orm_options .= " --markov=".$orm_markov_order;
      $orm_suffix .= "_m".$orm_markov_order;
    } elsif ($exp_freq_file{oligos}) {
      ## Convert relative path in absolute path
      unless ($exp_freq_file{oligos} =~ /^\//) {
	$exp_freq_file{oligos} = $dir{main}."/".$exp_freq_file{oligos};

      }
      $orm_options .= " --bgoligo=". $exp_freq_file{oligos};

    }

    ## Number of top motifs
    $orm_options .= " --max=rank ".$orm_uth_rank;
    $orm_suffix .= "_rank".$orm_uth_rank;

    ## Occurrence significance
    $orm_options .= " --min=occ_sig ".$orm_lth_occ_sig;
    $orm_suffix .= "_occ_sig".$orm_lth_occ_sig;
    $orm_options .= " --sort=-occ_sig";

    ## Window width
    $orm_options .= " --min=width ".$orm_lth_width;
    $orm_suffix .= "_minwidth".$orm_lth_width;

    ## Word rank
    $orm_options .= " --max=w_rank ".$orm_uth_wrank;
    $orm_suffix .= "_wrank".$orm_uth_wrank;

    ## Fixed window size
    #	if ($ORM_fixedsizewindow) {
    #	    $orm_options .= " --fixedsizewindow";
    #	    $orm_suffix .= "_z";
    #	}

    ## input file
    $orm_options .= " --input=".$family{$family_name}->{input_seq_file};

    ## Output file
    $family{$family_name}->{orm_dir} = ${family_name}."/"."orm_".${family_name};
    $family{$family_name}->{orm_file} = $family{$family_name}->{orm_dir}."/".$family{$family_name}->{file_prefix}."_".$orm_suffix;
    $orm_options .= " --output=".$family{$family_name}->{orm_file};
    $family{$family_name}->{orm_options} = $orm_options;
    $family{$family_name}->{orm_suffix} = $orm_suffix;

    $family{$family_name}->{orm_assembly_file} = $family{$family_name}->{orm_file}.".asmb";
    $family{$family_name}->{orm_pssm_file} = $family{$family_name}->{orm_file}."_pssm";
    #	$family{$family_name}->{orm_selection} = $family{$family_name}->{orm_file}."_selection";
    $family{$family_name}->{orm_ft_prefix} = $family{$family_name}->{orm_file};
    #	$family{$family_name}->{selection_ft_prefix} = $family{$family_name}->{orm_ft_prefix}."_report";
    $family{$family_name}->{orm_ft_file} = $family{$family_name}->{orm_ft_prefix}.".ft";
    $family{$family_name}->{orm_vs_known} = $family{$family_name}->{orm_file}."__vs__known_relw".$rel_w.".tab";
    $family{$family_name}->{orm_vs_known_weight_table} = $family{$family_name}->{orm_file}."__vs__known_weigth_table.tab";
    $family{$family_name}->{orm_vs_known_relw_table} = $family{$family_name}->{orm_file}."__vs__known_relw_table.tab";

    ################################################################
    ## Consensus options and files
    my $consensus_options = "";
    my $consensus_suffix = "";

    ## Number of matrices to print
    $consensus_options .= " -pf 1 -pt 1";

    ## strands
    if ($strands eq "-1str") {
      $consensus_options .= " -c0"; ## Single-strand search
      $consensus_suffix .= "-c0";
    } else {
      $consensus_options .= " -c2"; ## Double-strand search
      $consensus_suffix .= "-c2";
    }

    ## Matrix width
    $consensus_options .= " -L ".$matrix_width;
    $consensus_suffix .= "-L".$matrix_width;

    ## alphabet
    $consensus_options .= " -A a:t 0.325 c:g 0.175";

    ## Use designated prior frequencies
    #    $consensus_options .= " -d";

    ## Expected number of sites
    $consensus_options .= " -n ".$expected_sites;
    $consensus_suffix .= "-n".$expected_sites;
    $family{$family_name}->{consensus_suffix} = $consensus_suffix;
    $family{$family_name}->{consensus_options} = $consensus_options;
    $family{$family_name}->{consensus_dir} = "${family_name}/consensus_${family_name}";
    $family{$family_name}->{consensus_file} = $family{$family_name}->{consensus_dir}."/".$family{$family_name}->{file_prefix}.$consensus_suffix;
  }


  ################################################################
  ## Suffix for the synthetic table
  if ($family_file) {
    $table_suffix = $family_file;
  } elsif ($sequence_file_list) {
    $table_suffix .= $sequence_file_list;
    $table_suffix =~  s/\.tab$//;
    $table_suffix =~  s/\.txt$//;
  } else {
    $table_suffix = $organism_name;
  }
  if ($bg_seq_file) {
    $table_suffix .= "_bgseq";
  } elsif ($background) {
    $table_suffix .= "_bg_${background}";
  } else {
    $table_suffix .= "_mkv${markov_order}";
  }

  if ($family_file) {
    $table_suffix .= "_up".$from     if (defined($from));
    $table_suffix .= "_".$to     if (defined($to));
    $table_suffix .= "_".$feature_types  if (defined($feature_types));
    $table_suffix .= $noorf_string;
  }
  if ($analyze_purged_sequences) {
    $table_suffix .= "-purge";
  } else {
    $table_suffix .= "-nopurge";
  }
  $table_suffix .= "_".$min_oligo_len."nt";
  $table_suffix .= "_".$max_oligo_len."nt";
  $table_suffix .= $noov;
  $table_suffix .= $strands;
  $table_suffix .= &ThresholdSuffix();
  #    $table_suffix .= "_sig".$lth{occ_sig};
  $table_suffix .= "_mtx_width".$matrix_width;
  $table_suffix = `basename $table_suffix`;
  chomp $table_suffix;

  ################################################################
  ## Name of the report for the Motif Discovery Competition 2004
  $dir{mdc_report} = "mdc_report";
  $outfile{results} = $dir{mdc_report}."/".$table_suffix."_results.txt";
  $outfile{parameters} = $dir{mdc_report}."/".$table_suffix."_parameters.txt";
}

################################################################
##################### SUBROUTINE DEFINITION ####################


## ##############################################################
## Store the sequences for one family if sequences have been provided
## with the option -all_seq
sub StoreSequences  {
  my @members = @{$family{$family_name}->{members}};
  $out = &OpenOutputFile($family{$family_name}->{seq_file});
  foreach my $member (@members) {
    if (defined($all_sequences{lc($member)})) {
      my $current_seq = $all_sequences{lc($member)};
      my $current_id = $member;
      my $current_comment = $all_sequences_comment{lc($member)};
      &PrintNextSequence($out, $seq_format, 0, $current_seq, $current_id, $current_comment);
    } else {
      &RSAT::message::Warning("No sequence with ID", $member,"in  file", $all_seq_file);
    }
  }
  close $out;
  &RSAT::message::Warning($family_name, "Stored sequence in file", $family{$family_name}->{seq_file}) if ($main::verbose >= 2);
}

################################################################
#### retrieve upstream sequences
sub RetrieveSequences {
  &RSAT::message::TimeWarn ("Retrieving upstream sequences for family ".$family_name, $organism_name) if ($verbose >= 2);
  my $command;
  if ($taxon) {
    $command = "get-orthologs";
    $command .= " -org ". $organism_name;
    $command .= " -taxon ". $taxon;
    $command .= " -i ".$family{$family_name}->{family_file};
    $command .= " -o ".$family{$family_name}->{ortho_file};
    $command .= "; retrieve-seq-multigenome";
    $command .= " -label organism_name,id,name ";
    $command .= " -i ".$family{$family_name}->{ortho_file};
  } else {
    $command = "retrieve-seq";
    $command .= " -label id,name ";
    $command .= " -i ".$family{$family_name}->{family_file};
    $command .= " -org ". $organism_name;
  }
  $command .= " -imp_pos -type ".$seq_type;
  $command .= " -noorf " if ($noorf);
  $command .= " -rm " if ($repeat_masked);
  $command .= " -o ".$family{$family_name}->{seq_file};
  $command .= " -from ".$from if (defined($from));
  $command .= " -to ".$to if (defined($to));
  $command .= " -feattype $feature_types " if ($feature_types);
  $command .= " -format $seq_format";
  if ($batch) {
    push @main::batch_commands, $command;
  } else {
    &doit($command, $dry_run, $die_on_error, $verbose);
  }
}

sub RetrieveSequencesEnsEMBL {
  &RSAT::message::TimeWarn ("Retrieving upstream sequences for family ".$family_name, $organism_name) if ($verbose >= 2);
  $organism_name =~ s/_EnsEMBL//;
  my $command;
  $command = "retrieve-ensembl-seq.pl";
  #    $command .= " -label id,name ";
  $command .= " -ensemblhost xserve2 -alltranscripts";
  $command .= " -i ".$family{$family_name}->{family_file};
  $command .= " -org ". $organism_name;
  $command .= " -type upstream -maskcoding";
  $command .= " -noorf " if ($noorf);
  $command .= " -rm " if ($repeat_masked);
  $command .= " -o ".$family{$family_name}->{seq_file};
  $command .= " -from ".$from if (defined($from));
  $command .= " -to ".$to if (defined($to));
  $command .= " -feattype $feature_types " if ($feature_types);
  #    $command .= " -format $seq_format";
  if ($batch) {
    push @main::batch_commands, $command;
  } else {
    &doit($command, $dry_run, $die_on_error, $verbose);
  }
}

################################################################
#### Purge input sequences
sub PurgeSequences {
  &RSAT::message::TimeWarn("Purging sequences for family\t".$family_name) if ($verbose >= 2);

  ## Remove sequences of length 0
  $command = "convert-seq -dna -from fasta -to fasta -lw 0 -skip_short 1";
  $command .= " -i ".$family{$family_name}->{seq_file};
  $command .= " | purge-sequence -format fasta -ml ".$purge_ml." -mis ".$purge_mis;
  $command .= " -o ".$family{$family_name}->{seq_file_purged};
  if ($batch) {
    push @main::batch_commands, $command;
  } else {
    &doit($command, $dry_run, $die_on_error, $verbose);
  }
}


################################################################
### Calculate options and file names according to the selected
### parameters
sub CalcOligoOptions {
  my ($oligo_len) = @_;
  $oligo_options = "-v";
  #   $oligo_options .= " -lth occ 1 "; ## This is to avoid calculating P-value for patterns not rpesent in the dataset, and for having a correct correction for multi-testing.
  $oligo_options .= " -two_tails" if ($two_tails);
  $oligo_options .= " -pseudo ".$oligo_pseudo if ($oligo_pseudo);
#  $oligo_options .= " -quick ";
  $oligo_options .= " -sort ";
  $oligo_options .= " ".$strands;
  $oligo_options .= " -l ".$oligo_len;
  $oligo_options .= &ThresholdOptions();
  #   $oligo_options .= " -lth occ_sig ".$lth{occ_sig};
  #   if (&IsReal($lth{ms_freq})) {$oligo_options .= " -thmsf ".$lth{ms_freq} ; }
  #   if (&IsReal($lth{ms_sig})) {$oligo_options .= " -thmssig ".$lth{ms_sig} ; }

  if ($mask) {
    $oligo_options .= " -mask ".$mask;
  }

  if ($bg_seq_file) {
    ## Background model estimated from user-specified background
    ## sequences. If the background model file does not exist, compute
    ## it from background sequences.
    my $bg_model_file = $bg_model_file{"oligos_".$oligo_len."nt"};
    &define_bg_model_files() unless (defined($bg_model_file));
    &compute_bg_oligos() unless (-e $bg_model_file);
    $oligo_options .= " -expfreq ".$bg_model_file;

  } elsif ($exp_freq_file{oligos}) {
    ### Manually specified expected frequencies

    ## Convert relative path in absolute path
    unless ($exp_freq_file{oligos} =~ /^\//) {
      $exp_freq_file{oligos} = $dir{main}."/".$exp_freq_file{oligos};
    }
    $oligo_options .= " -expfreq ".$exp_freq_file{oligos};

  } elsif ($background eq "calib1") {
    ### Single-sequence based calibrated occurrences (mean and var per sequence)
    my $calib_length = $family{$family_name}->{calib_length};
    my $calib_file = &CalibrationPrefix($calib_length, $oligo_len);
    $calib_file .= "_negbin.tab";
    ### Check whether the calibration file exists
    unless ((-e $calib_file) || (-e $calib_file.".gz")) {
      &RSAT::error::FatalError ("Calibration file not found\t".$calib.file);
    }
    $oligo_options .= " -calib1 ".$calib_file;
    &RSAT::message::Info("Oligonucleotide calibration", $calib_length, $calib_file) if ($verbose >= 3);

  } elsif ($background eq "calibN") {
    ### Set-based calibrated occurrences (mean and var per set of N sequences)
    my $calib_length = $family{$family_name}->{calib_length};
    my $N = scalar(@{$family{$family_name}->{members}});
    my $calib_file = &CalibrationPrefix($calib_length, $oligo_len, $N);
    $calib_file .= "_negbin.tab";

    ### Check whether the calibration file exists
    unless ((-e $calib_file) || (-e $calib_file.".gz")) {
      ### temporary: if there is no R10000 file, try R1000
      $calib_file =~ s/10000/1000/g;
      if ((-e $calib_file) || (-e $calib_file.".gz")) {
	&RSAT::message::Warning("Using calibration with 1000 repetitions\t".$calib_file);
      } else {
	### temporary: if there is no R1000 file, try R100
	$calib_file =~ s/1000/100/g;
	if ((-e $calib_file) || (-e $calib_file.".gz")) {
	  &RSAT::message::Warning("Using calibration with 100 repetitions\t".$calib_file);
	} else {
	  &RSAT::error::FatalError ("Calibration file not found\t".$calib_file);
	}
      }
    }
    $oligo_options .= " -calibN ".$calib_file;
    &RSAT::message::Info("Oligonucleotide calibration", $calib_length, $calib_file) if ($verbose >= 3);

  } elsif (($organism_name) && ($background)) {
    ### Pre-calculated expected frequency files
    $oligo_options .= " -bg ".$background;
    $oligo_options .= " -org ".$organism_name;

  } elsif ($markov) {
    ### Markov chain model
    $oligo_options .= " -markov ".$markov_order;

  } else {
    &RSAT::error::FatalError("You must specify a method for estimating the background model.");
  }
  $oligo_options .= " -return occ,mseq,freq,proba,rank,zscore,ratio";
  $oligo_options .= " ".$noov;

  my $oligo_suffix = "_oligos";
  if ($bg_seq_file) {
    $oligo_suffix .= "_bgseq";
  } elsif ($background) {
    $oligo_suffix .= "_bg_".$background;
  } elsif ($markov) {
    $oligo_suffix .= "_mkv".$markov_order;
  }
  $oligo_suffix .= "_${oligo_len}nt";
  $oligo_suffix .= $strands;
  $oligo_suffix .= &ThresholdSuffix();
  #   $oligo_suffix .= "_sig".$lth{occ_sig};
  #   $oligo_suffix .= "_thmsf".$lth{ms_freq} if (&IsReal($lth{ms_freq}));
  #   $oligo_suffix .= "_thmssig".$lth{ms_sig} if (&IsReal($lth{ms_sig}));
  $oligo_suffix .= $noov;

  $oligo_file = "$family_name/oligos_${family_name}/".$family{$family_name}->{file_prefix}.$oligo_suffix;
  $command = "oligo-analysis -i $family{$family_name}->{input_seq_file} -format $seq_format -o $oligo_file $oligo_options";

  return ($command, $oligo_file, $oligo_suffix, $oligo_options);
}


################################################################
### oligo-analysis
sub OligoAnalysis {
  &RSAT::message::TimeWarn("Analyzing oligonucleotides for family $family_name",
			   "min len: $min_oligo_len",
			   "max len: $max_oligo_len") if ($verbose >= 2);
  my $dir = "${family_name}/oligos_${family_name}";

  chdir($dir{main}); chdir($dir{output}); &RSAT::util::CheckOutDir($dir);
  &RSAT::error::FatalError( "Cannot read file $family{$family_name}->{input_seq_file}")
    unless (($batch)  || ($dry_run) || ($dry_run) || (-r $family{$family_name}->{input_seq_file}));

  #    system "echo '' > $family{$family_name}->{oligo_file}"; # if ($verbose >= 2);

  ## Analyze all oligo lengths
  my @oligo_commands = ();
  for $oligo_len ($min_oligo_len..$max_oligo_len) {
      my ($one_oligo_command, $one_oligo_file) = &CalcOligoOptions($oligo_len);
      push @oligo_commands, $one_oligo_command;
      #	&doit($command, $dry_run, $die_on_error, $verbose);
      #	system "cat $oligo_file >> $family{$family_name}->{oligo_file}";
  }
  
  ## Merge the results obtained with different oligo lengths
  my $merge_command = &MergeOligoLengths();
  push @oligo_commands, $merge_command;

  ## Assemble the merged patterns
  my $assemble_command = &AssembleOligos();
  push @oligo_commands, $assemble_command;
  
  if ($batch) {
      push @main::batch_commands, @oligo_commands;
  } else {
      my $command = join "; \\\n", @oligo_commands;
      &doit($command, $dry_run, $die_on_error, $verbose);
  }
}

################################################################
## Merge the results of oligo-analysis for different oligo lengths
sub MergeOligoLengths {
  &RSAT::message::TimeWarn("Merging oligonucleotides for family ".$family_name,
			   "min len: ".$min_oligo_len,
			   "max len:".$max_oligo_len,
			   $family{$family_name}->{oligo_file}) if ($verbose >= 2);
  my $dir = "${family_name}/oligos_${family_name}";

   chdir($dir{main}); chdir($dir{output}); &RSAT::util::CheckOutDir($dir);
   &RSAT::error::FatalError("Cannot read file ".$family{$family_name}->{input_seq_file}) unless (($batch) || ($dry_run) ||(-r $family{$family_name}->{input_seq_file}));

  ## initialize the oligo file
#  my @merge_commands = "echo '' > $family{$family_name}->{oligo_file}";
  my @merge_commands = "rm -f ".$family{$family_name}->{oligo_file};
  for $oligo_len ($min_oligo_len..$max_oligo_len) {
    ($command, $oligo_file) = &CalcOligoOptions($oligo_len);
    push @merge_commands, "cat ".$oligo_file." >> ".$family{$family_name}->{oligo_file};
  }

   my $merge_command = join "; \\\n", @merge_commands;
   if ($task{oligos}) {
       return $merge_command;
   } else {
       if ($batch) {
	   push @main::batch_commands, @merge_commands;
       } else {
	   &doit($merge_command, $dry_run, 0, $verbose);
       }
   }
}


################################################################
### Merge the results of oligo-analysis for different oligo lengths
sub AssembleOligos {
  &RSAT::message::TimeWarn("Merging oligonucleotides for family ".$family_name,
			   "min len: ".$min_oligo_len,
			   "max len:".$max_oligo_len,
			   $family{$family_name}->{oligo_file}) if ($verbose >= 2);
  my $dir = "${family_name}/oligos_${family_name}";

  chdir($dir{main}); chdir($dir{output}); &RSAT::util::CheckOutDir($dir);
  &RSAT::error::FatalError("Cannot read file ".$family{$family_name}->{input_seq_file}) unless (($batch) || ($dry_run) ||(-r $family{$family_name}->{input_seq_file}));
  my @assemble_commands = ();

  #    ## initialize the oligo file
  #    my @assemble_commands = "echo '' > $family{$family_name}->{oligo_file}"; # if ($verbose >= 2);
  #    for $oligo_len ($min_oligo_len..$max_oligo_len) {
  # 	($command, $oligo_file) = &CalcOligoOptions($oligo_len);
  # 	push @assemble_commands, "cat ".$oligo_file." >> ".$family{$family_name}->{oligo_file};
  #    }

  ### pattern assembly
  &RSAT::message::TimeWarn("Assembling oligo patterns", $family{$family_name}->{oligo_assembly_file})
    if ($verbose >= 2);
  my $assembly_command = "pattern-assembly -v 1";
  $assembly_command .= " -maxfl 1 -toppat ".$toppat." -subst 1 ".$strands;
  $assembly_command .= " -max_asmb_nb ".$nmotifs;
  $assembly_command .= " -i ".$family{$family_name}->{oligo_file};
  $assembly_command .= " -o ".$family{$family_name}->{oligo_assembly_file};
  push @assemble_commands, $assembly_command;

  ### extract PSSM from assembled patterns
  #    &RSAT::message::TimeWarn("Matrix from oligo patterns", $family{$family_name}->{oligo_pssm_file})
  #        if ($verbose >= 2);
  #    my $pssm_command = "matrix-from-patterns -v 1 ";
  #    $pssm_command .= " -seq ".$family{$family_name}->{seq_file};
  #    $pssm_command .= " -asmb ".$family{$family_name}->{oligo_assembly_file};
  #    $pssm_command .= " -bginput -markov 0 ";
  #    $pssm_command .= " -o ".$family{$family_name}->{oligo_pssm_file};
  if ($task{pssm}) {
      $pssm_command = &MatrixFromPatterns($family{$family_name}->{oligo_assembly_file},
					  $family{$family_name}->{oligo_pssm_file}, "oligos");
  }
  push @assemble_commands, $pssm_command if ($task{pssm});

  my $assemble_command = join "; \\\n", @assemble_commands;
  if ($task{oligos}) {
    return $assemble_command;
  } else {
    if ($batch) {
      push @main::batch_commands, @assemble_commands;
    } else {
      &doit($assemble_command, $dry_run, 0, $verbose);
    }
  }
}


################################################################
## Match discovered patterns (oligos and dyads only)
sub MatchPatterns {
   my ($oligo_file, $fmap_prefix, $options) = @_;

   ## check file names
   $oligo_file = $family{$family_name}->{oligo_file} unless ($oligo_file);
   $fmap_prefix = $oligo_file unless ($fmap_prefix);

   ### pattern matching
   &RSAT::error::FatalError("Cannot read file $family{$family_name}->{seq_file}") unless (($batch) || ($dry_run) || (-r $family{$family_name}->{seq_file}));
   my $feature_file = $fmap_prefix.".ft";
   &RSAT::message::Info("Matching patterns",
			"\n;\t", $oligo_file,
			"\n;\t", $feature_file,
		       )  if ($verbose >= 2);

   my $command = "dna-pattern -i ".$family{$family_name}->{seq_file};
   $command .= " -format ".$seq_format;
   $command .= " -pl ".$oligo_file;
   $command .= " -origin ".$map_origin;
   $command .= " ".$strands;
   $command .= " -return sites,limits ";
   $command .= $options;
   $command .= " | features-from-dnapat -o ".$feature_file;

   ## Check if pattern matching is OK (sometimes there are no patterns)
   if ($batch) {
       push @main::batch_commands, $command;
   } else {
       my $error = &doit($command, $dry_run, 0, $verbose);
       if ($error) {
	   &RSAT::message::Warning("Feature map skipped because dna-pattern returned an error");
	   return;
       }
   }
}


################################################################
## Scan the sequences with a position-specific scoring matrix
sub ScanSequences {
   my ($matrix_file, $matrix_format, $fmap_prefix, $options) = @_;

   ## check file names
   &RSAT::error::FatalError("ScanSequences require to specify a matrix file") unless ($matrix_file);
   &RSAT::error::FatalError("ScanSequences require to specify a matrix format") unless ($matrix_format);
   &RSAT::error::FatalError("ScanSequences require to specify a prefix for the scanning result") unless ($fmap_prefix);

   ### pattern matching
   &RSAT::error::FatalError("Cannot read file $family{$family_name}->{seq_file}") unless (($batch) || ($dry_run) || (-r $family{$family_name}->{seq_file}));
   my $feature_file = $fmap_prefix.".ft";
   &RSAT::message::Info("Scaning sequences with matrix",
			"\n;\t", $matrix_file,
			"\n;\t", $feature_file,
		       )  if ($verbose >= 2);

   my $command = "$matrix_scan_cmd -v 1 -bginput -markov 1 -i ".$family{$family_name}->{seq_file};
   $command .= " ".${quick_scan};
   $command .= " -seq_format ".$seq_format;
   $command .= " -matrix_format ".$matrix_format;
   $command .= " ".$strands;
   $command .= " -m ".$matrix_file;
   $command .= " -consensus_name";
   $command .= " -origin ".$scan_origin;
   $command .= " -return limits,sites,pval,normw ";
   $command .= " -o ".$feature_file;
   $command .= $options;

#   die $command, "\n";

   ## Check if scanning is OK (sometimes there are no matrices)
   if ($batch) {
       push @main::batch_commands, $command;
   } else {
       my $error = &doit($command, $dry_run, 0, $verbose);
       if ($error) {
	   &RSAT::message::Warning("Feature map skipped because matrix-scan returned an error");
	   return;
       }
   }
}

################################################################
#### draw feature-map for the result of dna-pattern
sub DrawFeatureMap {
   my ($fmap_prefix, $title, $options) = @_;

   ## check file names
   $fmap_prefix = $family{$family_name}->{oligo_ft_prefix} unless ($fmap_prefix);
   $title = &ShortFileName($fmap_prefix) unless ($title);

   ### pattern matching
   &RSAT::error::FatalError("Cannot read file $family{$family_name}->{seq_file}") unless (($batch) || ($dry_run) || (-r $family{$family_name}->{seq_file}));
   my $feature_file = $fmap_prefix.".ft";
   $fmap_file = $fmap_prefix.".".$img_format;
   $htmap_file = $fmap_prefix.".html";
   &RSAT::message::Info("Drawing feature-map",
			"\n;\t", $feature_file,
			"\n;\t", $fmap_file,
			"\n;\t", $htmap_file,
		       )  if ($verbose >= 2);

   ## feature-map drawing
   if ($family{$family_name}->{calib_length}) {
	$from = -$family{$family_name}->{calib_length};
	$to = -1;
   }
   $command = "feature-map -i ".$feature_file;
   $command .= " -format ".$img_format;
   $command .= " -minfthick 2";
   $command .= " -scalebar -scalestep $scalestep -legend ";
   $command .= " -title '".$title."'";
#   $command .= " -from $from" if (defined($from));
#   $command .= " -to $to" if (defined($to));
   $command .= " -o $fmap_file -scorethick ";
   $command .= " ".$options;
   if ($htmaps) {
	$command .= " -htmap > $htmap_file ";
   };

   ## Delete the feature file to save disk space (the feature space
   ## occupy half od the result directory)
   $command .= "; rm -f ".$feature_file;

   if ($batch) {
       push @main::batch_commands, $command;
   } else {
       &doit($command, $dry_run, 0, $verbose);
   }
}

################################################################
### dyad analysis
sub DyadAnalysis {
  &RSAT::message::TimeWarn("Analyzing dyads for family $family_name",
			   "monad length: $monad_length",
			   "min_sp: $min_sp",
			   "max_sp: $max_sp",
			  ) if ($verbose >= 2);
  $dir = "${family_name}/dyads_${family_name}";
  chdir($dir{main}); chdir($dir{output}); &RSAT::util::CheckOutDir($dir);
  &RSAT::error::FatalError("Cannot read file $family{$family_name}->{input_seq_file}") unless (($batch) || ($dry_run) || (-r $family{$family_name}->{input_seq_file}));

  if ($bg_seq_file) {
    &compute_bg_dyads() unless (-e $bg_model_file{dyads});
  }

  ## build the dyad-analysis command
  $command = "dyad-analysis";
  $command .= " -return occ,proba,rank,zscore,ratio";
  $command .= " -i ".$family{$family_name}->{input_seq_file};
  $command .= " -format ".$seq_format;
  $command .= " -o ".$family{$family_name}->{dyad_file};
  $command .= " ".$dyad_options;
  $command .= " -org ".$organism_name if ($org_fam);
  if ($mask) {
    $command .= " -mask ".$mask;
  }

  ### pattern assembly
  &RSAT::message::TimeWarn("Assembling dyad patterns", $family{$family_name}->{dyad_assembly_file})
    if ($verbose >= 2);
  $command .= "; pattern-assembly -v 1";
  $command .=  " -maxfl 1 -subst 0 -toppat ".$toppat;
  $command .=  " ".${strands};
  $command .= " -max_asmb_nb ".$nmotifs;
  $command .= " -i ".$family{$family_name}->{dyad_file};
  $command .= " -o ".$family{$family_name}->{dyad_assembly_file};

  ### extract PSSM from assembled patterns
  #    $command = "echo TEST";
  if ($task{pssm}) {
    $command .= "; ";
    $command .= &MatrixFromPatterns($family{$family_name}->{dyad_assembly_file},
				   $family{$family_name}->{dyad_pssm_file}, "dyads");
  }

  if ($batch) {
    push @main::batch_commands, $command;
  } else {
    &doit($command, $dry_run, $die_on_error, $verbose);
  }
}


################################################################
## Convert assembled patterns (dyads, oligos) into PSSMs
sub MatrixFromPatterns {
  my ($assembly_file, $pssm_file, $type) = @_;
  &RSAT::message::TimeWarn("Matrix from patterns", $type, $pssm_file)
    if ($verbose >= 2);
  my $command = "matrix-from-patterns -v 1 ";
  $command .= " -seq ".$family{$family_name}->{seq_file};
  $command .= " -asmb ".$assembly_file;
  $command .= " -bginput -markov 0 ";
  $command .= " -max_asmb_nb ".$nmotifs;
  $command .= " -flanks 2";
  $command .= " -logo -logo_format png";
  $command .= " -logo_file ".$pssm_file."_logo";
  $command .= " -o ".$pssm_file;

#  my $pssm_dir = `dirname $pssm_file`;
#  chomp($pssm_dir);
#  $command .= "; convert-matrix";
#  $command .= " -i ".$pssm_file."_count_matrices.tf";
#  $command .= " -from transfac -to tab -return logo -logo_format png";
#  $command .= " -logo_dir ".$pssm_dir;
  return ($command);
}

################################################################
## Compare discovered motifs with motif database
sub MotifsVsDB {
  my ($matrix_file, $matrix_format, $db_file, $db_format, $compa_prefix) = @_;
  &RSAT::message::TimeWarn("Comparing motifs to dabatase") if ($main::verbose >= 2);

  my $cmd = "compare-matrices -v 2 ";
  $cmd .= " -file1 ".$matrix_file;
  $cmd .= " -format1 ".$matrix_format;
  $cmd .= " -file2 ".$db_file;
  $cmd .= " -format2 ".$db_format;
  $cmd .= " -DR";
  $cmd .= " -sort cor";
  $cmd .= " -uth rank 1";
  $cmd .= " -lth w 5";
  $cmd .= " -lth cor 0.85";
  $cmd .= " -lth Ncor 0.4";
  $cmd .= " -return matrix_name,direction,Ncor,SW,cor,width,consensus";
# $cmd .= " -return aligned_matrices";
  $cmd .= " -o ".$compa_prefix.".tab";
  $cmd .= " -out_matrices ".$compa_prefix."_matrices.tab";

  my $wd = `pwd`;
  &RSAT::message::Debug("Working dir", $wd, "\n", $cmd) if ($main::verbose >= 0);
  if ($batch) {
    push @main::batch_commands, $cmd;
  } else {
    &doit($cmd, $dry_run, 0, $verbose);
  }
}

################################################################
## Merge patterns detected by oligo-analysis and dyad-analysis
sub MergePatterns {
  my $filter_dyads = 1;
  &RSAT::message::TimeWarn("Merging oligonucleotides and dyads for family ",
			   $family_name,
			   $family{$family_name}->{merged_file}
			  ) if ($verbose >= 2);
   my $dir = "${family_name}/merged_${family_name}";
   chdir($dir{main}); chdir($dir{output}); &RSAT::util::CheckOutDir($dir);
   my $command = "cat $family{$family_name}->{oligo_file}";
#    $command .= " | grep -v '^;'";
   $command .= " > $family{$family_name}->{merged_file};";
   $command .= " cat $family{$family_name}->{dyad_file}";
#    $command .= " | grep -v '^;'";
   if ($filter_dyads) {
	$command .= " | grep -v '\{0\}'";
   } else {
	$command .= " | perl -pe 's/n\{0\}//g'";
   }
   $command .= " >> $family{$family_name}->{merged_file};";
   $command .= " wc $family{$family_name}->{merged_file}";
   if ($batch) {
       push @main::batch_commands, $command;
   } else {
       &doit($command, $dry_run, $die_on_error, $verbose);
   }
   &DrawMergedFeatureMap() if ($task{map});
}


################################################################
#### draw feature-map for the result of merged oligonucleotides and dyads
sub DrawMergedFeatureMap {
  &RSAT::message::TimeWarn("Drawing feature-map with results of merged oligonucleotides and dyads for family", $family_name)
    if ($verbose >= 2);
  ### pattern matching
  &RSAT::error::FatalError("Cannot read file $family{$family_name}->{seq_file}") unless (($batch) || ($dry_run) || (-r $family{$family_name}->{seq_file}));
  $fmap_file = $family{$family_name}->{merged_file}.".".$img_format;
  $htmap_file = $family{$family_name}->{merged_file}.".html";
  my $command = "dna-pattern";
  $command .= " -i ".$family{$family_name}->{seq_file};
  $command .= " -format ".$seq_format;
  $command .= " -pl ".$family{$family_name}->{merged_file};
  $command .= " -origin ".$map_origin;
  $command .= " -N 4";
  $command .= " -return sites,limits ";
  $command .= " ".$strands;
  $command .= " | features-from-dnapat -o $family{$family_name}->{merged_ft_file} ";

  if ($batch) {
    push @main::batch_commands, $command;
  } else {
    my $error = &doit($command, $dry_run, 0, $verbose);
    if ($error) {
      &RSAT::message::Warning("Feature map skipped because dna-pattern returned an error");
      return;
    }
  }

  ## feature-map drawing
  if  ($family{$family_name}->{calib_length}) {
    $from =  $family{$family_name}->{calib_length};
    $to = -1;
  }
  $command = " feature-map -i ".$family{$family_name}->{merged_ft_file};
  $command .= " -scalebar -scalestep $scalestep -legend";
  $command .= " -format ".$img_format;
  $command .= " -title ";
  $command .= &ShortFileName($family{$family_name}->{merged_ft_file});
  #   $command .= " -from $from" if (defined($from));
  #   $command .= " -to $to" if (defined($to));
  $command .= " -o $fmap_file -scorethick ";
  if ($htmaps) {
    $command .= " -htmap > $htmap_file ";
  };

  ## Delete the feature file to save disk space (the feature space
  ## occupy half od the result directory)
  $command .= "; rm -f ".$family{$family_name}->{merged_ft_file};

  if ($batch) {
    push @main::batch_commands, $command;
  } else {
    &doit($command, $dry_run, 0, $verbose);
  }
}


################################################################
## Run Gert Thijs' MotifSampler
sub MotifSampler {
  &RSAT::message::TimeWarn ("Running MotifSampler for family $family_name") if ($verbose >= 2);
  ## Check output directory
  my $dir = $family{$family_name}->{MotifSampler_dir};
  chdir($dir{main}); chdir($dir{output}); &RSAT::util::CheckOutDir($dir);

  ## run the command
  my $MS_command = "MotifSampler ".$family{$family_name}->{MotifSampler_options};
  unless (($batch) || ($verbose >= 3)) {
    ## Suppress verbosity
    $MS_command = "(".$MS_command.") >& MS_log.txt";
  }
  $MS_command .= "; MotifRanking -m 2";
  $MS_command .= " -i ".$family{$family_name}->{MotifSampler_matrix};
  $MS_command .= " -o ".$family{$family_name}->{MotifSampler_matrix}."_ICsorted";
  $MS_command .= "; ";
  $MS_command .= &ConvertMatrixCommand($family{$family_name}->{MotifSampler_file}, "MotifSampler");
  if ($batch) {
    push @main::batch_commands, $MS_command;
  } else {
    &doit($MS_command, $dry_run, $die_on_error, $verbose);
  }
}


################################################################
## Run Andrew Neuwald's gibbs program
sub Gibbs {
  chdir($dir{main}); chdir($dir{output}); &RSAT::util::CheckOutDir($family{$family_name}->{gibbs_dir});


  if ($strands eq "-2str") {
    $addrc = " -addrc";
  }
  ## This can be useful in case the file is compressed
  my $gibbs_command = "convert-seq ".$addrc;
  $gibbs_command .= " -from fasta -to fasta ";
  $gibbs_command .= " -i ".$family{$family_name}->{input_seq_file};
  $gibbs_command .= " -o ".$family{$family_name}->{gibbs_input_seq_file};
  $gibbs_command .= " ; gibbs ".$family{$family_name}->{gibbs_options};
  $gibbs_command .= " > ".$family{$family_name}->{gibbs_file};
  unless (($batch) || ($verbose >= 3)) {
    ## Suppress verbosity
    $gibbs_command = "(".$gibbs_command.") >& gibbs_log.txt";
  }

  $gibbs_command .= "; ";
  $gibbs_command .= &ConvertMatrixCommand($family{$family_name}->{gibbs_file}, "gibbs");
#  $gibbs_command = &ConvertMatrixCommand($family{$family_name}->{gibbs_file}, "gibbs");

  if ($batch) {
    push @main::batch_commands, $gibbs_command;
  } else {
    &doit($gibbs_command, $dry_run, $die_on_error, $verbose);
  }
}


################################################################
## Run Roth's AlignACE program
sub AlignACE {
  chdir($dir{main}); chdir($dir{output}); &RSAT::util::CheckOutDir($family{$family_name}->{AlignACE_dir});

  if ($strands eq "-2str") {
    $addrc = " -addrc";
  }

  ## This can be useful in case the file is compressed, and to covert
  ## n characters (masked nucleotide) into dots
  my $AlignACE_command = "convert-seq ";
  $AlignACE_command .= " -from fasta -to fasta -dotmask";
  $AlignACE_command .= " -i ".$family{$family_name}->{input_seq_file};
  $AlignACE_command .= " -o ".$family{$family_name}->{AlignACE_input_seq_file};
  $AlignACE_command .= " ; AlignACE ".$family{$family_name}->{AlignACE_options};
  $AlignACE_command .= " > ".$family{$family_name}->{AlignACE_file};

  $AlignACE_command .= "; ";
  $AlignACE_command .= &ConvertMatrixCommand($family{$family_name}->{AlignACE_file}, "AlignACE");

  if ($batch) {
    push @main::batch_commands, $AlignACE_command;
  } else {
    &doit($AlignACE_command, $dry_run, $die_on_error, $verbose);
  }
}

################################################################
## infogibbs (developed by Matthieu Defrance)
sub infogibbs {
  chdir($dir{main}); chdir($dir{output}); &RSAT::util::CheckOutDir($family{$family_name}->{infogibbs_dir});

  ## This can be useful in case the file is compressed, and to covert
  ## n characters (masked nucleotide) into dots
  my $infogibbs_command = "";
  $infogibbs_command .= "info-gibbs ".$family{$family_name}->{infogibbs_options};
  $infogibbs_command .= " > ".$family{$family_name}->{infogibbs_file};

  $infogibbs_command .= "; ";
  $infogibbs_command .= &ConvertMatrixCommand($family{$family_name}->{infogibbs_file}, "infogibbs");

  if ($batch) {
    push @main::batch_commands, $infogibbs_command;
  } else {
    &doit($infogibbs_command, $dry_run, $die_on_error, $verbose);
  }
}


################################################################
## Run MEME program
sub MEME {
  chdir($dir{main}); chdir($dir{output}); &RSAT::util::CheckOutDir($family{$family_name}->{meme_dir});

  ## MEME does not accept sequences shorter than 8
  my $min_seq_len = &max(8, $MEME_options{minw});

  ## Run the command
  $meme_command = "convert-seq ";
  $meme_command .= " -from fasta -to fasta -skip_short ".$min_seq_len;
  $meme_command .= " -i ".$family{$family_name}->{input_seq_file};
  $meme_command .= " -o ".$family{$family_name}->{meme_input_seq_file}." ; ";
  $meme_command .= $meme_cmd." ".$family{$family_name}->{meme_options};
  unless (($batch) || ($verbose >= 3)) {
    ## Redirect verbosity to a log file
    $meme_command = "(".$meme_command.") >& meme_log.txt";
  }
  $meme_command .= "; ";
  $meme_command .= &ConvertMatrixCommand($family{$family_name}->{meme_file}, "meme");
#  $meme_command = &ConvertMatrixCommand($family{$family_name}->{meme_file}, "meme");


  if ($batch) {
    push @main::batch_commands, $meme_command;
  } else {
    &doit($meme_command, $dry_run, 0, $verbose);
  }
}


################################################################
## Run orm program
sub orm {
  chdir($dir{main}); chdir($dir{output}); &RSAT::util::CheckOutDir($family{$family_name}->{orm_dir});

  ## run the command
  my $orm_command = "";
  if ($ENV{ORM}) {
    $orm_command = $ENV{ORM};
  } else {
    $orm_command = "orm";
  }
  $orm_command .= " -v 1 ".$family{$family_name}->{orm_options};

  if ($task{assemble_orm}) {
    &RSAT::message::TimeWarn("Assembling orm patterns", $family{$family_name}->{orm_assembly_file})
      if ($verbose >= 2);
    $orm_command .= "; pattern-assembly -v 1";
    $orm_command .= " -maxfl 1 -subst 1 -toppat ".$toppat." ".$strands;
    $orm_command .= " -max_asmb_nb ".$nmotifs;
    $orm_command .= " -i ".$family{$family_name}->{orm_file};
    $orm_command .= " -o ".$family{$family_name}->{orm_assembly_file};

    ### extract PSSM from pattern assembly
    &RSAT::message::TimeWarn("Matrix from orm patterns", $family{$family_name}->{orm_pssm_file})
      if ($verbose >= 2);
    $orm_command .= "; matrix-from-patterns -v 1 ";
    $orm_command .= " -seq ".$family{$family_name}->{seq_file};
    $orm_command .= " -asmb ".$family{$family_name}->{orm_assembly_file};
    $orm_command .= " -bginput -markov 0 ";
#    $orm_command .= " -max_asmb_nb ".$nmotifs;
    $orm_command .= " -o ".$family{$family_name}->{orm_pssm_file};
  }
	#print $orm_command."\n";
  if ($batch) {
    push @main::batch_commands, $orm_command;
  } else {
    &doit($orm_command, $dry_run, 0, $verbose);
  }
}


################################################################
## Run Jerry Hertz' consensus program
sub Consensus {
  chdir($dir{main}); chdir($dir{output}); &RSAT::util::CheckOutDir($family{$family_name}->{consensus_dir});

  ## input file has to be converted to consensus format
  $consensus_command = "convert-seq -dna -from fasta -to wc -i ".$family{$family_name}->{input_seq_file};

  ## run the command
  #   $consensus_command .= " | tr n ."; ## Transpose N residues in . because N is not recognized by consensus
  $consensus_command .= " | consensus ".$family{$family_name}->{consensus_options};
  $consensus_command .= " > ".$family{$family_name}->{consensus_file};
  $consensus_command .= "; ";
  $consensus_command .= &ConvertMatrixCommand($family{$family_name}->{consensus_file}, "consensus");
  if ($batch) {
    push @main::batch_commands, $consensus_command;
  } else {
    &doit($consensus_command, $dry_run, 0, $verbose);
  }
}

################################################################
### Read known known_site for each family
sub ReadKnownSites {
  my ($known_site_file) = @_;
  %known_site = ();
  %known_site_source = ();
  if (-e $known_site_file) {
    $known_sites_provided = 1;
    open KNOWN_SITE, $known_site_file;
    while (<KNOWN_SITE>) {
      next if (/^;/);
      next if (/^\#/);
      next unless (/\S/);
      chomp;
      my @fields = split "\t";
      my $site_sequence = &RSAT::util::trim(shift(@fields));
      $site_sequence =~ s/N/n/g;
      #$site_sequence = &compress_pattern($site_sequence,"n");
      my $family_name = &RSAT::util::trim(shift(@fields));
      next if (length($site_sequence) > $known_site_max_len);
      my $source =  &RSAT::util::trim(shift(@fields));
      push @{$known_site{$family_name}}, $site_sequence;
      push @{$known_site_source{$family_name}}, $source;
      #	    &RSAT::message::Debug(join ("\t", "; known site", "family:".$family_name, "site:".$site_sequence, $source)) if ($verbose >= 10);
    }
    close KNOWN_SITE;
  }
}


################################################################
### Generate a synthetic table
sub SyntheticTable {
  my $dir = "synthetic_tables";

  $outfile{table} = $dir."/".$table_suffix;
  $outfile{table} .= "_tr" if ($transpose);
  $outfile{table} .= ".html";

  &RSAT::message::TimeWarn("Generating synthetic table ", $outfile{table}) if ($verbose >= 1);

  unless (-d $dir) {
    mkdir $dir, 0755 || &RSAT::error::FatalError("Cannot create directory", $dir);
  }

  ### synthesize the results of all clusters into a single file

  ### headers
  $row = 0;
  $row{f} = $row++;
  $row{family} = $row++;
  $row{size} = $row++;
  $row{genes} = $row++;
  if ($seq_length_calculated) {
    $row{total_length} = $row++;
    $row{avg_length} = $row++;
  }
  $row{known} = $row++ if ($known_sites_provided);
  $row{oligos} = $row++;
  $row{dyads} = $row++;
  $row{orm} = $row++;
  $row{selection} = $row++ if ($task{report});
  $row{consensus} = $row++;
  $row{gibbs} = $row++;
  $row{AlignACE} = $row++;
  $row{infogibbs} = $row++;
  $row{meme} = $row++;
  $row{MotifSampler} = $row++;
  $row_num = $row-1;
  while (($header,$row) = each %row) {
    $table[$row][0] = "<b>$header</B>";
  }
  my $col = 0;

  my $f = 0;
  my $nb_fam = scalar(@families);
  foreach my $family_name (@families) {
    $f++;
    &RSAT::message::TimeWarn("Indexing result files for family", $f."/".$nb_fam, $family_name) if ($verbose >= 2);
    my @members = @{$family{$family_name}->{members}};
    $gene_nb = scalar(@members);

    ## Check minimum number of genes
    if ($gene_nb < $min_genes) {
      &RSAT::message::Warning("Skipping family",
			      $fam_count, $family_name,
			      $gene_nb." genes",
			      "< min = ".$min_genes), "\n" if ($verbose >= 2);
      next;
    }

    $col++;
    #### Index columns associated to each family
    $column{$family_name} = $col;

    ### check directory
    $dir = "${family_name}/dyads_${family_name}";
    unless (-d $dir) {
      &RSAT::message::Warning ("Cannot find directory $dir\n");
    }

    ### family number
    $table[$row{f}][$col] = "<b>$col</B>";

    ### family name
    if (($fam_link_prefix) || ($fam_link_suffix)) {
	$table[$row{family}][$col] = "<b><a href='".$fam_link_prefix.$family_name.$fam_link_suffix."'>".$family_name."</a></b>";
    } else {
	$table[$row{family}][$col] = "<b><a href='../".$family_name."'>".$family_name."</a></B>";
    }

    #### number of genes
    my $max_genes_to_list = 30; ## Max number of genes per family to list in the synthetic table
    my @genes = @{$family{$family_name}->{members}};
    my $genes = scalar(@genes);
    $table[$row{size}][$col] = $genes;
#    $table[$row{size}][$col] = scalar(@{$family{$family_name}->{members});

    #### sequence length
    if ($seq_length_calculated) {
      if ($genes > 0) {
	$family{$family_name}->{avg_length} = $family{$family_name}->{total_length}/$genes;
      } else {
	$family{$family_name}->{avg_length} = "NA";
      }
      $table[$row{total_length}][$col] = $family{$family_name}->{total_length};
      $table[$row{avg_length}][$col] = sprintf("%.1f", $family{$family_name}->{avg_length});
    }

    ### list of genes
    if (($gene_link_prefix) || ($gene_link_suffix)) {
	$table[$row{genes}][$col] = "";
	foreach my $g (0..&RSAT::stats::min($#genes, $max_genes_to_list)) {
	    $genes[$g] = "<a href='".$gene_link_prefix.$genes[$g].$gene_link_suffix."'>".$genes[$g]."</a>";
	}
    } else {
	$table[$row{genes}][$col] = "<a href='../".$family{$family_name}->{family_file}."'>";
    }
    if ($genes <= $max_genes_to_list) {
	$table[$row{genes}][$col] .= join(" ", @genes);
    } else {
	my $diff = $genes - $max_genes_to_list;
	$table[$row{genes}][$col] .= join(" ", @genes[0..($max_genes_to_list-1)]);
	if (($gene_link_prefix) || ($gene_link_suffix)) {
	    $table[$row{genes}][$col] .= "<a href='../".$family{$family_name}->{family_file}."'>";
	}
	$table[$row{genes}][$col] .= "<p> + ".$diff." other genes</p>";
    }
    if (($gene_link_prefix) || ($gene_link_suffix)) {
	$table[$row{genes}][$col] .= "</a>";
    }

    ### known patterns
    if ($known_sites_provided) {
      $table[$row{known}][$col] = "";
      my @fam_known_site = @{$known_site{$family_name}};
      my @fam_known_site_source = @{$known_site_source{$family_name}};
      for $c (0..$#fam_known_site) {
	#		$table[$row{known}][$col] .= ${known_site{$family_name}}[$c];
	$table[$row{known}][$col] .= $fam_known_site[$c];
	$table[$row{known}][$col] .= "<BR>(".$fam_known_site_source[$c].")" if ($fam_known_site_source[$c]);
	$table[$row{known}][$col] .= "<P>";
      }
    }

    ################################################################
    #### Index results from oligo-analysis
    &IndexPatterns($family_name,
		   $row{oligos},
		   $col,
		   patterns=>$family{$family_name}->{oligo_file},
		   assembly=>$family{$family_name}->{oligo_assembly_file},
		   pssm=>$family{$family_name}->{oligo_pssm_file},
		   validation=>$family{$family_name}->{oligo_vs_known},
		   validation_table_relw=>$family{$family_name}->{oligo_vs_known_relw_table},
		   validation_table_weight=>$family{$family_name}->{oligo_vs_known_weight_table},
		   type=>"oligos",
		  );

    ################################################################
    #### Index results from dyad-analysis
    &IndexPatterns($family_name,
		   $row{dyads},
		   $col,
		   patterns=>$family{$family_name}->{dyad_file},
		   assembly=>$family{$family_name}->{dyad_assembly_file},
		   pssm=>$family{$family_name}->{dyad_pssm_file},
		   validation=>$family{$family_name}->{dyad_vs_known},
		   validation_table_relw=>$family{$family_name}->{dyad_vs_known_relw_table},
		   validation_table_weight=>$family{$family_name}->{dyad_vs_known_weight_table},
		   type=>"dyads",
		  );

    ################################################################
    #### Index results from oligo-analysis
    &IndexPatterns($family_name,
		   $row{orm},
		   $col,
		   patterns=>$family{$family_name}->{orm_file},
		   assembly=>$family{$family_name}->{orm_assembly_file},
		   pssm=>$family{$family_name}->{orm_pssm_file},
		   validation=>$family{$family_name}->{orm_vs_known},
		   validation_table_relw=>$family{$family_name}->{orm_vs_known_relw_table},
		   validation_table_weight=>$family{$family_name}->{orm_vs_known_weight_table},
		   type=>"orm",
		  );

    ################################################################
    ### index results of manual pattern selection
    if ($task{report}) {
      &IndexPatterns($family_name,
		     $row{selection},
		     $col,
		     patterns=>$family{$family_name}->{oligo_selection},
		     type=>"manual"
		    );
    }


    ################################################################
    #### Index results of consensus
    &IndexConsensus($family_name, $row{consensus}, $col, matrix_file=>$family{$family_name}->{consensus_file});

    ################################################################
    #### Index results of gibbs
    &IndexGibbs($family_name, $row{gibbs}, $col, matrix_file=>$family{$family_name}->{gibbs_file});

    ################################################################
    #### Index results of AlignACE
    &IndexAlignACE($family_name, $row{AlignACE}, $col, matrix_file=>$family{$family_name}->{AlignACE_file});

    ################################################################
    #### Index results of infogibbs
    &IndexInfoGibbs($family_name, $row{infogibbs}, $col, matrix_file=>$family{$family_name}->{infogibbs_file});

    ################################################################
    #### Index results of meme
    &IndexMEME($family_name, $row{meme}, $col, matrix_file=>$family{$family_name}->{meme_file});

    ################################################################
    #### Index results of meme
    &IndexMotifSampler($family_name, $row{MotifSampler}, $col, matrix_file=>$family{$family_name}->{MotifSampler_file});
  }

  ###############################################################
  ## Print the header of the  synthetic table
  $syn = &OpenOutputFile($outfile{table});
  print $syn "<html>\n";
  print $syn "<title>",$table_suffix, "</title>","\n";
  print $syn "<body>\n";
  print $syn "<h1>".$table_suffix."</h1>\n";
  print $syn join( "\n",
		   "<pre>",
		   $verbose_message,
		   "</pre>"
		 ), "\n";
  print $syn "<table border=1>\n";

  ################################################################
  ## Add a link to the MDC report files
  if ($task{report}) {
    print $syn "<h4>Reports for the motif discovery competition</h4>";
    print $syn "<ul>";
    print $syn "<li><a href=../".$outfile{results}.">Results</a></li>";
    print $syn "<li><a href=../".$outfile{parameters}.">Parameters</a></li>";
    print $syn "</ul>";
  }


  ################################################################
  ## Sort clusters according to the selected criterion
  if ($sort_key eq "score") {
    #### sort clusters by score
    #### decreasing order
    @sorted_families = sort {$max_score{$b} <=> $max_score{$a}} @families;

  } else {
    #### sort clusters by family name
    @sorted_families = sort @families;
  }


  ################################################################
  ## Transpose the result table
  if ($transpose) {
    foreach $r (0..$row_num) {
      print $syn "<tr valign=top>\n";
      foreach $f (@sorted_families) {
	my $c = $column{$f};
	print $syn "<td>", $table[$r][$c], "</td>\n";
      }
      print $syn "</tr>\n";
    }
  } else {
    #### print table header
    foreach $r (0..$row_num) {
      print $syn "<th>", $table[$r][0], "</th>\n";
    }
    #### print table content
    foreach $f (@sorted_families) {
      my $c = $column{$f};
      print $syn "<tr valign=top>\n";
      foreach $r (0..$row_num) {
	print $syn "<td>", $table[$r][$c], "</td>\n";
      }
      print $syn "</tr>\n";
    }
  }

  print $syn "</table>\n";
  print $syn "<hr>", &AlphaDate;
  print $syn "</body>\n";
  print $syn "</html>\n";
  close $syn;
  #    chdir "synthetic_tables";
  #    chdir "../";
}




################################################################
## Index the matrix from a consensus file
sub IndexConsensus {
   &IndexMatrix(@_, program=>"consensus");
}

################################################################
##  Read a matrix and generate a summary for the synthetic table
sub IndexMatrix {
   my ($family_name, $row, $col, %args) = @_;

   my $program = $args{program};
   my $base = "../";
   my $matrix_file = $args{matrix_file};
   my $program = $args{program};
#   my $family_dir = $args{dir}."/".$family_name."/";
   my $family_dir = $family_name."/";
   $family_dir =~ s|/+|/|g;
   my $matrix_dir = $family_dir.$program."_".$family_name."/";
   my $map_link = $base.$matrix_file;

   &RSAT::message::Info("Indexing matrices from", $program, "family", $family_name, $matrix_file) if ($main::verbose >= 3);

   if (-e $matrix_file) {
     $table[$row][$col] .= &LinkToFile($family_dir, $family_name, $base);
     $table[$row][$col] .= &LinkToFile($matrix_dir, "dir", $base);
     $table[$row][$col] .= &LinkToFile($matrix_file, $program, $base);
     $table[$row][$col] .= &LinkToFile($matrix_file.".tab", "tab", $base);
     $matrix_result = "";


     ################################################################
     ## If not, write the first matrix
     my @matrices = &ReadMatrices($family_name, $matrix_file, %args);
     if (scalar(@matrices) > 0) {
       my $max_matrices = 5; ## Maximum number of matrices per family in the synthetic table
       my $m = 0;
       foreach my $matrix (@matrices) {
	 $m++;
	 $matrix_result .= "; MATRIX ".$m."/".scalar(@matrices)."\n";
	 if ($m > $max_matrices) {
	   my $remaining = scalar(@matrices) - $m + 1;
	   $matrix_result .= ";\n; ... and ".$remaining." more matrices in the file.\n";
	   last;
	 }
	 &RSAT::message::Debug($family_name, "Indexing matrix", $m."/".scalar(@matrices),$program)
	   if ($verbose >= 3);
	 #	   $matrix_result .= $matrix->toString(sep=>" ", col_width=>4, type=>"counts");
	 #$matrix_result .= $matrix->toString(type=>"parameters");
	 $matrix_result .= $matrix->toString(type=>"consensus");
       }
       $table[$row][$col] .= "<table cellpadding=3>\n";
       $table[$row][$col] .= "<TR>\n";
       $table[$row][$col] .= "<td colspan=2><pre>";
       $table[$row][$col] .= $matrix_result;
       $table[$row][$col] .= "</pre></td>\n";
       $table[$row][$col] .= "</TR>\n";
       $table[$row][$col] .= "</table>\n";

       ################################################################
       ## If the logos exist, include them in the synthetic table
       my $logo_expr = $matrix_file;
       $logo_expr =~ s/\.tab//;
       $logo_expr .= '*_logo.png';
       my @logo_files = glob ($logo_expr);
       if (scalar(@logo_files) >= 1) {
	 &RSAT::message::Debug("LOGO", $logo_expr, "FILES", @logo_files) if ($main::verbose >= 3);
	 foreach my $logo (@logo_files) {
	   $table[$row][$col] .= "<br><a href='".$base.$logo."'><img src='".$base.$logo."' height=100></a>\n";
	 }
       }

     } else {
       $table[$row][$col] .= "<br><font color=red>No matrix could be found in the result file</font>\n";
     }
   } else {
     $table[$row][$col] =  "<a href=".$base.$matrix_file."><font color=red>File not found</font></a><br>\n";
     &RSAT::message::Warning("File not found. Skipped.", $program, $matrix_file) if ($main::verbose >= 3);
   }
}

################################################################
## Convert a PSSM to tab-delimited format

## Compute the options for converting a PSSM
sub ConvertMatrixCommand {
  my ($matrix_file, $program) = @_;

  ## Convert matrix in tab-delimited file
  my $command = "convert-matrix -v 1";
  $command .= " -pseudo ".$matrix_pseudo;
  $command .= " -from ".$program;
  $command .= " -to tab";
  $command .= " -return counts,parameters";
  $command .= " -return logo -logo_format png";
  $command .= " -i ".$matrix_file;
  my $sort_key = $pssm_sort_key{$program};
  my $sort_order = $pssm_sort_order{$program};
  if ($sort_key) {
    $command .= " -sort ".$sort_order." ".$sort_key;
  }
  $command .= " -o ".$matrix_file.".tab";
  &RSAT::message::Warning("Converting matrix", $command) if ($verbose >= 2);
  return $command;
}

## Convert the PSSM
sub ConvertMatrix {
  my ($matrix_file, $program) = @_;
  my $command = &ConvertMatrixCommand($matrix_file, $program);
  &doit($command, $dry_run, $die_on_error, $verbose);
}

################################################################
##  Read a matrix from a matrix file and return a matrix object
##
## Matrix objects are used for displaying the top matrix in the
## synthetic table (-task synthesis), and for exporting matrices in
## tab-delimited files (-task sql).
##
sub ReadMatrices {
  my ($family_name, $matrix_file, %args) = @_;

  my $program = $args{program};
  &RSAT::message::TimeWarn("Reading matrices from", $program, "family", $family_name, $matrix_file, $row, $col) if ($verbose >= 2);

  ## If the matrix has already been loaded, return the indexed matrix
  if ($matrix_by_file{$matrix_file}) {
    return ($matrix_by_file{$matrix_file});
  } else {
    if (-e $matrix_file) {
      ## This piece of code has to be evaluated because it can raise
      ## an error, if the output file does not contain any
      ## matrix. This is the case for some output files of consensus,
      ## due to a bug. There are also gibbs file which contain empty
      ## matrices.
      #      eval {

      ################################################################
      ## Read matrices from the input file
      my @matrices = &RSAT::MatrixReader::readFromFile($matrix_file, $program);
      &RSAT::message::Info("Read",scalar(@matrices),"matrices from file",$matrix_file, $program) if ($verbose >= 3);
      foreach my $matrix (@matrices) {
	$matrix->force_attribute("pseudo",$matrix_pseudo);
	$matrix->force_attribute("family_id",$family_id);
	$matrix->force_attribute("family", $family_name);
	$matrix->force_attribute("format", $program);
	$matrix->set_attribute("input_file", $matrix_file);
	$matrix_by_file{$matrix_file} = $matrix;
	$matrix->calcConsensus();
	$matrix->calcInformation();
	#	  if ($@) {
	#	    &RSAT::message::Warning("Matrix file does no contain a valid matrix. Skipped.", $program, $matrix_file) if ($verbose >= 2);
	#	    &RSAT::message::Debug(join "\n", $@) if ($verbose >= 10);
	#	    return(undef);
	#	  } else {
	#	    return($matrix);
	#	  }
	#	}
	my $id = RSAT::matrix->auto_id($run_prefix."_mat");
	$matrix->force_attribute("id", $id);
	$matrix->set_attribute("analysis", $run_prefix);

	&RSAT::message::Debug($family, $program, "Matrix", $matrix->get_attribute("id"),
			      $matrix->get_attribute("consensus.IUPAC"),
			     ) if ($verbose >= 3);
      }

      ################################################################
      ## Sort matrices
      my $sort_key = $pssm_sort_key{$program};
      my $sort_order = $pssm_sort_order{$program};
      if ($sort_key) {
	@matrices = &RSAT::MatrixReader::SortMatrices($sort_key, $sort_order, @matrices);
      }

      ## Export matrices to an SQL databases
      my $first_matrix = $matrices[0];
      if ($first_matrix) {
	foreach my $attr (@matrix_scalar_out_fields) {
	  $first_matrix->_set_attribute_cardinality($attr, "SCALAR");
	}
      }
      foreach my $matrix (@matrices) {
	$matrix_factory->add_object($matrix);
      }


      ## Check that the result file contains at least one matrix
      if (scalar(@matrices) > 0) {
	return(@matrices);
      } else {
	&RSAT::message::Warning("Matrix file does no contain any valid matrix. Skipped.", $program, $matrix_file) if ($verbose >= 3);
	return()
      }

    } else {
      &RSAT::message::Warning("File not found. Skipped.", $program, $matrix_file) if ($verbose >= 3);
      return();
    }
  }
}

################################################################
## Index the matrix from gibbs file
sub IndexGibbs {
   &IndexMatrix(@_, program=>"gibbs");
}

################################################################
## Index the matrix from AlignACE file
sub IndexAlignACE {
   &IndexMatrix(@_, program=>"AlignACE");
}

################################################################
## Index the matrix from infogibbs file
sub IndexInfoGibbs {
   &IndexMatrix(@_, program=>"infogibbs");
}

################################################################
## Index the matrix from meme file
sub IndexMEME {
   &IndexMatrix(@_, program=>"meme");
}

################################################################
## Index the matrix from MotifSampler file
sub IndexMotifSampler {
   &IndexMatrix(@_, program=>"MotifSampler");
}


################################################################
## Add a link to one file in the synthetic table
sub LinkToFile {
  my ($file, $label, $base) = @_;
  my $link = '';
  if (-e $file) {
    $link = "<a href='".$base.$file."'>[".$label."]</a>\n";
  } else {
    $link = "<font color='#FFBBBB'>[".$label."]</font>";
  }
#  my $pwd = `pwd`; chomp $pwd;
#  &RSAT::message::Debug("&LinkToFile", $file, $label, $base, $pwd);
  return($link);
}


################################################################
## Index the patterns discovered for the HTML synthetic table
sub IndexPatterns {
   my ($family_name, $row, $col, %args) = @_;


   undef @patterns;
   undef @sorted_patterns;
   undef %rc;
   undef %score;
   undef %type;

   my $base = "../";
   my $type = $args{type};
   my $assembly_file = $args{assembly};
   my $pssm_file = $args{pssm};
   my $pattern_file = $args{patterns};
   my $validation_file = $args{validation};
   my $validation_table_relw_file = $args{validation_table_relw};
   my $validation_table_weight_file = $args{validation_table_weight};
#   my $family_dir = $args{dir}."/".$family_name."/";
   my $family_dir = $family_name."/";
   $family_dir =~ s|/+|/|g;
   my $pattern_dir = $family_dir.$type."_".$family_name."/";
   my $map_link = $base.$pattern_file;

   &RSAT::message::Info(join("\t", "; Indexing patterns of type", $type, "for family", $family_name, $pattern_file)) if ($verbose >= 2);

   ## Read the patterns
   if (-e $assembly_file) {
       if (($type eq "dyads") ||
	   ($type eq "oligos") ||
	   ($type eq "orm"))
	   {
	   ## Parse an assembly file (only retain contigs and isolated)
	   ($error, @patterns) = &ReadAssemblyFile($family_name, $assembly_file, $type);
       } else {
	   ## Parse a pattern file (retain all patterns)
	   @patterns = &ReadPatterns($assembly_file);
       }
   } else {
	$table[$row][$col] = "<a href=".$base.$assembly_file."><font color=red>File not found</font></a><br>\n";
	&RSAT::message::Warning("Family $family_name", "assembly file ", $assembly_file, " does not exist. Skipped.")
	  if ($verbose >= 3);
	return;
   }

   if ($error) {
	$table[$row][$col] = "<font color=red>$error</font>\n";
   } else {
	@sorted_patterns = sort { $score{$b} <=> $score{$a} } @patterns;
	$table[$row][$col] .= &LinkToFile($family_dir, $family_name, $base);
#	$table[$row][$col] .= "<a href=".$base.$family_dir.">".$family_name."</a>\n";
	$table[$row][$col] .= &LinkToFile($pattern_dir, "dir", $base);
#	$table[$row][$col] .= "<a href=".$base.$pattern_dir.">[dir]</a>\n";
	$table[$row][$col] .= &LinkToFile($pattern_file, $type, $base);
#	$table[$row][$col] .= "<a href=".$base.$pattern_file.">[".$type."]</a>\n";
#	$table[$row][$col] .= "<a href=".$base.$pattern_file.">[patt]</a>\n";
	$table[$row][$col] .= &LinkToFile($assembly_file, "asmb", $base);
#	$table[$row][$col] .= "<a href=".$base.$assembly_file.">[asmb]</a>\n";
	$table[$row][$col] .= &LinkToFile($pattern_file.".".$img_format, "map", $base);
#	$table[$row][$col] .= "<a href=".$base.$pattern_file.".".$img_format.">[map]</a>\n";
	if ($htmaps) {
	  $table[$row][$col] .= &LinkToFile($pattern_file.".html", "html", $base);
#	  $table[$row][$col] .= "<a href=".$base.$pattern_file.".html>[html]</a>\n";
	}
	$table[$row][$col] .= &LinkToFile($pssm_file."_sig_matrices.tf", "sig matrix", $base);
	$table[$row][$col] .= &LinkToFile($pssm_file."_count_matrices.tf", "PSSM-tf", $base);
	$table[$row][$col] .= &LinkToFile($pssm_file."_count_matrices.txt", "PSSM-tab", $base);
	$table[$row][$col] .= &LinkToFile($pssm_file.".".$img_format, "scan map", $base);
#	$table[$row][$col] .= "<a href=".$base.$pssm_file."_sig_matrices.tf>[sig matrix]</a>\n";
#	$table[$row][$col] .= "<a href=".$base.$pssm_file."_count_matrices.txt>[count matrix]</a>\n";
#	$table[$row][$col] .= "<a href=".$base.$pssm_file.".$img_format>[scan map]</a>\n";

#	if ($task{validate}) {
	$table[$row][$col] .= &LinkToFile($validation_file, "matches", $base);
#	$table[$row][$col] .= "<a href=".$base.$validation_file.">[matches]</a>\n";
	$table[$row][$col] .= &LinkToFile($validation_table_weight_file, "match_weight", $base);
#	$table[$row][$col] .= "<a href=".$base.$validation_table_weight_file.">[match_weight]</a>\n";
	$table[$row][$col] .= &LinkToFile($validation_table_relw_file, "match_relw", $base);
#	$table[$row][$col] .= "<a href=".$base.$validation_table_relw_file.">[match_relw]</a>\n";
#	}

	$table[$row][$col] .= "<table>\n";
	if ($max_score{$type}{$family_name}) {
	  $table[$row][$col] .="<tr><td align=right>max.score</td><td>".$max_score{$type}{$family_name}."</td></tr>\n";
	} else {
	  $table[$row][$col] .= "<tr><td>no pattern</td></tr>\n";
	}
	$table[$row][$col] .="</table>\n";
	$table[$row][$col] .= "<table>\n";
	$table[$row][$col] .= "<TR>\n";

	### print oligo sequence
	$table[$row][$col] .= "<TD>\n";
	for $p (@sorted_patterns) {
	    $table[$row][$col] .= "<b>" if ($p->get_attribute("score") >= 1);
	    my $sequence = $p->get_attribute("sequence");
	    $compressed = &compress_pattern($sequence, "n");
	    $compressed =~ s/N/n/g;
	    $table[$row][$col] .= $compressed;
	    $table[$row][$col] .= "</B>" if ($p->get_attribute("score") >= 1);
	    $table[$row][$col] .= "<BR>" unless ($p == $#sorted_patterns);
	}
	$table[$row][$col] .= "</TD>\n";

	### print reverse complementary oligo
	if ($strands eq "-2str") {
	    $table[$row][$col] .= "<TD>\n";
	    for $p (@sorted_patterns) {
		$table[$row][$col] .= "<b>" if ($p->get_attribute("score") >= 1);
#		my $sequence =  $p->get_attribute("rc");
		my $rc =  &SmartRC($p->get_attribute("sequence"));
		$compressed = &compress_pattern($rc, "n");
		$compressed =~ s/N/n/g;
		$table[$row][$col] .= $compressed;
		$table[$row][$col] .= "</B>" if ($p->get_attribute("score") >= 1);
		$table[$row][$col] .= "<BR>" unless ($p == $#sorted_patterns);
	    }
	    $table[$row][$col] .= "</TD>\n";
	}

	### print pattern scores
	$table[$row][$col] .= "<TD><a href=".$map_link.">\n";
	for $p (@sorted_patterns) {
	    $table[$row][$col] .= "<b>" if ($p->get_attribute("score") >= 1);
	    $table[$row][$col] .= $p->get_attribute("score");
	    $table[$row][$col] .= "</B>" if ($p->get_attribute("score") >= 1);
	    $table[$row][$col] .= "<BR>" unless ($p == $#sorted_patterns);
	}
	$table[$row][$col] .= "</A></TD>\n";

	### print matching patterns
	if ($known_sites_provided) {
	    $table[$row][$col] .= "<TD>\n";
	    for $p (@sorted_patterns) {
		my @matches = ();
		foreach $site (@site_library) {
		    if (($p->contains($site, min_score=>$min_matching_score)) ||
			($site->contains($p->get_attribute("sequence"), min_score=>$min_matching_score))) {
			#my $match = $site->get_attribute("sequence");
			my $match = $site->get_id();
			push @matches, $match;
		    }
		}
		$table[$row][$col] .= join(";",@matches);
		$table[$row][$col] .= "<BR>" unless ($p == $#sorted_patterns);
	    }
	    $table[$row][$col] .= "</TD>\n";
	}
	$table[$row][$col] .= "</TR>\n";
	$table[$row][$col] .= "</TABLE>\n";
   }


   ################################################################
   ## If  logos exist, include them in the synthetic table
   my $logo_expr = $pattern_file;
   $logo_expr =~ s/\.tab//;
   $logo_expr .= '*_logo.png';
   my @logo_files = glob ($logo_expr);
   if (scalar(@logo_files) >= 1) {
     &RSAT::message::Debug("LOGO", $logo_expr, "FILES", @logo_files) if ($main::verbose >= 3);
     foreach my $logo (@logo_files) {
       $table[$row][$col] .= "<br><a href='".$base.$logo."'><img src='".$base.$logo."' height=100></a>\n";
     }
   }
}




################################################################
#### Draw feature maps
sub DrawDyadFeatureMap {
   &RSAT::message::TimeWarn( "Drawing feature-map with results of dyad-analysis for family $family_name") if ($verbose >= 2);
   &RSAT::error::FatalError("Cannot read file $family{$family_name}->{seq_file}") unless (($batch) || ($dry_run) || (-r $family{$family_name}->{seq_file}));

   ## pattern matching
   my $command = "dna-pattern";
   $command .= " -pl $family{$family_name}->{dyad_file}";
   $command .= " -format $seq_format";
   $command .= " -i $family{$family_name}->{seq_file}";
   $command .= " -origin $map_origin";
   $command .= " -N 4";
   $command .= " -return sites,limits ";
   $command .= " ".$strands;
   $command .= "| features-from-dnapat -o $family{$family_name}->{dyad_ft_file} ";
   if ($batch) {
       push @main::batch_commands, $command;
   } else {
       my $error = &doit($command, $dry_run, 0, $verbose);
       if ($error) {
	   &RSAT::message::Warning("Feature map skipped because dna-pattern returned an error");
	   return;
       }
   }

   ## feature-map drawing
   $command = "feature-map -i $family{$family_name}->{dyad_ft_file} -o $family{$family_name}->{fmap_file} ";;
#   $command .= " -from $from" if (defined($from));
#   $command .= " -to $to " if (defined($to));
   $command .= "-legend ";
   $command .= "-scalebar -scalestep $scalestep ";
   $command .= "-scorethick ";
   $command .= "-mlen 520 ";
   $command .= "-title '${family_name}$dyad_suffix' ";
   if ($htmaps) {
	$command .= "-htmap >  $family{$family_name}->{htmap_file}";
   }

   ## Delete the feature file to save disk space (the feature space
   ## occupy half od the result directory)
   $command .= "; rm -f ".$family{$family_name}->{dyad_ft_file};

   #  $command .= " ;  xv -o $family{$family_name}->{fmap_file} &";
   if ($batch) {
       push @main::batch_commands, $command;
   } else {
       &doit($command, $dry_run, 0, $verbose);
   }
}


################################################################
#### check parameters
sub CheckParameters {

  ## Output directory
  &RSAT::error::FatalError("You must specify the output directory (option -outdir)")
    unless $dir{output};

  #### accepted feature types
  unless (%accepted_feature_types) {
    ## By default, accept all feature types
    $accepted_feature_types{cds} = 1;
  }
  $feature_types = join ",", keys (%accepted_feature_types);

  ## For backward compatibility
  if ($task{upstream}) {
      &RSAT::message::Warning("Option -task upstream is obsolete, please use -task sequences");
      $task{sequences} = 1;
  }
  if ($task{upstream_ensembl}) {
      &RSAT::message::Warning("Option -task upstream_ensembl is obsolete, please use -task sequences_ensembl");
      $task{sequences_ensembl} = 1;
  }


  #### check selected tasks
  unless (%task) {
    &RSAT::error::FatalError("You should select at least one task.");
  }
  if ($task{all}) {
    foreach my $t (@supported_tasks) {
      $task{$t} = 1;
    }
    unless ($draw_maps) {
      $task{maps} = 0;
      $task{oligo_maps} = 0;
      $task{dyad_maps} = 0;
      $task{orm_maps} = 0;
    }
  }
  if ($task{maps}) {
    $task{oligo_maps} = 1;
    $task{dyad_maps} = 1;
    $task{orm_maps} = 1;
  }

  ## Validation
  if (($task{validate_oligos}) ||
      ($task{validate_dyads}) ||
      ($task{validate_orm})) {
    $task{validate} = 1;
    &RSAT::error::FatalError("For the task 'validate', you must specify a list of known sites with the option -known")
      unless ($known_site_file);
  }

  ## Db match
  if (($task{db_match_oligos}) ||
      ($task{db_match_dyads})) {
    $task{db_match} = 1;
    &RSAT::error::FatalError("For the task 'db_match', you must specify a list of known sites with the option -db")
      unless ($db_site_file);
  }
  ### family file ###
  unless (($family_file) || ($sequence_file_list)) {
    &RSAT::error::FatalError("You must specify either a family file or a sequence file list");
  }

  ### organism ###
  if ($organism_name) {
    unless ($supported_organism{$organism_name}) {
      &RSAT::error::FatalError ("organism $organism_name is not supported");
    }
  } elsif ($org_fam) {
  } else {
    &RSAT::error::FatalError ("You must specify an organism (option -org) or indicate the organism in the second column of the family file (-org_fam).");
  }

  ## Upstream region limits
  $from = $supported_organism{$organism_name}->{'up_from'}  unless defined($from);
  $to = $supported_organism{$organism_name}->{'up_to'}  unless defined($to);
  $seq_length = abs($to - $from) + 1;

  ## Dyad type
  $dyad_type = "any" unless ($dyad_type);

  ## Strands
  if (defined($force{strands})) {
    $strands = $force{strands}; ### force
  } else {
    $strands = "-2str";
  }

  ## Strands for the background model
  $bg_strands = $strands;
#  $bg_strands = '-1str';


  ## Threshold on occurrence significance
  unless ((defined($lth{occ_sig})) || ($lth{occ_sig} eq $null)) {
    $lth{occ_sig} = 0;
    &RSAT::message::Info("Lower threshold on occ_sig automatically set to 0 for oligo-analysis and dyad-analysis.")
      if ($verbose >= 1);
  }


  ################################################################
  ## MEME directory
  if ($task{meme}) {
    &check_meme_path();
  }

}


################################################################
## Define background model files
sub define_bg_model_files {
  ## Directory for BG models has to be defined relative to the output directory
  $dir{bg_models} = "bg_models";
  chdir($dir{main}); chdir($dir{output}); &RSAT::util::CheckOutDir($dir{bg_models});
  &RSAT::message::TimeWarn("Background model directory", $dir{bg_models}) if ($main::verbose >= 1);

  ## Name of model files for oligo-analysis
  for my $oligo_len ($min_oligo_len..$max_oligo_len) {
    $bg_model_file{"oligos_".$oligo_len."nt"} = $dir{bg_models}."/bg_oligos_".$oligo_len."nt".$bg_strands.$noov.".tab";
  }

  ## Name of model files for dyad-analysis
  $bg_model_file{"dyads"} = $dir{bg_models}."/bg_dyads_".$monad_length."nt".$bg_strands.$noov.".tab";

  ## Name of model file for MEME
  $bg_model_file{"meme"} = $dir{bg_models}."/bg_meme_mkv".$markov_order.".txt";
}

################################################################
## Compute background models from a set of background sequences
## specified with the option -bg_seq
sub ComputeBG {
  chdir($dir{main});
  &RSAT::message::TimeWarn("Computing background model from background sequence file") if ($main::verbose >= 1);
  &compute_bg_oligos();
  &compute_bg_dyads();
  &compute_bg_meme();
  chdir($dir{output});
}


################################################################
## Compute background models for oligo-analysis
sub compute_bg_oligos {
  ## Tip: the bg sequence file is defined relative to the main
  ## directory, but the bg model files must be defined relative to the
  ## output directory.
  chdir($dir{main});

  for my $oligo_len ($min_oligo_len..$max_oligo_len) {
    my $bg_model_file =  $bg_model_file{"oligos_".$oligo_len."nt"};
    &RSAT::message::TimeWarn("\tComputing background model for oligos", $oligo_len."nt", $bg_model_file) if ($main::verbose >= 1);
    my $cmd = "";
    if ($bg_seq_file =~ /.gz$/) {
      $cmd .= "gunzip -c ".$bg_seq_file. "| $count_words_cmd -v 1";
    } else {
      $cmd .= "$count_words_cmd -v 1 -i ".$bg_seq_file;
    }
    $cmd .= " -nogrouprc $bg_strands $noov ";
    $cmd .= " -l ".$oligo_len;
    $cmd .= " -o ".$dir{output}."/".$bg_model_file;
    &doit($cmd, $dry_run, $die_on_error, $verbose);
  }
  chdir($dir{output});
}

################################################################
## Compute background models for dyad-analysis
sub compute_bg_dyads {
  ## Tip: the bg sequence file is defined relative to the main
  ## directory, but the bg model files must be defined relative to the
  ## output directory.
  chdir($dir{main});
  my $bg_model_file =  $bg_model_file{"dyads"};
  &RSAT::message::TimeWarn("\tComputing background model for dyads", $bg_model_file) if ($main::verbose >= 1);
  my $cmd = "";
  $cmd .= " dyad-analysis -v 2";
  $cmd .= " -quick";
  $cmd .= " -i ".$bg_seq_file;
  $cmd .= " -return occ,freq";
  $cmd .= " -nogrouprc $strands $noov ";
#  $cmd .= " -type ".$dyad_type;
  $cmd .= " -l ".$monad_length;
  #   $dyad_spacing = $min_sp."-".$max_sp;
  $cmd .= " -sp ".$dyad_spacing;
  $cmd .= " -o ".$dir{output}."/".$bg_model_file;
  &doit($cmd, $dry_run, $die_on_error, $verbose);
  chdir($dir{output});
}

################################################################
## Check MEME path
sub check_meme_path {
  $fasta_get_markov = &RSAT::server::GetProgramPath("fasta-get-markov", $die_on_error);
  $meme_cmd = &RSAT::server::GetProgramPath("meme", $die_on_error);
}

################################################################
## Compute background model for MEME
sub compute_bg_meme {
  ## Tip: the bg sequence file is defined relative to the main
  ## directory, but the bg model files must be defined relative to the
  ## output directory.
  &check_meme_path();
  chdir($dir{main});
  my $bg_model_file =  $bg_model_file{"meme"};
  &RSAT::message::TimeWarn("\tComputing background model for MEME", "markov".$markov_order, $bg_model_file) if ($main::verbose >= 1);
  my $cmd = "";
  if ($bg_seq_file =~ /.gz$/) {
    $cmd .= "gunzip -c ".$bg_seq_file;
  } else {
    $cmd .= "cat ".$bg_seq_file;
  }
  $cmd .= " | ".$fasta_get_markov;
  $cmd .= " -m ".$markov_order;
  $cmd .= " > ".$dir{output}."/".$bg_model_file;
  &doit($cmd, $dry_run, $die_on_error, $verbose);
  chdir($dir{output});
}


################################################################
## Analyze all the clusters
sub AnalyzeFamilies {
  my $fam_count=0;
  my $fam_nb = $#families+1;

  ### Calibrate oligonucleotides for each sequence length
  if (($task{calibrate}) ||
      ($task{calibN}) ||
      ($sequence_file_list) ||
      ($background eq "calib1") ||
      ($background eq "calibN")
     ) {
    my @calib_lengths = &CalcCalibrationLengths();
    &CalibrateOligos(@calib_lengths) if ($task{calibrate});
    &CalibrateOligosN() if ($task{calibN});
    #    } else {
    #	foreach $family_name (@families) {
    #	    $family{$family_name}->{seq_length} = $seq_length*scalar(@{$family{$family_name}->{members}});
    #	}
    $seq_length_calculated = 1;
  }

  ## Read all sequences from an input file and create the family-specific sequence files
  if (($task{sequences}) && ($all_seq_file)) {
    &ReadAllSequences();
  }

  foreach $family_name (@families) {
    #       @main::batch_commands = ();
    $fam_count++;
    my @members = @{$family{$family_name}->{members}};
    $gene_nb = scalar(@members);

    #	&CheckSkip($fam_count,$family_name);
    #	&CheckLast($fam_count);

    ## Check minimum number of genes
    if ($gene_nb < $min_genes) {
      &RSAT::message::Warning ("Skipping family",
			       $fam_count, $family_name,
			       $gene_nb." genes",
			       "< min = ".$min_genes) if ($verbose >= 2);
      next;
    }

    if ($org_fam) {
      $organism_name = $family_name;
      &RSAT::message::TimeWarn("Organism name", $organism_name);
    }


    &RSAT::message::TimeWarn("Treating cluster ", $family_name," (".$fam_count."/".$fam_nb.")",
			     $gene_nb." genes", $organism_name)
      if ($verbose >= 1);

    ### retrieve upstream sequences of the family
    if ($task{sequences}) {
      if ($all_seq_file) {
	&StoreSequences();
      } else {
	&RetrieveSequences();
      }
    }

    ### retrieve upstream sequences of the family
    &RetrieveSequencesEnsEMBL() if ($task{upstream_ensembl});

    ### purge upstream sequences of the family
    &PurgeSequences() if ($task{purge});

    ### oligo-analysis
    if ($task{oligos}) {
      ## This method runs oligo-analysis + merges patterns of different length + runs pattern-assembly
      &OligoAnalysis();

    } else {
      ## oligo merging can be done a posteriori with the option -task merge_oligos
      &MergeOligoLengths() if ($task{merge_oligos});

      ## pattern assembly can be done a posteriori with the option -task assemble_oligos
      &AssembleOligos() if ($task{assemble_oligos});
    }

    ## Convert oligos to PSSM
    if ($task{oligos_pssm}) {
      my $pssm_command = &MatrixFromPatterns($family{$family_name}->{oligo_assembly_file},
					     $family{$family_name}->{oligo_pssm_file}, "oligos");
      if ($main::batch) {
	push @main::batch_commands, $pssm_command;
      } else {
	&doit($pssm_command, $dry_run, $die_on_error, $verbose);
      }
    }

    if ($task{oligo_maps}) {
      ### feature-maps of oligonucleotide occurrences
      &MatchPatterns($family{$family_name}->{oligo_file},
		     $family{$family_name}->{oligo_file},
		     " -N 4");
      &DrawFeatureMap($family{$family_name}->{oligo_file});

      ## scan sequences with the PSSM built from significant oligos
      if (-e $family{$family_name}->{oligo_pssm_file}."_count_matrices.txt","tab") {
	&ScanSequences($family{$family_name}->{oligo_pssm_file}."_count_matrices.txt","tab",
		       $family{$family_name}->{oligo_pssm_file}," -uth Pval 0.00025");
	&DrawFeatureMap($family{$family_name}->{oligo_pssm_file});
      }
    }



    ### dyad analysis
    &DyadAnalysis() if ($task{dyads});

    ## Convert dyads to PSSM
    if ($task{dyads_pssm}) {
      my $pssm_command = &MatrixFromPatterns($family{$family_name}->{dyad_assembly_file},
					     $family{$family_name}->{dyad_pssm_file}, "dyads");
      if ($main::batch) {
	push @main::batch_commands, $pssm_command;
      } else {
	&doit($pssm_command, $dry_run, $die_on_error, $verbose);
      }
    }

    ### dyad feature-maps
    if ($task{dyad_maps}) {
      &DrawDyadFeatureMap();

      ## scan sequences with the PSSM built from significant dyads
      if (-e $family{$family_name}->{dyad_pssm_file}."_count_matrices.txt","tab") {
	&ScanSequences($family{$family_name}->{dyad_pssm_file}."_count_matrices.txt","tab",
		       $family{$family_name}->{dyad_pssm_file}," -uth Pval 0.00025");
	&DrawFeatureMap($family{$family_name}->{dyad_pssm_file});
      }
    }

    ### Jerry Hertz' consensus
    if ($task{consensus}) {
      &Consensus();
    }
    if ($task{consensus_maps}) {
      &ScanSequences($family{$family_name}->{consensus_file}."_count_matrices.txt","consensus",
		     $family{$family_name}->{consensus_file}," -uth Pval 0.00025");
      &DrawFeatureMap($family{$family_name}->{consensus_file});
    }

    ### Andrew Neuwald's gibbs
    &Gibbs() if ($task{gibbs});
    if ($task{gibbs_maps}) {
      &ScanSequences($family{$family_name}->{gibbs_file}."_count_matrices.txt","gibbs",
		     $family{$family_name}->{gibbs_file}," -uth Pval 0.00025");
      &DrawFeatureMap($family{$family_name}->{gibbs_file});
    }

    ### Roth's AlignACE
    &AlignACE() if ($task{AlignACE});
    if ($task{AlignACE_maps}) {
      &ScanSequences($family{$family_name}->{AlignACE_file}."_count_matrices.txt","AlignACE",
		     $family{$family_name}->{AlignACE_file}," -uth Pval 0.00025");
      &DrawFeatureMap($family{$family_name}->{AlignACE_file});
    }

    ### infogibbs
    &infogibbs() if ($task{infogibbs});
    if ($task{infogibbs_maps}) {
      &ScanSequences($family{$family_name}->{infogibbs_file}."_count_matrices.txt","infogibbs",
		     $family{$family_name}->{infogibbs_file}," -uth Pval 0.00025");
      &DrawFeatureMap($family{$family_name}->{infogibbs_file});
    }

    ### Gert Thijs' MotifSampler
    &MotifSampler() if ($task{MotifSampler});

    ### MEME
    &MEME() if ($task{meme});
    if ($task{meme_maps}) {
      &ScanSequences($family{$family_name}->{meme_file}."_count_matrices.txt","meme",
		     $family{$family_name}->{meme_file}," -uth Pval 0.00025");
      &DrawFeatureMap($family{$family_name}->{meme_file});
    }

    ### orm
    &orm() if ($task{orm});

    ## Convert orm assembly to PSSM
    if ($task{orm_pssm}) {
      my $pssm_command = &MatrixFromPatterns($family{$family_name}->{orm_assembly_file},
					     $family{$family_name}->{orm_pssm_file}, "orm");
      if ($main::batch) {
	push @main::batch_commands, $pssm_command;
      } else {
	&doit($pssm_command, $dry_run, $die_on_error, $verbose);
      }
    }


    ### ORM feature-maps
    if ($task{orm_maps}) {
      &MatchPatterns($family{$family_name}->{orm_file},
		     $family{$family_name}->{orm_file},
		     " -N 4");
      &DrawFeatureMap($family{$family_name}->{orm_file});
    }

    #	### merge oligo-analysis and dyad-analysis results
    #	&MergePatterns() if ($task{merge_patterns});

    #	### Match all patterns together with a sliding window
    #	&SlidingWindow() if ($task{slide});
    ### delete upstream sequence file
    if ($task{clean}) {
      &RSAT::message::TimeWarn("Cleaning files from the hard drive for family $family_name") if ($verbose >= 2);
      foreach my $file ($family{$family_name}->{seq_file},
			$family{$family_name}->{seq_file_purged},
			$family{$family_name}->{merged_ft_file},
			$family{$family_name}->{orm_ft_file},
			$family{$family_name}->{dyad_ft_file}
		       ) {
	&RSAT::message::Info("\t$file") if ($verbose >=2);
	$command = "rm -f $file";
	if ($main::batch) {
	  push @main::batch_commands, $command;
	} else {
	  &doit($command, $dry_run, $die_on_error, $verbose);
	}
      }
    }

    ## Compare discovered motifs with motif databases
    if ($task{motifs_vs_db}) {
      my @matrix_types = ("oligo", "dyad", "orm");
      foreach my $matrix_type (@matrix_types) {
	my $matrix_prefix = $family{$family_name}->{$matrix_type."_pssm_file"};
	my $matrix_file = $matrix_prefix."_count_matrices.tf";
	if (-e $matrix_file) {
	  my $compa_prefix = $matrix_prefix."_vs_jaspar";
	  &MotifsVsDB($matrix_file, "transfac", $ENV{RSAT}."/public_html/motif_databases/JASPAR/jaspar_matrices.tf", "transfac", $compa_prefix);
	} else {
	  &RSAT::message::Warning("Motif versus DB", $matrix_type, "Matrix file does not exist", $matrix_file);
	}
      }
    }

    ## Send the batch script to the queue
    if (($batch) && (scalar(@batch_commands) > 0)) {
      my $batch_script = join ";", @main::batch_commands;
      &doit($batch_script, $dry_run, $die_on_error, $verbose, $batch, $family_name);
      @main::batch_commands = ();
    }
  }
}

################################################################
## Execute a command, with optional echo
#sub doit {
#    my ($command) = @_;
#    print "\n$command\n" if ($verbose >= 2);
#    system $command unless $dry_run;
#}

# ## Skip initial clusters if specified with -skip option
# sub CheckSkip {
#     my ($fam_count,$family_name) = @_;
#     if ($fam_count <= $skip) {
# 	warn join ("\t", "; Skipping family",
# 		   $fam_count, $family_name,
# 		       $gene_nb." genes",
# 		   "\t(skip ".$skip. ")",
# 		   ), "\n" if ($verbose >= 1);
# 	next;
#     }
# }

# ## Stop after a few clusters
# sub CheckLast {
#     my ($fam_count) = @_;
#     if (($last >0) && ($fam_count > $last)) {
# 	&RSAT::message::Warning("Stopped after $last genes");
# 	last;
#     }
# }

########################## subroutine definition ############################

sub PrintHelp {
#### display full help message #####
   $HELP_FAMILY_FILE = &help_message("class file");
   open HELP, "| more";
   print HELP <<End_of_help;
NAME
	gene-cluster-motifs

VERSION
       $program_version

AUTHOR
       Since 1999 by Jacques van Helden (Jacques.van-Helden\@univ-amu.fr)
       Since 2004 Olivier Sand (oly\@bigre.ulb.ac.be)

USAGE
       gene-cluster-motifs [-i inputfile] [-v]

DESCRIPTION
	Runs a combination of programs in order to discover significant
	patterns in the upstream regions of several gene clusters.

	This script is a way to auto;atize the systematic analysis of
	multiple clusters of co-regulated genes, such as those
	obtained with DNA chip experiments.

	The program reads the composition of several clusters in a
	single text file, then executes different program discovery
	programs, and generates a summary report.

	Results are subdifided in directories created on the fly, one
	directory per family plus one summary directory.

CATEGORY
	sequences
	motif discovery

OPTIONS
	-h	(must be first argument) display full help message

	-help	(must be first argument) display options

	-v	verbose

	-i	family file (incompatible with -seq)

	-mask upper|lower
		Mask lower or uppercases, respecively, i.e. replace
		selected case by N characters.

	-seq_list
	        List of input sequences files (incompatible with -i)
		This option allows to specify a file containing a list
		of sequence files. The first word of each line must
		specify the path of a sequence file. Each sequence
		file must be in fasta format.

	-all_seq
		A single file containing all the sequences.
		This option is convenient to analyze for example
		results of ChIP-chip experiments, where there is one
		particular sequence associated to each probe.
		Mutually incompatible with -seq_list.

	-all_seq_format
		Input sequence format. This option is useful only in
		combination with the option -all_seq.

	-bg_seq

	        Specify a file containing a set of sequences used to
	        compute background models.  Background sequences must
	        be formatted in fasta.

	        Background models are only computed if the option
	        -task compute_bg is called.

	-skip # skip the first # data sets when performing the
		analyses (this si useful when the program has been
		interrupted after a considerable amount of work).

	-last # stop after the first # data sets when performing the
		analyses (this si useful to perform quick tests).

	-select	fam1[,fam2,fam3,...]
		Perform the analysis of selected clusters only.

	-maindir
		main directory (by default, the working directory is used)

	-outdir	output directory

	-mingenes #
		minimal number of genes per family. Clusters with less
		than this number are skipped.

	-maxgenes #
		maximal number of genes per family. Clusters with more
		than this number are skipped.

   Sequence retrieval options
	-org	organism


	-org_fam
		When this option is used, the name of the family
		(second column of the family file) is used as
		organism. This allows to automatically perform a
		genome-per-genome analysis of the orthologs of a
		regulon from some model organism.

		Typically, the input for this option is obtained from
		the program get-orthologs: starting from a regulon
		from some model organism (e.g. Saccharomyces
		cerevisiae), one wants to discoer motifs in the
		corresponding orthologous genes in other species
		(e.g. each other species of Fungi).

		This option is incompatible with -org.

	-taxon	taxon
		If this option is specified, sequences are retrieved
		for all the orthologs of the input genes in the
		selected taxon. This approach relies on the detection
		of phylogenetic footprints (conserved elements in
		non-coding sequences). If the taxon is chosen in an
		appropriate way, it increases the sensitivity of the
		analysis.

	-noorf	exclude upstream ORFs from upstream sequences

	-rm	use repeat masked version  of the genome

	-orfov  do not exclude upstream ORFs from upstream sequences

	-from	upstream region left limit

	-to	upstream region right limit

       -feattype
		feature type (e.g. CDS, mRNA)

	-seq_type
	       sequence type (upstream, downstream, ORF)

        -size_names
                Use the old naming system (obsolete, only for backward
                compatibility). In the old naming system, file names
                included the upstream size. New names are more
                precise, indicating the -from and -to parameters. This
                avoids ambiguity in some cases.


   Sequence purging option

   	For motif discovery, it is essential to purge sequences
   	i.e. to mask redundant fragments. Such redundant fragments can
   	come from genomic repeats (e.g. in duplicated genes in
   	telomeric regions), or from neighbour genes sharing the same
   	promoter, or , when working with multi-taxa sequences, the
   	fact that several promoters were retrieved from closely
   	related species (e.g. various strains of E.coli). These
   	repeats give a strong bia on the statistical estimation of
   	motif over-representation, and lead thus to a large number
   	of false positive motif.

	The approach followed here is to mask repeats (replace them
	with N characters) during the motif discovery step, but use
	the unmasked sequences for pattern matching, in order to
	locate all the putative instances of the discovered motifs.

	-purge	use purged sequences for motif discovery
		(default)

	-nopurge
		use non-purged sequences for motif discovery

        -purge_ml #
		matching length above which repeats have to be masked

        -purge_mis #
		number of accetped mismatches to consider two sequence
		segments as repats to be masked.

   oligo-analysis and dyad-analysis options

   	-two_tails
		Perform a two-tails test for oligo-analysis and
		dyad-analysis. The two-tails test detects both
		under-represented and over-represented motifs.

	-1str   strand-sensitive analysis

	-2str   strand-insensitive analysis

	-htmaps	draw html maps (dynamic map with pointers to the features)
		These html files take space on the hard drive, so I
		suppressed them from the default output.

	-noov	prevent overlapping matches for self-overlapping patterms
		(default)

	-ovlp	allow overlapping matches for self-overlapping patterms

	-nomap	do not draw feature maps (for saving time and hard disk space)

	-bg	background frequency model
		This parameter indicates which background model has to
		be used for the estimation of expected pattern
		frequencies.
		Supported: $supported_background

	-calib_dir

		Directory containing the calibration files for the
		negative binomial. The calibration file name is then
		calculated automatically from the conditions (seq
		lengths, seq number, ...)

	-calibN_repet #
		Number of repetitions for the calibration.

	-thosig #
		Threshold on occurrence significance.
		(obsolete: use -lth occ_sig instead)

	-lth param value
		Lower threshold on some parameter. All patterns with a
		parameter value smaller than the threshold are
		discarded.
		Supported parameters: $supported_thresholds
	      	Example: select patterns with a positive value for the
	      	occurrence significance.

			 -lth occ_sig 0

	-uth param value
		Upper threshold on some parameter. All patterns with a
		parameter value higher than the threshold are
		discarded.
		Supported parameters: $supported_thresholds
		Example: to select no more than 50 patterns
		        -uth rank 50

	-toppat #
		(default $toppat)
		Maximal number of patterns to take into account for
		the assembly. This parameter is passed to the command
		pattern-assembly.

   oligo-analysis specific options
	-thmsf threshold on frequency of matching sequences (propotion
	        of sequences with at least one occurrence of the
	        pattern)

	-thmssig threshold on the significance of matching sequences
	        (sequences with at least one occurrence of the
	        pattern).

	-maxol	maximum oligonucleotide length

	-minol	minimum oligonucleotide length

	-oligo_exp_freq
		name of a file containing the expected frequencies for
		oligo-analysis.

	-pseudo	pseudo-weight (see oligo-analysis manual)

	-markov #
		calculate expected frequencies on the basis of a
		Markov chain model of order #

   dyad-analysis specific options
	-monad  monad length for dyad-analysis

	-minsp	maximum spacing for dyad-analysis

	-maxsp	maximum spacing for dyad-analysis

	-dyad_exp_freq
		name of a file containing the expected frequencies for
		dyad-analysis.

   feature-map options
   	-origin	 #
		Origin for pattern matching positions, that will
		determine the way patterns are displayed on the
		feature-maps (default: -0, i.e. negative coordinates).
		This parameter is passed to dna-pattern and matrix-scan.

	-scalestep
		Distance between the vertical ticks on the scale bar
		of the feature maps.
		This parameter is passed to feature-map.

   orm-specific options

	orm is a motif discovery program developed by Matthieu
	Defrance. It should be installed in the contribution dir:
		    $RSAT/python-scripts/orm

	orm.py detects windows containing over-represented words, by
	comparison with the background model. More information can be found
	in the ORM documentation.

   Matrix-based motif discovery options (consensus, gibbs, AlignACE, meme, MotifSampler, infogibbs)
       -width  matrix width (default $matrix_width)
       -sps	expected number of sites per sequence
		(default $expected_sites_per_seq)
       -nmotifs number of motifs to discover per sequence set (family)

   Gibbs sampler (Neuwald, 1995)
	-seed	 seed number for the random generator

   AlignACE (Roth, 1998)
	-seed	 seed number for the random generator

   MotifSampler (Thijs, 2001)
	-MS_b	backgound file
	-MS_p	prior probability of 1 motif copy (default $MS_p)
	-MS_M   Maximal number of motif instances per sequence. (default $MS_M; unset=0)
	-MS_n	number of different motifs to search for (default $MS_n).
	-MS_x	allowed overlap between different motifs. (default $MS_x)
	-MS_r	number of times the MotifSampler should be repeated (default = $MS_r).
	Note	-MS_w is replaced by -width

   MEME options (Bailey)
	Any option starting with -MEME_ will be passed to MEME.
	In particular, the following options are passed by default.
   	-MEME_minw	minimum motif width (default $MEME_minw)
   	-MEME_maxw	maximum motif widht (default $MEME_maxw)
	-MEME_bfile     name of background Markov model file

   info-gibbs options (Defrance, 2008)
	Any option starting with -infogibbs_ will be passed to info-gibbs.
	The list of supported options can be obtained with the command
	    info-gibbs -h

   Output options
	-transp	transpose synthetic result table (columns become rows)
	-sort sort_key
	        sort key for the synthetic table
		Supported: $supported_sort_keys

  Options for the HTML synthesis table
    -fam_link_pref prefix
    -fam_link_pref suffix
       prefix and suffix to put before and before family names in
       order to build a specific HTML hyperlink for each family name.

       Example (with prefix only):
         -fam_link_pref 'http://amigo.geneontology.org/cgi-bin/amigo/term-details.cgi?term=GO:'
	will replace the family name 0003997 by a link of the type
	  <a href=http://amigo.geneontology.org/cgi-bin/amigo/term-details.cgi?term=GO:0003997>0003997</a>

    -gene_link_pref prefix
    -gene_link_pref suffix
       prefix and suffix to put before and before family names in
       order to build a specific HTML hyperlink for each gene name.

       Example (with prefix only):
         -gene_link_pref 'http://db.yeastgenome.org/cgi-bin/locus.pl?locus='
	   will create a link from the gene YBR093C to the URL
	   http://db.yeastgenome.org/cgi-bin/locus.pl?locus=YBR093C

  Options for the automaticaly generated SQL scripts
	-schema database schema (default: $schema)
	-host	database host (default: $host)
	-user	database user (default: $user)
	-password
		database password (default: $password)

   Other options
	-task selected_task
		Select the tasks to be performed.
		Supported tasks: $supported_tasks

		Can be used iteratively on the same command line to
		select multiple tasks.

		Example:
		    -task upstream -task oligos -task synthesis
		For a full analysis, simply type
		    -task all

		See details below (TASK DESCRIPTION)

	-known	known_site_file
		A file containing a list of known sites/motifs for
		each of the gene clusters. These known sites are NOT
		used during the motif discovery. They are simply
		displayed besides the discovered patterns, for the
		sake of comparison.

		Format: the known site file is a tab-delimited text
		files, with the collofing column content :
		       1) site/motif sequence
		       2) family
		       3) source (optional)

		Example
		=======
		; sequence	family	source
		gGGAAAaTGAAACT	ISGF-3	TRANSFAC:R00001
		AGGAAATAGAAACT	ISGF-3	TRANSFAC:R00003
		ggagGGGCGGccct	Sp1	TRANSFAC:R14218
		...

    	-known_max_len #
		maximal length for using known sites. This allows to
		filter out some sites with large sequences annotated in
		TRANSFAC (default: $known_site_max_len)

        -db     database_file
                A file containing a list of known sites (from a database)

	-batch (advanced users)
		Send time-consuming jobs on a batch queue which will
		be executed on a PC cluster. This option only works on
		our lab cluster, but could be adapted for other
		configurations by adapting the method &doit() in the
		utilities ($RSAT/lib/RSA.lib).

	-nodie
		Continue the analysis even if errors are raised by
		sub-processes. By default, gene-cluster-motifs
		stops if an error has occurred in one of the
		sub-programs. However, in some particular the user
		might want to ignore such errors.

		In particular, this option is useful to circumvent a
		problem encountered with the gibbbs sampler under some
		operating systems (Linux but not Mac OSX): the gibbs
		program returns an error code to the system even when
		the motif discovery was successful.

FILE FORMAT
   $HELP_FAMILY_FILE

QUICK START
   1) create a directory for the set of clusters.

   2) witin this directory, create a text file contaning the
      description of family composition, as described above.

   3) run the command. For example, with a bacterial genome :

      gene-cluster-motifs -org organism -i family_file \
          -v -from -200 -to 49 -task all

   4) With a web browser (e.g. Netscape), open the folder
      'synthetic_tables'. There should be a HTML file with the name
      of your family ile as prefix, and the suffix indicating the
      other analysis parameters. Open this file and analyze the
      result.

TASK DESCRIPTION

   all		Perform all the tasks below.

   upstream	Retrieve upstream sequences for the clusters.
		(retrieve-seq)

   purge	Purge upstream sequences for the family.
		(purge-sequences)

   compute_bg 	Compute background models from a set of background
      		sequences. Background sequences must be specified with
      		the option -bg_seq.

   calibrate
   		Perform a word-specific calibration of expected
		frequencies, by calculating oligonucleotide
		distributions in the whole genome set of upstream
		sequences of the same size.

		Beware: this takes time, but, for higher organisms, it
		gives much less false positives than the standard
		background models.

   calibN
		Perform a family-specific calibration of expected
		oligonucleotide frequencies by calculating occurrences
		in N random selections of genes.

		Beware: this takes a lot of time, but it strongly
		reduces the rate of false positives for higher
		organisms. We did not yet measure the rate of true
		positives remaining after this calibration.

   oligos	Run oligo-analysis on each family.

   dyads	Run dyad-analysis on each family.

   maps	Draw feature maps with the results of oligo-analysi and
		dyad-analysis.
		(feature-map)

   merge	Merge the results of oligo-analysis and dyad-analysis.

   slide	Match discovered patterns with a sliding window.
		(dna-pattern)

   synthesis	Generate a HTML report in the directory sythetic_tables.

   sql		Export the discovered patterns in tab-delimited files,
		and export SQL scripts for loading these tables in a
		relational database.

   clean	Delete upstream sequences after processing, in order
		to save disk space.

   validate	Validate discovered patterns by comparing them to the list
   		of known sites. This requires to specify a collection
   		of know sites with the option -known.

   db_match     Compare discovered patterns to a list of known sites
                (for instance a whole database). This requires to specify
                the list of known sites with the option -db.

WISH LIST

  -bg_seq must still be adapted for several programs
  	  dyad-analysis
	  MotifSampler
	  info-gibbs (same bg format as MotifSampler)

End_of_help
 close HELP;
 exit;
}

################################################################
#### Read arguments
sub ReadArguments {

  foreach my $a (0..$#ARGV) {
    ### verbose ###
    if ($ARGV[$a] eq "-v") {
      $verbose = $ARGV[$a+1];
      unless (&IsNatural($verbose)) {
	$verbose = 1;
      }

      #### dry run
    } elsif ($ARGV[$a] eq "-n") {
      $dry_run = 1;

      #### Sequence puring options
    } elsif ($ARGV[$a] eq "-nopurge") {
      $analyze_purged_sequences = 0;

    } elsif ($ARGV[$a] eq "-purge") {
      $analyze_purged_sequences = 1;

    } elsif ($ARGV[$a] eq "-purge_ml") {
      $purge_ml = 1;

    } elsif ($ARGV[$a] eq "-purge_mis") {
      $purge_mis = 1;

      ### detailed help
    } elsif ($ARGV[$a] eq "-h") {
      &PrintHelp();

      ### list of options
    } elsif ($ARGV[$a] eq "-help") {
      &PrintOptions();

      ### family file
    } elsif (($ARGV[$a] eq "-i") ||
	     ($ARGV[$a] eq "-fam") # for compatibility with previous versions
	    ) {
      $family_file = $ARGV[$a+1];

      ### skip some data sets
    } elsif ($ARGV[$a] eq "-skip") {
      &FatalError("The option -skip is incompatible with the option -select") if (scalar(@selected) > 0);
      $skip = $ARGV[$a+1];
      &RSAT::error::FatalError("Invalid number with option -skip\t$skip") unless &IsNatural($skip);

      ### sequence file list
    } elsif ($ARGV[$a] eq "-seq_list") {
      $sequence_file_list = $ARGV[$a+1];
    } elsif ($ARGV[$a] eq "-seq") {
	&RSAT::message::Warning("Option -seq is obsolete, you should use the option -seq_list.");
	$sequence_file_list = $ARGV[$a+1];

      ### Single file with all sequences
    } elsif ($ARGV[$a] eq "-all_seq") {
      $all_seq_file = $ARGV[$a+1];

      ### input sequence format
    } elsif ($ARGV[$a] eq "-all_seq_format") {
      $all_seq_format = $ARGV[$a+1];

      ### File containing sequences used to estimate the background model
    } elsif ($ARGV[$a] eq "-bg_seq") {
      $bg_seq_file = $ARGV[$a+1];
      &RSAT::error::FatalError("Background sequence file does not exists", $bg_seq_file) unless (-e $bg_seq_file);

      ### stop after some data sets
    } elsif ($ARGV[$a] eq "-last") {
      &FatalError("The option -last is incompatible with the option -select") if (scalar(@selected) > 0);
      $last = $ARGV[$a+1];
      &RSAT::error::FatalError("Invalid number with option -last\t$last") unless &IsNatural($last);

      ### stop after some data sets
    } elsif ($ARGV[$a] eq "-select") {
      &FatalError("The option -delete is incompatible with the option -last") if ($last > 0);
      &FatalError("The option -delete is incompatible with the option -skip") if ($skip > 0);
      push (@selected, (split ",", $ARGV[$a+1]));

      ## mask
    } elsif ($ARGV[$a] eq "-mask") {
      $mask = $ARGV[$a+1];
      &CheckMask($mask);	

      ### main directory
    } elsif ($ARGV[$a] eq "-maindir") {
      $dir{main} = $ARGV[$a+1];

      ### output directory
    } elsif ($ARGV[$a] eq "-outdir") {
      $dir{output} = $ARGV[$a+1];

      ### min number of genes
    } elsif ($ARGV[$a] eq "-mingenes") {
      $min_genes = $ARGV[$a+1];

      ### max number of genes
    } elsif ($ARGV[$a] eq "-maxgenes") {
      $max_genes = $ARGV[$a+1];

      ### max number of patterns to assemble
    } elsif ($ARGV[$a] eq "-toppat") {
      $toppat = $ARGV[$a+1];

      ### upstream sequence lmits
    } elsif (($ARGV[$a] eq "-from") &&
	     (&IsInteger($ARGV[$a+1]))) {
      $from = $ARGV[$a+1];
    } elsif (($ARGV[$a] eq "-to") &&
	     (&IsInteger($ARGV[$a+1]))) {
      $to = $ARGV[$a+1];

      ### strands
    } elsif ($ARGV[$a] eq "-1str") {
      $force{strands} = "-1str";
    } elsif ($ARGV[$a] eq "-2str") {
      $force{strands} = "-2str";

      ### oligo-analysis parameters
    } elsif ($ARGV[$a] eq "-minol") {
      $min_oligo_len = $ARGV[$a+1];;
    } elsif ($ARGV[$a] eq "-maxol") {
      $max_oligo_len = $ARGV[$a+1];;

      ### dyad-analysis parameters
    } elsif ($ARGV[$a] eq "-monad") {
      $monad_length = $ARGV[$a+1];;
    } elsif ($ARGV[$a] eq "-minsp") {
      $min_sp = $ARGV[$a+1];;
    } elsif ($ARGV[$a] eq "-maxsp") {
      $max_sp = $ARGV[$a+1];;

      ## feature-map parameters
    } elsif ($ARGV[$a] eq "-origin") {
      $map_origin = $ARGV[$a+1];;
    } elsif ($ARGV[$a] eq "-scalestep") {
      $scalestep = $ARGV[$a+1];;

      ### matrix-based motif discovery
    } elsif ($ARGV[$a] eq "-width") {
      $matrix_width = $ARGV[$a+1];
    } elsif ($ARGV[$a] eq "-sps") {
      $expected_sites_per_seq = $ARGV[$a+1];
    } elsif ($ARGV[$a] eq "-nmotifs") {
      $nmotifs = $ARGV[$a+1];
      $MEME_options{nmotifs} = $nmotifs;
      $MS_n = $nmotifs;

      ### Gibbs sampler + AlignACE + infogibbs seed option
    } elsif ($ARGV[$a] eq "-seed") {
      $seed = $ARGV[$a+1];

      ### MotifSampler options
    } elsif ($ARGV[$a] =~ /-(MS_\S+)/) {
      $$1 =  $ARGV[$a+1];

      ### MEME options
    } elsif ($ARGV[$a] =~ /-MEME_(\S+)/) {
      my $option = $1;
      my $value =  $ARGV[$a+1];
      push @MEME_options, $option unless (defined($MEME_options{$option}));
      $MEME_options{$option} = $value;

      ### info-gibbs options
    } elsif ($ARGV[$a] =~ /-infogibbs_(\S+)/) {
      my $option = $1;
      my $value =  $ARGV[$a+1];

      ## Replace long option name by short option name if defined
      $option =~ s/^verbosity$/v/;
      $option =~ s/^length$/l/;
      $option =~ s/^strand$/s/;
      $option =~ s/^iter$/n/;
      $option =~ s/^words$/w/;
      $option =~ s/^expected$/e/;
      $option =~ s/^motifs$/m/;
      #      $option =~ s/^bgfile$/b/;
      $option =~ s/^dmin$/d/;

      ## Avoid redundant parameters
      if ($option eq "s") {
	&RSAT::error::FatalError("info-gibbs option: Strand should be defined with options -2str|-1str");
      } elsif ($option eq "i") {
	&RSAT::error::FatalError("info-gibbs option: Input file is automaticaly specified by gene-cluster-motifs");
      } elsif ($option eq "l") {
	&RSAT::error::FatalError("info-gibbs option: Matrix width should be defined with option -w");
      } elsif ($option eq "m") {
	&RSAT::error::FatalError("info-gibbs option: Number of motifs width should be defined with option -nmotif");
      } elsif ($option eq "e") {
	&RSAT::error::FatalError("info-gibbs option: Expected number of sites per sequence should be defined with option -sps");
      }
      push @infogibbs_options, $option unless (defined($infogibbs_options{$option}));
      $infogibbs_options{$option} = $value;

      ### ORM options
    } elsif ($ARGV[$a] =~ /-(ORM_\S+)/) {
      $$1 =  $ARGV[$a+1];

      ### infogibbs options
    } elsif ($ARGV[$a] =~ /-(infogibbs_\S+)/) {
      $$1 =  $ARGV[$a+1];

      ### organism
    } elsif ($ARGV[$a] eq "-org") {
      $organism_name = $ARGV[$a+1];

      ### organisms are specified as family names
    } elsif ($ARGV[$a] eq "-org_fam") {
      $org_fam = 1;

      ### Old naming system, maintained for backward compatibility
    } elsif ($ARGV[$a] eq "-size_names") {
      $size_names = 1;

      ### taxon
    } elsif ($ARGV[$a] eq "-taxon") {
      $taxon = $ARGV[$a+1];

      ### file with a collection of known motifs, for comparison between discovered and known motifs
    } elsif ($ARGV[$a] eq "-known") {
      $known_site_file = $ARGV[$a+1];
    } elsif ($ARGV[$a] eq "-known_max_len") {
      $known_site_max_len = $ARGV[$a+1];

      ### file with a collection of known motifs, for comparison between discovered and known motifs
    } elsif ($ARGV[$a] eq "-db") {
      $db_site_file = $ARGV[$a+1];

      ### file with a collection of known motifs, for comparison between discovered and known motifs
    } elsif ($ARGV[$a] eq "-dbname") {
      $db_site_name = $ARGV[$a+1];

      #### task selection
    } elsif ($ARGV[$a] eq "-task") {
      my @requested_tasks = split ",", $ARGV[$a+1];
      foreach my $task (@requested_tasks) {
	next unless $task;
	#	    $task = $ARGV[$a+1];
	if ($supported_task{$task}) {
	  $task{$task} = 1;
	} else {
	  &RSAT::error::FatalError("Unsupported task '$task'. \n\tSupported: $supported_tasks");
	}
      }

      #### transpose synthetic table
    } elsif ($ARGV[$a] =~ /^-transp/) {
      $transpose = 1;

      #### family link prefix
    } elsif ($ARGV[$a] =~ /^-fam_link_pref/) {
      $fam_link_prefix = $ARGV[$a+1];
      #### family link suffix
    } elsif ($ARGV[$a] =~ /^-fam_link_suff/) {
      $fam_link_suffix = $ARGV[$a+1];

      #### gene link prefix
    } elsif ($ARGV[$a] =~ /^-gene_link_pref/) {
      $gene_link_prefix = $ARGV[$a+1];
      #### gene link suffix
    } elsif ($ARGV[$a] =~ /^-gene_link_suff/) {
      $gene_link_suffix = $ARGV[$a+1];

      #### sort key
    } elsif ($ARGV[$a] =~ /^-sort/) {
      $sort_key = $ARGV[$a+1];
      unless ($supported_sort_key{$sort_key}) {
	&RSAT::error::FatalError("$sort_key is not supported as sort key");
      }

      #### clip upstream ORFs fom upstream regions
    } elsif ($ARGV[$a] =~ /^-noorf/) {
      $noorf = "-noorf";

      #### use repeat masked version of the genome
    } elsif ($ARGV[$a] eq "-rm") {
      $repeat_masked = 1;

      #### do not clip upstream ORFs fom upstream regions
    } elsif ($ARGV[$a] =~ /^-orfov/) {
      $noorf = "";

      #### Threshold on relative weight for compare-patterns
    } elsif ($ARGV[$a] eq '-rel_w') {
      $rel_w = $ARGV[$a+1];
      &FatalError("Relative weight should be a real value comprised between 0 and 1") unless ((&IsReal($rel_w)) && ($rel_w >= 0) && ($rel_w <= 1));

      ### Upper threshold
    } elsif ($ARGV[$a] eq "-lth") {
      my $thr_field = $ARGV[$a+1];
      my $thr_value =  $ARGV[$a+2];
      unless ($supported_threshold{$thr_field}) {
	&RSAT::error::FatalError("Invalid threshold field $thr_field. Supported: $supported_thresholds");
      }
      $lth{$thr_field} = $thr_value;
      #	    &RSAT::message::Debug("Lower threshold", $thr_field, $thr_value, $lth{$thr_field});

      ### Lower threshold
    } elsif ($ARGV[$a] eq "-uth") {
      my $thr_field = $ARGV[$a+1];
      my $thr_value =  $ARGV[$a+2];
      unless ($supported_threshold{$thr_field}) {
	&RSAT::error::FatalError("Invalid threshold field $thr_field. Supported: $supported_thresholds");
      }
      $uth{$thr_field} = $thr_value;

      #### threshold
    } elsif ($ARGV[$a] =~ /^-thosig/) {
      &RSAT::error::FatalError("Option -thosig is deprecated. Please use '-lth occ_sig' instead.");

      #### threshold on mseq frequency
    } elsif ($ARGV[$a] =~ /^-thmsf/) {
      &RSAT::error::FatalError("Option -thmsf is deprecated. Please use '-lth ms_freq' instead.");
#      $lth{ms_freq} = $ARGV[$a+1];

      #### threshold on mseq significance
    } elsif ($ARGV[$a] =~ /^-thmssig/) {
      &RSAT::error::FatalError("Option -thmssig is deprecated. Please use '-lth ms_sig' instead.");
#      $lth{ms_sig} = $ARGV[$a+1];

      #### dyad type
    } elsif ($ARGV[$a] =~ /^-type/) {
      $force{type} = $ARGV[$a+1];

      #### feature types
    } elsif ($ARGV[$a] eq "-feattype") {
      my @types = split ",", $ARGV[$a+1];
      foreach my $type (@types) {
	if ($supported_feature_types{lc($type)}) {
	  $accepted_feature_types{lc($type)}++;
	} else {
	  &RSAT::error::FatalError("$type invalid feature type. Supported: $supported_feature_types");
	}
      }

      #### sequence types
    } elsif ($ARGV[$a] eq "-seq_type") {
      $seq_type = $ARGV[$a+1];

      #### expected frequency file for dyad-analysis
    } elsif ($ARGV[$a] =~ /^-dyad_exp_freq/) {
      $exp_freq_file{dyads} = $ARGV[$a+1];

      #### expected frequency file for oligo-analysis
    } elsif ($ARGV[$a] =~ /^-oligo_exp_freq/) {
      $exp_freq_file{oligos} = $ARGV[$a+1];

      #### Calibration directory
    } elsif ($ARGV[$a] =~ /^-calib_dir/) {
      $dir{calib1} = $ARGV[$a+1];
      $dir{calibN} = $ARGV[$a+1];

      #### Number of repetitions for the calibration
    } elsif ($ARGV[$a] =~ /^-calibN_repet/) {
      $calibN_repet = $ARGV[$a+1];
      &FatalError("Invalid number of repetitions for the calibrations: should be at least 2, and a much larger value (e.g.100) is highly recommended.")
	unless ((&IsNatural($calibN_repet)) && ($calibN_repet >= 2));

      #### background model for expected frequency.
      #### Supported: "upstream", "upstream-noorf", "intergenic", "mncf", "upstream-rm", "upstream-noorf-rm"
    } elsif (($ARGV[$a] =~ /^-exp/)
	     || ($ARGV[$a] =~ /^-bg/)) {
      $background = $ARGV[$a+1];
      $background =~ s/^ncf$/intergenic/;

      unless ($supported_background{$background}) {
	&RSAT::error::FatalError("$background\tInvalid value for background. Supported : $supported_background");
      }

      #### Markov chain estimation of expected oligo frequencies
    } elsif ($ARGV[$a] =~ /^-markov/) {
      $markov_order = $ARGV[$a+1];
      $markov=1;

      #### prevent self-overlap
    } elsif ($ARGV[$a] eq "-noov") {
      $noov = "-noov";

      #### two-tail test for oligo-analysis and dyad-analysis
    } elsif ($ARGV[$a] eq "-two_tails") {
      $two_tails = 1;

      #### allow self-overlap
    } elsif ($ARGV[$a] eq "-ovlp") {
      $noov = "-ovlp";

      #### prevent feature-map drawing
    } elsif ($ARGV[$a] eq "-nomap") {
      $draw_maps = 0;

      #### draw htmaps
    } elsif ($ARGV[$a] =~ /^-htmap/) {
      $draw_maps = 1;
      $htmaps = 1;

      #### pseudo-frequency for oligoanalysis
    } elsif ($ARGV[$a] =~ /^-oligo_pseudo/) {
      $oligo_pseudo = $ARGV[$a+1];
      unless ((&IsReal($oligo_pseudo)) && ($oligo_pseudo > 0)) {
	&RSAT::error::FatalError("pseudo-weight must be a positive real number");
      }

      #### pseudo-weight for matrix conversion
    } elsif ($ARGV[$a] =~ /^-matrix_pseudo/) {
      $matrix_pseudo = $ARGV[$a+1];
      unless ((&IsReal($matrix_pseudo)) && ($matrix_pseudo > 0)) {
	&RSAT::error::FatalError("pseudo-weight must be a positive real number");
      }

      #### Options for the SQL scripts
    } elsif ($ARGV[$a] eq "-schema") {
      $schema = $ARGV[$a+1];
    } elsif ($ARGV[$a] eq "-host") {
      $host = $ARGV[$a+1];
    } elsif ($ARGV[$a] eq "-user") {
      $user = $ARGV[$a+1];
    } elsif ($ARGV[$a] eq "-password") {
      $password = $ARGV[$a+1];

    } elsif ($ARGV[$a] eq "-pc_cluster") {
      &RSAT::message::Warning("Option -pc_cluster is obsolete, use option -batch instead.");
      $batch = 1;

    } elsif ($ARGV[$a] eq "-batch") {
      $batch = 1;

    } elsif ($ARGV[$a] eq "-nodie") {
      $die_on_error = 0;

    }
  }
}



################################################################
# Verbosity
#
sub Verbose {
  my $verbose_message = "";
  $verbose_message .= "; gene-cluster-motifs ";
  $verbose_message .= &PrintArguments()."\n";
  printf "; %-22s\t%s\n", "Program version", $program_version;
  printf "; clusters     \t%d\n", $#families + 1;
  my $f = 0;
  foreach my $family_name (@families) {
    $f++;
    printf ";\tfamily\t%d\t%s\t%d elements\n", $f, $family_name, $family{$family_name}->get_size();
  }

  if ($family_file) {
    $verbose_message .= sprintf "; Family file\t%s\n", $family_file ;
    $verbose_message .= sprintf "; Upstream regions\n";
    $verbose_message .= sprintf ";\t%-22s\t%d\n", "Strands", $strands;
    $verbose_message .= sprintf ";\t%-22s\t%d\n", "From", $from;
    $verbose_message .= sprintf ";\t%-22s\t%d\n", "To", $to;
  } elsif ($sequence_file_list) {
    $verbose_message .= sprintf "; Sequence file list\t%s\n", $sequence_file_list;
  }
  $verbose_message .= sprintf "; %-13s\t%s\n", "Organism", $organism_name;

  if ($task{oligos}) {
    $verbose_message .= sprintf "; oligo-analysis\n";
    $verbose_message .= sprintf ";\t%-22s\t%d\n", "max oligo length", $max_oligo_len;
    $verbose_message .= sprintf ";\t%-22s\t%d\n", "min oligo length", $min_oligo_len;
  }

  if ($task{dyads}) {
    $verbose_message .= sprintf "; dyad-analysis\n";
    $verbose_message .= sprintf ";\t%-22s\t%d\n", "monad length", $monad_length;
    $verbose_message .= sprintf ";\t%-22s\t%d\n", "min spacing", $min_sp;
    $verbose_message .= sprintf ";\t%-22s\t%d\n", "max spacing", $max_sp;
    $verbose_message .= sprintf ";\t%-22s\t%s\n", "dyad type", $dyad_type;
    $verbose_message .= &PrintThresholdValues();
    #	$verbose_message .= sprintf ";\t%-22s\t%g\n", "sig threshold", $lth{occ_sig};
  }

  if ($bg_seq_file) {
    $verbose_message .= sprintf "; %-13s\t%s\n", "Background sequence file", $bg_seq_file;
    foreach my $model (sort keys %bg_model_file) {
      $verbose_message .= sprintf ";\t%-12s\t%s\n", $model, $bg_model_file{model};
    }
  } elsif ($background) {
    $verbose_message .= sprintf "; %-13s\t%s\n", "Background model", $background;
  } elsif ($markov) {
    $verbose_message .= sprintf "; %-13s\t%s\n", "Background Markov order", $markov_order;
  }

  $verbose_message .= "; No overlap with upstream ORFs\n" if ($noorf);
  if ($noov eq "-noov") {
    $verbose_message .= "; Overlapping matches are discarded for self-overlapping patterns\n" ;
  } else {
    $verbose_message .= "; Overlapping matches are allowed for self-overlapping patterns\n" ;
  }
  warn $verbose_message;
  return($verbose_message);
}


################################################################
### initialize one directory for each family
sub MakeDirectories {
   foreach $family_name(@families) {
	$dir = "${family_name}/";
	unless (-d $dir) {
	    mkdir $dir, 0755 || &RSAT::error::FatalError("Cannot create directory", $dir);;
	}
	open FAM, ">$family{$family_name}->{family_file}" ||
	  &RSAT::error::FatalError("Cannot write family file", $family{$family_name}->{family_file});
	foreach $id ($family{$family_name}->get_members()) {
	  print FAM "$id\t$name{$id}\n";
	}
	close FAM;
   }
}


################################################################
### Generate tab files and SQL scripts for storing the results in a
### relational database
sub ExportSQL {
  &RSAT::message::TimeWarn("Generating SQL") if ($verbose >= 1);

  &RSAT::message::debug($current_analysis, $current_analysis->get_attribute("id")) if ($verbose >= 10);

  ## Set the parameters for the analysis as a whole
  $current_analysis->set_attribute("date", $run_date);
  $current_analysis->set_attribute("family_file", $family_file);
  $current_analysis->set_attribute("organism", $organism_name);
  $current_analysis->set_attribute("background", $background);
  $current_analysis->set_attribute("up_from", $from);
  $current_analysis->set_attribute("up_to", $to);
  $current_analysis->set_attribute("orf_ovlp", $noorf);
  $current_analysis->set_attribute("pattern_ovlp", $noov);
  $current_analysis->set_attribute("strands", $strands);
  $current_analysis->set_attribute("thosig", $lth{occ_sig});
  $current_analysis->set_attribute("suffix", $table_suffix);

  my %export_columns = ();
  my $f = 0;
  foreach my $family_name (@families) {
    $f++;
    &RSAT::message::TimeWarn("Generating SQL for family", $f."/".scalar(@families), $family_name) if ($verbose >= 2);

    #### general family attributes
    my $family_object = $family_factory->new_object(id=>$run_prefix."_".$family_name);
    local $family_id = $family_object->get_attribute("id");
    $current_analysis->push_attribute("clusters",$family_id);
    $family_object->set_attribute("analysis", $run_prefix);
    $family_object->set_attribute("name", $family_name);
    $family_object->set_attribute("size", scalar(@{$family{$family_name}->{members}}));
    $family_object->set_attribute("organism", $organism_name);
    &RSAT::message::Warning ("", "SQL export for family", $family_name, $organism_name) if ($verbose >= 2);

    #### family members
    my @members = @{$family{$family_name}->{members}};
    foreach my $m (@members) {
      $family_object->push_attribute("genes", $m);
    }

    #### discovered oligos
    my $oligo_file = $family{$family_name}->{oligo_file};
    if (-e $oligo_file) {
      my %export_columns = ();
      ($oligo_handle, $dir) = &OpenInputFile($oligo_file);
      while (<$oligo_handle>) {
	chomp;
	if (/^;/) {
	  if ((/;\s+(\d+)\t(\S+)/) && ($2 ne "id")) {
	    $export_columns{$2} = $1 - 1;
	  }
	  next;
	}
	next unless (/\S/);	## Skip empty lines
	next if (/^\#/);		## Skip header line
	my @fields = split;
	my $sequence = uc($fields[0]);
	my $oligo_object = $pattern_factory->new_object();
	&RSAT::message::Debug("Created object", $sequence, $oligo_object) if ($verbose >= 5);
	foreach $k (sort keys %export_columns) {
	  &RSAT::message::Debug ( "specifying attribute",
				  $k,
				  $export_columns{$k},
				  $fields[$export_columns{$k}],
				) if ($verbose >= 5);
	  $oligo_object->set_attribute($k, $fields[$export_columns{$k}]);
	}
	&RSAT::message::Debug ("specifying sequence", $sequence) if ($verbose >= 5);
	$oligo_object->force_attribute("sequence", $sequence);
	$oligo_object->force_attribute("type", "oligo");
	$oligo_object->set_attribute("family_id", $family_id);
	$oligo_object->set_attribute("family", $family_name);
	if ($strands eq "-2str") {
	  $oligo_object->force_attribute("rev_compl", &ReverseComplement($oligo_object->get_attribute("sequence")));
	}
      }
      close $oligo_handle;
    } else {
      my $pwd = `pwd`;
      chomp $pwd;
      &RSAT::message::Warning(join ("\t", "SQL export", $family_name, $pwd, "oligo file does not exist",  $oligo_file , "SKIPPED"));
    }

    #### discovered dyads
    my $dyad_file = $family{$family_name}->{dyad_file};
    if (-e $dyad_file) {
      my %export_columns = ();
      open DYADS, $dyad_file;
      while (<DYADS>) {
	chomp;
	if (/^;/) {
	  ## Temporary patch for dyad obs_occ -> occ (to be compatile with oligo-analysis
	  s/obs_occ/occ/;

	  ## Output fields
	  if ((/;\s+(\d+)\t(\S+)/) && ($2 ne "id")) {
	    $export_columns{$2} = $1 - 1;
	  }
	  next;
	}
	next unless (/\S/);
	next if (/^\#/);		## Skip header line
	my @fields = split;
	my $sequence = uc($fields[0]);
	$sequence =~ s/N/n/g;
	my $dyad_object = $pattern_factory->new_object();
	&RSAT::message::Debug("Created object", $sequence, $dyad_object) if ($verbose >= 5);
	foreach $k (sort keys %export_columns) {
	  &RSAT::message::Debug("specifying attribute", $k, $export_columns{$k},$fields[$export_columns{$k}])
	    if ($verbose >= 5);
	  $dyad_object->set_attribute($k, $fields[$export_columns{$k}]);
	}
	&RSAT::message::Debug ("specifying sequence", $sequence) if ($verbose >= 5);
	$dyad_object->force_attribute("sequence", $sequence);
	$dyad_object->force_attribute("type", "dyad");
	$dyad_object->set_attribute("family_id", $family_id);
	$dyad_object->set_attribute("family", $family_name);
	if ($strands eq "-2str") {
	  my $rc_sequence = uc( &ReverseComplement($dyad_object->get_attribute("sequence")));
	  $rc_sequence =~ s/N/n/g;
	  $dyad_object->force_attribute("rev_compl",$rc_sequence);
	}
      }
      close DYADS;
    } else {
      &RSAT::message::Warning(join ("\t", "SQL export", $family_name, $pwd, "dyad file does not exist",  $dyad_file , "SKIPPED"));
    }

    #### Matrices
    for my $program (qw (consensus gibbs AlignACE infogibbs meme MotifSampler)) {
      my $matrix_file = $family{$family_name}->{$program."_file"};
      if (-e $matrix_file) {
	&ReadMatrices($family_name, $matrix_file, program=>$program);
      } else {
	&RSAT::message::Warning(join ("\t", "SQL export", $family_name, $pwd, "matrix file does not exist",  $matrix_file , "SKIPPED"));
      }
    }
  }

  ## Export directory
  $dir{sql} = "sql_export";
  chdir($dir{main}); chdir($dir{output}); &RSAT::util::CheckOutDir($dir{sql});

  ## Temporarily motify  output directory
  ## for table dumping
  $dir{output_bk} = $dir{output};
  $dir{output} = $dir{sql};
  my $pwd = `pwd`;
  chomp $pwd;
  &RSAT::message::Info ($pwd , $dir{output}, $dir{output_bk});
  #### export the result tables
  foreach my $factory ($analysis_factory, $family_factory, $pattern_factory, $matrix_factory) {
    $factory->dump_tables();
    $factory->generate_sql(schema=>$schema,
			   host=>$host,
			   password=>$password,
			   user=>$user);
  }
  &ExportMakefile("analysis", "family", "pattern");
  $dir{output} = $dir{output_bk};
}


## ##############################################################
## Read all sequences (option -all_seq)
sub ReadAllSequences {
  &RSAT::message::TimeWarn("Reading ALL sequences from file", $all_seq_file, "format", $all_seq_format);
  chdir($dir{main});
  our %all_sequences = ();
  ($in, $input_dir) = &OpenInputFile($all_seq_file);
  while ((($current_seq, $current_id, @comments) = &ReadNextSequence($in, $all_seq_format, $input_dir, "",$mask, %args)) &&
	 (($current_seq ne "") || ($current_id ne ""))) {
    $seq_nb++;
    &RSAT::message::TimeWarn("Read sequence", $seq_nb) if (($main::verbose >= 0) && ($seq_nb%1000==0));
    #	&RSAT::message::Debug("", "read sequence", $seq_nb, $current_id) if ($main::verbose >= 10);
    $all_sequences{lc($current_id)} = $current_seq;
    $all_sequences_comment{lc($current_id)} = join "; ", @comments;
#    &RSAT::message::Debug($current_id, length($current_seq)." bp", $all_sequences_comment{lc($current_id)}) if ($main::verbose >= 10);
  }
  close $in;
  chdir($dir{output});
}


## ##############################################################
## If the input is a list of sequence files instead of a family file,
## index the input sequence files
sub ReadSequenceList {
   ($sequence_dir, $short_sequence_file_list) = &SplitFileName($sequence_file_list);

   ## Convert relative to absolute directory
   if (($dir{main}) && ($sequence_dir !~ /^\//)) {
	$sequence_dir = $dir{main}."/".$sequence_dir;
   }

   &RSAT::message::Info("Sequence dir", $sequence_dir) if ($verbose >= 1);
   &RSAT::message::Info("Reading sequence file list", $sequence_file_list) if ($verbose >= 1);
   my ($files) = &OpenInputFile($sequence_file_list);
   my $f = 0;
   while (<$files>) {
	next if (/^;/);
	next unless (/\S/);
	chomp;
	$f++;
	my @fields = split /\s+/;
	my $sequence_file = shift @fields; ## the first word of each row is a file
	my $short_file = &ShortFileName($sequence_file);
	my $family_name = $short_file;
	$family_name =~ s|\.${seq_ext}$||;
       my $comment = join "; ", @fields;

#	push @families, $family_name;
#	push @sequence_files, $sequence_file;

	#### create a new family if required
	unless ($family{$family_name}) {
	    $family{$family_name} = new RSAT::Family(name=>$family_name);
	}

       #### Sequence file
	if ($sequence_dir) {
	    $family{$family_name}->{seq_file} = $sequence_dir."/".$sequence_file;
	} else {
	    $family{$family_name}->{seq_file} = $sequence_file;
	}
	$family{$family_name}->{seq_file} =~ s|/+|/|g;
	$family{$family_name}->{seq_file_purged} = $family{$family_name}->{seq_file};
	$family{$family_name}->{seq_file_purged} =~ s/\.${seq_ext}$//;
	$family{$family_name}->{seq_file_purged} .= "_purged.".${seq_ext};

       #### Comment
       $family{$family_name}->{comment} = $comment;

	&RSAT::message::Info (join ("\t", "File $f",
				    $family_name,
				    $family{$family_name}->{seq_file},
				    $family{$family_name}->{seq_file_purged},
#				    $sequence_file,
#				    $short_file,
#				    $family_name,
				    $comment,
#		   "\n", $family{$family_name}->{seq_file},
#		   "\n", $family{$family_name}->{seq_file_purged}
				   )) if ($verbose >= 3);
	push @comments, $comment;
   }
   close $files;

   @families = sort keys %family;

   #### Family members are read from the sequence file (sequence IDs are used as member IDs)

   &RSAT::message::Info("Reading family members from input sequence files") if ($verbose >= 1);
   foreach my $family_name (@families) {
       my ($in, $input_dir) =  &OpenInputFile($family{$family_name}->{seq_file});
       while ((($current_seq, $current_id, @comments) = &ReadNextSequence($in, $seq_format, $input_dir, 'dna', $mask)) &&
	      (($current_seq) || ($current_id))) {
           $family{$family_name}->new_member($current_id);
           &RSAT::message::Info (join ("\t",
				       "", "File", $family{$family_name}->{seq_file},
				       "Family", $family_name,
				       "member", $current_id)) if ($verbose >= 4);
       }
       close $in;
       my $gene_nb = scalar(@{$family{$family_name}->{members}});
       &RSAT::message::Info (join ("\t",
				   $family_name,
				   $family{$family_name}->{seq_file},
				   $gene_nb." members")) if ($verbose >= 2);
			    }
}

################################################################
## Validate the motif discovery results by comparing discovered patterns to
## known sites
sub Validate {
   &RSAT::message::Info( ";\n; Validating discovered patterns") if ($verbose >= 1);
   if ($task{validate_oligos}) {
       $pattern_type = 'oligo';
       $pattern_suffix = $oligo_suffix;
   } elsif ($task{validate_dyads}) {
       $pattern_type = 'dyad';
       $pattern_suffix = $dyad_suffix;
   } elsif ($task{validate_orm}) {
       $pattern_type = 'orm';
       $pattern_suffix = $orm_suffix;
   }

   if ($noorf eq "-noorf") {
       $noorf_status = $noorf;
   } else {
       $noorf_status = "-orfov";
   }

   ## Validation per family file
   $outfile{validation_per_family} = join ("", "validation_per_family", $pattern_suffix,$from,$noorf_status,"_",$feature_types,".tab");
   $valid_fam_handle = &OpenOutputFile($outfile{validation_per_family});

   ## Validation per pattern file
   $outfile{validation_per_pattern} = join ("", "validation_per_pattern", $pattern_suffix,$from,$noorf_status,"_",$feature_types,".tab");
   $valid_patt_handle = &OpenOutputFile($outfile{validation_per_pattern});

   ## Print the column descriptions as comments
   ## Validation per family
   $valid_fam_keys{"Fam"} = "Family name";
   $valid_fam_keys{"Members"} = "Number of sequences";
   $valid_fam_keys{"Sites"} = "number of annotated sites";
   $valid_fam_keys{"sig_max"} = "Maximal significance";
   $valid_fam_keys{"sig_sum"} = "Sum of significances for the discovered patterns";
   $valid_fam_keys{"sig_m"} = "Mean of significance for the discovered patterns";
   $valid_fam_keys{"TPsites"} = "Know sites matched by at least one pattern";
   $valid_fam_keys{"nb_pat"} = "Number of discovered patterns";
   $valid_fam_keys{"TP_pat"} = "Number of discovered patterns matching at least one site";
   $valid_fam_keys{"PPV"} = "Positive predictive value: PVV=TP_pat/nb_pat";
   $valid_fam_keys{"Sn"} = "Sensitivity: Sn=TPsites/Sites";
   $valid_fam_keys{"Acc.a"} = "Arithmetic accuracy: Acc.a = (Sn + PPV)/2";
   $valid_fam_keys{"Acc.g"} = "Geometric accuracy: Acc.g = sqrt(Sn*PPV)";
   $valid_fam_keys{"Acc.h"} = "Harmonic accuracy: Acc.h = 2*(Sn*PPV)/(Sn+PPV)";
   print $valid_fam_handle "; Column contents\n";
   foreach my $fam_key (keys(%valid_fam_keys)) {
       print $valid_fam_handle ";\t", $fam_key, "\t", $valid_fam_keys{$fam_key}, "\n";
   }

   ## Print the column descriptions as comments
   ## Validation per pattern
   $valid_patt_keys{"Fam"} = "Family name";
   $valid_patt_keys{"Members"} = "Number of sequences";
   $valid_patt_keys{"Sites"} = "number of annotated sites";
   $valid_patt_keys{"Pattern"} = "Pattern sequence";
   $valid_patt_keys{"Sig"} = "Significance of discovered pattern";
   $valid_patt_keys{"TPsites"} = "Know sites matched by discovered pattern";
   $valid_patt_keys{"TP_pat"} = "Discovered pattern is matching at least one annotated site (0 = False, 1 = True)";
   $valid_patt_keys{"PPV"} = "Positive predictive value: PVV=TP_pat";
   $valid_patt_keys{"Sn"} = "Sensitivity: Sn=TPsites/Sites";
   $valid_patt_keys{"Acc.a"} = "Arithmetic accuracy: Acc.a = (Sn + PPV)/2";
   $valid_patt_keys{"Acc.g"} = "Geometric accuracy: Acc.g = sqrt(Sn*PPV)";
   $valid_patt_keys{"Acc.h"} = "Harmonic accuracy: Acc.h = 2*(Sn*PPV)/(Sn+PPV)";
   print $valid_patt_handle "; Column contents\n";
   foreach my $patt_key (keys(%valid_patt_keys)) {
       print $valid_patt_handle ";\t", $patt_key, "\t", $valid_patt_keys{$patt_key}, "\n";
   }

   ## Print header in validation files
   print $valid_fam_handle join("\t",
			     "# Fam",
			     "members",
			     "sites",
			     "sig_max",
			     "sig_sum",
			     "sig_m",
			     "TPsites",
			     "nb_pat",
			     "TP_pat",
			     "PPV",
			     "Sn",
			     "Acc.a",
			     "Acc.g",
			     "Acc.h"), "\n";

   print $valid_patt_handle join("\t",
				 "# Fam",
				 "Members",
				 "Sites",
				 "Pattern",
				 "Sig",
				 "TPsites",
				 "TP_pat",
				 "PPV",
				 "Sn",
				 "Acc.a",
				 "Acc.g",
				 "Acc.h"), "\n";

   my $f = 0;
   foreach my $family_name (@families) {
       $f++;


       my @members = @{$family{$family_name}->{members}};
       $gene_nb = scalar(@members);

       ## Check minimum number of genes
       if ($gene_nb < $min_genes) {
	   &RSAT::message::Warning("Skipping family",
				   $f, $family_name,
				   $gene_nb." genes",
				   "< min = ".$min_genes), "\n" if ($verbose >= 2);
	   next;
       }


       ## Create a separate file for compare-patterns with the current family
       my @sites = @{$known_site{$family_name}};
       my @sources = @{$known_site_source{$family_name}};

       &RSAT::message::Info("Validating family ".$f."/".scalar(@families),
			    $family_name,
			    "known sites",
			    $family{$family_name}->{known_sites},
			    "Number of sites",
			    scalar(@sites)) if ($verbose >= 2);

       $known_handle = &OpenOutputFile($family{$family_name}->{known_sites});
       foreach my $s (0..$#sites) {
	   my $source = $sources[$s];
	   unless ($source) {
	       $source = $family_name."known".$s;
	   }
	   print $known_handle join("\t", $sites[$s], $source), "\n";
       }
       close $known_handle;

       &ValidateOneResult($family_name, scalar(@sites));
   }
   close $valid_fam_handle;
   &RSAT::message::Info(join ("\t", "Validation per family done", $outfile{validation_per_family}));
   close $valid_patt_handle;
   &RSAT::message::Info(join ("\t", "Validation per pattern done", $outfile{validation_per_pattern}));
}

################################################################
## Compare one result file with annotated motifs
sub ValidateOneResult {
    my ($family_name, $number_sites) = @_;

    my $pattern_file = $family{$family_name}->{$pattern_type."_file"};

    ## Compare discovered patterns with known sites
    my $command = "compare-patterns";
    $command .= " -v 1";
    $command .= " ".$strands;
    $command .= " -file1 ".$family{$family_name}->{known_sites};
    $command .= " -file2 ".$pattern_file;
    $command .= " -slide";
    $command .= " -return match,weight,rel_w,id,strand,seq";
    $command .= " -lth rel_w ".$rel_w;
    $command .= " -o ".$family{$family_name}->{$pattern_type."_vs_known"};
    &doit($command, $dry_run, $die_on_error, $verbose);

    ## Compare discovered patterns with known sites (relw table output type)
    my $command = "compare-patterns";
    $command .= " -v 1";
    $command .= " ".$strands;
    $command .= " -file1 ".$family{$family_name}->{known_sites};
    $command .= " -file2 ".$pattern_file;
    $command .= " -slide";
    $command .= " -table rel_w";
    $command .= " -o ".$family{$family_name}->{$pattern_type."_vs_known_relw_table"};
    &doit($command, $dry_run, $die_on_error, $verbose);

    ## Compare discovered patterns with known sites (weight table output type)
    my $command = "compare-patterns";
    $command .= " -v 1";
    $command .= " ".$strands;
    $command .= " -file1 ".$family{$family_name}->{known_sites};
    $command .= " -file2 ".$pattern_file;
    $command .= " -slide";
    $command .= " -table weight";
    $command .= " -o ".$family{$family_name}->{$pattern_type."_vs_known_weight_table"};
    &doit($command, $dry_run, $die_on_error, $verbose);

    my $match_file = $family{$family_name}->{$pattern_type."_vs_known"};
    my ($pattern_handle) = &OpenInputFile($pattern_file);

    if ($pattern_type eq "oligo") {
	$score_col = 9;
    } elsif ($pattern_type eq "dyad") {
	$score_col = 8;
    } elsif ($pattern_type eq "orm") {
	$score_col = 9;
    } else {
	&FatalError ("Pattern type $pattern_type is not supported yet");
    }

    ## Statistics per pattern
    my $nb_patterns = 0;
    my $max_sig = 0;
    my $sum_sig = 0;

    while (<$pattern_handle>) {
        next if (/^;/); # Skip comment lines
        next if (/^\#/); # Skip column header lines
        next unless (/\S/); # Skip empty lines
        chomp();
	$nb_patterns ++;
	    my @fields = split "\t", $_;
	    my $pattern = $fields[0];
	    my $score = $fields[$score_col -1];
	    if ($score > $max_sig) {
		$max_sig = $score;
	    }
	    $sum_sig = $sum_sig + $score;
	    if ($number_sites == 0) {
		$TPsites_patt = "NA";
		$TPpattern = "NA";
		$Sn_patt = "NA";
		$PPV_patt = "NA";
		$acc_a_patt = "NA";
		$acc_g_patt = "NA";
		$acc_h_patt = "NA";
	    } else {
		chomp($TPsites_patt = `more $match_file | grep '$pattern' | wc -l`);
		if ($TPsites_patt == 0) {
		    $TPpattern = 0;
		} elsif ($TPsites_patt >= 1) {
		    $TPpattern = 1;
		}
		$Sn_patt = $TPsites_patt/$number_sites;
		$Sn_patt = sprintf("%.4f", $Sn_patt);
		$PPV_patt = $TPpattern;
		$PPV_patt = sprintf("%.4f", $PPV_patt);
		$acc_a_patt = ($Sn_patt+$PPV_patt)/2;
		$acc_a_patt = sprintf("%.4f", $acc_a_patt);
		$acc_g_patt = sqrt($Sn_patt*$PPV_patt);
		$acc_g_patt = sprintf("%.4f", $acc_g_patt);
		if (($Sn_patt + $PPV_patt) == 0) {
		    $acc_h_patt = 0;
		} else {
		    $acc_h_patt = 2*($Sn_patt*$PPV_patt)/($Sn_patt+$PPV_patt);
		    $acc_h_patt = sprintf("%.4f", $acc_h_patt);
		}
	    }
	print $valid_patt_handle join("\t", $family_name,
				      scalar(@{$family{$family_name}->{members}}),
				      $number_sites,
				      $pattern,
				      $score,
				      $TPsites_patt,
				      $TPpattern,
				      $PPV_patt,
				      $Sn_patt,
				      $acc_a_patt,
				      $acc_g_patt,
				      $acc_h_patt), "\n";
    }

    ## Mean significance per family (only the patterns above the threshold of significance)
    if ($nb_patterns > 0) {
	$mean_sig = sprintf("%.3f",  $sum_sig / $nb_patterns);
    } else {
	$max_sig = "NA";
	$sum_sig = "NA";
	$mean_sig = "NA";
    }

    ## Count number of annotated sites matching at least one discovered pattern
    $TPsites = `more $match_file | grep -v ';' | grep -v '#' | cut -f 6 | sort -u | wc -l`;
    chomp($TPsites);

    ## Count number of discovered patterns matching at least one annotated site
    $TPpatterns = `more $match_file | grep -v ';' | grep -v '#' | cut -f 7 | sort -u | wc -l`;
    chomp($TPpatterns);

    ## Calculate sensitivity, positive predictive value and accuracy
    if ($number_sites == 0) {
	$TPsites = "NA";
	$TPpatterns = "NA";
	$Sn = "NA";
	$PPV = "NA";
	$acc_a = "NA";
	$acc_g = "NA";
	$acc_h = "NA";
    } else {
        $Sn = $TPsites/$number_sites;
        $Sn = sprintf("%.4f", $Sn);
	if ($nb_patterns == 0) {
	    $PPV = 0;
	    $acc_a = 0;
            $acc_g = 0;
            $acc_h = 0;
	} else {
	    $PPV = $TPpatterns/$nb_patterns;
	    $PPV = sprintf("%.4f", $PPV);
	    $acc_a = ($Sn+$PPV)/2;
	    $acc_a = sprintf("%.4f", $acc_a);
	    $acc_g = sqrt($Sn*$PPV);
	    $acc_g = sprintf("%.4f", $acc_g);
	    if ($Sn + $PPV == 0) {
		$acc_h = 0;
	    } else {
		$acc_h = 2*($Sn*$PPV)/($Sn+$PPV);
		$acc_h = sprintf("%.4f", $acc_h);
	    }
	}
    }

    ## Print results
    print $valid_fam_handle join("\t", $family_name,
			     scalar(@{$family{$family_name}->{members}}),
			     $number_sites,
			     $max_sig,
			     $sum_sig,
			     $mean_sig,
			     $TPsites,
			     $nb_patterns,
			     $TPpatterns,
			     $PPV,
			     $Sn,
			     $acc_a,
			     $acc_g,
			     $acc_h), "\n";

}

################################################################
## Database comparison
sub DatabaseMatch {
    &RSAT::message::Info( ";\n; Comparison of discovered patterns to database of known sites") if ($verbose >= 1);
   if ($task{db_match_oligos}) {
       $pattern_type = 'oligo';
       $pattern_suffix = $oligo_suffix;
   } elsif ($task{db_match_dyads}) {
       $pattern_type = 'dyad';
       $pattern_suffix = $dyad_suffix;
   } elsif ($task{db_match_orm}) {
       $pattern_type = 'orm';
       $pattern_suffix = $orm_suffix;
   }

   if ($noorf eq "-noorf") {
       $noorf_status = $noorf;
   } else {
       $noorf_status = "-orfov";
   }

   ## Database comparison synthetic file
#   $outfile{db_match} = join ("", "database_match", $pattern_suffix,$from,$noorf_status,"_",$feature_types,".tab");
#   $db_match_handle = &OpenOutputFile($outfile{db_match});

   foreach my $family_name (@families) {
       $f++;
       &RSAT::message::Info("Matching patterns discovered for family ".$f."/".scalar(@families),
			    $family_name) if ($verbose >= 2);

       my $pattern_file = $family{$family_name}->{$pattern_type."_file"};

       ## Compare discovered patterns with database sites
       my $command = "compare-patterns";
       $command .= " -v 1";
       $command .= " ".$strands;
       $command .= " -file1 ".$db_site_file;
       $command .= " -file2 ".$pattern_file;
       $command .= " -slide";
       $command .= " -return match,weight,rel_w,id,strand,seq";
       $command .= " -lth rel_w ".$rel_w;
       $command .= " -o ".$family{$family_name}->{$pattern_type."_vs_db"};
       &doit($command, $dry_run, $die_on_error, $verbose);

       ## Compare discovered patterns with database sites (relw table output type)
       my $command = "compare-patterns";
       $command .= " -v 1";
       $command .= " ".$strands;
       $command .= " -file1 ".$db_site_file;
       $command .= " -file2 ".$pattern_file;
       $command .= " -slide";
       $command .= " -table rel_w";
       $command .= " -o ".$family{$family_name}->{$pattern_type."_vs_db_relw_table"};
       &doit($command, $dry_run, $die_on_error, $verbose);

       ## Compare discovered patterns with database sites (weight table output type)
       my $command = "compare-patterns";
       $command .= " -v 1";
       $command .= " ".$strands;
       $command .= " -file1 ".$db_site_file;
       $command .= " -file2 ".$pattern_file;
       $command .= " -slide";
       $command .= " -table weight";
       $command .= " -o ".$family{$family_name}->{$pattern_type."_vs_db_weight_table"};
       &doit($command, $dry_run, $die_on_error, $verbose);

#       my $db_match_file = $family{$family_name}->{$pattern_type."_vs_db"};



       ## Print results in synthetic file
#       print $db_match_handle join("\t", $family_name,
#				      scalar(@{$family{$family_name}->{members}}),
#				      $number_sites,
#				      $pattern,
#				      $score,
#				      $site,
#				      $site_id), "\n";

   }
#   close $db_match_handle;
#   &RSAT::message::Info(join ("\t", "Database match done", $outfile{db_match}));
}

################################################################
## Distribution of significance
sub SigDistrib {
    &RSAT::message::Info( ";\n; Significance distribution of discovered patterns") if ($verbose >= 1);
    if ($task{sig_distrib_oligos}) {
	$pattern_type = 'oligo';
	$pattern_suffix = $oligo_suffix;
    } elsif ($task{sig_distrib_dyads}) {
	$pattern_type = 'dyad';
	$pattern_suffix = $dyad_suffix;
    } elsif ($task{sig_distrib_orm}) {
	$pattern_type = 'orm';
	$pattern_suffix = $orm_suffix;
    }

    if ($noorf eq "-noorf") {
	$noorf_status = $noorf;
    } else {
	$noorf_status = "-orfov";
    }

   ## Significance distribution file
    $dir{distrib} = "sig_distrib";
    chdir($dir{main}); chdir($dir{output}); &RSAT::util::CheckOutDir($dir{distrib});
    $outfile{sig_distrib_file} = join ("", $dir{distrib},"/","sig_distrib", $pattern_suffix,$from,$noorf_status,"_",$feature_types);
    if ($min_genes > 1) {
	$outfile{sig_distrib_file} .= "_mingenes".$min_genes;
    }
    $sig_distrib_handle = &OpenOutputFile($outfile{sig_distrib_file}.".tab");

    ## Getting and printing patterns and their significance
    my %max_score = ();
    my %pps = ();
    my %fps = ();

    $fc=0;
    foreach my $family_name (@families) {
	$fc++;


	my @members = @{$family{$family_name}->{members}};
	$gene_nb = scalar(@members);

	## Check minimum number of genes
	if ($gene_nb < $min_genes) {
	    &RSAT::message::Warning("Skipping family",
				    $fc, $family_name,
				    $gene_nb." genes",
				    "< min = ".$min_genes), "\n" if ($verbose >= 2);
	    next;
	}


	my $pattern_file = $family{$family_name}->{$pattern_type."_file"};
	my ($pattern_handle) = &OpenInputFile($pattern_file);
	my $score_col;
	if ($pattern_type eq "oligo") {
	    $score_col = 9;
	} elsif ($pattern_type eq "dyad") {
	    $score_col = 8;
	} elsif ($pattern_type eq "orm") {
	    $score_col = 9;
	} else {
	    &FatalError ("Pattern type $pattern_type is not supported yet");
	}
	$max_score{$family_name} = "NA";
	while (<$pattern_handle>) {
	    next if (/^;/); # Skip comment lines
	    next if (/^\#/); # Skip column header lines
	    next unless (/\S/); # Skip empty lines
	    chomp();
	    my @fields = split "\t", $_;
	    my $pattern = $fields[0];
	    my $score = $fields[$score_col -1];
	    $pps{$score}++;
	    if (($max_score{$family_name} eq "NA") || ($score > $max_score{$family_name})) {
		$max_score{$family_name} = $score;
	    }
#	    print $sig_distrib_handle join ("\t", $family_name, $pattern, $score), "\n";
	}
	$fps{$max_score{$family_name}}++;
	close $pattern_handle;
    }

    ## Output columns
    @sig_distrib_columns = qw(Sig pps pps_icum ppf_icum fps_icum ffs_icum E_ppf E_ffs);

    ## Print the column descriptions as comments
    $sig_distrib_keys{"Sig"} = "Significance of discovered patterns (s)";
    $sig_distrib_keys{"pps"} = "Patterns per score (number of patterns with score = s)";
    $sig_distrib_keys{"pps_icum"} = "Patterns per score (number of patterns with score >= s)";
    #    $sig_distrib_keys{"ppf"} = "Patterns per family (number of patterns with exactly this score per family)";
    $sig_distrib_keys{"ppf_icum"} = "Patterns per family (number of patterns per family with score >= s)";
    $sig_distrib_keys{"fps_icum"} = "Number of clusters with score >= s";
    $sig_distrib_keys{"ffs_icum"} = "Fraction of clusters with score >= s";
    $sig_distrib_keys{"E_ppf"} = "Expected number of patterns per family with score >= s";
    $sig_distrib_keys{"E_ffs"} = "Expected fraction of clusters with score >= s";
    print $sig_distrib_handle "; Column contents\n";
    foreach my $k (0..$#sig_distrib_columns) {
	my $key = $sig_distrib_columns[$k];
	print $sig_distrib_handle join ("\t", ";", $k+1, $key,  $sig_distrib_keys{$key}), "\n";
    }

    ## Compute inverse cumulative distirbutions
    my @sorted_scores_decr = sort {$b <=> $a} keys (%pps);
    my %pps_icum = ();
    my %fps_icum = ();
    my %E_ppf = ();
    my %E_ffs = ();
    my $nb_families = scalar(@families);

    my $pps_icum = 0;
    my $fps_icum = 0;
    foreach my $score (@sorted_scores_decr) {
	$pps_icum += $pps{$score}; $pps_icum{$score} =$pps_icum;
	$fps_icum += $fps{$score}; $fps_icum{$score} = $fps_icum;
	$E_ppf{$score} = 10**(-$score);

	&RSAT::message::Debug("distrib", $score, $pps{$score}, $pps_icum, $fps{$score}, $fps_icum, $E_ppf{$score})
	  if ($main::verbose >= 5);
	$E_ffs{$score} = 1- &poisson(0, $E_ppf{$score});
      }

   ## Print header row for the distribution table
    print $sig_distrib_handle "#", join("\t", @sig_distrib_columns), "\n";

    ## Print distribution table
    my @sorted_scores = sort {$a <=> $b} keys (%pps);
    foreach my $score (@sorted_scores) {
	print $sig_distrib_handle join("\t",
				       $score,
				       $pps{$score},
				       $pps_icum{$score},
				       sprintf("%.3f",$pps_icum{$score}/$nb_families),
				       $fps_icum{$score},
				       sprintf("%.3f",$fps_icum{$score}/$nb_families), ## ffs
				       sprintf("%7.2g",$E_ppf{$score}),
				       sprintf("%7.2g",$E_ffs{$score}),
				       ), "\n";
    }

    close $sig_distrib_handle;
    &RSAT::message::Info(join ("\t", "Significance distribution done", $outfile{sig_distrib_file}.".tab"));

    foreach my $log ("", "-ylog") {
	if ($log) {
	    $log_option = $log." 10";
	}
	my $sig_distrib_graph = $outfile{sig_distrib_file}.$log;

	## Draw a graph with the number of patterns per family, as a function of the sig score
	my $command = "XYgraph ".$log_option;
	$command .= " -title1 '".$sig_distrib_graph."'";
	$command .= " -title2 $organism_name.' (".$nb_families." families; min ".$min_genes." genes)'";
	$command .= " -ymin 0 -ysize 400";
	$command .= " -xgstep1 1";
	$command .= " -i ".$outfile{sig_distrib_file}.".tab";
	$command .= " -o ".$sig_distrib_graph."_ppf.jpg";
	$command .= " -xcol 1 -ycol 4,7 -lines -header -legend -xsize 800 -xleg1 'score' -yleg1 'patterns per family'";
	&doit($command, $dry_run, $die_on_error, $verbose);
	&RSAT::message::Info(join ("\t", "Patterns per family curve", $sig_distrib_graph."_ppf.jpg"));

	## Draw a graph with the fraction of clusters per score, as a function of the sig score
	my $command = "XYgraph ".$log_option;
	$command .= " -title1 '".$sig_distrib_graph."'";
	$command .= " -title2 $organism_name.' (".$nb_families." families; min ".$min_genes." genes)'";
	$command .= " -i ".$outfile{sig_distrib_file}.".tab";
	$command .= " -o ".$sig_distrib_graph."_ffs.jpg";
	$command .= " -ymin 0 -ymax 1 -ygstep1 0.1 -ygstep2 0.02 -ysize 400";
	$command .= " -xgstep1 1";
	$command .= " -xcol 1 -ycol 6,8 -lines -header -legend -xsize 800 -xleg1 'score' -yleg1 'fraction of clusters per score'";
	&doit($command, $dry_run, $die_on_error, $verbose);
	&RSAT::message::Info(join ("\t", "Clusters per score curve", $sig_distrib_graph."_ffs.jpg"));
    }
}

################################################################
## INDEX one sequence length per family
sub CalcCalibrationLengths {
   my %calib_lengths = ();
   &RSAT::message::TimeWarn("Calculating sequence lengths") if ($verbose >= 1);
   foreach my $family_name (@families) {
	$fam_count++;

#	&CheckSkip($fam_count,$family_name);
#	&CheckLast($fam_count);

	## Calculate sequence lengths
	&RSAT::message::Info( join("\t", "", "Calculating sequence lengths for family",
				   $fam_count."/".scalar(@families),
				   $family_name,
				   $family{$family_name}->{seq_file},
				   $family{$family_name}->{seq_len_file})
			     ) if ($verbose >= 2);
	my $command = "sequence-lengths -i ".$family{$family_name}->{seq_file};
	$command .= " -o ".$family{$family_name}->{seq_len_file};
	&doit($command, $dry_run, $die_on_error, $verbose);

	## Index sequence lengths
	my ($in, $indir) = &OpenInputFile($family{$family_name}->{seq_len_file});
	while (<$in>) {
	    chomp;
	    @fields = split "\t";	
	    my $current_len = $fields[1];
	    $family{$family_name}->{total_length} += $current_len;
	    ## check that all sequences of one family have the same length

	    if ((defined($family{$family_name}->{calib_length}))
		&& ($task{calibrate})) {
		unless ($family{$family_name}->{calib_length} == $current_len) {
		    &RSAT::error::FatalError("Sequences of family $family_name have different lengths.\nThis is not compatible with the option calibrate");
		}
	    } else {
		if ($current_len > $family{$family_name}->{calib_length}) {
		    $family{$family_name}->{calib_length} = $current_len;
		}
	    }
	}
	close $in;
	$calib_lengths{$family{$family_name}->{calib_length}}++;
	&RSAT::message::TimeWarn("Calibration length for family ", $fam_count."/".scalar(@families), $family_name,
				 $family{$family_name}->{total_length}) if ($verbose >= 2);
   }
   my @calib_lengths = sort {$a <=> $b } keys %calib_lengths;
   &RSAT::message::Info("Calibration lengths", join( ";", @calib_lengths)) if ($verbose >= 1);
   return @calib_lengths;
}


## ##############################################################
## Calculate the prefix of the calibration file
sub CalibrationPrefix {
   my ($calib_length, $oligo_len, $N) = @_;

   my $calib_prefix = "";

   if ($N) {
	## set-based calibration (simulations)

	## Directories containing the calibration files
	unless ($dir{calibN}) {
#	    $dir{calibN} = $supported_organism{$organism_name}->{'data'}."/rand_gene_selections";
#	    ## Temporary
#	    $dir{calibN} = "~/motif_discovery_competition_2003/results/".$organism_name."/rand_gene_selections/";
	    ## Temporary
#	    $dir{calibN} = $dir{output}."/rand_gene_selections/";
	    $dir{calibN} = "calibrations";
	}

	## directory for the current calibration
#	$calib_prefix = $dir{calibN}."/";
	$calib_prefix = $dir{main}."/".$dir{calibN};
#	$calib_prefix .= $oligo_len."nt";
#	$calib_prefix .= $strands;
#	$calib_prefix .= $noov;
#	$calib_prefix .= "_N".$N;
#	$calib_prefix .= "_L".$calib_length;
#	$calib_prefix .= "_R".$calibN_repet;

	## file name
	$calib_prefix .= "/";
	$calib_prefix .= $organism_name."_";
	$calib_prefix .= $oligo_len."nt_";
	$calib_prefix .= $strands;
	$calib_prefix .= $noov;
	$calib_prefix .= "_n".$N;
	$calib_prefix .= "_l".$calib_length;
	$calib_prefix .= "_r".$calibN_repet;

   } else {
	## single-gene-based calibration (all upstream)
	$calib_prefix = $dir{calib1}."/";
	$calib_prefix .= $oligo_len."nt";
	$calib_prefix .= "_upstream_L".$calib_length;
	$calib_prefix .= "_".$organism_name;
	$calib_prefix .= $noov;
	$calib_prefix .= $strands;
   }

   return $calib_prefix;
}

################################################################
## Generate report fiels for the motif disovery competition 2004
sub MDCreport {
   chdir($dir{main}); chdir($dir{output}); &RSAT::util::CheckOutDir($dir{mdc_report});
   &RSAT::message::TimeWarn("Generating MCD report\t".$outfile{results}) if ($verbose >= 1);

   ################################################################
   ## Open a file for the report of results
   ($results) = &OpenOutputFile($outfile{results});
#      print $results join ("\n",
#  			 ">name of contact",
#  			 "Jacques van Helden",
#  			 ">email",
#  			 "Jacques.van-Helden\@univ-amu.fr",
#  			 ">program name",
#  			 "multiple-family-analyis"), "\n";
   close $results;

   ################################################################
   ## Open a file for the report of parameters
   ($parameters) = &OpenOutputFile($outfile{parameters});
#      print $parameters join ("\n",
#  			    ">name of contact",
#  			    "Jacques van Helden",
#  			    ">email",
#  			    "Jacques.van-Helden\@univ-amu.fr",
#  			    ">program name",
#  			    "multiple-family-analyis"), "\n";

   ################################################################
   ## Generate the report for each family
   my $command = "";
   foreach $family_name (@families) {
	$fam_count++;
#	if ($fam_count <= $skip) {
#	    warn "; Skipping family\t$fam_count\t$family_name\n" if ($verbose >= 3);
#	    next;
#	}

	my $selection_file = $family{$family_name}->{oligo_selection};
	unless (-e $selection_file) {
#	    $selection_file = $family{$family_name}->{oligo_file};
	    &RSAT::message::Warning("No manual selection: file does not exist $selection_file") if ($verbose >= 0);
	    next;
	}
	&RSAT::message::Info("Pattern file for the report",  $selection_file) if ($verbose >= 2);

	my $selection_ft_prefix =  $family{$family_name}->{oligo_selection};
	my $feature_file = $selection_ft_prefix.".ft";
 	my $matching_options = " -N 0"; ## Make sure flanking sequences are not considered
	$matching_options .= " -merge"; ## merge overlapping matches

#  	## Threshold for reporting a match
#  	my $threshold = 0;

#  	### Sliding window options
#  	if ($sliding_window_size >= 1) {
#  	    my @patterns = &ReadPatterns(pattern_file=>$selection_file);
#  	    my @scores = ();
#  	    foreach my $pattern (@patterns) {
#  		push @scores, $pattern->get_attribute("score");
#  	    }
#  	    if (scalar(@patterns) >= 1) {
#  		$threshold = &max(@scores) + 0.01;
#  	    } elsif (scalar(@patterns) == 1) {
#  		$threshold = &max(@scores);
#  	    }
#  	    $matching_options .= " -window ".$sliding_window_size;
#  	}
#  	$matching_options .= " -th ".$threshold;



	################################################################
	## initialize postprocessing and parameter description
	my $postprocessing_text = ">postprocessing\n";
	my $parameters_text = ">parameters\n";
	my $multi_params = join " ", @ARGV;
	$multi_params =~ s/\ -/\n-/g;
	$parameters_text .= $multi_params."\n";

	### Read the pattern file in order to select
	### manually-specified parameters threshold
	($in) = &OpenInputFile($selection_file);
	my $parameters_started = 0;
	my $postprocessing_started = 0;
	while ($line = <$in>) {
	    if ($line =~ />parameters/) {
		$parameters_started = 1;
		next;
	    }
	    next unless ($parameters_started);
	    if ($line =~ />postprocessing/) {
		$postprocessing_started = 1;
		next;
	    }
	    next unless ($parameters_started);
	    $line =~ s/^;\s*//;
	    if ($postprocessing_started) {
		$postprocessing_text .= $line;
	    } elsif ($parameters_started) {
		$parameters_text .= $line;
		$matching_options .= " ".$line;
		&RSAT::message::Info("manually specified parameter $line") if ($verbose >= 3);
	    }
	}
	close($in);
	$matching_options =~ s/\n/ /g;
	$matching_options =~ s/\r/ /g;

	&RSAT::message::Info("Family $family_name\tParameters\t", $parameters_text) if ($verbose >= 2);
	&RSAT::message::Info("Family $family_name\tPostprocessing\t", $postprocessing_text) if ($verbose >= 2);
	&RSAT::message::Info("Matching options\t", $matching_options) if ($verbose >= 2);

	## Make sure there is a carriage return after postprocessing and parameters
	chomp($parameter_text);
	$parameter_text .= "\n";
	chomp($postprocessing_text);
	$postprocessing_text .= "\n";

	## Write parameters and postprocessing
	print $parameters ">data set\n";
	print $parameters $family_name, "\n";
	print $parameters $parameters_text;
	print $parameters $postprocessing_text;

	## Draw the feature map of the selected oligos
	&MatchPatterns($selection_file, $selection_ft_prefix,  $matching_options);
	&DrawFeatureMap($selection_ft_prefix, $selection_ft_prefix, " -minfthick 5");

	## Convert the features in MDC format
	&RSAT::message::Info("\tReporting\t",$feature_file) if ($verbose >=2);
	$command = "MDCreport-from-dnapat -d ".$family_name;
	$command .= " -i ".$feature_file;
	$command .= ">> ".$outfile{results} if ($outfile{results});
	&doit($command, $dry_run, $die_on_error, $verbose);
   }

   close $parameters;

   &RSAT::message::TimeWarn("Generated report\t", $outfile{results}) if ($verbose >= 1);
}


## ##############################################################
## Extract contigs and isolated patterns from an assembly file for the
## synthetic table
sub ReadAssemblyFile {
  my ($family_name, $assembly_file, $pattern_type) = @_;
  my ($pattern_handle, $input_dir) = &OpenInputFile($assembly_file);
  my @assembled_patterns = ();
  my $error = "";
  my %score = ();

  while (<$pattern_handle>) {
    chomp;
    if (/Too many patterns to assemble/i) {
      $error = $_;
      while (<$pattern_handle>) {
	$error .= "\n", $_;
      }
      return($error);
    }
    next if (/^;/);
    next if (/^\#/);
    next unless (/\S/);

    if ((/contig/) || (/consensus/) || (/isol/)) {
      @fields = split "\t";
      my $pattern_seq = &RSAT::util::trim(shift (@fields));
      $pattern_seq =~ s/^\.+//g;
      $pattern_seq =~ s/\.+$//g;
      $pattern = new RSAT::pattern();
      $pattern->set_attribute("sequence", $pattern_seq);
      push @assembled_patterns, $pattern;
      if ($strands eq "-2str") {
	my $rc = &RSAT::util::trim(shift @fields);
	$pattern->set_attribute("rc", $rc);
      }
      $score{$pattern} = shift @fields;
      $pattern->set_attribute("score", $score{$pattern});

      my $type = shift @fields;
      $pattern->force_attribute("type", $type);
      $max_score{$family_name} = &max($max_score{$family_name}, $score{$pattern});
      $max_score{$pattern_type}{$family_name} = &max($max_score{$pattern_type}{$family_name}, $score{$pattern});
    }
  }
  close $pattern_handle;
  return ($error, @assembled_patterns);
}



################################################################
## Calibrate oligonucleotide occurrences (mean and variance) for each
## sequence length
sub CalibrateOligos {
   my @calib_lengths = @_;
   &RSAT::message::TimeWarn("Calibrating oligonucleotide occurrences") if ($verbose >= 1);

   ### Analyse upstream occurrence distributions for each sequence length
   chdir($dir{main}); chdir($dir{output}); &RSAT::util::CheckOutDir($dir{calib1});
   &RSAT::message::TimeWarn("Calculating oligonucleotide distributions in all upstream sequences")
     if ($verbose >= 1);
   foreach my $calib_length (sort {$a <=> $b} @calib_lengths) {

	## retrieve all upstream sequence if required
	my $allup_file = $dir{calib1}."/tmp_all_up_".$calib_length.".fasta";
	if (-e $allup_file) {
	    &RSAT::message::Info("All upstream sequence file already exists. Skipping retrieval", $allup_file) if ($verbose >= 2);
	} else {
	    $command = "retrieve-seq -imp_pos -org $organism_name -all -from -$calib_length -to -1 -o $allup_file";
	    &RSAT::message::TimeWarn("Retrieving all upstream sequences", $allup_file) if ($verbose >= 2);
### This should not be done in batch, since the calibrate-oligo commands will be distributed over the nodes of the cluster
#	    if ($batch) {
#		push @main::batch_commands, $command;
#	    } else {
		&doit($command, $dry_run, $die_on_error, $verbose);
#	    }
	}

	## Calibrate oligonucleotides
	for my $oligo_len ($min_oligo_len..$max_oligo_len) {

	    ## Names of the calibration files
	    my $calib_prefix = &CalibrationPrefix($calib_length, $oligo_len);
	    my $distrib_file = $calib_prefix."_distrib.tab";
	    my $fitting_file = $calib_prefix."_negbin.tab";

	    ## Skip the calibration if the file already exists
	    if ((-e $fitting_file.".gz") && !($force_calib)) {
		&RSAT::message::Info("Calibration file already exists. Skipping calibration.\t${fitting_file}.gz");
		next;
	    } else {
		&RSAT::message::TimeWarn("Calculating oligo calibration\t${fitting_file}.gz");
	    }

	    ## Calculate occurrence distributions
	    $command = "oligo-analysis -v 3 -l $oligo_len $noov $strands  -i $allup_file -return occ -distrib -o $distrib_file";
	    &RSAT::message::TimeWarn( "Analysing ",$oligo_len."nt distributions",$distrib_file) if ($verbose >= 2);
	    if ($batch) {
		push @main::batch_commands, $command;
	    } else {
		&doit($command, $dry_run, 0, $verbose);
	    }

	    ## Calculate stats and fit negbin on the distributions
	    $command = "fit-distribution -v 1 -i $distrib_file -distrib negbin -o $fitting_file";
	    if ($batch) {
		push @main::batch_commands, $command;
	    } else {
		&doit($command, $dry_run, $die_on_error, $verbose);
	    }

	    ## Compress distrib and fitting file
	    $command = "gzip -f $distrib_file $fitting_file";
	    if ($batch) {
		push @main::batch_commands, $command;
	    } else {
  	        &doit($command, $dry_run, $die_on_error, $verbose);
           }
	}

	## Delete the file with all upstream sequences
	$command = "rm -f $allup_file";
	&RSAT::message::Info ("Cleaning all upstream sequences", $allup_file) if ($verbose >= 1);
	if ($batch) {
	    push @main::batch_commands, $command;
	} else {
	    &doit($command, $dry_run, 0, $verbose);
	}

	## Send the batch script to the queue
	if ($batch) {
	    my $batch_script = join "\n\n", @main::batch_commands;
	    &doit($batch_script, $dry_run, $die_on_error, $verbose, $batch);
	    @main::batch_commands = ();
	}
   }

}



################################################################
## Calibrate oligo frequencies by selecting R random sets of N genes,
## counting the occurrences of each word in each set, and estimating
## the average and standard deviation of the occurrence number.
#sub CalibrateOligosN {
sub CalibrateOligosN {
  &RSAT::message::Info("Calibrating oligo occurrences with random gene selections") if ($verbose >= 1);

  chdir($dir{main}); ## Calibration directory is specified relative to the main directory
  &RSAT::message::Info("Chdir to main directory\t".$dir{main});
  for my $oligo_len ($min_oligo_len..$max_oligo_len) {
    foreach my $family_name (@families) {
      my $gene_nb = scalar(@{$family{$family_name}->{members}});
      my $seq_len = $family{$family_name}->{calib_length};

      if ($calib_done{$gene_nb}{$seq_len} > 0) {
	&RSAT::message::Info($family_name, "Already done a calibration of oligo occurrences with ",$gene_nb,
			     "random gene selections","upstream length", $seq_len);
      } else {

	&RSAT::message::Info($family_name, "Calibrating oligo occurrences with ",$gene_nb,
			     "random gene selections","upstream length", $seq_len)
	  if ($verbose >= 2);
	my $command = "calibrate-oligos -v 1";
	$command  .= " -r ".$calibN_repet." -sn ".$gene_nb." -ol ".$oligo_len." -sl ".$seq_len;
	my $calibN_task = "all,clean_oligos";
	$command .= " -task ".$calibN_task;
	#		$command .= "-start ${START}";
	#		$command .= "${END}";
	$command .= " ".$strands;
	$command .= " ".$noov;
	$command .= " -outdir ".$dir{calibN};
	$command .= " -org ".$organism_name;
	$calib_done{$gene_nb}{$seq_len}++;
	&doit($command, $dry_run, $die_on_error, $verbose, $batch, "multi_calibN");
      }
    }
  }
  chdir($dir{output});
}


################################################################
#### display short help message
sub PrintOptions {
 open HELP, "| more";
 print HELP <<End_short_help;
gene-cluster-motifs options
---------------------------
## General options
-h		(must be first argument) display full help message
-help		(must be first argument) display options
-v		verbose
-i		family file (incompatible with -seq)
-mask upper|lower	mask upper- or lowercases, respectively
-seq_list	sequence file list (incompatible with -i)
-all_seq	single file containing all the sequences
-all_seq_format	input sequence format.
-bg_seq		file containing sequences used to estimate background models
-maindir	main directory (by default, the working directory is used
-outdir		output directory
-skip #		skip the # first data sets
-last #		stop after the # first data sets
-select		fam1[,fam2,fam3,...]. Perform the analysis of selected clusters only.
-task		selected task (supported: $supported_tasks)
-htmaps		draw html maps (dynamic map with pointers to the features)
-n      	dry run: print commands without executing them
-batch	send time-consuming jobs to a batch queue for a PC cluster
-nodie	        continue the analysis even if errors are raised by sub-processes

## Sequence options
-org		organism
-size_names     use the old naming system (obsolete, only for backward compatibility)
-org_fam	each family corresponds to one organism
-taxon		taxon (collect upstream sequences of orthologous genes)
-purge		use purged sequences for motif discovery (default)
-nopurge	use non-purged sequences for motif discovery
-purge_ml	min matching length for purge-sequence
-purge_mis	max number of mismatches for purge-sequence
-mingenes	minimal number of genes per family
-maxgenes	maximal number of genes per family
-from		upstream region left limit
-to		upstream region right limit
-noorf		exclude upstream ORFs from upstream sequences
-rm		use repeat masked version of the genome
-orfov		do not exclude upstream ORFs from upstream sequences
-feattype	feature type for sequence retrieval (e.g. CDS, mRNA)
-seq_type       sequence type (upstream, downstream, ORF)

## Parameters for oligo-analysis and dyad-analysis
-1str   	strand-sensitive analysis
-2str   	strand-insensitive analysis
-two_tails	perform a two-tails test (detect under- and over-represented motifs)
-noov		prevent overlapping matches for self-overlapping patterms
-ovlp		allow overlapping matches for self-overlapping patterms
-pseudo		pseudo-weight (see oligo-analysis manual)
-nomap		do not draw feature maps (for saving time and hard disk space)
-lth param \#	lower threshold on parameter. Supported: $supported_thresholds
-uth param \#	upwer threshold on parameter. Supported: $supported_thresholds
-thosig		threshold on occurrence significance (obsolete)
-toppat # 	Maximal number of patterns to assemble (default $toppat).

## oligo-analysis specific parameters
-maxol		maximum oligonucleotide length
-minol		minimum oligonucleotide length
-exp		expected frequency file (obsolete)
-oligo_exp_freq	expected frequencies for oligo-analysis
-bg		background frequency model
-calib_dir	calibration directory
-calibN_repet #	Number of repetitions for the calibration.
-markov #	exp. freq. calcualted with Markov chain model of order #
-thmsf		threshold on frequency of matching sequences (obsolete)
-thmssig	threshold on the significance of matching sequences (obsolete)

## dyad-analysis specific parameters
-monad		monad length for dyad-analysis
-minsp		maximum spacing for dyad-analysis
-maxsp		maximum spacing for dyad-analysis
-transp		transpose synthetic result table (columns become rows)
-sort		sort key (supported: $supported_sort_keys)
-dyad_exp_freq	expected frequencies for dyad-analysis

## feature-map options
-origin #       origin for dna-pattern, matrix-scan and feature-maps
-scalestep #	scale steps for the feature maps.

## General options for matrix-based motif discovery
-width		matrix width (default $matrix_width)
-sps		expected number of sites per sequence (default $expected_sites_per_seq)
-nmotifs	number of motifs to discover per sequence set (family)

## Gibbs sampler (Neuwald, 1995)
-seed		seed number for the random generator

## AlignACE (Roth, 1998)
-seed		seed number for the random generator

## MotifSampler (Thijs, 2001)
-MS_b		backgound file
-MS_p		prior probability of 1 motif copy (default $MS_p)
-MS_M   	Maximal number of motif instances per sequence. (default $MS_M; unset=0)
-MS_n		number of different motifs to search for (default $MS_n).
-MS_x		allowed overlap between different motifs. (default $MS_x)
-MS_r		number of times the MotifSampler should be repeated (default = $MS_r).

## MEME options (Bailey, 1994)
-MEME_minw	minimum motif width (default $MEME_minw)
-MEME_maxw	maximum motif width (default $MEME_maxw)
-MEME_bfile     name of background Markov model file
-MEME_XXX	any other MEME option (XXX) can be passed by appending it to the prefix -MEME_

## infogibbs options (Defrance, 2008)
-infogibbs_XXX	any other infogibbs option (XXX) can be passed by appending it to the prefix -infogibbs_

## Validation
-known		file containing a list of known sites/motifs
-known_max_len 	maximal length for using known sites (default: $known_site_max_len)

## Database comparison
-db             file containing a list of known sites (from a database)

## Synthetic table
-fam_link_pref  prefix to build a HTML link for the family names
-fam_link_suff  suffix to build a HTML link for the family names
-gene_link_pref  prefix to build a HTML link for the gene names
-gene_link_suff  suffix to build a HTML link for the gene names

## Database
-schema		database schema (default: $schema)
-host		database host (default: $host)
-user		database user (default: $user)
-password	database password (default: $password)

End_short_help

 close HELP;
 exit;
}
