#!/usr/bin/env perl
############################################################
#
# $Id: taxon-frequencies,v 1.48 2012/02/05 14:53:29 rsat Exp $
#
############################################################

## use strict;

=pod

=head1 NAME

taxon-frequencies

=head1 DESCRIPTION

Calculate taxon-wide oligo and dyad frequencies.

=head1 USAGE

I<taxon-frequencies> has many options, we just describe here the usual
way to apply it.

=head2 compute all models (oligos, dyads) for a given taxon, and install them in the RSAT directory

This requires to have write access on the RSAT directory. It is
important to use the option -install, because otherwise all the models
are simply displayed on the terminal (STOUT).

 taxon-frequencies -v 1 -all_models -install -taxon Mollicutes

=head2 compute a single model and store it in a local file

 taxon-frequencies -v 1 -type dyad -ml 3 -2str -taxon Mollicutes \
      > Mollicutes_dyads_3nt_sp0-20_freq.tab

=head2 Compute and install dyad frequencies for all the sub-taxa of a taxon of interest

 taxon-frequencies -v 1 -type dyad -ml 3 -2str -install -sub_taxa \
      -taxon Mollicutes 

Beware: this takes time. Sub-taxa exploration is not optimized yet, so
that all the descendents are collected for each taxon. When I find the
time, I (JvH) should re-implement this with a leave-to-root traversal
of the taxonomic tree.

=head1 AUTHORS

Jacques.van-Helden\@univ-amu.fr

=head1 CATEGORY

util

=head1 USAGE

taxon-frequencies -taxon my_taxon [-o outputfile] [-v #] [...]

=head1 INPUT FORMAT

Oligo and dyad frequency files are loaded from $RSAT genome
directories.

=head1 OUTPUT FORMAT

The output format is a tab-delimited file with one row per pattern
(oligo or dyad).

By default, the output file contains 4 columns:

=head2 pattern sequence

Pattern sequence is always a single-strand pattern description, even
though it can be used to scan sequences on both strands.

=head2 pattern ID

For single-strand frequencies, the ID is simply the pattern
sequence. For two-strand frequencies, the pattern ID includes the
direct sequence + its reverse complement, separated by a pipe
character.

=head2 frequency

Taxon-wide frequency of the pattern.

=head2 occurrences

Taxon-wide occurrences of the pattern in the considered sequence type
(upstream, upstream-noorf, intergenic).

=head2 organism-specific occurrences (optional)

When the option -occ_per_org is active, the program exports one
additional column per organism, indicating the organism-specific
number of occurrences of each pattern.

=head2 organism-specific frequencies (optional)

When the option -freq_per_org is active, the program exports one
additional column per organism, indicating the organism-specific
frequency of each pattern.

=cut


BEGIN {
    if ($0 =~ /([^(\/)]+)$/) {
	push (@INC, "$`lib/");
    }
}
require "RSA.lib";
require RSAT::Tree;
require "footprint.lib.pl";

################################################################
## Main package
package main;
{

  ################################################################
  ## Initialize parameters
  local $start_time = &RSAT::util::StartScript();

  local $taxon;
  local @taxa = ();
  local @organisms = ();

  local %dir = ();
  local %outfile = ();
  local %infile = ();

  local $verbose = 0;
  local $out = STDOUT;

  local $all_taxa = 0;
  local $all_models = 0;
  local $pattern_type = "";	## Supported= oligo, dyad
  local $oligo_length = 0;
  local $monad_length = 0;
  local $background_model = "upstream-noorf";
  local $str = "-1str";
  local $noov ="-noov";		## Supported= -noov, -ovlp
  local $occ_col = 4; ## Column containing pattern occurrences in the frequency files

  local $decimals = 13;		## Decimals for printing frequencies
  
  ## Background models
  %supported_bg = ('upstream'=>1,
		   'upstream-noorf'=>1,
		   'intergenic'=>1,
		   'upstream-rm'=>1,
		   'upstream-noorf-rm'=>1
		  );


  ## Lists for iterating on parameters
  my @pattern_types = ();
  my @oligo_lengths = ();
  my @monad_lengths = ();
  my @strands = ();
  my @noov = ();
  my @background_models = ();

  ## Check pattern type
  @supported_pattern_types = qw (dyad oligo);
  %supported_pattern_types = ();
  foreach my $type (@supported_pattern_types) {
      $supported_pattern_type{$type} = 1;
  }
  $supported_pattern_types = join(",", @supported_pattern_types);

  ################################################################
  ## Read argument values
  &ReadArguments();

  ################################################################
  ## Check argument values

  ## Options -org_list and -install are not compatible
  if ($orglist_file && $install) {
    &RSAT::error::FatalError("Options -org_list and -install are mutually incompatible.");
  }

  ## Background model parameters
  if ($all_models) {
    @pattern_types = qw(oligo dyad);
    @oligo_lengths = 1..8;
    @monad_lengths = 1..3;
    @strands = qw (-1str -2str);
    @noov = qw (-noov -ovlp);
    @background_models = qw (upstream-noorf upstream);
  } else {
    @pattern_types = $pattern_type;
    @oligo_lengths = $oligo_length;
    @monad_lengths = $monad_length;
    @strands = $str;
    @noov = $noov;
    @background_models = $background_model;
  }
  

  ################################################################
  ## Load the taxonomical tree of organisms supported in RSAT
  local $tree = new RSAT::Tree();
  my @supported_organisms = sort keys (%supported_organism);
  $tree->LoadSupportedTaxonomy("Organisms", \%main::supported_organism, 1);


  ################################################################
  ## Taxa

  ## Select the taxa to be analyzed
  if ($all_taxa) {
    @taxa = $tree->node_names();

  } elsif ($main::orglist_file) {
    @taxa = ("org_list");

  } else {
    &RSAT::error::FatalError("You must specify at lest one taxon (option -taxon) or select all taxa (option -all_taxa)") 
      if (scalar(@taxa) == 0);
    ## check taxon names
    foreach my $taxon (@taxa) {
      &RSAT::message::Debug("Checking taxon", $taxon) if ($main::verbose >= 4);
      &CheckTaxon($taxon);
    }
    if ($sub_taxa) {
      my @sub_taxa = ();
      foreach my $taxon (@taxa) {
	push @sub_taxa, $tree->get_node_descendents_names($taxon, "DFS", "node");
      }
      @taxa = @sub_taxa;
    }
  }

  &RSAT::message::TimeWarn("Computing taxon-specific oligo/dyad frequencies for", scalar(@taxa),"taxa") if ($main::verbose >= 2);
  if ($main::verbose >= 3) {
    my $t = 0;
    foreach my $taxon (@taxa) {
      $t++;
      &RSAT::message::Info(join "\t", $t, $taxon);
    }
  }

  ################################################################
  ## For each query taxon (or organism list), collect the organisms
  ## and compute background models(s).

  my $t = 0;
  my $nb_taxa = scalar(@taxa);
  foreach  $taxon (@taxa) {
    $t++;

    ## Check taxon and collect organisms
    if ($main::orglist_file) {
      @organisms = &ReadOrganismsFromFile($main::orglist_file);
    } else {
      @organisms = $tree->get_node_descendents_names($taxon, "DFS", "leaf");
    }

    ## TEMPORARY FIX: the current implementation of
    ## &RSAT::Tree::get_node_descendents() includes the nodes in the
    ## returned list. I suppress it with a shift
    if ($organisms[0] eq $taxon) {
      shift @organisms;
    }

#    &RSAT::message::Info("Taxon", $taxon, scalar(@organisms), "organisms") if ($main::verbose >= 10);
#    &RSAT::message::Info("Taxon", $taxon, "organisms", join(";", @organisms)) if ($main::verbose >= 10);

    ## Iterate background models
    my $m = 0;
    foreach $background_model (@background_models) {

      ## Iterate pattern types
      foreach $pattern_type (@pattern_types) {
	my @lengths = ();

	## Check pattern type and type-specific lengths
	if ($pattern_type eq "oligo") {
	  @lengths = @oligo_lengths;
	  &RSAT::error::FatalError("You must define the oligo length (option -ol)")
	    unless (scalar(@lengths));
	} elsif ($pattern_type eq "dyad") {
	  @lengths = @monad_lengths;
	  &RSAT::error::FatalError("You must define the monad length (option -ml)")
	    unless (scalar(@lengths));
	} else {
	  &RSAT::error::FatalError("You must define pattern type (option -type dyad|oligo)")
	    unless ($pattern_type);
	}

	## Iterate oligo/monad lengths
	foreach my $length (@lengths) {

	  ## Iterate strands
	  foreach my $str (@strands) {

	    ## Check strands
	    my $sum_rc = 0;
	    if ($str eq "-1str") {
	      $sum_rc = 0;
	    } elsif ($str eq "-2str") {
	      $sum_rc = 1;
	    } else {
	      &RSAT::error::FatalError($str, "Invalid value for the strands. Supported= -1str, -2str.");
	    }
	    ## Iterate overlap types
	    foreach my $noov (@noov) {
	      $m++;

	      &RSAT::message::TimeWarn("Taxon ".$t."/".$nb_taxa, $taxon, scalar(@organisms)." organisms", "background model ".$m,   $background_model, $pattern_type, $length."nt", $str, $noov)
		if ($main::verbose >= 2);

	      ## Initialize variables for one background model
	      local %occ_sum_per_pattern = ();
	      local %occ_sum_per_organism = ();
	      local %occ_sum_per_spacing = ();
	      local %freq_sum_per_organism = ();
	      local %pattern_freq;
	      local %pattern_id;
	      local $pattern_freq_sum = 0;
	      local $total_occ = 0;
	      local %patterns_per_organism = ();
	      local %invalid_patterns_per_organism = ();
	      local $occ_per_org = 0;
	      local $freq_per_org = 0;

	      ## Automatic specification of the output file for the installatin
	      ## in the RSAT data folder
	      if ($install) {
		## Output file name
		my $taxfreq_file = "";
		if ($pattern_type eq "oligo") {
		  $taxfreq_file = &ExpectedFreqFile($taxon, $length, $background_model,
						    type=>'oligo',
						    noov=>$noov,
						    str=>$str,
						    nowarn=>1,
						    taxon=>1);
		} elsif ($pattern_type eq "dyad") {
		  $taxfreq_file = &ExpectedFreqFile($taxon, $length, $background_model,
						    type=>'dyad',
						    noov=>$noov, 
						    str=>$str,
						    nowarn=>1,
						    taxon=>1);
		} else {
		  &RSAT::error::FatalError($pattern_type, "is not a valid pattern type. Supported: oligo,dyad");
		}
		$taxfreq_file .= ".gz";	## Automatic compression of the background model files
		&RSAT::message::Info("Auto install file", $taxfreq_file) if ($main::verbose >= 3);

		## Output directory
		my $current_out_dir  ="";
		if ($dir{taxfreq}) {
		  ## Replace automatic by user-specified taxfreq directory
		  my $short_file = &ShortFileName($taxfreq_file);
		  $outfile{taxfreq} = join ("/", $dir{taxfreq}, $short_file);
		  $current_out_dir = $dir{taxfreq};
		} else {
		  $outfile{taxfreq} = $taxfreq_file;
		  ($current_out_dir) = &SplitFileName($taxfreq_file);
		}
		$outfile{taxfreq} =~ s|//|/|g;
		&RSAT::util::CheckOutDir($current_out_dir);
		&RSAT::message::Info("Install dir", $current_out_dir) if ($main::verbose >= 3);
		&RSAT::message::Info("Install file", $outfile{taxfreq}) if ($main::verbose >= 2);
	      }

	      ################################################################
	      ## Open output stream
	      $out = &OpenOutputFile($outfile{taxfreq});

	      ################################################################
	      ## Read background pattern frequencies
	      my $org_nb = scalar(@organisms);
	      my $o = 0;
	      foreach my $org (@organisms) {
		  $o++;
		my $exp_freq_file;

		## Initialize pattern counters
		$invalid_patterns_per_organism{$org} = 0;
		$patterns_per_organism{$org} = 0;
		$occ_sum_per_organism{$org} = 0;

		if (($pattern_type eq "oligo") ||
		    ($pattern_type eq "dyad")) {
		  $exp_freq_file = &ExpectedFreqFile($org, $length, $background_model,
						     type=>$pattern_type,
						     noov=>$noov, str=>$str,
						     warn=>1);
		  #	  } elsif  ($pattern_type eq "dyad") {
		  #	    $exp_freq_file = &ExpectedFreqFile($org, $length, $background_model,
		  #					       type=>'dyad',
		  #					       noov=>$noov, str=>$str,
		  #					       warn=>1);
		} else {
		  &RSAT::error::FatalError($pattern_type, "is not a valid pattern type. Supported: oligo,dyad");
		}

		## Include a warning for missing files in the output file
		unless ((-e $exp_freq_file) ||
			(-e $exp_freq_file.".gz")) {
		  print $out join ("\t", "; WARNING", "Skipped missing file for org", $org, $exp_freq_file), "\n";
		  next;
		}
		$infile{$org} = $exp_freq_file;

		## Count occurrences for each frequency file
		&RSAT::message::TimeWarn("Loading", $pattern_type, "occurrences for organism", $o."/".$org_nb, $org) if ($main::verbose >= 3);
		&RSAT::message::TimeWarn("\tFrequency file", $infile{$org}) if ($main::verbose >= 4);
		my ($in) = &OpenInputFile($infile{$org});
		my $l = 0;
		while (<$in>) {
		  $l++;
		  next if (/^--/); ## Skip comment lines
		  next if (/^;/);  ## Skip comment lines
		  next if (/^#/);  ## Skip header line
		  next unless (/\S/); ## Skip empty lines
		  my @fields = split /\s+/;
		  my $pattern_seq = $fields[0];
		  my $pattern_id = $fields[1];
		  my $occ = $fields[$occ_col -1];
		  unless (&IsNatural($occ)) {
		    ## Temporary fix for dyad, because the frequency files contain occurrences in the 5th column
		    if ($pattern_type eq "dyad") {
		      $occ = $fields[$occ_col];
		    }
		    unless (&IsNatural($occ)) {
		      &RSAT::error::FatalError($occ, "Invalid value for occurrences. Should be a Natural number", $infile{$org}, "line", $l);
		    }
		  }

		  ################################################################
		  ## Check validity of the pattern

		  ## Check oligonucleotide length
		  if ($pattern_type eq "oligo") {
		    unless (length($pattern_seq) == $length) {
		      &RSAT::message::Warning($org, "Skipped", $pattern_seq, "Invalid pattern for ".$length."nucleotide(s)");
		      $invalid_patterns_per_organism{$org}++; ## Increment pattern counter
		      next;
		    }
		  }

		  ## For strand-insensitive frequencies, the frequency files
		  ## should in principle contain only one pattern for each
		  ## pair of reverse complementary patterns.
		  if ($sum_rc) {
		    my $rc =  lc(&SmartRC($pattern_seq));
		    if (lc($pattern_seq) gt $rc) {
		      &RSAT::message::Warning($org, "Skipped", $pattern_seq, "Invalid pattern for 2str file");
		      $invalid_patterns_per_organism{$org}++; ## Increment pattern counter
		      next;
		    }
		  }

		  ## Statistics per organism
		  $patterns_per_organism{$org}++; ## Increment pattern counter
		  $occ_sum_per_organism{$org} += $occ;

		  ## store the organism-specific occurrences for the full table (can be memory costly)
		  if (($occ_per_org) || ($freq_per_org)) {
		    $pattern_occ{$pattern_seq}{$org} = $occ;
		  }

		  $occ_sum_per_pattern{$pattern_seq} += $occ;
		  if (defined($pattern_id{$pattern_seq})) {
		    if ($pattern_id{$pattern_seq} ne $pattern_id) {
		      &RSAT::error::FatalError("Inconsistency betwen pattern identifiers", $infile{$org}, $pattern_seq, 
					       $pattern_id, $pattern_id{$pattern_seq});
		    }
		  } else {
		    $pattern_id{$pattern_seq} = $pattern_id;
		  }
		}
		close $in if ($infile{input});
		if ($invalid_patterns_per_organism{$org} > 0) {
		  print $out join ("\t", "; WARNING","skipped", 
				   $invalid_patterns_per_organism{$org}, 
				   "invalid patterns for organism", $org, $exp_freq_file), "\n";
		}
	      }

	      ################################################################
	      ## Compute total pattern occurrences
	      foreach my $pattern_seq (keys %occ_sum_per_pattern) {
		$total_occ += $occ_sum_per_pattern{$pattern_seq};

		## Specific treatment for dyads: there is a specific total for
		## each spacing value, and th sum of frequencies is 1 for each
		## spacing value
		if ($pattern_type eq "dyad") {
		  if ($pattern_seq =~ /n{(\d+)}/i) {
		    my $spacing = $1;
		    $occ_sum_per_spacing{$spacing} += $occ_sum_per_pattern{$pattern_seq};
		  } else {
		    &RSAT::error::FatalError($pattern_seq, "Invalid sequence for a dyad");
		  }
		}
	      }
	      foreach my $pattern_seq (sort keys %pattern_id) {
		if ($pattern_type eq "dyad") {
		  if ($pattern_seq =~ /n{(\d+)}/i) {
		    my $spacing = $1;
		    $pattern_freq{$pattern_seq} = $occ_sum_per_pattern{$pattern_seq}/$occ_sum_per_spacing{$spacing};
		  }
		} else {
		  $pattern_freq{$pattern_seq} = $occ_sum_per_pattern{$pattern_seq}/$total_occ;
		}
		$pattern_freq_sum += $pattern_freq{$pattern_seq};
	      }

	      ################################################################
	      ## Suppress RC patterns if option -2str has been activated
	      if ($sum_rc) {
		&RSAT::message::TimeWarn("Suppressing redundant patterns (reverse complements)\n") if ($main::verbose >= 3);
		foreach $pattern_seq (sort keys %pattern_id) {
		  $rc_pattern_seq = &SmartRC($pattern_seq);
		  #	&RSAT::message::Debug("Grouping reverse complements", $pattern_seq, $rc_pattern_seq) if ($main::verbose >= 4);
		  if ($rc_pattern_seq gt $pattern_seq) { ### only suppress one oligo from the pair
		    delete $pattern_id{$rc_pattern_seq};
		  }
		}
	      }


	      ################################################################
	      ## Print verbose
	      &Verbose() if ($main::verbose);

	      ################################################################
	      ## Print output

	      ## Define output fields
	      my @out_fields =  ("seq", "pattern_id", "frequency", "occ");
	      $field_descr{seq} = "pattern sequence";
	      $field_descr{pattern_id} = "pattern identifier";
	      $field_descr{frequency} = "taxon-wide patter, frequency";
	      $field_descr{occ} = "taxon-wide occurrences";

	      if ($main::occ_per_org) {
		foreach my $org (@organisms) {
		  my $field = "N.".$org;
		  push @out_fields, $field;
		  $field_descr{$field} = "occurrences\t".$org;
		}
	      }
	      if ($main::freq_per_org) {
		foreach my $org (@organisms) {
		  my $field = "F.".$org;
		  push @out_fields, $field;
		  $field_descr{$field} = "pattern frequencies in ".$org;
		}
	      }

	      ## Print column content
	      if ($main::verbose >= 1) {
		print $out "; Column content\n";
		my $c = 0;
		foreach my $field (@out_fields) {
		  $c++;
		  print $out join ("\t", ";", $c, sprintf("%-29s", $field), $field_descr{$field}), "\n";
		}
	      }

	      ## Print header
	      print $out "#", join( "\t", @out_fields), "\n";

	      ## Print pattern occurrences
	      foreach my $pattern_seq (sort keys %pattern_id) {
		print $out join( "\t", 
				 $pattern_seq,
				 $pattern_id{$pattern_seq},
				 sprintf("%.${decimals}f", $pattern_freq{$pattern_seq}),
				 $occ_sum_per_pattern{$pattern_seq},
			       );

		## Print organism-specific occurrences
		if ($occ_per_org) {
		  foreach my $org (@organisms) {
		    print $out "\t", $pattern_occ{$pattern_seq}{$org} || 0;
		  }
		}

		## Print organism-specific frequencies
		if ($freq_per_org) {
		  foreach my $org (@organisms) {
		    my $freq = 0;
		    if ((defined($pattern_occ{$pattern_seq}{$org})) 
			&& (defined($occ_sum_per_organism{$org}))
			&& ($occ_sum_per_organism{$org} > 0)
		       ) {
		      $freq = $pattern_occ{$pattern_seq}{$org}/$occ_sum_per_organism{$org}
		    } else {
		      $freq = 0;
		    }
		    $freq_sum_per_organism{$org} += $freq;
		    printf $out "\t%.${decimals}f", $freq;
		  }
		}
		print $out "\n";
	      }

	      ################################################################
	      ## Print totals per column
	      if ($main::verbose >= 1) {
		my @totals = ("total","total",sprintf("%.${decimals}f", $pattern_freq_sum),$total_occ);
		if ($occ_per_org) {
		  foreach my $org (@organisms) {
		    push @totals, $occ_sum_per_organism{$org} || 0;
		  }
		}
		if ($freq_per_org) {
		  foreach my $org (@organisms) {
		    push @totals, sprintf("%.${decimals}f", $freq_sum_per_organism{$org});
		  }
		}
		print $out ";", join( "\t", @totals), "\n";
	      }

	      ################################################################
	      ## Close output stream
	      my $exec_time = &RSAT::util::ReportExecutionTime($start_time);
	      print $main::out $exec_time if ($main::verbose >= 1);
	      close $main::out if ($main::outfile{taxfreq});
	      &RSAT::message::TimeWarn("Saved taxon frequency file", $outfile{taxfreq}) if (($main::verbose >= 2) && ($outfile{taxfreq}));
	    }
	  }
	}
      }
    }
  }
  exit(0);
}


################################################################
################### SUBROUTINE DEFINITION ######################
################################################################


################################################################
## Display full help message 
sub PrintHelp {
    system "pod2text -c $0";
    exit()
}

################################################################
## Display short help message
sub PrintOptions {
    &PrintHelp();
}

################################################################
## Read arguments
sub ReadArguments {
    my $arg;
    my @arguments = @ARGV; ## create a copy to shift, because we need ARGV to report command line in &Verbose()
    while (scalar(@arguments) >= 1) {
      $arg = shift (@arguments);
	## Verbosity
=pod

=head1 OPTIONS

=over 4

=item B<-v #>

Level of verbosity (detail in the warning messages during execution)

=cut
	if ($arg eq "-v") {
	    if (&IsNatural($arguments[0])) {
		$main::verbose = shift(@arguments);
	    } else {
		$main::verbose = 1;
	    }

	    ## Help message
=pod

=item B<-h>

Display full help message

=cut
	} elsif ($arg eq "-h") {
	    &PrintHelp();

	    ## List of options
=pod

=item B<-help>

Same as -h

=cut
	} elsif ($arg eq "-help") {
	    &PrintOptions();

	    ## Taxon
=pod

=item B<-taxon taxon>

Specify the taxon for which frequencies have to be computed.

This option can be used iteratively to compute background models for
multiple taxa.

Example:

taxon-frequencies -v 1 -taxon Bacteria -taxon Fungi -sub_taxa [...]


Alternatively, reference organisms can be specified with the option
-org_list.


=cut
	} elsif ($arg eq "-taxon") {
	  if ($main::all_taxa) {
	    &RSAT::error::FatalError("Options -taxon and -all_taxa are mutually incompatible");
	  }
	  push @main::taxa, shift(@arguments);
    &RSAT::error::FatalError("Options -taxon and -org_list are mutually incompatible") if ($main::orglist_file);

=pod

=item B<-org_list>

Build background models from a user-specified file containing a list
of organisms.

File format: the first word of each line is used as organism ID. Any
subsequent text is ignored. The comment char is ";".

This option is incompatible with the option "-taxon".

=cut
  } elsif ($arg eq "-org_list") {
    $main::orglist_file = shift(@arguments);
    &RSAT::error::FatalError("Options -taxon and -org_list are mutually incompatible") if (scalar(@main::taxa) >= 1);



	    ## Output directory
=pod

=item	B<-dir output_directory>

This option lets the program determine the name of the output file
(respecting the same nomenclature as with the option -install), but
you can specify the output directory.

This is the most convenient way to use taxon-frequencies when you do
not have write permissions in the RSAT directory.

Incompatible with the option -o and -install.

=cut
	} elsif ($arg eq "-dir") {
	    $main::dir{taxfreq} = shift(@arguments);
	    if ($install) {
	      &RSAT::error::FatalError("Options -dir and -install are mutually exclusive");
	    }
	  if ($main::outfile{taxfreq}) {
	    &RSAT::error::FatalError("Options -o and -dir are mutually exclusive");
	  }

	    ## Automatic installation 
=pod

=item	B<-install>

Automatically specify the name of the output file and save the result
in the $RSAT directory. This requires to have write permissions on the
$RSAT/public_html/data directory.

Incompatible with the option -o and -dir.

=cut
	} elsif ($arg eq "-install") {
	  $install = 1;
	  if ($main::outfile{taxfreq}) {
	    &RSAT::error::FatalError("Options -o and -install are mutually exclusive");
	  }
	  if ($main::dir{taxfreq}) {
	    &RSAT::error::FatalError("Options -install and -dir are mutually exclusive");
	  }

	    ### Background model
=pod

=item B<-bg background_model>

Specify a background model for expected frequencies.

  Supported: upstream-noorf, upstream, intergenic
  Default: upstream-noorf

=cut
	} elsif ($arg eq "-bg") {
	    $background_model = shift (@arguments);
	    $background_model =~ s/ncf/intergenic/;
	    $background_model =~ s/input/bernoulli/;
	    unless ($supported_bg{$background_model}) {
		&RSAT::error::FatalError("Invalid background model\t$background_model\tsupported: $supported_bg");
	    }

	    #### strands
=pod

=item B<-2str>

Collect frequencies computed on both strands.

=item B<-1str>

Collect frequencies computed on a single strand.

=cut
	} elsif ($arg eq "-1str") {
	  $str = "-1str";
	} elsif ($arg eq "-2str") {
	  $str = "-2str";

	  ## Overlap
=pod

=item B<-noov>

Do not allow mutual overlap between successive occurrences of
self-overlaping patterns (e.g. TATATATATA contains a signel occurrence
of TATATA).

=item B<-ovlp>

Allow mutual overlap between successive occurrences of
self-overlapping patterns (e.g. TATATATATA contains three occurrences
of TATATA).

=cut
	} elsif ($arg eq "-ovlp") {
	  $noov = "-ovlp";
	} elsif ($arg eq "-noov") {
	  $noov = "-noov";


	  ## Pattern type
=pod

=item B<-type oligo|dyad>

Pattern type. Supported: oligo, dyad. 

=cut
	} elsif ($arg eq "-type") {
	  $pattern_type = shift @arguments;
          &RSAT::error::FatalError($pattern_type, "Invalid pattern type. Supported: ".$supported_pattern_types) unless ($supported_pattern_type{$pattern_type});

	  ## Oligonucleotide length
=pod

=item B<-ol #>

Oligonucleotide length.

This option is only valid when pattern type is set to 'oligo'.

=cut
	} elsif ($arg eq "-ol") {
	  $oligo_length = shift @arguments;
	  &RSAT::error::FatalError($oligo_length, 
				   "Invalid oligo length. Must be a strictly positive Natural number") 
	    unless ((&IsNatural($oligo_length)) && ($oligo_length > 0));

	  ## Monad length
=pod

=item B<-ml #>

Monad length for the dyads.

This option is only valid when pattern type is set to 'dyad'.

=cut
	} elsif ($arg eq "-ml") {
	  $monad_length = shift @arguments;
	  &RSAT::error::FatalError($monad_length, 
				   "Invalid monad length. Must be a strictly positive Natural number") 
	    unless ((&IsNatural($monad_length)) && ($monad_length > 0));

	  ## Calculate all background models
=pod

=item B<-all_models>

Compute all the background models for the specified taxon

=cut
        } elsif ($arg eq "-all_models") {
	  $main::all_models = 1;

	  ## Calculate background model(s) for all taxa below the specified taxon
=pod

=item B<-sub_taxa>

Compute the background model(s) for all taxa below the taxon specified
with the option -taxon.

=cut
        } elsif ($arg eq "-sub_taxa") {
	  $main::sub_taxa = 1;


	  ## Calculate background model(s) for all taxa
=pod

=item B<-all_taxa>

Compute the background model(s) for all taxa supported in RSAT.
Beware ! This can take some time. In principle, this hould be done
only when many genomes are updated.

=cut
        } elsif ($arg eq "-all_taxa") {
	  $main::all_taxa = 1;
	  if (scalar(@main::taxa) > 0) {
	    &RSAT::error::FatalError("Options -taxon and -all_taxa are mutually incompatible");
	  }


	  ## Return occurrences per organism
=pod

=item B<-occ_per_org>

Return additional columns with the organism-specific pattern
occurrences (one column per organism).

=cut
        } elsif ($arg eq "-occ_per_org") {
	  $main::occ_per_org = 1;

	  ## Return frequencies per organism
=pod

=item B<-freq_per_org>

Return additional columns with the organism-specific pattern
frequencies (one column per organism).

=cut
        } elsif ($arg eq "-freq_per_org") {
	  $main::freq_per_org = 1;



	  ## Decimals for printing frequencies
=pod

=item B<-decimals #>

Number of decimal for printing frequencies (default 13)

=cut
        } elsif ($arg eq "-decimals") {
	  $main::decimals = shift(@arguments);
	  &RSAT::error::FatalError($main::decimals, "Invalid value for decimals. Must be a Natural number") 
	    unless (&IsNatural($main::decimals));

	} else {
	    &FatalError(join("\t", "Invalid option", $arg));

	}
    }


=pod

=back

=cut

}

################################################################
#### verbose message
sub Verbose {
    print $main::out "; taxon-frequencies ";
    &PrintArguments($main::out);

    ## Compute the length of the largest organism name
    my $org_len = 20; ## Length of the string or printing organism names
    foreach my $org (@organisms) {
      if (length($org) > $org_len) {
	$org_len = length($org);
      }
    }

    ## Print the list of input files
    print $out "; Organisms\n";
    my $i = 0;
    print $out join ("\t", ";", "org_nb",
		     sprintf("%-${org_len}s", "organism name"), 
		     "nb_patt", "occ_sum", "invalid", "Input file"), "\n";
    foreach my $org (@organisms) {
      if ($infile{$org}) {
	$i++;
	print $out join ("\t", ";",
			 $i,
			 sprintf("%-${org_len}s", $org),
			 $patterns_per_organism{$org} || 0,
			 $occ_sum_per_organism{$org} || 0,
			 $invalid_patterns_per_organism{$org} || 0,
			 $infile{$org},
			), "\n";
      }
    }

    if (%main::outfile) {
	print $main::out "; Output files\n";
	while (my ($key,$value) = each %main::outfile) {
	    print $main::out ";\t$key\t$value\n";
	}
    }
}


__END__

=pod

=head1 SEE ALSO

=head2 supported-organisms

To get a list of supported organisms + their respective taxa, type:

I<supported-organisms -format tab -return ID,taxonomy>

=head2 oligo-analysis

Taxon-wide oligo frequency files can be used as expected frequencies
for motif discovery with I<oligo-analysis>.

=head2 dyad-analysis

Taxon-wide dyad frequency files can be used as expected frequencies
for motif discovery with I<dyad-analysis>.

=head2 convert-background-model

Oligo and dyad frequencies can be converted into background model for
third parties pattern detection programs (MEME, MotifSampler).

=head1 WISH LIST

=head2 Taxon tree traversal

Optimize the taxonomic tree traversal for the option -sub_taxa. The
procedure should compute the frequencies at each branching node by
summing the occurrences on its direct children, rather than summing
occurrences for all the leaves. This should not be too complicated
with a recursion from the root, where each branching calculates its
own occurrence table by requestint the occurrences of each of its
children.

=cut
