#!/usr/bin/perl -w
############################################################
#
# $Id: phylo-profiles,v 1.4 2013/09/30 12:53:15 jvanheld Exp $
#
############################################################

## use strict;

=pod

=head1 NAME

phylo-profiles

=head1 VERSION

$program_version

=head1 DESCRIPTION

Compute phylogenetic profiles for all the genes of a query genome
across a user-specified taxon, and exract a co-occurrence network.

=head2 Phylogenetic profiles

The notion of phylogenetic profile was introduced by Pellegrini et
al. (1999). The method relies on the assumption that genes ensuring
necessary steps in a same biological process should be either present
together in organisms where this function is enabled, or both absent
in organisms where the function is not ensured.

Phylogenetic profiles are defined as Boolean profiles of
presence/absence of gene orthologs in a set of reference genomes
(e.g. all the Bacterial genomes).

=head2 Similarity between phylogenetic profiles

At the time of the original publication, Pellegrini and colleagues
only disposed of 16 fully sequenced genomes. Their idea was
illustrated by showing that groups of gene groups having identical
profiles of presence/absence across the 16 genomes were involved in
the same process.

As we now dispose of several thousands of fully sequenced genomes, the
power of the method has drastically increased. The simplistic
criterion of selecting identical profiles would give poor results.
Several metrics have been proposed to measure the similarity between
two phylogenetic profiles (see Feller et al. for an evaluation of the
alternative metrics and other parameters).

The program I<phylo-profiles> relies on the hypergeometric
distribution to compute a significance of the co-occurrence between
two genes across a set of reference genomes.

An important assumption of the hypergeometric is that the genomes
should be independent from each other. This is however far from being
the case, since we dispose of many closely related genomes. To reduce
this bias, we select non-redundant genomes by retaining only one
species per sub-taxon, at a given depth of the taxonomic tree. This
criterion is somewhat rough, but seems to give good results (as far as
we can judge by comparing the functions of gene pairs detected as
co-occurring).

=head2 Orthology criterion

Ideally, the method should rely on suitable criteria to infer the
presence/absence of orthologs in the reference genomes. This would
however require to infer, for each gene family, a phylogenetic tree,
and qualify the relations of homology (homologs vs paralogs vs
xenologs).

For the sake of facility (we could even say feasibility), we use an
approximative criterion to infer the presence/absence of ortholog, by
detecting bidirectional best hits (BBH) with the similarity search
program BLAST.


=head1 AUTHORS

Jacques.van-Helden\@univ-amu.fr

=head1 CATEGORY

=over

=item comparative genomics

=back

=head1 USAGE

phylo-profiles -org query_organism -taxon reference_taxon [...]

See below for detailed description of the options.

=head1 INPUT DATA

=head1 OUTPUT FORMAT

The program produces a set of output files corresponding
to the successive steps of the analysis, and a synthetic HTML page
with links to the individual result files.

=head1 REFERENCES

Pellegrini, M., Marcotte, E. M., Thompson, M. J., Eisenberg, D. &
Yeates, T. O. (1999).
Assigning protein functions by comparative genome analysis: protein 
phylogenetic profiles.
Proc Natl Acad Sci U SA 96, 4285-8.

Ferrer, L., Dale, J. M. & Karp, P. D. (2010).
A systematic study of genome context methods: calibration,
normalization and combination.
BMC Bioinformatics 11, 493.

=head1 SEE ALSO

=over

=item I<genome-blast>

Pior to using the progam I<phylo-profiles> homology tables have to be
pre-computed between all proteins of the query genome and each species
of the reference taxon. This can be done with the program
I<genome-blast>.

 genome-blast -v 2 -q query_org -dbtaxon taxon -reciprocal

=back

=head1 WISH LIST

=head2 Computation of co-occurrence network

(Jacques van Helden just needs to add the command in this work flow,
it can be done easily)

In a co-occurrence network, nodes represent (protein-coding) genes and
edges are used to link pairs of co-occurring genes. Co-occurrence
means that the genes are generally either both present or both absent
in the genomes of the reference taxon. Various statistics can be used
to measure the level of co-occurrence (for a comparison of these
statistics, see Ferrer et al., 2010).

=head2 B<Sort organisms by taxonomy>

Sort the columns of the output file by taxonomical group (as Nishant
does).

=head2 B<-taxon_list>

The user can specify a tab-delimited text file containing a list of
taxa of interest.

The first word of each line must contain the name of a taxon supported
in RSAT. If this option is specified, the program exports a summary
table per taxa, with one row per gene, one column per taxon of
interest, and each cell indicates the number of species where the gene
has been found in the corresponding taxon.

The second column can optionally contain the keywords "absent" or
"present". This indicates a taxonomic filter which will be applied to
each gene, in order to retain only the genes which are both - well
represented in taxa labelled "present"; - more or less absent from
genomes of the taxa labelled "absent".

This enables to lead knowledge-oriented searches for groups of genes
whose products are expected to be present in some taxa, and absent
from some other taxa.

=head2 B<-task heatmap>

Export a heatmap indicating with contrasted colors (red vs blue) the
genomes in which each gene is present/absent.

Nishant will define the appropriate heatmap options with a small
script in R, which can then be integrated in phylo-profiles.

=cut


BEGIN {
  if ($0 =~ /([^(\/)]+)$/) {
    push (@INC, "$`lib/");
  }
}
require "RSA.lib";
require "RSA2.cgi.lib";

################################################################
## Main package
package main;
{

  ################################################################
  ## Initialise parameters
  our $start_time = &RSAT::util::StartScript();
  our $program_version = do { my @r = (q$Revision: 1.4 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
  #    $program_version = "0.00";

  our %infile = ();
  our %outfile = ();

  our @outfiles = ();

  our $verbose = 0;
  our $in = STDIN;
  our $out = STDOUT;

  our $taxon;
  our $organism;
  our $taxonomic_depth = 5;
  our $null = "<NA>"; ## String for null/undefined/not assigned values
  our $inf = "320"; ## String to display for infinite values in the sig table

  ## Supported tasks
  our @supported_tasks = qw(
			    species
			    bbh
			    profiles
                            network
			    index
			    all
			   );
  our $supported_tasks = join (",", @supported_tasks);
  our %supported_task = ();
  foreach my $task (@supported_tasks) {
    $supported_task{$task} = 1;
  }
  our %task = ();

  ## lower and upper thresholds
  @supported_threshold_fields = qw(
				   ident
				   ali_len
				   e_value
				   bit_sc
				  );
  foreach my $field (@supported_threshold_fields) {
    $supported_threshold_field{$field} = 1;
  }
  $supported_threshold_fields = join (",", @supported_threshold_fields);

  ## Thresholds
  our %lth = ();
  our %uth = ();
  $lth{ident} = 10; ## Threshold on BLAST identity percentage
  $lth{ali_len} = 50; ## Threshold on BLAST alignment length
  $lth{bit_sc} = 30; ## Threshold on BLAST bit score
  $uth{expect} = 1e-5; ## Threshold on BLAST expect (e-value)

  ## For default thresholds, I intently use 2-digit numbers because
  ## this will be used for filename suffixes
  our @cooc_sig_thresholds = (01,02,03,05,10,20,30); 

  ################################################################
  ## Read argument values
  &ReadArguments();

  ################################################################
  ## Check argument values

  if ((scalar(keys(%task)) == 0) || ($task{all})){
    %task = %supported_task;
  }

  ## Mandatory arguments
  unless ($organism) {
    &RSAT::error::FatalError("You must define a query organism (option I<-orf>)");
  }

  unless (($taxon) || ($ref_org_file)) {
    &RSAT::error::FatalError("You must define a reference taxon (option I<-taxon>)");
  }
  
  ## DB organism file
  if ($ref_org_file) {
    ($orglist) = &OpenInputFile($main::ref_org_file);
    &RSAT::message::Info ("Reading list of db organisms form file", $main::ref_org_file ) if ($main::verbose > 1);
    while (<$orglist>) {
      chomp();
      s/\r/\n/g;	  ## Suppress Windows-specific carriage return
      next if /^;/;		## Comment line
      next if /^\#/;		## Header line
      next if /^\--/;		## SQL comment line
      next unless /\S/;		## Empty line
      my ($org) = split /\s/;
      $org = &trim($org); ## Remove leading and trailing spaces
      &RSAT::OrganismManager::check_name($org);
      push @main::ref_organisms, $org;
    }
    close $orglist;
  }


  ## Automatic definition of output directory if not specified by user
  $ref_suffix = "";
  unless ($dir{output}) {
    if ($ref_org_file) {
      $dir{output} .= scalar(@ref_organisms)."_selected_organisms";
    } else {
      $ref_suffix = ${taxon};
    }
    if ($taxonomic_depth != 0) {
	$ref_suffix .= "_depth".${taxonomic_depth};
    }
    $dir{output} = "results/phylo_profiles/".${organism}."_vs_".$ref_suffix;
  }

  &RSAT::util::CheckOutDir($dir{output});

  ## Specify output files
  $prefix{output} = $dir{output}."/".$organism."_".$ref_suffix;
  $outfile{html_index} = $prefix{output}."_index.html"; push @outfiles, "html_index";
  $outfile{species}=$prefix{output}."_selected_species.tab"; push @outfiles, "species";
  $outfile{species_html}=$prefix{output}."_selected_species.html"; push @outfiles, "species_html";
  $outfile{species_names}=$prefix{output}."_selected_species_names.tab"; push @outfiles, "species_names";
  $outfile{bbh} =  $prefix{output}."_bbh_len".$lth{ali_len}."_id".$lth{ident}."_bits".$lth{bit_sc}."_e".$uth{expect}.".tab"; push @outfiles, "bbh";

  ## Output file names for phylogenetic profiles
  $prefix{profiles} = $prefix{output}."_profiles_len".$lth{ali_len}."_id".$lth{ident}."_e".$uth{expect};
  $outfile{profiles_ids}= $prefix{profiles}."_ids.tab"; push @outfiles, "profiles_ids";
  $outfile{profiles_boolean}= $prefix{profiles}."_boolean.tab"; push @outfiles, "profiles_boolean";
  $outfile{profiles_boolean_names}= $prefix{profiles}."_boolean_names.tab"; push @outfiles, "profiles_boolean_names";
  $outfile{profiles_evalues}= $prefix{profiles}."_evalues.tab"; push @outfiles, "profiles_evalues";
  $outfile{profiles_evalues_names}= $prefix{profiles}."_evalues_names.tab"; push @outfiles, "profiles_evalues_names";
  $outfile{profiles_sig}= $prefix{profiles}."_sig.tab"; push @outfiles, "profiles_sig";
  $outfile{profiles_sig_names}= $prefix{profiles}."_sig_names.tab"; push @outfiles, "profiles_sig_names";

  ## Output file names for co-occurrence network
  $outfile{gene_names}= $prefix{output}."_gene_names.tab"; push @outfiles, "gene_names";
  $prefix{cooccurrence} = $prefix{profiles}."_network";
  $outfile{cooccurrence_table}= $prefix{cooccurrence}.".tab"; push @outfiles, "cooccurrence_table";
  $outfile{cooccurrence_table_html}= $prefix{cooccurrence}.".html"; push @outfiles, "cooccurrence_table_html";
  $outfile{cooccurrence_graph_dot}= $prefix{cooccurrence}.".dot"; push @outfiles, "cooccurrence_graph_dot";
  $outfile{cooccurrence_graph_gml}= $prefix{cooccurrence}.".gml"; push @outfiles, "cooccurrence_graph_gml";

  ################################################################
  ## Open output stream
  local $out = &OpenOutputFile($outfile{output});

  ## Open the HTML index file
  local $html_index;
  if ($task{index}) {
    $html_index = &OpenOutputFile($outfile{html_index});
    my $header = &PrintHtmlResultHeader(program=>"phylo-profiles", refresh_time=>120);
    print $html_index $header;

    ## Report command and arguments
    print $html_index "<p><b>Command:</b> <tt>phylo-profiles ";
    &PrintArguments($html_index, 1);
    print $html_index "</tt></p>\n";
  } else {
    &RSAT::message::TimeWarn("Skipping index task") if ($main::verbose >= 2);
  }

  ################################################################
  ## Print verbose
  &Verbose() if ($main::verbose >= 1);

  ################################################################
  ## Select a set of species in a way to reduce redundancy between
  ## closely related species. This is a bit tricky: we cut the organism
  ## tree at a given depth (e.g. 5) and select a single species of each
  ## taxon at this depth.
  if ($task{species}) {
    if ($taxon) {
      &RSAT::message::TimeWarn("Selecting species",	$taxon,	"depth=".$taxonomic_depth) if ($main::verbose >= 2);
      my $species_cmd = $SCRIPTS."/supported-organisms -v 1";
      $species_cmd .= " -return ID,nb,selected_taxon,taxon_depth,org_per_taxon,taxonomy";
      $species_cmd .= " -taxon ".$taxon;
      $species_cmd .= " -depth ".$taxonomic_depth;
      $species_cmd .= " -o ".$outfile{species};
      $species_cmd .= " ; ".$SCRIPTS."/text-to-html";
      $species_cmd .= " -i ".$outfile{species};
      $species_cmd .= " -o ".$outfile{species_html};
      $species_cmd .= " ; grep -v '^#' ".$outfile{species};
      $species_cmd .= " | cut -f 1";
      $species_cmd .= " > ".$outfile{species_names};
    } elsif ($ref_org_file) {
      if ($ref_org_file ne $outfile{species_names}) {
	$species_cmd = "cp ".$ref_org_file." ".$outfile{species_names};
      } else {
	$species_cmd = "echo";
      }
    }
    &one_command($species_cmd, 1, 0, task=>"species");
    &RSAT::message::Info("species file (tab)", $outfile{species}) if ($main::verbose >= 2);
    &RSAT::message::Info("species file (html)", $outfile{species_html}) if ($main::verbose >= 2);
  } else {
    &RSAT::message::TimeWarn("Skipping species selection task") if ($main::verbose >= 2);
  }


  ## Report the number of selected species
  $species_nb = `grep -v '^;' $outfile{species_names} | grep -v '^#' | wc -l`;
  chomp $species_nb;
  &RSAT::message::Info("Selected", $species_nb, "species") if ($main::verbose >= 1);
  
  ################################################################
  ## Identify all the putative orthologs (BBH)
  if ($task{bbh}) {
    &RSAT::message::TimeWarn("Detecting all orthologs (BBH)", $organism, $taxon) if ($main::verbose >= 2);
    my $ortho_cmd = $SCRIPTS."/get-orthologs -v 2";
    $ortho_cmd .= " -all";
    $ortho_cmd .= " -org ".$organism;
#  $ortho_cmd .= " -taxon ".${taxon};
    $ortho_cmd .= " -org_list ".$outfile{species_names};
    $ortho_cmd .= " -uth rank 1 -lth ali_len ".$lth{ali_len};
    $ortho_cmd .= " -lth bit_sc ".$lth{bit_sc};
    $ortho_cmd .= " -lth ident ".$lth{ident};
    $ortho_cmd .= " -uth e_value ".$uth{expect};
    $ortho_cmd .= " -return e_value,bit_sc,ident,ali_len";
    $ortho_cmd .= " -o ".$outfile{bbh};
    &one_command($ortho_cmd, 1, 0, task=>"bbh");
  } else {
    &RSAT::message::TimeWarn("Skipping detection of all orthologs (BBH)", $organism, $taxon) if ($main::verbose >= 2);
  }

  ## Report the number of BBH relationships (across all reference species)
  if (-e $outfile{bbh}) {
    &RSAT::message::Info("BBH file", $outfile{bbh}) if ($main::verbose >= 2);
    $bbh_nb = `grep -v '^;' $outfile{bbh} | grep -v '^#' | wc -l`;
    chomp $bbh_nb;
    &RSAT::message::Info("Selected", $bbh_nb, "BBH across", $species_nb, "species") if ($main::verbose >= 1);
  }

  ################################################################
  ## Convert ortholog table into a profile table 
  ## with the IDs of the putative orthologs 
  if ($task{profiles}) {
    &RSAT::message::TimeWarn("Converting ortholog table to profiles") if ($main::verbose >= 2);

    ## Gene ID table
    &RSAT::message::TimeWarn("Computing phylogenetic profiles (gene IDs) from BBH") if ($main::verbose >= 2);
    my $command = $SCRIPTS."/convert-classes -v 2 -i ".$outfile{bbh};
    $command .= " -from tab -to profiles";
    $command .= " -ccol 2 -mcol 3 -scol 1 -null '".$null."'";
    $command .= "| grep -v '^;'";
    $command .= "> ".$outfile{profiles_ids};
    &RSAT::message::TimeWarn("ID profiles", $outfile{profiles_ids}) if ($main::verbose >= 2);
    &one_command($command, 1, 0, task=>"profiles");
    &RSAT::message::Info("Phylogenetic profiles by IDs", $outfile{profiles_ids}) if ($main::verbose >= 2);

    ## Convert ortholog table into a Boolean profile table 
    &RSAT::message::TimeWarn("Computing Boolean phylogenetic profiles from BBH") if ($main::verbose >= 2);
    $command = $SCRIPTS."/convert-classes -v 2 -i ".$outfile{bbh};
    $command .= " -from tab -to profiles";
    $command .= " -ccol 2 -mcol 3  -null 0"; ## For Boolean profiles, we want to have 0 for null values
    $command .= " | grep -v '^;'";
    $command .= " > ".$outfile{profiles_boolean};
    $command .= " ; add-gene-info -org ".${organism};
    $command .= " -before -i ".$outfile{profiles_boolean};
    $command .= " -info name -o ".$outfile{profiles_boolean_names};
    &RSAT::message::TimeWarn("Boolean profiles", $outfile{profiles_boolean});
    &RSAT::message::TimeWarn("Boolean profiles with names", $outfile{profiles_boolean_names});
    &one_command($command, 1, 0, task=>"profiles");

    ## Convert ortholog table into a e-value (BLAST expect) profile table 
    &RSAT::message::TimeWarn("Computing e-value (BLAST expect) phylogenetic profiles from BBH") if ($main::verbose >= 2);
    $command = $SCRIPTS."/convert-classes -v 2 -i ".$outfile{bbh};
    $command .= " -from tab -to profiles";
    $command .= " -ccol 2 -mcol 3 -scol 4 -null 0";
    $command .= " -inf ".$inf;
    $command .= " | grep -v '^;'";
    $command .= " > ".$outfile{profiles_evalues};
    $command .= " ; add-gene-info -org ".${organism};
    $command .= " -before -i ".$outfile{profiles_evalues};
    $command .= " -info name -o ".$outfile{profiles_evalues_names};
    &RSAT::message::TimeWarn("e-values profiles", $outfile{profiles_evalues});
    &RSAT::message::TimeWarn("e-values profiles with names", $outfile{profiles_evalues_names});
    &one_command($command, 1, 0, task=>"profiles");

    ## Convert ortholog table into a significance profile table, where sig = -log10(expect)
    &RSAT::message::TimeWarn("Computing e-value (BLAST expect) phylogenetic profiles from BBH") if ($main::verbose >= 2);
    $command = 'awk -F\'\t\' \'{print $1"\t"$2"\t"$3"\t"$4"\t"(-log($4)/log(10))}\' '.$outfile{bbh};
    $command .= " | ".$SCRIPTS."/convert-classes -v 2";
    $command .= " -from tab -to profiles";
    $command .= " -ccol 2 -mcol 3 -scol 5 -null 0";
    $command .= " -inf ".$inf;
    $command .= " | grep -v '^;'";
    $command .= " > ".$outfile{profiles_sig};
    $command .= " ; add-gene-info -org ".${organism};
    $command .= " -before -i ".$outfile{profiles_sig};
    $command .= " -info name -o ".$outfile{profiles_sig_names};
    &RSAT::message::TimeWarn("e-values profiles", $outfile{profiles_sig});
    &RSAT::message::TimeWarn("e-values profiles with names", $outfile{profiles_sig_names});
    &one_command($command, 1, 0, task=>"profiles");

  } else {
    &RSAT::message::TimeWarn("Skipping conversion from bbh tables to profiles") if ($main::verbose >= 2);
  }

  ################################################################
  ## Compute the co-occurrence network
  if ($task{network}) {
      &RSAT::message::TimeWarn("Getting gene names") if ($main::verbose >= 2);
      my $cds_table = $ENV{RSAT}."/public_html/data/genomes/".$organism."/genome/cds.tab";
      $command = "grep -v '^--' ".$cds_table;
      $command .= " | cut -f 1 ";
      $command .= " | add-gene-info  -info name -org ".$organism;
      $command .= " -o ".$outfile{gene_names};
      &RSAT::message::TimeWarn("Gene names", $outfile{gene_names}) if ($main::verbose >= 2);
      &one_command($command, 1, 0, task=>"network");

      ## Compare profiles using compare-classes
      #      my $sig_col=`grep -P '^;\t\d+\tsig' ${COMPA}.tab | cut -f 2`
      # MIN_SPEC=5
      &RSAT::message::TimeWarn("Extracting co-occurrence network from phylogenetic profiles") if ($main::verbose >= 2);
      $min_species = 5;
      $command = "grep -v '^;' ".$outfile{bbh};
      $command .= " | grep -v '^#'";
      $command .= " | awk '{print \$2\"\t\"\$3\"\t\"\$4}'";
      $command .= " | compare-classes -v 2 -i /dev/stdin -sc 3";
      $command .= " -return occ,freq,proba,entropy,jac_sim,rank -sort sig ";
      $command .= " -triangle -distinct ";
      $command .= " -lth sig 0 -lth Q ".$min_species." -lth R ".$min_species." -lth QR ".$min_species;
      $command .= " -rnames ".$outfile{gene_names};
      $command .= " -qnames ".$outfile{gene_names};
      $command .= " -o ".$outfile{cooccurrence_table};
      &one_command($command, 1, 0, task=>"network");
      &RSAT::message::Info("Co-occurrence table", $outfile{cooccurrence_table}) if ($main::verbose >= 2);

      ## Convert tab-delimited file in HTML file
      $command = "text-to-html -i ".$outfile{cooccurrence_table};
      $command .= " -o ".$outfile{cooccurrence_table_html};
      &one_command($command, 1, 0, task=>"network");
      &RSAT::message::Info("Co-occurrence table", $outfile{cooccurrence_table_html}) if ($main::verbose >= 2);

      ## Identify the significance column in the comparison table
      my $sig_col_cmd = "grep -P '^;\\s+\\d+\\s+sig' $outfile{cooccurrence_table} | cut -f 2";
      my $sig_col =`$sig_col_cmd`;
      chomp($sig_col);
      ## For the sake of safety: if not found in the output file, take
      ## the column supposed to contain sig values
      unless ((&IsNatural($sig_col)) && ($sig_col > 0)) {
	  $sig_col = 24;
      }

      my $convert_graph_options = " -from tab -scol 3 -tcol 4";
      $convert_graph_options .= " -wcol $sig_col -ewidth -ecolors fire ";

      &RSAT::message::TimeWarn("Generating network graph	SIG_COL=".$sig_col) if ($main::verbose >= 2);
      $command = "convert-graph ".$convert_graph_options;
      $command .= " -i ".$outfile{cooccurrence_table};
      $command .= " -to gml -o ".$outfile{cooccurrence_graph_gml}; 
      &one_command($command, 1, 0, task=>"network");
      &RSAT::message::Info("Co-occurrence graph (gml)", $outfile{cooccurrence_graph_gml}) if ($main::verbose >= 2);

      ## Compute the distribution of significance values
      

      ## Select thresholds with different thresholds on co-occurrence
      ## significance.
      foreach my $threshold (@cooc_sig_thresholds) {
	  my $key = 'cooccurrence_graph_sig'.$threshold.'_gml';
	  $outfile{$key}= $prefix{cooccurrence}."_sig".$threshold.".gml"; push @outfiles, $key;
	  &RSAT::message::TimeWarn("Selecting sub-network  with co-occurrence significance >= ".$threshold) if ($main::verbose >= 2);
	  my $cmd = "awk '\$".$sig_col." >= ".$threshold."'";
	  $cmd .= " ".$outfile{cooccurrence_table};
	  $cmd .= " | convert-graph ".$convert_graph_options;
	  $cmd .= " -to gml -o ".$outfile{$key};
	  &one_command($cmd, 1, 0, task=>"network");
#	  die "HELLO\t", $cmd, "\n";
	  &RSAT::message::Info("Co-occurrence subgraph (gml)", "hypergeometrix sig >=".$threshold, $outfile{$key}) if ($main::verbose >= 2);
      }
  }

  ################################################################
  ## Generate the HMTL index of input/output files
  if ($task{index}) {
    &IndexFiles();
    print $main::html_index "<hr>";
    print $main::html_index "</body>";
    print $main::html_index "</html>";
    close $main::html_index;
    &RSAT::message::TimeWarn("Index file", $outfile{html_index});
  }

  ################################################################
  ## Report execution time and close output stream
  my $exec_time = &RSAT::util::ReportExecutionTime($start_time); ## This has to be exectuted by all scripts
  print $out $exec_time if ($main::verbose >= 1); ## only report exec time if verbosity is specified
  close $out if ($outfile{output});

  exit(0);
}

################################################################
################### SUBROUTINE DEFINITION ######################
################################################################


################################################################
## Display full help message 
sub PrintHelp {
  system "pod2text -c $0";
  exit()
}

################################################################
## Display short help message
sub PrintOptions {
  &PrintHelp();
}

################################################################
## Read arguments 
sub ReadArguments {
  my $arg;
  my @arguments = @ARGV; ## create a copy to shift, because we need ARGV to report command line in &Verbose()
  while (scalar(@arguments) >= 1) {
    $arg = shift (@arguments);
    ## Verbosity

=pod

=head1 OPTIONS

=over 4

=item B<-v #>

Level of verbosity (detail in the warning messages during execution)

=cut
    if ($arg eq "-v") {
      if (&IsNatural($arguments[0])) {
	$main::verbose = shift(@arguments);
      } else {
	$main::verbose = 1;
      }


=pod

=item B<-h>

Display full help message

=cut
    } elsif ($arg eq "-h") {
      &PrintHelp();


=pod

=item B<-help>

Same as -h

=cut
    } elsif ($arg eq "-help") {
      &PrintOptions();



=pod

=item B<-org query_organism>

Name of the query organism (see I<supported-orgnisms> to get a full
list).

=cut

	} elsif ($arg eq "-org") {
	    $main::organism = shift (@arguments);


=pod

=item B<-taxon ref_taxon>

Reference taxon. Orthologs are returned for each supported organism
belonging to the reference taxon.

=cut

	} elsif ($arg eq "-taxon") {
	    $main::taxon = shift (@arguments);

=pod

=item B<-ref_org_file ref_org_file>

User-provided list of reference species. Orthologs are returned for
each supported organism belonging to this list.

=cut

	} elsif ($arg eq "-ref_org_file") {
	    $main::ref_org_file = shift (@arguments);


=pod

=item B<-depth taxonomic_depth>

Taxonomic depth, i.e. the depth to be traversed across the taxonomic
tree in order to select "non-redundant" species.

=cut

	} elsif ($arg eq "-depth") {
	    $main::taxonomic_depth = shift (@arguments);
	    &RSAT::error::FatalError($main::taxonomic_depth, "Invalid value for depth:  must be a Natural number.")
	      unless (&IsNatural($taxonomic_depth));


=pod

=item	B<-o output_dir>


Output directory. I<phylo-profiles> exports several output files in
the output directory.

=cut
    } elsif ($arg eq "-o") {
      $dir{output} = shift(@arguments);

=pod

=item B<-task task1,task2,...>

Supported tasks:

=over

=item I<all>

Run all tasks. This is the default usage mode of I<phylo-profiles>.

=item I<species>

Select the list of species at a given depth of the taxonomic tree.

=item I<bbh>

Detect all bidirectional best hits (BBH) between each protein of the
query genome and each genome of the reference taxon.

The result is a table with one row per BBH, with different scores.

=item I<profiles>

Convert the BBH to phylogenetic profile tables.  A phylogenetic
profile table contains 1 row per protein-coding gene of the query
oranism, and one column per genome of the reference taxon. This
program exports a series of separate phylogenetic profiles whose cell
values indicate various statistics.

=item I<network>

Generate a co-occurrence network from the phylogenetic profiles. The
network is built by computing the similarity between profiles of each
gene pair. the network contains one node per gene, and one edge is
instantiated for each gene pair passing a threshold on the selected
similarity metrics.

=item I<index>

Generate the HTML index file.

=back

=cut
    } elsif ($arg eq "-task") {
      my @requested_tasks = split ",", shift (@arguments);
      foreach my $task (@requested_tasks) {
	next unless $task;
	if ($supported_task{$task}) {
	  $task{$task} = 1;
	} else {
	  &RSAT::error::FatalError("Task '$task' is not supported. \n\tSupported: $supported_tasks");
	}
      }


=pod

=item B<-lth field lower_threshold>

Lower threshold value on the specified field.

Supported threshold values: ali_len,ident,e_value,bit_sc

Example: 
    -lth ali_len 30 -uth e_value 1e-10 -lth bit_sc 50 -lth ali_len 50

=item B<-uth field upper_threshold>

Upper threshold value on the specified field.

=cut

            #### threshold values
        } elsif ($arg eq "-lth") {
            my $field = shift(@arguments);
            my $value = shift(@arguments);
	    &RSAT::error::FatalError("Invalid threshold criterion\t".$field)
		unless ($supported_threshold_field{$field});
            $lth{$field} = $value;

        } elsif ($arg eq "-uth") {
            my $field = shift(@arguments);
            my $value = shift(@arguments);
	    &RSAT::error::FatalError("Invalid threshold criterion\t".$field)
		unless ($supported_threshold_field{$field});
            $uth{$field} = $value;


=pod

=item B<-null>

Null string used as score in the profile output for the undefined class
memberships (default 0).

=cut
    } elsif ($arg eq "-null") {
	$main::null = shift (@arguments);

=pod

=item B<-inf>

Value to display in the significance table as replacement for the
infinite values (obtained e.g. from log(0)).

Default: 320.

=cut
    } elsif ($arg eq "-inf") {
	$main::inf = shift (@arguments);

    } else {
      &FatalError(join("\t", "Invalid option", $arg));

    }
  }

=pod

=back

=cut

}

################################################################
## Verbose message
sub Verbose {
  print $out "; phylo-profiles ";
  &PrintArguments($out);
  printf $out "; %-22s\t%s\n", "Program version", $program_version;
  if (%main::dir) {
    print $out "; Directories\n";
    while (my ($key,$value) = each %main::dir) {
      printf $out ";\t%-13s\t%s\n", $key, $value;
    }
  }
  if (%main::infile) {
    print $out "; Input files\n";
    while (my ($key,$value) = each %main::infile) {
      printf $out ";\t%-13s\t%s\n", $key, $value;
    }
  }
  if (%main::outfile) {
    print $out "; Output files\n";
    while (my ($key,$value) = each %main::outfile) {
      printf $out ";\t%-13s\t%s\n", $key, $value;
    }
  }

  print $out &PrintThresholdValues();


}

__END__
