#!/usr/bin/env perl
############################################################
#
# $Id: process-pathwayextractor-output,v 1.2 2012/03/27 23:21:56 rsat Exp $
#
############################################################

## use strict;

=pod

=head1 Extract Pathway from gene list

=head1 VERSION 1.0

=head1 DESCRIPTION

This code processes the pathway inference ouput subgraphe file:
1. Annotation of the inferred pathway: identify the EC numbers,
enzymes and genes associated to each reaction (seed + inferred
reactions) of the inferred pathway. This documentation relies on the
same NNN and GNN file as the gene to reaction mapping.

2. Create an dot file and an image files

This implementation requires no database or Internet connection and
works with local files only.  The PathwayInference tool wraps a number
of algorithms to infer pathways: k shortest paths (REA), kWalks and a
hybrid approach combining both (Faust, et al., 2010). In addition, two
Steiner tree algorithms are available (Takahashi-Matsuyama and
Klein-Ravi), each of them alone or in combination with kWalks.

=head1 AUTHORS

The java PathwInference tool was developed by Karoline Faust. This
Perl wrapper was developed by Didier Croes. The doc was written by
Didier Croes and Jacques van Helden.

=head1 REFERENCES

Faust, K., Croes, D. and van Helden, J. (2011). Prediction of
metabolic pathways from genome-scale metabolic networks. Biosystems.

Faust, K., Dupont, P., Callut, J. and van Helden, J. (2010). Pathway
discovery in metabolic networks by subgraph extraction. Bioinformatics
26, 1211-8.

Faust, K. and van Helden, J. (2011). Predicting metabolic pathways by
sub-network extraction.  Methods in Molecular Biology in press, 15.

=head1 CATEGORY

Graph tool

=head1 USAGE

process_pathwayinference_output -h  [-i inputfile] [-o output_directory] [-v verbosity] \
    -gnn gene2nodename -nnn nodename2nodeid [-d unique_descriptor] [-t temp_directory] [-show]

=head1 INPUT FORMAT



----------------------------------------------------------------
=head2 Seed mapping file

The seed mapping file makes the link between different types of seeds
(genes, EC numbers, proteins, compound names) and nodes of the network
(reactions or compounds depending on the seed type).


=head3 Network Node Names (nnn) file (option I<-nnn>)

Mandatory.

The NNN files makes the link between EC numbers/rxn name/cpd name and node id in the network .

These files are used for the reaction ids/compound ids to gene annotation (backward).


=head3 Example of NNN file

 #query  id      qualifier       name
1.-.-.- RXN1G-1486      EC      3-oxo-C78-α-mycolate-reductase
1.-.-.- RXN1G-1527      EC      3-oxo-C85-cis-methoxy-mycolate reductase
1.-.-.- RXN1G-1528      EC      3-oxo-C86-trans-methoxy-mycolate-reductase
10-deoxysarpagine       10-DEOXYSARPAGINE       compounds       10-deoxysarpagine
10-DEOXYSARPAGINE       10-DEOXYSARPAGINE       compounds       10-deoxysarpagine
 ...

=head3 Gene to Node Names (gnn) file (option I<-gnn>)

not mandatory, in this cas the queyr file or stdin must contains only Node Names.

The GNN files makes the link between external identifier and node names (example: gene name, refseq, locuslink) . 


=head3 Example of NNN file

#query	id	qualifier	name	taxonomy_id	species
aas	2.3.1.40	GENE_NAME	aas	83333	Escherichia coli (strain K12)
aas	6.2.1.20	GENE_NAME	aas	83333	Escherichia coli (strain K12)
aat	2.3.2.6	GENE_NAME	aat	83333	Escherichia coli (strain K12)

=head1 EXAMPLES

=head2 With an input file

=head3 Motivation

Get methionine-related genes in Escherichia coli genome. This
generates a file containing one line per gene and one column per
attribute (ID, start, end, name, ...).


=head3 Commands

Extract all E.coli genes whose name starts with met
 gene-info -org Escherichia_coli_GCF_000005845.2_ASM584v2 -feattype CDS -full -q '^met.*' -o met_genes.tab

Select the first column, containing gene Ids.
 grep -v "^;" met_genes.tab | cut -f 1 > met_genes_IDs.txt

Extract a pathway connecting at best the reactions catalyzed by these gene products
  pathway-extractor -i met_genes_IDs.txt \
     -g data/networks/MetaCyc/MetaCyc_directed_141.txt \
     -ger ${RSAT}/data/metabolic_networks/GER_files/GPR_Uniprot_112011_Escherichia_coli_K12.tab \
     -o result_dir \
     -t temp_dir

----------------------------------------------------------------

=head2 Using standard input

=head3 The script pathway-extractor can also use as input the STDIN. This allows to use it in aconcatenation of commands. For example, all the commands above could be combined in a single pipeline as follows.

 gene-info -org Escherichia_coli_GCF_000005845.2_ASM584v2 -feattype CDS -q 'met.*' \
   | grep -v "^;" met_genes.tab | cut -f 1 \
   | pathway-extractor -g data/networks/MetaCyc/MetaCyc_directed_141.txt \
       -ger data/networks/MetaCyc/METACYC_GPR_EC_20110620.txt \
       -o result_dir -t temp_dir

----------------------------------------------------------------

=head1 OUTPUT FILES:

*_converted_seeds.txt : pathway inference input file.

*_pred_pathways.txt : result graph file of the pathway inference

*_annot_pred_pathways.txt : : result file of the pathway inference with gene and EC annotation

*_annot_pred_pathways.dot : result file in dot format, which can be converted to a figure using the
automatic layout program dot (included in the software suite graphviz).

*._annot_pred_pathways.png : png image file of the inferred pathway
----------------------------------------------------------------

=cut


BEGIN {
    if ($0 =~ /([^(\/)]+)$/) {
	push (@INC, "$`lib/");
	push (@INC,"$ENV{RSAT}/perl-scripts/lib/");
    }
}
require "RSA.lib";

use RSAT::PathwayExtraction;
################################################################
## Main package
package main;
{

  ################################################################
  ## Initialise parameters
  #
  local $start_time = &RSAT::util::StartScript();
  $program_version = do { my @r = (q$Revision: 1.2 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
  #    $program_version = "0.00";

  ## Input/output files
  our %infile = ();	     # File name containing a list of genes ID
  our %outfile = ();

  ## Directories
  our %dir = ();
  $dir{output} = "."; # output directory
  $dir{temp}= "";     # temporary directory

  our $verbose = "";
  our $in = STDIN;
  our $out = STDOUT;


  $infile{gnn} =""; # GPR Gene -> EC -> REACTION annotation file path. Default (METACYC_GPR_EC.tab)
  $infile{nnn}=""; 
  $show = 0;		# Open png image in gwenview
#   $group_descriptor= ""; # Unique name to differenciate output files

  our $directed="";
  my $organism = "Unknown";
  my $organism_id;
  # my $working_dir = "";
  my $query_ids;
  my @query_id_list;

  ################################################################
  ## Read argument values
  &ReadArguments();

  ################################################################
  #generate ouput files name
  &RSAT::message::TimeWarn("processing files:Start") if ($verbose >= 2); 
   &RSAT::PathwayExtraction::ProcessOutputFiles(
      $infile{input},
      $dir{output},
      $infile{gnn},
      $infile{nnn},$directed,$verbose);
  &RSAT::message::TimeWarn("processing files:End") if ($verbose >= 2); 
  exit(0);
}

################################################################
################### SUBROUTINE DEFINITION ######################
################################################################



################################################################
## Display full help message
sub PrintHelp {
  system "pod2text -c $0";
  exit()
}

################################################################
## Display short help message
sub PrintOptions {
  &PrintHelp();
}

################################################################
## Read arguments
sub ReadArguments {
  my $arg;
  my @arguments = @ARGV; ## create a copy to shift, because we need ARGV to report command line in &Verbose()
  while (scalar(@arguments) >= 1) {
    $arg = shift (@arguments);
    ## Verbosity

=pod

=head1 OPTIONS

=over 4

=item B<-v>

Verbose mode

=cut
    if ($arg eq "-v") {
      if (&IsNatural($arguments[0])) {
	$verbose = shift(@arguments);
      } else {
	$verbose = 1;
      }


=pod

=item B<-h>

Display full help message

=cut
    } elsif ($arg eq "-h") {
      &PrintHelp();


=pod

=item B<-help>

Same as -h

=cut
    } elsif ($arg eq "-help") {
      &PrintOptions();

=pod

=item B<-show>

execute gwenview to display the pathway results in png format

=cut
    } elsif ($arg eq "-show") {
     $show = 1;

=pod

=item B<-i inputfile>

If no input file is specified, the standard input is used.  This
allows to use the command within a pipe.

=cut
    } elsif ($arg eq "-i") {
      $infile{input} = shift(@arguments);

=pod

=item	B<-gnn Gene Node Name file>

NODE NAME to GENE NAME/ID annotation file.

=cut
    } elsif ($arg eq "-gnn") {
      $infile{gnn} = shift(@arguments);
=pod

=item	B<-nnn Network Node Name file>

NODE_ID to NODE NAME (NNN) annotation file.

=cut
    } elsif ($arg eq "-nnn") {
      $infile{nnn} = shift(@arguments);      


=pod

=item	B<-o output Directory>

If no output file is specified, the current directory is used.

=cut
    } elsif ($arg eq "-o") {
      $dir{output} = shift(@arguments);
=pod

=item	B<-u undirected display>

Generate an undirected dot and png files (default is directed) even if the input is directed

=cut
    } elsif ($arg eq "-d") {
     $directed = "-directed";
    
    } else {
      &FatalError(join("\t", "Invalid pathway_extractor option", $arg));

    }
  }

=pod

=back

=cut

}
################################################################
## Verbose message
sub Verbose {
    print $out "; template ";
    &PrintArguments($out);
    printf $out "; %-22s\t%s\n", "Program version", $program_version;
    if (%infile) {
	print $out "; Input files\n";
	while (my ($key,$value) = each %infile) {
	  printf $out ";\t%-13s\t%s\n", $key, $value;
	}
    }
    if (%outfile) {
	print $out "; Output files\n";
	while (my ($key,$value) = each %outfile) {
	  printf $out ";\t%-13s\t%s\n", $key, $value;
	}
    }
}


__END__
