#!/usr/bin/perl -w
############################################################
#
# $Id: pathway_extractor.pl,v 1.17 2011/10/05 11:01:05 rsat Exp $
#
############################################################

## use strict;

=pod

=head1 Extract Pathway from gene list

=head1 VERSION 1.0

=head1 DESCRIPTION

This tool is a Perl Wrapper around the stand-alone application (PathwayInference) developed by
Karoline Faust.

The PathwayInference tool can also be used via the Web interface "Pathway Extraction" (http://
rsat.ulb.ac.be/neat).

The Perl wrapper performs several steps to enable the extraction of pathways from sets of
functionally related genes (e.g. co-expressed, co-regulated, members of the same operon, …).

1. Gene to reaction mapping. Identify the set of reactions ("seed reactions") catalyzed by the set of
input genes ("seed genes"). The mapping relies on a user-specified file describing the mapping of
genes to reactions (GPR, Gene-Protein-Reaction file).

2. Pathway extraction (=Pathway inference). PathwayInference takes as input a network (typically
a metabolic network made of compounds + reactions) and a set of "seed" nodes. The program
attempts to return a subnetwork that interconnects the seed nodes at a minimal “cost", where the
cost is a weighted sum of the intermediate compounds and reactions used to link the seed nodes
(Faust, et al., 2011; Faust and van Helden, 2011).

3. Annotation of the inferred pathway: identify the EC numbers, enzymes and genes associated to
each reaction (seed + inferred reactions) of the inferred pathway. This documentation relies on the
same GPR file as the gene to reaction mapping.

This implementation requires no database or Internet connection and works with local files only.
The PathwayInference tool wraps a number of algorithms to infer pathways: k shortest paths
(REA), kWalks and a hybrid approach combining both (Faust, et al., 2010). In addition, two Steiner
tree algorithms are available (Takahashi-Matsuyama and Klein-Ravi), each of them alone or in
combination with kWalks.

=head1 AUTHORS

The java PathwInference tool was developed by Karoline Faust. This Perl wrapper was developed by
Didier Croes. The doc was written by Didier Croes and Jacques van Helden.

=head1 REFERENCES

Faust, K., Croes, D. and van Helden, J. (2011). Prediction of metabolic pathways from genome-scale
metabolic networks. Biosystems.

Faust, K., Dupont, P., Callut, J. and van Helden, J. (2010). Pathway discovery in metabolic networks by
subgraph extraction. Bioinformatics 26, 1211-8.

Faust, K. and van Helden, J. (2011). Predicting metabolic pathways by sub-network extraction.
Methods in Molecular Biology in press, 15.

=head1 CATEGORY

Graph tool

=head1 USAGE

pathway_extractor -h -hp [-i inputfile] [-o outputdirectory] [-v verbosity] -g graphfile -a gene2ec2reactions [-d unique descriptor] [-t tempdir] [-show] 

=head1 INPUT FORMAT

Warning: the same gene identifiers should be used in all input files.

=head2 1) List of seed genes (gene identifiers):

(Warning at least 2 gene ids must be present in the graph file see below) in this example we use gene
IDs. Beware, the gene IDs must be compatible with the genome version installed on RSAT. Depending
on the organism source, the IDs can correspond to RefSeq, EnsEMBL, or other references.

Example of seed gene file:
NP_414739
NP_414740
NP_414741
NP_416617
NP_417417
NP_417481
NP_418232
NP_418272
NP_418273
NP_418373
NP_418374
NP_418375
NP_418376
NP_418437
NP_418443

----------------------------------------------------------------    

=head2 2)Graph file format:

see Pathwayinference tools helppathway_extractor

java graphtools.algorithms.Pathwayinference –h

The same result can be obtained by typing

pathway_extractor.pl -hp

---------------------------------------------------------------- 

=head2 3) GPR (Gene Protein Reaction) (option -a)

This file is used for the Gene to reaction mapping.
At least -a file is mandatory. This file maps Gene to reaction through EC numbers thus one gene can
be mapped to several reactions.
The three first columns are mandatory (gene_id, ec_number, reaction_id). Additional columns can be
provided for convenience, but are ignored.

=head3 Example of GPR file.

gene_id ec_number       reaction_id     species_name    taxonomy_id     gene_name
O22340  4.2.3.- RXN-10482       Abies grandis   46611   (4S)-limonene synthase
O22340  4.2.3.- RXN-10483       Abies grandis   46611   (4S)-limonene synthase
O22340  4.2.3.- RXN-10566       Abies grandis   46611   (4S)-limonene synthase
O22340  4.2.3.- RXN-10567       Abies grandis   46611   (4S)-limonene synthase
O22340  4.2.3.- RXN-10568       Abies grandis   46611   (4S)-limonene synthase
O22340  4.2.3.- RXN-10600       Abies grandis   46611   (4S)-limonene synthase

---------------------------------------------------------------- 

=head1 EXAMPLES:

=head2 With an input file

=head3 Get methionine-related genes in Escherichia coli genome. This generates a file containing one line per gene and one column per attribute (ID, start, end, name, ...).

gene-info -org Escherichia_coli_K_12_substr__MG1655_uid57779 -feattype CDS -q met. -o met_genes.tab

=head3 Select the first column, containing gene Ids.

grep -v "^;" met_genes.tab | cut -f 1 > met_genes_IDs.txt

=head3 Extract a pathway connecting at best the reactions catalyzed by these gene products

pathway_extractor.pl -i met_genes_IDs.txt -g data/networks/MetaCyc/MetaCyc_directed_141.txt -a data/networks/MetaCyc/METACYC_GPR_EC_20110620.txt -o result_dir -t temp_dir

---------------------------------------------------------------- 

=head2 Using standard input

=head3 The script pathway_extractor.pl can also use as input the STDIN. This allows to use it in aconcatenation of commands. For example, all the commands above could be combined in a single pipeline as follows.

gene-info -org Escherichia_coli_K_12_substr__MG1655_uid57779 -feattype CDS -q met.| grep -v "^;" met_genes.tab | cut -f 1 | pathway_extractor.pl -g data/networks/MetaCyc/MetaCyc_directed_141.txt -a data/networks/MetaCyc/METACYC_GPR_EC_20110620.txt -o result_dir -t temp_dir

---------------------------------------------------------------- 

=head1 OUTPUT FILES:

*_converted_seeds.txt : pathway inference input file.

*_pred_pathways.txt : result graph file of the pathway inference

*_annot_pred_pathways.txt : : result file of the pathway inference with gene and EC annotation

*_annot_pred_pathways.dot : result file in dot format, which can be converted to a figure using the
automatic layout program dot (included in the software suite graphviz).

*._annot_pred_pathways.png : png image file of the inferred pathway
---------------------------------------------------------------- 

=cut


BEGIN {
    if ($0 =~ /([^(\/)]+)$/) {
	push (@INC, "$`lib/");
	push (@INC,"$ENV{RSAT}/perl-scripts/lib/");
    }
}
require "RSA.lib";



################################################################
## Main package
package main;
{

  ################################################################
  ## Initialise parameters
  #
  local $start_time = &RSAT::util::StartScript();
  $program_version = do { my @r = (q$Revision: 1.17 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
  #    $program_version = "0.00";

  %main::infile = ();	     # File name containing a list of genes ID
  $main::outdir = ".";				# output directory
  $main::tempdir= "";				# temporary directory

  $main::verbose = "";
  $main::in = STDIN;
  $main::out = STDOUT;
    
  $main::gprfile ="METACYC_GPR_EC.tab";	# GPR Gene -> EC -> REACTION annotation file path. Default (METACYC_GPR_EC.tab)
  $main::grfile="";		# GR Gene -> REACTION annotation
  $main::graph = "";		# Graph Name 
  $main::graphfile="";		# File containing the graph
  $main::show = 0;			# Open png image in gwenview
  $main::groupdescriptor=""; # Unique name to differenciate output files

  ################################################################
  ## Read argument values
  &ReadArguments();

  ################################################################
  ## Check argument values
    
    
  if (defined $main::infile{input}) {
    ($main::in) = &OpenInputFile($main::infile{input});
    if (!$main::groupdescriptor) {
      $main::groupdescriptor = $main::infile{input};
      $main::groupdescriptor =~ s{.*/}{};     # removes path  
      $main::groupdescriptor=~ s{\.[^.]+$}{}; # removes extension 
    }
  } else {
    if (!$main::groupdescriptor) {
      $main::groupdescriptor ="stdin";
    }
  }
  $main::groupdescriptor=~s/(\s|\(|\))+/_/g;
  if (!$main::graph) {
    $main::graph = $main::graphfile;
    $main::graph =~ s{.*/}{};	    # removes path
    $main::graph=~ s{\.[^.]+$}{};   # removes extension
  }
  $main::graph=~s/(\s|\(|\))+/_/g;
  ################################################################
  ## Read input
  #     ($main::in) = &OpenInputFile($main::infile{input});
  #     while (<$main::in>) {
  # 
  #     }
  #     

  ################################################################
  ## Print verbose
  &Verbose() if ($main::verbose);

  ################################################################
  ## Execute the command

  ## Check the existence of the output directory and create it if
  ## required
  if (!($outdir=~m/\/$/)) {
    $outdir = $outdir."/";
  }
  &RSAT::util::CheckOutDir($outdir);

  ## Check the existence of the temp directory and create it if
  ## required
  if (!$tempdir) {
    $tempdir=$tempdir;
  }
  if (!($tempdir=~m/\/$/)) {
    $tempdir = $tempdir."/";
  }
  &RSAT::util::CheckOutDir($tempdir);

  my $organism = "Unknown";
  my $organismid;
  # my $working_dir = "";
  my $genesid;
  my @genesidlist;
################################################################
# GPR Mapping
  @genesidlist = <$main::in>;
  close $main::in if ($main::infile{input});
  chomp(@genesidlist);
  $genesid = (join "|^",@genesidlist );	#
  #   print join (@genesidlist,"|")."\n";
  my @grconversiontable;
  my %genelist=();
  if ($grfile) {## if annotated gene to reaction file (-b)
    my $seed_converter_cmd = "awk -F'\\t+' '\$1~\"^".$genesid."\" {print \$2\"\\t\"\$1\"\\t\"\$3\"\\t\"\$4\"\\t\"\$5}' $grfile";
    @grconversiontable = qx ($seed_converter_cmd) ;
    chomp(@grconversiontable);
    @grconversiontable = sort(@grconversiontable);
    #   print join( "\n",@grconversiontable) ."\n"; 
  }
   
  my $seed_converter_cmd = "awk -F'\\t+' '\$1~\"^".$genesid."\" {print \$2\"\\t\"\$1\"\\t\"\$3\"\\t\"\$4\"\\t\"\$5}' $gprfile";
  print $seed_converter_cmd."\n";

  my @conversiontable = qx ($seed_converter_cmd) ;
  chomp(@conversiontable);
  @conversiontable =sort(@conversiontable);
  # getting organism information
  my @tempdata = split(/\t/,$conversiontable[0]);
  $organism = $tempdata[3];
  $organismid = $tempdata[4];
  my $groupid=( join "-",$organism,$organismid);
  $groupid=~s/(\s|\(|\))+/_/g;

#  merging GR and GPR in one array
  if (@grconversiontable) {
    foreach my $val (@conversiontable) {
      @tempdata = split(/\t/,$val);
      if (!grep(/^$tempdata[0]/, @grconversiontable)) {
	push(@grconversiontable, grep(/^$tempdata[0]/, @conversiontable))
      }
    }
    #   print join( "\n",@conversiontable) ."\n"; 
    @conversiontable=@grconversiontable;
    
  }
#  End of merging GR and GPR in one array

# End of GPR mapping
################################################################

################################################################
# Creating reaction file fo pathway inference
  my $seed_converter_filename = $outdir.(join "_",$main::groupdescriptor,$groupid,$graph, "_converted_seeds.txt");
  open (MYFILE, '>'.$seed_converter_filename);
  print MYFILE "# Batch file created" . qx("date");
  print MYFILE "# GENE groups parsed from file:". "\n"; 
  print MYFILE "# Organism: ". $organism. "\n";
  print MYFILE "# EC number grouping: true". "\n";
  my $seednum= 0;
  my @previousarray;
  foreach my $val (@conversiontable) {

    @tempdata = split(/\t/,$val);
    if (@previousarray && !($tempdata[0] eq $previousarray[0])) {
      print MYFILE "$previousarray[0]\t$groupid\n";
      $seednum++;
    }
    # 	print "$tempdata[1] eq $previousarray[1]\n";
    print MYFILE $tempdata[2] .">\t".$tempdata[2]. "\n";
    print MYFILE $tempdata[2] ."<\t".$tempdata[2]. "\n";
    print MYFILE $tempdata[2] ."\t".$tempdata[0]. "\n";
    @previousarray = @tempdata;
  } 

  if (@conversiontable) {
    print MYFILE "$previousarray[0]\t$groupid\n";
    $seednum++;
  }
# END OF  Creating reaction file fo pathway inference
################################################################

################################################################
# Running patywayinference
  if ($seednum > 1) {
    my $predicted_pathway_filename = $outdir.(join "_",$main::groupdescriptor, $groupid, $graph, "_pred_pathways.txt");
    my $pathway_infer_cmd = "java -Xmx1000M graphtools.algorithms.Pathwayinference -i $seed_converter_filename -m 5 -C -f flat -n -p $main::tempdir -E $outdir -b -d -g $graphfile -y con $main::verbose -o $predicted_pathway_filename";
    print $pathway_infer_cmd."\n";
    print $predicted_pathway_filename."\n";
    system $pathway_infer_cmd;

# END of Running patywayinference
################################################################

################################################################
# Loading results graph reactions
    open (INFILE, '<'.$predicted_pathway_filename) or die "couldn't open the file!";
    my $i = 0;
    my $stop = "";
    my $line;
    my $reactionquery="";
    my $cpdquery="";
    while ($line=<INFILE>) {
      #$line = $_;
      chomp  ($line );
      if (length($line)>0 && !($line=~m/^;/)) {
	my @tempdata = split(/\t/,$line);
	
	if ($tempdata[6] &&($tempdata[6] eq "Reaction")) {
	  #       print "|".$line."|"."\n";
	  $tempdata[0]=~s/<$|>$//;
	  $i++;
	  $reactionquery = $reactionquery."(\$3~\"^".$tempdata[0]."\"&&\$5~\"".$organismid."\")||";
	} elsif ($tempdata[6] &&($tempdata[6] eq "Compound")) {
	  $cpdquery = $cpdquery."(\$1~\"^".$tempdata[0]."\"&&\$2~\"Compound\")||";
	}
      } elsif ($i>0) {
	last;
      }

    }
    close (INFILE); 
# End of Loading results graph reactions    
################################################################

################################################################
# Searching all reactions information for the reaction that are in  the infered pathway graph
    $reactionquery =~s/\|+$//;
    $cpdquery =~s/\|+$//;
    my $command_ = "awk -F'\\t+' ' $reactionquery {print \$3\"\\t\"\$2\"\\t\"\$6}' $gprfile|sort|uniq";
    print "$command_\n";
    @conversiontable = qx ($command_);
    $command_ = "awk -F'\\t+' '$cpdquery {print \$1\"\\t\"\$4\"\\t\"\$1}' $graphfile";
    print "$command_\n";
    my @conversionreactiontable = qx ($command_);
    #     print "****************".join("\n", @conversionreactiontable)."\n";
    push (@conversiontable,@conversionreactiontable);
     
    chomp(@conversiontable);
    # Storing in a hash for faster search
    my %reactioninfos=();
    undef @previousarray;
    my @reacinfoarray=();
    foreach my $content (@conversiontable) {
      my @currentarray = split(/\t/,$content);
      if ( @previousarray && !($previousarray[0] eq $currentarray[0])) { 
	# 	  print $previousarray[0]."\n";
	my @truc = @reacinfoarray;
	$reactioninfos{$previousarray[0]}=\@truc;
	undef @reacinfoarray;
      }
      push (@reacinfoarray,\@currentarray);
      @previousarray = @currentarray;
    }
    $reactioninfos{$previousarray[0]}=\@reacinfoarray;
# End of Searching all reactions information for the reaction that are in  the infered pathway graph
################################################################

################################################################
# Adding description to the pathway graph
    open (INFILE, '<'.$predicted_pathway_filename) or die "couldn't open the file!";
    my $annot_graph_filename = $outdir.(join "_",$main::groupdescriptor, $groupid, $graph, "annot_pred_pathways.txt");
    # my $outfilename = `mktemp $annot_graph_filename`;
    open (OUTFILE, '>'.$annot_graph_filename);
    #     print $main::groupdescriptor;
    while (<INFILE>) {
      my($line) = $_;
      chomp($line);
      my @tempdatab = split(/\t/,$line);
      if (length($line)==0 || $line=~ m/^;/) {
	print OUTFILE $line. "\n";
      } else {
	    
	my $tempstring = $tempdatab[0];
	$tempstring=~s/<$|>$//;
	$tempdatab[0] = $tempstring;#remove directionality from reaction node id to merge the nodes
	# 	      print "TEMPSTRING = $tempstring\n";
	my $values_ref = $reactioninfos{$tempstring};
	if (defined $values_ref) {
	  my @values = @{$values_ref};
	  if ($tempdatab[6] && ($tempdatab[6] eq "Reaction")) {

	    my $label="";
	    my $labelb;
	    foreach my $info_ref (@values) {
	      my @info = @{$info_ref};

	      #  	      print "JOIN=".join("\t", $myarray[0][0])."\n";
	      my( $reacid,$ec,$genename) = @info;
	      # 		    print "ec: $ec\n";
	      if ($ec) {
		chomp($genename);
		$label=$label."$genename,";
		if (!defined $labelb) {
		  $labelb = "\\n$ec\\n($reacid)";
		}
	      }
	    }
	    $tempdatab[3] = $label.$labelb;
	  } elsif ($tempdatab[6] &&($tempdatab[6] eq "Compound")) {
	    $tempdatab[3] =  $values[0][1];
	  } else {
	    $tempdatab[1]=~s/<$|>$//;
	  }
	}
	print OUTFILE (join "\t",@tempdatab). "\n";
      }
    } 
    close (MYFILE); 
# End of Adding description to the pathway graph
################################################################

################################################################
# Converting graph to dot graph format
    my $annot_graph_dot = $outdir.(join "_",$main::groupdescriptor, $groupid, $graph, "annot_pred_pathways.dot");
    my $convert_graph_cmd = "convert-graph -from path_extract -to dot -i $annot_graph_filename -o $annot_graph_dot";
    system  $convert_graph_cmd;
# End of Converting graph to dot graph format
################################################################

################################################################
# Converting dot graph to image with graphviz dot
    my $dpng_file = $outdir.(join "_",$main::groupdescriptor, $groupid, $graph, "annot_pred_pathways.png");
    my $graph_image_cmd = "dot -Tpng -Kdot -o $dpng_file $annot_graph_dot";
    system $graph_image_cmd;
    #exit 0;
    if ($main::show) {
      system "gwenview $dpng_file &";
    }
  } else {
    print STDERR "NOT ENOUGH SEEDS. Min 2. I stop here!!\n";
    exit(1);
  }
# End of Converting dot graph to image with graphviz dot
################################################################

  ################################################################
  ## Insert here output printing
  ## Report execution time and close output stream
  my $exec_time = &RSAT::util::ReportExecutionTime($start_time); ## This has to be exectuted by all scripts
  print $main::out $exec_time if ($main::verbose); ## only report exec time if verbosity is specified
  close $main::out if ($main::outfile{output});

  exit(0);
}

################################################################
################### SUBROUTINE DEFINITION ######################
################################################################



################################################################
## Display full help message 
sub PrintHelp {
    system "pod2text -c $0";
    exit()
}

################################################################
## Display short help message
sub PrintOptions {
    &PrintHelp();
}

################################################################
## Read arguments 
sub ReadArguments {
  my $arg;
  my @arguments = @ARGV; ## create a copy to shift, because we need ARGV to report command line in &Verbose()
  while (scalar(@arguments) >= 1) {
    $arg = shift (@arguments);
    ## Verbosity

=pod

=head1 OPTIONS

=over 4

=item B<-v>

Verbose mode

=cut
    if ($arg eq "-v") {
	$main::verbose = "-v";


=pod

=item B<-h>

Display full help message

=cut
    } elsif ($arg eq "-h") {
      &PrintHelp();


=pod

=item B<-help>

Same as -h

=cut
    } elsif ($arg eq "-help") {
      &PrintOptions();

=pod

=item B<-hp>

Display full PathwayInference help message

=cut
    } elsif ($arg eq "-hp") {
      system("java graphtools.algorithms.Pathwayinference -h");
      print "\n";
      exit(0);

=pod

=item B<-show>

execute gwenview to display the pathway results in png format

=cut
    } elsif ($arg eq "-show") {
     $main::show = 1;

=pod

=item B<-i inputfile>

If no input file is specified, the standard input is used.  This
allows to use the command within a pipe.

=cut
    } elsif ($arg eq "-i") {
      $main::infile{input} = shift(@arguments);

=pod

=item	B<-a GPR Genes file Default (METACYC_GPR_EC.tab)>

GPR Gene -> EC -> REACTION annotation file path. Default (METACYC_GPR_EC.tab)

=cut
    } elsif ($arg eq "-a") {
      $main::gprfile = shift(@arguments);

=pod

# =item	B<-b GR Gene -> REACTION annotation>
# 
# An gene annotation file with diredt link gene to reaction. Does not rely on the EC number annotation
# 
=cut
    } elsif ($arg eq "-b") {
      $main::grfile = shift(@arguments);

=pod

=item	B<-n Graph name>

Name of the Graph (default: Name of the graph file) 

=cut
#     } elsif ($arg eq "-n") {
#       $main::graph = shift(@arguments);
# 

# =pod
# 
# =item	B<-d Graph file>
# 
# Name of the Graph (default: Name of the graph file) 
# 
# =cut
    } elsif ($arg eq "-d") {
      $main::groupdescriptor = shift(@arguments);


=pod

=item	B<-d Unique descriptor>

Unique name to differenciate output files. If not set With -i, the name of the input file will be used.  

=cut
    } elsif ($arg eq "-g") {
      $main::graphfile = shift(@arguments);

=pod

=item	B<-o output Directory>

If no output file is specified, the current directory is used.

=cut
    } elsif ($arg eq "-o") {
      $main::outdir = shift(@arguments);

=pod

=item	B<-t temp Directory>

If no output file is specified, the current directory is used. 
pathway_extractor.pl
=cut
    } elsif ($arg eq "-t") {
      $main::tempdir = shift(@arguments);

    } else {
      &FatalError(join("\t", "Invalid pathway_extractor option", $arg));

    }
  }

=pod

=back

=cut

}
################################################################
## Verbose message
sub Verbose {
    print $main::out "; template ";
    &PrintArguments($main::out);
    printf $main::out "; %-22s\t%s\n", "Program version", $program_version;
    if (%main::infile) {
	print $main::out "; Input files\n";
	while (my ($key,$value) = each %main::infile) {
	  printf $main::out ";\t%-13s\t%s\n", $key, $value;
	}
    }
    if (%main::outfile) {
	print $main::out "; Output files\n";
	while (my ($key,$value) = each %main::outfile) {
	  printf $main::out ";\t%-13s\t%s\n", $key, $value;
	}
    }
}


__END__
