#!/usr/bin/perl -w
############################################################
#
# $Id: pathways-from-clusters,v 1.15 2012/09/12 08:19:47 rsat Exp $
#
############################################################


## use strict;

=pod

=head1 NAME

pathways-from-clusters

=head1 VERSION

$program_version

=head1 DESCRIPTION

Run pathway-extractor on clusters of elements (genes, proteins, EC
numbers or molecules).

The primary use of the tool is to extract metabolic pathways from
groups of functionally related genes (co-expression clusters, operons,
directons, clusters extracted from phylogenetic profiles, ...).

In the current version, pathways-from-cluster assumes that clusters
are made of genes only. Gene identifiers are mapped to reaction via a
GER file (Gene-EC-Reaction).

=head1 AUTHORS

Didier Croes ()didier.croes@ulb.ac.be)

Revised by Jacques van Helden (van-helden.j@univmed.fr)

=head1 USAGE

pathways-from-clusters -h [-i inputfile] [-o outputdirectory] [-v] -g graphfile -gnn gene2ec2reactions [-ecg ECGenericity level]

=head1 INPUT FORMATS

=head2 Network file (option I<-g>)

Tab-delimited description of the graph. See pathway-extractor for
detailed description.

=head2 Seed mapping file

Tab-delimited description indicating the relationship between seed
names (e.g. gene names) and reaction IDs.  See pathway-extractor for
detailed description.

=head1 SEE ALSO

=item I<pathway-extractor>

The program I<pathways-from-clusters> is a wrapper around
I<pathway-extractor>.

=head1 WISH LIST


=cut


BEGIN {
    if ($0 =~ /([^(\/)]+)$/) {
	push (@INC, "$`lib/");
    }
}
require "RSA.lib";



################################################################
## Main package
package main;
{

  ################################################################
  ## Initialise parameters
  local $start_time = &RSAT::util::StartScript();
  $program_version = do { my @r = (q$Revision: 1.15 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
  #    $program_version = "0.00";

  ## Input files
  our %infile = ();
  $infile{gnn} ="METACYC_GPR_EC.tab"; # GPR Gene -> EC -> REACTION annotation file path. Default (METACYC_GPR_EC.tab)
  $infile{nnn}="";		# GR Gene -> REACTION annotation
  $infile{graph}="";	# File containing the graph
  our $operoncolnum="3"; 
  our %outfile = ();
  our %dir = ();
  our $ECgenericitylevel = "4";
  our $verbose = "";
  our $in = STDIN;
  our $out = STDOUT;


  our $SCRIPTS = $ENV{RSAT}."/perl-scripts" unless ($SCRIPTS);

  ## Job management options
  local $job_prefix = "fpdisco";
  local $die_on_error = 1;
  local $batch = 0;
  local $dry = 0;

  ## Task management
  @supported_tasks = qw(clean);
  $supported_tasks = join ",", @supported_tasks;
  %supported_task = ();
  foreach my $task (@supported_tasks) {
    $supported_task{$task} = 1;
  }
  %task = ();			## List of tasks to be executed

  ################################################################$pathway_extractor_cmd .= " -
  ## Read argument values
  &ReadArguments();

  ################################################################
  ## Check argument values

  ## Mandatory input files
  unless (defined($infile{graph})) {
    &RSAT::error::FatalError("You must define a graph file (option -g)");
  }
  unless (defined($infile{clusters})) {
    &RSAT::error::FatalError("You must define a cluster file (option -i)");
  }
  unless (defined($infile{gnn})) {
    &RSAT::error::FatalError("You must define a gene node names file (option -gnn)");
  }
unless (defined($infile{nnn})) {
    &RSAT::error::FatalError("You must define a network node names file (option -nnn)");
  }


  ## Output directory is a mandatory argument
  unless (defined($dir{output})) {
    &RSAT::error::FatalError("You must define the output directory (option -o)");
  }

  ## output directory must be terminated by a /
  $dir{output} .= "/" unless ($dir{output} =~ /\/$/);

  ## Check if output directory exists. If not, create it.
  &RSAT::util::CheckOutDir($dir{output});

  ################################################################
  ## Open output stream
  $main::out = &OpenOutputFile($main::outfile{output});

  ################################################################
  ## Read input
  ($main::in) = &OpenInputFile($main::infile{clusters});


  ################################################################
  ## Print verbose
  &Verbose() if ($main::verbose);

  ################################################################
  ## Parse cluster and launch pathway-extractor for each cluster
  ## separately.
  print $dir{output};
  my $oldclusterid;
  my @genelist;
  my $line;
  while ($line=<$main::in>) {
#   print "$line";
    chomp($line);
    next if ($line =~ /^#/); ## Skip header line
    next if ($line =~ /^;/); ## Skip comment lines
    next unless ($line =~ /\S/); ## Skip empty lines

    ## TO DO WITH DIDIER: this is not the right way to read a cluster
    ## file, because we don't want to assume that the rows are sorted
    ## by cluster. Redo.

#    if (!($line=~m/^;|^#/)) {
#      if (!($line=~m/^;|^#/)) {
    &RSAT::message::Debug($l, $line) if ($main::verbose >= 10);
    my @tempdata = split(/\t/,$line);
#     	   print "$tempdata[$main::operoncolnum] eq $oldclusterid\n";
    if ($oldclusterid && !($tempdata[$main::operoncolnum] eq $oldclusterid)) {
      $oldclusterid=~s/\s+/_/g;
      $oldclusterid=~s/\;+/_/g;
      my $tempfilename = $main::dir{output}."/".$oldclusterid."-seeds.tab";
      chomp ($tempfilename);
      open (MYFILE, '>'.$tempfilename);
      print MYFILE join ("\n",@genelist). "\n";
      close (MYFILE);
      my $grfilecmd = "";
      if ($main::infile{gr}) { # if there is a file with direct link from gene to reaction
	$grfilecmd = "-b $main::infile{gr}";
      }

      ## Run pathway-extractor with current seed set
      my $pathway_extractor_cmd = $SCRIPTS."/pathway-extractor";
      $pathway_extractor_cmd .= " -v ".$main::verbose;
      $pathway_extractor_cmd .= " -i ".$tempfilename;
      $pathway_extractor_cmd .= " -g ".$main::infile{graph};
      $pathway_extractor_cmd .= " -gd $oldclusterid";
      $pathway_extractor_cmd .= " -gnn ".$main::infile{gnn};
      $pathway_extractor_cmd .= " -nnn ".$main::infile{nnn};
      $pathway_extractor_cmd .= " -C -d -F 7 -T pathsUnion ";
      $pathway_extractor_cmd .= " -o ".$main::dir{output};
      $pathway_extractor_cmd .= " -ecg $ECgenericitylevel";
      $pathway_extractor_cmd .= " -process";
      
      ## TO DO WITH DIDIER: redirect STDER from pathway-extractor to a
      ## log file in the output directory.

      &RSAT::message::Info("Patwhay extractor command: ", $pathway_extractor_cmd);
#       exit (-1);
      &doit($pathway_extractor_cmd, $dry, $die_on_error, $verbose, $batch, $job_prefix);

      ## CHECK WITH DIDIER: WHY THE CLEANING HAS TO EXECUTED HERE
      ## AND NOT IN TH EPROGRAM pathway-extractor ITSELF.

      ## Clean temporary files if required
       if($seednum > 1){
# my $clean_cmd = "rm -f $tempfilename";
	&doit($pathway_extractor_cmd, $dry, $die_on_error, $verbose, $batch, $job_prefix);
	}

      ## Re-initialize the ### INTERRUPTED HERE !!!! IMPLEMENTATION IN CONSTRUCTION
      @genelist =();
    }
    my $gene= $tempdata[0];
#     $gene=~ s{\.[^.]+$}{};	# removes gene version not anymore
    push(@genelist,$gene);
    $oldclusterid = $tempdata[$main::operoncolnum];
    $seednum++;
  }
#}
    close $main::in if ($main::infile{clusters});

    ################################################################
    ## Report execution time and close output stream
    my $exec_time = &RSAT::util::ReportExecutionTime($start_time); ## This has to be exectuted by all scripts
    print $main::out $exec_time if ($main::verbose >= 1); ## only report exec time if verbosity is specified
    close $main::out if ($main::outfile{output});
    exit(0);
}

################################################################
################### SUBROUTINE DEFINITION ######################
################################################################


################################################################
## Display full help message
sub PrintHelp {
    system "pod2text -c $0";
    exit()
}

################################################################
## Display short help message
sub PrintOptions {
    &PrintHelp();
}

################################################################
## Read arguments
sub ReadArguments {
  my $arg;
  my @arguments = @ARGV; ## create a copy to shift, because we need ARGV to report command line in &Verbose()
  while (scalar(@arguments) >= 1) {
    $arg = shift (@arguments);
    ## Verbosity

=pod

=head1 OPTIONS

=over 4

=item B<-v>

Verbose mode

=cut
    if ($arg eq "-v") {
	$main::verbose = shift (@arguments);

=pod

=item B<-h>

Display full help message

=cut
    } elsif ($arg eq "-h") {
      &PrintHelp();


=pod

=item B<-help>

Same as -h

=cut
    } elsif ($arg eq "-help") {
      &PrintOptions();

=pod

=item B<-hp>

Display full PathwayInference help message

=cut
    } elsif ($arg eq "-hp") {
      system "java graphtools.algorithms.Pathwayinference -h";


=pod

=item B<-i inputfile>

If no input file is specified, the standard input is used.  This
allows to use the command within a pipe.

=cut
    } elsif ($arg eq "-i") {
      $main::infile{clusters} = shift(@arguments);
=pod

=item	B<-ocol #operon name col number>

operon name column number starting from 0

=cut
    } elsif ($arg eq "-ocol") {
      $main::operoncolnum = shift(@arguments);
=pod

=item	B<-gnn gene2nodenames file.tab>

GER file (Gene -> EC -> REACTION). annotation file path.

=cut
    } elsif ($arg eq "-gnn") {
      $main::infile{gnn} = shift(@arguments);

=pod

=item	B<-nnn network node names file -> REACTION and compound annotation>

An gene annotation file with diredt link gene to reaction. Does not rely on the EC number annotation

=cut
    } elsif ($arg eq "-nnn") {
      $main::infile{nnn} = shift(@arguments);


=pod

=item	B<-g Graph file>

Name of the Graph (default: Name of the graph file)

=cut
    } elsif ($arg eq "-g") {
      $main::infile{graph} = shift(@arguments);


=pod

=item	B<-o output Directory>

If no output file is specified, the current directory is used.

=cut
    } elsif ($arg eq "-o") {
      $main::dir{output} = shift(@arguments);

=pod

=item	B<-ecg Max EC genericity>

Ec max genericity level. Used to limit the level of genericity of EC when mapping. 
The max genericity level may have level from 0 to 4 :  1.-.-.- (level 3) or 1.2.-.- (level2) or 1.2.3.- (level 1) or 1.2.3.4 (level 0 = no generic EC)
Generic EC may contained a large number of reaction node that can increase drasticaly computatioal time and gives little information.
Default is 4;

=cut
    } elsif ($arg eq "-ecg") {
      $ECgenericitylevel = shift(@arguments);

=pod

=item B<-task>

Specify a subset of tasks to be executed.

By default, the program runs all necessary tasks. However, in some
cases, it can be useful to select one or several tasks to be executed
separately.

Beware: task selection requires expertise, because most tasks depends
on the prior execution of some other tasks in the workflow. Selecting
tasks before their prerequisite tasks have been completed will provoke
fatal errors.

B<Default tasks>

=over

=item I<clean> (default)

Remove temporary files. This option is incompatible with the option
I<-batch>.

=back

=cut
    } elsif ($arg eq "-task") {
	my @requested_tasks = split ",", shift (@arguments);
	foreach my $task (@requested_tasks) {
	    next unless $task;
	    if ($supported_task{$task}) {
		$task{$task} = 1;
	    } else {
		&RSAT::error::FatalError("Task '$task' is not supported. \n\tSupported: $supported_tasks");
	    }
	}

=pod

=item B<-batch>

Generate one command per query gene, and post it on the queue of a PC
cluster.

=cut
  } elsif ($arg eq "-batch") {
    $main::batch = 1;


    } else {
      &FatalError(join("\t", "Invalid option", $arg));

    }
  }

=pod

=back

=cut

}

################################################################
## Verbose message
sub Verbose {
    print $main::out "; template ";
    &PrintArguments($main::out);
    printf $main::out "; %-22s\t%s\n", "Program version", $program_version;
    if (%main::infile) {
	print $main::out "; Input files\n";
	while (my ($key,$value) = each %main::infile) {
	  printf $main::out ";\t%-13s\t%s\n", $key, $value;
	}
    }
    if (%main::outfile) {
	print $main::out "; Output files\n";
	while (my ($key,$value) = each %main::outfile) {
	  printf $main::out ";\t%-13s\t%s\n", $key, $value;
	}
    }
}


__END__
