#!/usr/bin/perl

############################################################
#
# $Id: gene-info,v 1.18 2011/07/26 11:10:09 jvanheld Exp $
#
# Time-stamp: <2003-07-06 20:30:09 jvanheld>
#
############################################################

if ($0 =~ /([^(\/)]+)$/) {
    push (@INC, "$`lib/");
}
require "RSA.lib";
require RSAT::util;
require RSAT::OrganismManager;
require RSAT::organism;
require RSAT::GenomeFeature;
use Storable qw(nstore retrieve);

# temporary directory
my $RSAT = $0; $RSAT =~ s/\/perl-scripts.*//; my $TMP = $RSAT.'/public_html/tmp';


local $start_time = &RSAT::util::StartScript();
$date = $start_time;

#### initialise parameters ####
$orf_col = 0;
$locus_col = 2;
$full = 0;
$no_query = 0;
$match_description = 0;

&ReadArguments();

################################################################
## Check parameter values


## Organism
&RSAT::OrganismManager::check_name($organism_name);
my $organism = new RSAT::organism();
$organism->set_attribute("name", $organism_name);
#&CheckOrganism($organism_name);

## Accepted feature types
$organism->DefineAcceptedFeatureTypes(sort(keys(%accepted_feature_types)));
&RSAT::message::Info("Accepted feature types", sort(keys(%accepted_feature_types))) if ($main::verbose >= 0);

### open output file ###
$out = &RSAT::util::OpenOutputFile($outputfile);

&Verbose();

### Read ORF information
my $serial_file = $TMP."/".$organism_name."1syn.serial";
if (-e $serial_file && !$annotation_table && -M $serial_file < 2 && -w $serial_file) {
  $organism = retrieve $serial_file;
} else {
  ### Read feature attributes (position, description, ...)
  $organism->LoadFeatures($annotation_table, $imp_pos);
  $organism->LoadSynonyms();
  nstore $organism, $serial_file;
  system ("chmod 777 $serial_file");
}

################################################################
#### Read query file
if (($inputfile) || ($#queries == -1)) {
  &RSAT::message::TimeWarn("Reading query file") if ($main::verbose >= 2);
  my ($in, $input_dir) = &OpenInputFile($inputfile);
  while (<$in>) {
    next if (/^;/);
    next if (/^--/);
    next if (/^\#/);
    next unless (/\S/);
    if  (/^\s*(\S+)/) {
      push(@queries, $1);
    }
  }
  close $in if ($inputfile ne "");
}
&RSAT::message::Info("Number of queries", scalar(@queries)) if ($main::verbose >= 2);

unless (scalar(@queries)) {
  &RSAT::error::FatalError("You must define at least one query, with the options -q or -i.");
}

################################################################
#### search orf info for all queries ####
&Header();
# if ($match_description) {
#     %descr = $organism->index_attribute_by_feature("descr");
# }

foreach $query (@queries) {
  my $found = 0;

  foreach my $feature ($organism->get_attribute("features")) {
    if ($feature->matches($query, $match_description, $full)) {
      $found = 1;
      &PrintInfo($query, $feature);
    }
  }

  #### send a warning when the query did not match anything
  unless ($found) {
    &PrintWarning($query);
  }
}


###### close output file ######
my $exec_time = &RSAT::util::ReportExecutionTime($start_time);
print $main::out $exec_time if ($main::verbose >= 1);
close $out unless ($outputfile eq "");


exit(0);

################################################################
################### subroutine definition ######################
################################################################

################################################################
#### print header
sub Header {
    print $out ";", join ("\t", 
			  "id",
			  "name",
			  "chr",
			  "strand",
			  "left",
			  "right",
			  "description [synonyms]"
			  );
    unless ($no_query) {
	print $out "\tquery";
    }
    print $out "\n";
}

################################################################
#### print information for one ORF
sub PrintInfo {
    my ($query, $feature) = @_;
    print $out join ("\t", 
		     $feature->get_attribute("id"),
		     $feature->get_attribute("name"),
		     $feature->get_attribute("ctg"),
		     $feature->get_attribute("strand"),
		     $feature->get_attribute("left"),
		     $feature->get_attribute("right"),
		     $feature->get_attribute("descr"),
		     );
#     &RSAT::message::Debug ("Gene info",
# 			   "id: ".$feature->get_attribute("id"),
# 			   "name: ".$feature->get_attribute("name"),
# 			   "ctg: ".$feature->get_attribute("ctg"),
# 			   "strand: ".$feature->get_attribute("strand"),
# 			   "left: ".$feature->get_attribute("left"),
# 			   "right: ".$feature->get_attribute("right"),
# 			   "descr: ".$feature->get_attribute("descr"),
# 			   "query: ".$query,
# 			  ) if ($main:verbose >= 5);

    my @names = $feature->get_attribute("names");
    if (scalar(@names) >= 1) {
	print $out " [", join (";", @names), "]";
    }
    unless ($no_query) {
	print $out "\t$query";
    }
    print $out "\n";
}

################################################################
#### print warning
sub PrintWarning {
  my ($query) = @_;
  print $out join ("\t",
		   ";WARNING",
		   $query,
		   "no info"), "\n";
}

################################################################
#### print full help message
sub PrintHelp {
  open HELP, "| more ";
  print HELP <<EnfOfHelp;
NAME
	gene-info

USAGE
	gene-info -org organism -q query
	gene-info -org organism -q query1 -q query2 -q query3 ...
	gene-info -org organism i query_file

DESCRIPTION
	Returns the information about genes (CDS, mRNA, ...) specified
	either by their identifier, name, or by any supported synonym.

	Searches can also be done by specifying a sub-string of the
	gene descriptions. Regular expressions are supported. 

CATEGORY
	genomics

OPTIONS
	-h	display full help message
	-help	display options
	-v	verbose
	-org	organism
	-q	query. can be reiterated within the same command line
	-i	query file
	-o	output file
	-full	full match only (no substring matching)
	-noquery	
		do not print the query at the beginning of each line
	-descr	match queries against the description 
		(in addition to gene ID and  names)
	-feattype
		Feature type.
		Supported: $supported_feature_types
QUERY
	The simplest way to enter a query is with the -q option :
	    the query can be a ORF identifier 
		gene-info -org yeast -q YML115C
	    or a gene name
		gene-info -org yeast -q arg81
		
	Regular expressions can be used in the query. In this case, it is safer to 
	quote the query
		gene-info -org yeast -q 'arg.*'
		gene-info -org yeast -q '.*4'
		
	The search is case insensitive.
	
	Several queries can be combined on the command line by
	iterative use of the -q command :
		gene-info -org yeast -q pho5 -q pho81 -q pho11

	Alternatively, a list of queries can be provided in a text
	file. The first word of each line is considered as a new
	query.
		gene-info -org yeast -i query_file.txt

SYNONYMS
	It often happens that a gene bears several names. 
	gene-info will look for all synonyms of the query.
		gene-info -org yeast -q nil1 
	will provide the information about its synonym GAT1.

WEB VERSION
	http://www.bigre.ulb.ac.be/bioinformatics/rsa-tools/
EnfOfHelp
  close HELP;
  exit(0);
}

################################################################
#### display short help message
sub PrintOptions {
  open HELP, "| more";
  print HELP <<End_short_help;
gene-info options
----------------
-h		display full help message
-help		display options
-v		verbose
-org		organism
-i		query file
-o		output file
-q		query (can be reiterated on the same command line)
-full		full match only (no substring matching)
-noquery	do not print the query at the beginning of each line
-descr		match queries against the description 
-feattype	accepted feature types. Supported: $supported_feature_types
End_short_help
  close HELP;
  exit;
}

################################################################
#### read arguments
sub ReadArguments {
  foreach $a (0..$#ARGV) {
    ### verbose ###
    if ($ARGV[$a] eq "-v") {
      if (&IsNatural($ARGV[$a+1])) {
	$verbose = $ARGV[$a+1];
      } else {
	$verbose = 1;
      }
	    
      ### detailed help
    } elsif ($ARGV[$a] eq "-h") {
      &PrintHelp();
	    
      ### list of options
    } elsif ($ARGV[$a] eq "-help") {
      &PrintOptions();

      #### organism
    } elsif ($ARGV[$a] eq "-org") {
      $organism_name =$ARGV[$a+1];

      #### full match only
    } elsif ($ARGV[$a] eq "-full") {
      $full = 1;

      #### feature types
    } elsif ($ARGV[$a] eq "-feattype") {
      my @feature_types = split ",", $ARGV[$a+1];
      foreach my $feature_type (@feature_types) {
	if ($supported_feature_types{lc($feature_type)}) {
	  $accepted_feature_types{lc($feature_type)}++;
	} else {
	  &RSAT::error::FatalError("$feature_type invalid feature type. Supported: $supported_feature_types");
	}
      }

      #### match query against description
    } elsif ($ARGV[$a] =~ /^-desc/) {
      $match_description = 1;
	    
      #### don't print the query in output
    } elsif ($ARGV[$a] eq "-noquery") {
      $no_query = 1;
	    
      ### input file ###
    } elsif ($ARGV[$a] eq "-i") {
      $inputfile = $ARGV[$a+1];
	    
      ### query ###
    } elsif ($ARGV[$a] eq "-q") {
      push(@queries, $ARGV[$a+1]);
	    
      ### output file ###
    } elsif ($ARGV[$a] eq "-o") {
      $outputfile = $ARGV[$a+1];
    }
  }
}

################################################################
#### verbose
sub Verbose {
    if ($verbose) {
	print $out "; gene-info ";
	&PrintArguments($out);
	
	printf $out "; %-14s\t%s\n", "organism", $organism_name;
	print $out "; Output file	$outputfile\n" if ($outputfile);
	print $out "; Query file	$inputfile\n" if ($inputfile);
	printf $out "; %-14s\t%s\n", "feature type", $feature_types;
	print $out "; Queries\n;\t";
	print $out join ("\n;\t",@queries), "\n";
	print $out "; Match queries against description\n" if ($match_description);
	print $out "; Full match\n" if ($full);
    }
}
