#!/usr/bin/perl
if ($0 =~ /([^(\/)]+)$/) {
    push (@INC, "$`lib/");
}
require "RSA.lib";
require RSAT::OrganismManager;

use Storable qw(nstore retrieve);

## Initialization
$start_time = &RSAT::util::StartScript();
$calc_neighbours = 0;
$info_before = 0;
@ID_column = ();
$null = "<NULL>";
$header_line = 0;


@supported_info_types = qw(id ctg strand left right name descr names 
                           upstr_neighb_name upstr_neighb_id upstr_limit upstr_size
                           downstr_neighb_name downstr_neighb_id downstr_limit downstr_size
                           right_neighb_name right_neighb_id right_limit right_size
                           left_neighb_name left_neighb_id left_limit left_size
			  );

%supported_info_types = ();
foreach my $type (@supported_info_types) {
  $supported_info_types{lc($type)} = 1;
}
$supported_info_types{all} = 1;
$supported_info_types = join ",", @supported_info_types;

&ReadArguments();

#### output file ####
$out = &OpenOutputFile($outputfile);

#### organism
&RSAT::OrganismManager::CheckOrganism($organism_name);


#### Information types
if (scalar(@info_types < 1)) {
  @info_types = "descr";
}

@null = ();

## Check if all the info types were requested
foreach my $type (@info_types) {
  if ($type eq "all") {
    @info_types = @supported_info_types;
    last;
  }
}

## Check that the column containing the gene ID is specified.
## If not only default ID_column is 1
if (scalar(@ID_column) == 0) {
  push @ID_column, 1;
}


## Check the requested info types
foreach my $col (@ID_column) {
  my $suffix = "";
  if (scalar(@ID_column) > 1) {
    $suffix = "_".$col;
  }
  foreach my $type (@info_types) {
    &RSAT::error::FatalError("Invalid information type $type. Supported: $supported_info_types") 
      unless ($supported_info_types{$type});
    if (($type =~ /^upstr_/) ||
	($type =~ /^down_/) ||
	($type =~ /^right_/) ||
	($type =~ /^left_/) ) {
      $calc_neighbours = 1;
    }
    my $type_header = $type;
    $type_header =~ s/upstr_neighb_/up_/;
    $type_header =~ s/upstr_/up_/;
    $type_header =~ s/downstr_neighb_/dn_/;
    $type_header =~ s/downstr_/dn_/;
    $type_header =~ s/up_limit/up_lim/;
    $type_header =~ s/^descr$/description/;
    push @null, $null;
    push @header, $type_header.$suffix;
  }
}

################################################################
## Instantiate organism
$organism = new RSAT::organism();
$organism->check_name($organism_name);
$organism->set_attribute("name", $organism_name);
$organism->OpenContigs($organism_name, $annotation_table);

## Accepted feature types
#$organism->DefineAcceptedFeatureTypes(sort keys %supported_feature_types);
@accepted_feature_types = sort(keys(%accepted_feature_types));
$organism->DefineAcceptedFeatureTypes(@accepted_feature_types);

### Load the organism
my $imp_pos = 0;
my $synonyms = 1;
if ($organism->is_serialized($imp_pos, $synonyms)) {
  ## Load organism from serialized file
  my $serial_file = $organism->serial_file_name($imp_pos, $synonyms);
  $organism = retrieve $serial_file;
  &RSAT::message::TimeWarn("Retrieved organism", $organism_name, "from serialized file", $serial_file)
    if ($main::verbose >= 3);
} else {
  ### Load organism from flat files (slower)
  $organism->load_and_serialize($imp_pos, $synonyms);
}
$organism->CalcNeighbourLimits() if ($calc_neighbours);

#### verbose ####
if ($verbose) {
  print $out "; add-gene-info ";
  &PrintArguments($out);
  print $out ("; Feature type(s)	", join(",", @accepted_feature_types),"\n") if (scalar(@accepted_feature_types) > 0);
  print $out ("; Input file	$inputfile\n") if ($inputfile);
  print $out ("; Output file	$outputfile\n") if ($outputfile);
}

#### input file ####
($in) = &OpenInputFile($inputfile);


## Read header line
if ($header_line) {
  $current_line = <$in>;
  chomp($current_line);
  if ($info_before) {
    $current_line =~ s/^#//;
    print $out "#", join ("\t", @header, $current_line), "\n";
  } else {
    print $out join ("\t", $current_line, @header), "\n";
  }
}

## Read the file and add info
my $l = 0;
while ($current_line = <$in>) {
  $l++;
  chomp($current_line);
  $current_line =~ s/\r//g; ## Suppress Windows-specific carriage return

  ## Skip empty lines
  unless ($current_line =~ /\S/) {
    print $out $current_line, "\n";
    next;
  }

  ## Print header line + info column names
  if ($current_line =~ /^#/) {
    if ($info_before) {
      $current_line =~ s/^#//;
      print $out "#", join ("\t", @header, $current_line), "\n";
    } else {
      print $out join ("\t", $current_line, @header), "\n";
    }
    next;
  }

  ## Print comment line as such
  if ($current_line =~ /^;/) {
    print $out join ("\t", $current_line), "\n";
    next;
  }
  my @info = ();
  foreach my $col (@ID_column) {
    if ($col > 0) {
      @columns = split("\t", $current_line);
      $query = uc($columns[$col - 1]);
      $query = &RSAT::util::trim($query);

      my $feature = $organism->get_feature_for_name($query);

      &RSAT::message::Debug("identified feature", $query, $feature) if ($main::verbose >= 3);
      if ($feature) {
	foreach my $type (@info_types) {
#	  $type = "description" if ($type eq "descr");
	  my $info = join( ";", $feature->get_attribute($type));
	  &RSAT::message::Warning(join("\t", "query",$query, $type, $info)) if ($main::verbose >= 5);
	  push @info, $info;
	}
	&RSAT::message::Info(join ("\t", "Line", $l, "found", $query, $type, join("\t", @info))) if ($main::verbose >= 3);
      } else {
	@info = @null;
	&RSAT::message::Warning(join ("\t", "Line", $l, "No info found for query", $query)) if ($main::verbose >= 1);
      }
    }
  }
  if ($info_before) {
    print $out join ("\t", @info, $current_line), "\n";
  } else {
    print $out join ("\t", $current_line, @info), "\n";
  }
}



## Close input/output files
close $in if $inputfile;


################################################################
## Report execution time and close output stream
my $exec_time = &RSAT::util::ReportExecutionTime($start_time); ## This has to be exectuted by all scripts
print $main::out $exec_time if ($main::verbose >= 1); ## only report exec time if verbosity is specified
close $main::out if ($outputfile);

exit(0);


########################## subtroutine definition ############################

#### display full help message #####
sub PrintHelp {
    open HELP, "| more";
    print HELP <<End_of_help;
NAME
	add-gene-info

DESCRIPTION

	Takes as input a tab-delimited file with one ore more columns
	containing gene IDs, and adds columns with information about
	the corresponding genes.

CATEGORY genomics

USAGE
        add-gene-info [-i inputfile] [-o outputfile] [-v] 
        [-col id_column_1] [-col id_column_2] -org organism
	
OPTIONS
	-h	display full help message

	-help	display options

	-v	verbose
	
	-col	column containing gene IDs. 

	-i inputfile
		if not specified, the standard input is used.
		This allows to place the command within a pipe.

	-o outputfile
		if not specified, the standard output is used.
		This allows to place the command within a pipe.

	-org organism

	-info	information type (supported: $supported_info_types)

	-before add the information before the input line (by default,
         	the info is added at the end of each input line).
	
	-null String to display for undefined values (default: $null).

	-feattype
		Feature type.
		Supported: $supported_feature_types

INPUT FORMAT

	Any text file containing a column with gene identifiers or
	gene names.
	
OUTPUT FORMAT

	A text file with tab-separated values reproducing the content
	of the input file with added colums containing the requested
	information fields.

End_of_help
    close HELP;
    exit;
}


#### display short help message #####
sub PrintOptions {
    open HELP, "| more";
    print HELP <<End_short_help;
add-gene-info options
---------------------
-h          display full help message
-help       display options
-i          input file
-o          output file
-v          verbose
-col #      gene ID column (can be used recursively)
-org	    organism
-info	    information type (supported: $supported_info_types)
-before	    add info before rather than afte the input line
-null       string to display for undefined values (default: $null).
-feattype   accepted feature types. Supported: $supported_feature_types
End_short_help
  close HELP;
  exit;
}



################################################################
#### Read arguments 
sub ReadArguments() {
    foreach $a (0..$#ARGV) {

	### verbose  
	if ($ARGV[$a] eq "-v") {
	    if (&IsNatural($ARGV[$a+1])) {
		$verbose = $ARGV[$a+1];
	    } else {
		$verbose = 1;
	    }
	    
	    ### detailed help
	} elsif ($ARGV[$a] eq "-h") {
	    &PrintHelp();
	    
	    ### list of options
	} elsif ($ARGV[$a] eq "-help") {
	    &PrintOptions();
	    
	    
	} elsif ($ARGV[$a] eq "-i") {
	    $inputfile = $ARGV[$a+1];
	    
	} elsif ($ARGV[$a] eq "-o") {
	    $outputfile = $ARGV[$a+1];

	    # column containing the gene ID
	} elsif ($ARGV[$a] eq "-col") {
	    my $col = $ARGV[$a+1];
	    unless (&RSAT::util::IsNatural($col) && ($col > 0)) {
	      &RSAT::error::FatalError("Invalid column specification\n");
	    } else {
	      push @ID_column, $col;
	    }
	    #### organism
	} elsif ($ARGV[$a] eq "-org") {
	    $organism_name =$ARGV[$a+1];

	    #### information type
	} elsif ($ARGV[$a] eq "-info") {
	    @info_types = split ",", lc($ARGV[$a+1]);

	    #### add info before the input line
	} elsif ($ARGV[$a] eq "-before") {
	    $info_before = 1;

	    ## String to display for null values
	} elsif ($ARGV[$a] eq "-null") {
	    $null = $ARGV[$a+1];

	    ## First row contains header
	} elsif ($ARGV[$a] eq "-header") {
	  $header_line = 1;

	    #### feature types
	} elsif ($ARGV[$a] eq "-feattype") {
	  my @feature_types = split ",", $ARGV[$a+1];
	  foreach my $feature_type (@feature_types) {
	    if ($supported_feature_types{lc($feature_type)}) {
	      $accepted_feature_types{lc($feature_type)}++;
	    } else {
	      &RSAT::error::FatalError("$feature_type invalid feature type. Supported: $supported_feature_types");
	    }
	  }
	}
    }
}
