#!/usr/bin/perl -w
############################################################
#
# $Id: retrieve-variation-seq,v 1.18 2013/08/18 10:00:18 jvanheld Exp $
#
############################################################

=pod

=head1 NAME

retrieve-variation-seq

=head1 VERSION

$program_version

=head1 DESCRIPTION

Given a set of IDs for polymorphic variations, retrieve the
corresponding variants and their flanking sequences, in order to scan
them wiht the tool I<variation-scan>.

=head1 AUTHORS

=over

=item B<Jeremy Delerce> (M2 thesis project 2013)

=item B<Alejandra Medina Rivera> <amedina@lcg.unam.mx>

=item B<Jacques van Helden> <Jacques.van-Helden\@univ-amu.fr>

=back

=head1 CATEGORY

=over

=item util

=back

=head1 USAGE

 retrieve-snp-seq -species species_name (-release # | -assembly assembly)  \
   [-i #inputfile] [-format variation_format] \
   [-col ID_column] [-mml #] [-o outputfile] [-v #] [...]

=head2 Example

  Get variation sequence of Homo_sapiens from a bed file

    retrieve-variation-seq -v 2 \
      -species Homo_sapiens -release 84 -format bed -assembly GCRh37 \
      -i $RSAT/public_html/demo_files/sample_regions_for_variations_hg19.bed \
      -mml 30 \
      -o variations.varSeq

=head1 INPUT FORMAT

=head2 Genomic coordinate file

The option I<-i> allows to specify a genomic coordinate file in bed
format. The program only takes into account the 3 first columns of the
bed file, which specify the genomic coordinates.

B<Note> (from Jacques van Helden): the UCSC genome browser adopts a
somewhat inconsistent convention for start and end coordinates: the
start position is zero-based (first nucleotide of a
chromosome/scaffold has coordinate 0), but the end position is
considered not included in the selection. This is equivalent to have a
zero-based coordinate for the start, and a 1-base coordinate for the
end.

=head2 Example of bed file

 chr1	3473041	3473370
 chr1	4380371	4380650
 chr1	4845581	4845781
 chr1	4845801	4846260

The definition of the BED format is provided on the UCSC Genome
Browser web site (http://genome.ucsc.edu/FAQ/FAQformat#format1).

This program only takes into account the 3 first columns, which
specify the genomic coordinates.

=over

=item 1. chrom

The name of the chromosome (e.g. chr3, chrY, chr2_random) or scaffold
(e.g. scaffold10671).

=item 2. chromStart

The starting position of the feature in the chromosome or
scaffold. For RSAT programs, the first base in a chromosome is
numbered 1 (this differs from the UCSC-specific zero-based notation
for the start).

B<Note> from Jacques van Helden: the UCSC genome browser adopts a
somewhat inconsistent convention for start and end coordinates: the
start position is zero-based (first nucleotide of a
chromosome/scaffold has coordinate 0), and the end position is
considered not included in the selection. This is equivalent to have a
zero-based coordinate for the start, and a 1-base coordinate for the
end. We find this representation completely counter-intuitive, and we
herefore decided to adopt a "normal" convention, where:

=over

=item start and end position represent the first and last positions
I<included> in the region of interest.

=item start and end positions are provided in one-based notation
(first base of a chromosome or contig has coordinate 1).

=back

=item 3. chromEnd

The ending position of the feature in the chromosome or scaffold.

=back

=head2 Variation file

See I<download-ensembl-variation> output format.

=head2 Variation ID list

A tab delimited file with id of variation in column.

=head1 OUTPUT FORMAT

A tab delimited file with the following column content.

=over

=item 1. chrom

The name of the chromosome (e.g. 1, X, 8...)

=item 2. chromStart

The starting position of the feature in the chromosome

=item 3. chromEnd

The ending position of the feature in the chromosome

=item 4. chromStrand

The strand of the feature in the chromosome

=item 5. variation id

ID of the variation

=item 8. SO term

SO Term of the the variation

=item 7. ref variant

Allele of the variation in the reference sequence

=item 8. variant

Allele of the variation in the sequence

=item 10. allele_frequency

Allele frequency

=item 10. sequence

Sequence of lenght L center on the variation

=back

=head1 SEE ALSO

=head2 download-ensembl-genome

I<retrieve-variation-seq> uses the sequences downloaded
from Ensembl using the tool I<download-ensembl-genome>.

=head2 download-ensembl-variations

I<retrieve-variation-seq> uses variation coordinates downloaded
from Ensembl using the tool I<download-ensembl-variations>.

=head2 variation-scan

Scan variation sequences with one or several position-specific scoring
matrices.

=head1 WISH LIST

=cut


BEGIN {
  if ($0 =~ /([^(\/)]+)$/) {
    push (@INC, "$`lib/");
  }
}

require "RSA.lib";
require "RSAT_to_ensembl.lib.pl";

################################################################
## Main package
package	main;
{

  ###############################################################
  ## Initialise parameters
  our $start_time = &RSAT::util::StartScript();
  our $program_version = do { my @r = (q$Revision: 1.18 $ =~ /\d+/g); sprintf"%d."."%02d" x $#r, @r };

  our %infile	= ();
  our %outfile = ();

  our $verbose = 0;
  our $in = STDIN;
  our $out = STDOUT;
  our $species = '';
  our $ensembl_release = "";
  our $assembly = "";

  our $ref_seq = "";
  our $flank_len = 29;

  our $col = 1;

  our $get_available_species = 0;
  our $validate = 0;

  ## Define supported output formats
  our @supported_input_formats = qw (varBed id bed);
  our $supported_input_formats = join ",", @supported_input_formats;
  our %supported_output_format = ();
  foreach my $format (@supported_input_formats) {
      $supported_output_format{$format} = 1;
  }
  our $format = "";
  &RSAT::message::Debug("Scripts path", $SCRIPTS ) if ($main::verbose >= 10);


  our $total_nb_variation = 0;
  our $total_nb_variant = 0;

  ## Parameters for the &doit() command
  $dry = 0;
  $die_on_error = 1;
  $job_prefix = "get_variations";
  $batch = 0;

  ################################################################
  ## Read argument values
  &ReadArguments();
  our $genomes_dir = &Get_genomes_dir();


  #
  ################################################################
  ## Check argument values

  #Check if argument specified
  &RSAT::error::FatalError("No species specified. Use -species") unless ($species);
  &RSAT::error::FatalError("No assembly and ensembl version specified. Use at least one of these options: -release -assembly") 
      unless ($ensembl_release || $assembly);

  if ($main::infile{'input'}) {
    &RSAT::error::FatalError("No input format specified. Use -format") unless ($format);
   
  }

  # Check directory
  my $genome_dir = &Get_genome_dir($species, $assembly, $ensembl_release,$species_suffix);
  my $variation_dir = &Get_variation_dir($species, $assembly, $ensembl_release, $species_suffix);

  &RSAT::error::FatalError("Genome directory", $genome_dir, "does not exist. Use download-ensembl-genome before retrieve-variation-seq.") unless (-d $genome_dir);
  &RSAT::error::FatalError("Variation directory", $variation_dir, "does not exist. Use download-ensembl-variation before retrieve-variation-seq.") unless (-d $variation_dir);

  # Check if sequence files are not missing
  my %chr_file = &Get_file_seq_name($genome_dir);

  foreach my $file (keys(%chr_file)) {
      &RSAT::message::Debug("Chr ", "++".$file."++", "File",  $chr_file{$file} )  if ($main::verbose >= 10);
      my $raw_seq_file=$genome_dir."/".$chr_file{$file};
      unless (-f $raw_seq_file) {
	  &RSAT::error::FatalError($raw_seq_file," is missing.");
      }
  }

## Check if files was given
  unless  ((-f  $main::infile{'input'} ) || !($main::in)){
      &RSAT::error::FatalError("Input file not found", $main::infile{'input'});
  }

  ################################################################
  ## Print verbose
  $out = &OpenOutputFile($main::outfile{output});
  &Verbose() if ($main::verbose >= 1);

  @header_fields = qw(chr start end strand id soterm ref_var alt_var allele_freq seq);
  %field_description = ();
  $field_description{chr} = "Chromosome";
  $field_description{start} = "Start position";
  $field_description{end} = "End position";
  $field_description{strand} = "Strand";
  $field_description{id} = "identifier";
  $field_description{soterm} = "Sequence ontology (SO) term describing variation type.";
  $field_description{ref_var} = "Reference variant";
  $field_description{alt_var} = "Alternative variant";
  $field_description{allele_freq} = "Frequency of the current allele";
  $field_description{seq} = "Sequence of the current variant, flanked by a user-specified neighbouring region";
  
  ## ##############################################################
  ## Print field descriptions
  if ($main::verbose >= 1) {
      print $out "; Column contents\n";
      foreach my $f (1..scalar(@header_fields)) {
	  my $field = $header_fields[$f-1];
	  printf $out ";\t%d\t%-14s\t%s\n", $f, $field, $field_description{$field};
      }
  }
  
  print $out "#",join("\t",@header_fields),"\n";

  ################################################################
  ## Get the list of variation files installed in the RSAT data
  ## directory.
  my @variation_files = ();

  #########
  ## AMR NOTE: I'm having a lot of problems with the glob function
  ## and haven't been able to figure them out.
  ## I'll change back this part of the code as soon as I get it to
  ## work. This code is a patch

  my $vart_files=`ls $variation_dir/*.varBed`;
  @variation_files= split ("\n",$vart_files);

  
  ## print join("+",@variation_files);
  ## die "BOOM";
  ## foreach ( glob ( $variation_dir."*.varBed") ) {
  ##  push (@variation_files,$_) unless (/Failed/);
  ## }

  ################
  ## Initialize get_variations command in case format is bed or id
  my $get_variations_cmd="";
  if ($format eq "id" || $format eq "bed"){
      $get_variations_cmd=$ENV{RSAT}."/perl-scripts/variation-info ";
      $get_variations_cmd .=" -species ". $main::species;
      $get_variations_cmd .=" -release ". $main::ensembl_release if ($main::ensembl_release );
      $get_variations_cmd .=" -assembly ". $main::assembly if ($main::assembly ) ;
      $get_variations_cmd .=" -i ".   $main::infile{input} ;
  }

  ################################################################
  ## Retrieve variations from varBed format
  if ($format eq "varBed") {
      if ($main::infile{'input'}) {
	  $main::infile{'input_variation'} = $main::infile{'input'};
	  
      } else { ## Read in STDIN and write it to a file
	  $main::outfile{'variation_rsat'} = &RSAT::util::make_temp_file("","variation_rsat", 1).".varBed";
	  my $out_v = &OpenOutputFile($main::outfile{'variation_rsat'});
	  
	  while (<$main::in>) {
	      next if (/^#/);
	      next if (/^;/);
	      next unless (/\t/);
	      print $out_v $_;
	  }
	  close $out_v;
	  
	  $main::infile{'input_variation'} = $main::outfile{'variation_rsat'};
	  
      }
  }
  
  
  ################################################################
  ## Retrieve variations from one or several ID(s)
  ## retrieve-variation-seq works with varBed formats
  ## if IDs are given retrieve-variation-seq will call get-variants 
  ## to get the variant information in varBed format and save it in a temp file

  elsif ($format eq "id") {
      &RSAT::message::TimeWarn("Retrieving variations from ID(s) using variation-info") if ($main::verbose >= 2);
      ## Retrieved variation information in rsat format will be writen in a temporary file
      
      ## Create temporary file
      $main::outfile{'variation_rsat'} = &RSAT::util::make_temp_file("","variation_rsat_fromIDs", 1).".varBed";
      
      ## get_variations command completed for bed file 
      $get_variations_cmd .=" -format id " ;
      $get_variations_cmd .=" -o ".  $main::outfile{'variation_rsat'} ;
      
      &RSAT::message::TimeWarn("variation-info command:", $get_variations_cmd) if ($main::verbose >= 2);
      
      &doit($get_variations_cmd , $dry, $die_on_error, $verbose, $batch, $job_prefix);
      
      ## Use variation-info out put as input to retrieve sequences 
      $main::infile{'input_variation'} = $main::outfile{'variation_rsat'};
  }
  
  ################################################################
  ## Get variation from coordinates
  
  elsif ($format eq "bed") {
      
      ## Retrieve variation
      &RSAT::message::TimeWarn("Retrieve variation from coordinate using variation-info") if ($main::verbose >= 2);

      ## Retrieved variation information in rsat format will be writen in a temporary file
      ## Create temporary file
      
      $main::outfile{'variation_rsat'} = &RSAT::util::make_temp_file("","variation_rsat_from_bed", 1).".varBed";
  
      
      ## get_variations command completed for bed file 
      $get_variations_cmd .=" -format bed " ;
      $get_variations_cmd .=" -o ".  $main::outfile{'variation_rsat'} ;
    
      &RSAT::message::TimeWarn("variation-info command:", $get_variations_cmd) if ($main::verbose >= 2);
      
      &doit($get_variations_cmd , $dry, $die_on_error, $verbose, $batch, $job_prefix);
      
      ## Use variation-info out put as input to retrieve sequences 
      $main::infile{'input_variation'} = $main::outfile{'variation_rsat'};
  }
  
  ################################################################
  ## Change default variation file
  ## ALE NOTE: Don't understand what this is for
  if ($main::infile{'input_variation'}) {
      @variation_files = ();
      push (@variation_files,$main::infile{'input_variation'});
  }
  
  ################################################################
  ## Get sequence
  ## For all input options variants were selected and reported in a temporary file afterwards
  ## sequences need to be retrieved for each variant for further scanning
  &RSAT::message::TimeWarn("Retrieving sequence for each allele of a variation") if ($main::verbose >= 2);
  my $last_id ="";
  my $nb_var = 0;
  my $last_nb = 1000000;
  
  foreach my $var_file (@variation_files) {
      my $last_chr = "";
      my $ref_seq = "";
      
      &RSAT::message::Debug("Reading in variation information file ", $var_file) if ($main::verbose >= 10); 
      
      ($file) = &OpenInputFile($var_file);
      
      while (<$file>) {
	  next if (/^#/);
	  next if (/^;/);
	  
	  chomp();
	  
	  my ($chr,$start,$end,$strand,$id,$ref,$var,$type,$valide,$minor_allele_freq,$suvar,$invar) = split("\t");
	  next if ($invar && !$validate);
	  &RSAT::message::Debug("Line content", join("*",$chr,$start,$end,$strand,$id,$ref,$var,$type,$valide,$minor_allele_freq,$suvar,$invar)) if ($main::verbose >= 10);
	  ## Strip chr from the chromosome to leave only the number, users some times leave the regarthless of the fact formats should not include it
	  $chr=~s/chr//;

	  #print "chr".$chr."\n";
	  $total_nb_variation++;
	  
	  if ($chr ne $last_chr) { ## get the raw sequence for the given chromosome 
	      
	      my $raw_seq_file=$genome_dir."/".$chr_file{$chr};
	      &RSAT::message::Debug("raw seq file  value", $chr, $raw_seq_file) if ($main::verbose >= 10); 
	      $ref_seq = qx($SCRIPTS/sub-sequence -i $raw_seq_file -from 1 -to 250000000 -format raw);
	      &RSAT::message::Debug("ref_seq  value",  $ref_seq) if ($main::verbose >= 10); 
	      $last_chr = $chr;
	      &RSAT::message::Debug("last_chr value",  $last_chr) if ($main::verbose >= 10);
	  }
	  &RSAT::message::Debug("id  value",  $id) if ($main::verbose >= 10);
	  &RSAT::message::Debug("last_id  value",  $last_id) if ($main::verbose >= 10);
	  
	  #if ($id ne $last_id && !$main::infile{'input'}) {  ## ALE'S NOTE: I DONT UNDERSTAND THIS SEGMENT SPECIALLY THE NO INPUT INSTATEMENT IN THE IF
	  if ($id ne $last_id ) {  
	      $nb_var++;
	      if ($nb_var >= $last_nb + 1000000) {
		  &RSAT::message::TimeWarn("Getting sequence for the variation",$nb_var,"to",$last_nb) if ($main::verbose >= 10);
		  $last_nb += 1000000;
	      }
	      $last_id = $id;
	  }
	  
	  $var .= ",$ref";
	  @variants = split(",",$var);
	  
	  foreach my $variant_aux (@variants) {
	      ## Suppress the "-" symbol, which is used to denote
	      ## deletions, but would not make sense in the sequence of
	      ## the variant extended on both flanks.
	      $variant_aux =~ s/-//g; 
	      $variant_aux2=$variant_aux;
	      ## Take STRAND into account
	      ## if the strand is reverse, the variant will be translate to the direct strand
	      ## so all sequences will be direct 
	      if ($strand eq "-"){
		  $variant_aux2=~ tr/ACGTacgt/TGCAtgca/;
		  &RSAT::message::TimeWarn("Transforming negative stranded variant", $id,$variant_aux," to positive ",$variant_aux2) if ($main::verbose >= 2);

	      }elsif ($strand ne "+"){
		  &RSAT::message::TimeWarn("Strand information does not match any known annotation, variant", $id," will be retrieved as direct") if ($main::verbose >= 1);
		  $variant_aux2=$variant_aux;
	      }
	      &RSAT::message::Debug( "variant ", $variant_aux ."\n",
				     # "ref_seq " .$ref_seq ."\n" ,
				     "strand", $strand."\n",
				     "start ",$start ."\n" ,
				     "flank_len ", $flank_len ."\n" ) if ($main::verbose >= 10);
	      
	      #<STDIN>;
	      ## JvH, for Ale: we should check it the strand is well taken
	      ## into account (here or below ?)
	      ## Note Ale for Jacques 20150209: I added the strand part just before this note; 

	      
	      my $left_flank = lc(substr($ref_seq,$start-$flank_len-1,$flank_len));
	      my $right_flank = lc(substr($ref_seq,$end-0,$flank_len));
	      my $variant_seq = $left_flank.$variant_aux2.$right_flank;
	      
	      ## Select the output fields
	      my @out_fields = ($chr, $start, $end, $strand, $id, $type, $ref,  $variant_aux,  $minor_allele_freq, $variant_seq);
	      &RSAT::message::Debug("Variant retrieved fields\n", join("\t",@out_fields )) if ($main::verbose >= 10);
	      print $out join("\t", @out_fields)."\n";
	      #<STDIN>;
	      $total_nb_variant++;
	  }
      }
  }
  
  my $exec_time = &RSAT::util::ReportExecutionTime($start_time); ## This has to be exectuted by all scripts
  if ($main::verbose >= 1) {
      print $out "; Total variations\t",$total_nb_variation,"\n";
      print $out "; Total variants\t",$total_nb_variant,"\n";
      print $out $exec_time;
  }
  
  exit(0);
}

################################################################
################### SUBROUTINE DEFINITION ######################
################################################################

################################################################
## Display full help message
sub PrintHelp {
  system "pod2text -c $0";
  exit(0);
}

################################################################
## Display short help message
sub PrintOptions {
  &PrintHelp();
}

################################################################
## Read arguments
sub ReadArguments {
  my $arg;
  my @arguments = @ARGV; ## create a copy to shift, because we need ARGV to report command line in &Verbose()
  while (scalar(@arguments) >= 1) {
    $arg = shift (@arguments);

=pod

=head1 OPTIONS

=over 4

=item B<-v #>

Level of verbosity (detail in the warning messages during execution)

=cut
    if ($arg eq "-v") {
      if (&IsNatural($arguments[0])) {
        $main::verbose = shift(@arguments);
      } else {
        $main::verbose = 1;
      }

=pod

=item B<-h>

Display full help message

=cut
    } elsif ($arg eq "-h") {
      &PrintHelp();

=pod

=item B<-help>

Same as -h

=cut
    } elsif ($arg eq "-help") {
      &PrintOptions();

=pod

=item B<-species species_name>

Species name. This name must correspond to the species of the
variation/bed/id file if provided.

=cut
    } elsif ($arg eq "-species") {
      $main::species = lc(shift(@arguments));

=pod

=item B<-species_suffix>

Species name. This name must correspond to the species of the
variation/bed/id file if provided.

=cut
    } elsif ($arg eq "-species_suffix") {
      $main::species_suffix = lc(shift(@arguments));

=pod

=item B<-release #>

The version of ensembl database (e.g. 84).

Note: each Ensembl release contains a specific assembly for
each species. When the option -release is used, the option
-assembly should thus in principle not be used.

=cut
  } elsif (($arg eq "-release") || ($arg eq "-version") || ($arg eq "-e_version")) {
    if ($arg ne "-release") {
      &RSAT::message::Warning("Option -version is obsolete, has been replaced by -release.");
    }
    $main::ensembl_release = shift(@arguments);

=pod

=item B<-assembly #>

Assembly (e.g. GRCh37 for the assembly 37 of the Human genome).

Note: genome assemblies can cover several successive ensemble
versions. In case of ambiguity, the latest corresponding ensembl
version is used.

=cut
    } elsif (($arg eq "-assembly") || ($arg eq "-a_version")) {
      if ($arg ne "-assembly") {
	&RSAT::message::Warning("Option -a_version is obsolete, has been replaced by -assembly.");
      }
      $main::assembly = shift(@arguments);

=pod

#=item B<-available_species>
#
#Get the list of all locally supported species and genome assemblies.
#
#=cut
#    } elsif ($arg eq "-available_species") {
#      $main::available = 1;
#
#=pod

=item B<-i input_file>

Input File.

The input file specifies a list of query variations.
Each row corresponds to one query.

The variations can be provided in various formats (see option -format
below).

=cut
    } elsif ($arg eq "-i") {
      $main::infile{input} = shift(@arguments);

=pod

=item B<-format variation_format>

Format of the input file

Supported formats: 

=over

=item I<varBed>

Format of variation files used by all RSAT scripts.

=item I<id>

tab-delimited file with all variation IDs in a given column, which can
be specified by the option I<-col>.

=item I<bed>

General format for the description of genomic features
(see https://genome.ucsc.edu/FAQ/FAQformat.html#format1).

=back 

=cut
    } elsif ($arg eq "-format") {
      $main::format = shift(@arguments);
      unless ($supported_output_format{$main::format}) {
         &RSAT::error::FatalError($format, "Invalid output format. Supported: ". $supported_input_formats);
      }

=pod

=item B<-mml #>

Length of the longest Matrix. 

The program will adapt the length of the flanks to be extracted on
each side of the variants, in order to be able to align the longest
matrix on both sides. 

The flanking size on each size will be

    n = mml -1

For example, if the longest matrix of the database contains 30
columns, the flanking size will be 29 base pairs, which is sufficient
to align the SNP at all the positions of the matrix.

=cut
    } elsif ($arg eq "-mml") {
      if (&IsNatural($arguments[0])) {
        $main::flank_len = shift(@arguments)-1;
      } else {
        &RSAT::error::FatalError("-col argument : ",shift(@arguments)," is not natual");
      }

=pod

=item B<-col #>

Column containing the variation IDs with the input format "id".

Default : 1

=cut
    } elsif ($arg eq "-col") {
      if (&IsNatural($arguments[0])) {
        $main::col = shift(@arguments);
      } else {
        &RSAT::error::FatalError("-col argument : ",shift(@arguments)," is not natual");
      }

=pod

=item	B<-o outputfile>

The output file is in fasta format.

If no output file is specified, the standard output is used.  This
allows to use the command within a pipe.

=cut
    } elsif ($arg eq "-o") {
      $outfile{output} = shift(@arguments);

=pod

=back

=cut

    } else {
      &FatalError(join("\t", "Invalid option", $arg));
    }
  }
}

################################################################
## Verbose message
sub Verbose {
  print $out "; retrieve-variation-seq ";
  &PrintArguments($out);
  printf $out "; %-22s\t%s\n", "Program version", $program_version;

  if (%main::infile) {
    print $out "; Input files\n";
    while (my ($key,$value) = each %main::infile) {
      printf $out ";\t%-13s\t%s\n", $key, $value;
    }
  }

  if (%main::outfile) {
    print $out "; Output files\n";
    while (my ($key,$value) = each %main::outfile) {
      printf $out ";\t%-13s\t%s\n", $key, $value;
    }
  }
}
