#!/usr/bin/perl -w

############################################################
#
# $Id: download-ensembl-variations,v 1.22 2013/08/12 10:22:28 rsat Exp $
#
############################################################

use warnings;

=pod

=head1 NAME

download-ensembl-variations

=head1 VERSION

$program_version

=head1 DESCRIPTION

Download GVF file of variations from Ensembl, decompress the file,
remove variations that fail to pass the quality check.

Create "combinatory variations" from sets of overlapping variations.
Write new variation file with one file per chromosome plus a separate
file for the removed variations.

/!\ Before using I<download-ensembl-variations>, genomic sequences
need to be installed in raw format for the species of interest. To
download genomic sequences, run the command
I<download-ensembl-genome>.

=head1 AUTHORS

=over

=item I<Jeremy Delerce> (Master 2 thesis 2013)

=item I<Alejandra Medina-Rivera> (amedina@lcg.unam.mx)

=item I<Jacques van Helden> (Jacques.van-Helden\@univ-amu.fr)

=back

=head1 CATEGORY

=over

=item util

=back

=head1 USAGE

 download-ensembl-variations -species # [-dir #] [-available_species] [-v #]

=head2 Example

Get all variations for Homo sapiens

 download-ensembl-variations -species Homo_sapiens

=head1 OUTPUT FORMAT

A tab delimited file containing one row per variation, with the
following column content.

=head2 VARIATIONS PASSING QC

=over

=item 1. chrom

The name of the chromosome (e.g. 1, X, 8...)

=item 2. chromStart

The starting position of the feature in the chromosome

=item 3. chromEnd

The ending position of the feature in the chromosome

=item 4. chromStrand

The strand of the feature in the chromosome

=item 5. varId

The id of the variation(s)

=item 6. refSeq

Reference sequence of the variation

=item 7. varSeq

Sequence of all the variant

=item 8. type

Type of the variation

=item 9. validate

If the variation is validate.
Go to the following link to see all validation state :
http://www.ncbi.nlm.nih.gov/projects/SNP/snp_legend.cgi?legend=validation

=item 10. minor_allele_freq

Minor allele frequency

=item 11. isSpVar

If the variation is a combinatory variation

=item 12. inSpVar

If the variation is in a combinatory variation

=back

=head2 VARIATIONS FAILING QC

=over

=item 1. chrom

The name of the chromosome (e.g. 1, X, 8...)

=item 2. chromStart

The starting position of the feature in the chromosome

=item 3. chromEnd

The ending position of the feature in the chromosome

=item 4. chromStrand

The strand of the feature in the chromosome

=item 5. varId

The id of the variation(s)

=item 6. description

Why the variation is remove

=back

=head1 SEE ALSO

=head2 install-ensembl-genome

I<install-ensembl-genome> is a tools that allow to install all Ensembl genome, feature and variation.

=head1 WISH LIST

=cut

BEGIN {
  if ($0 =~ /([^(\/)]+)$/) {
    push (@INC, "$`lib/");
  }
}

require "RSA.lib";
require "RSAT_to_ensembl.lib.pl";

################################################################
## Main package
package	main;
{

  ###############################################################
  ## Initialise parameters
  our $start_time = &RSAT::util::StartScript();
  our $program_version = do { my @r = (q$Revision: 1.22 $ =~ /\d+/g); sprintf"%d."."%02d" x $#r, @r };

  our %outfile = ();

  our $verbose = 2;
  our $out = STDOUT;
  our $out_var = "";
  our $db = "ensembl";
  our $taxid = "";
  our $species = "";
  our $assembly_version = "";
  our $species_suffix = "";
  our $full_species_ID = "";
#  our $data_dir = &Get_data_dir();
  our $registry = 'Bio::EnsEMBL::Registry';
  our $ensembl_version_safe = &get_ensembl_version_safe($db);
  our $ensembl_version_latest = &get_ensembl_version_latest($db);
  our $ensembl_version = &get_ensembl_version($db);
  our $null = "<NULL>";

  our $get_available_species = 0;
  our $ref_seq = "";
  
  ################################################################
  ## Tasks
  
  ## Default tasks are executed if no task is specified
  local @default_tasks = (
      "download_gvf", ## Fetch gvf file from ensembl 
      "report_snps", ## Read downloaded file from ensembl and report the processed data 
      "clean_gvf", ## Delete the uncompressed downloaded GVF files after processing
      );
  
  ## These optional tasks are NOT executed by default, either because
  ## they are particularly time-consuming, or because we don't want to
  ## loose the raw data
  local @optional_tasks = (
      "check_ref_seq", ## Check the sequence on the reference genome
      "super_variations", ## Compute super-variations
      "clean_gvf_gz", ## Delete the downloaded compressed GVF files after processing
      );
  local @supported_tasks = (
      @default_tasks,
      @optional_tasks,
      "default", ## Run default tasks (avoid eavy tasks such as check_ref_seq)
      "all", ## Run all other tasks
      );
  local $supported_tasks = join (",", @supported_tasks);
  local %supported_tasks = ();
  foreach my $task (@supported_tasks) {
      $supported_tasks{$task} = 1;
  }
  local %task = (); ## List of tasks to be executed


  ################################################################
  ## Read argument values
  &ReadArguments();

  
  ################################################################
  ## Check the selected options


  ################################################################
  ## Check selected tasks

  ## If no tasks has been specified, execute them all
  if ($task{all}) {
      %task = %supported_tasks;
      delete $task{'all'};
      delete $task{'default'};
      
  } elsif ((scalar(keys(%task))==0) || ($task{default})) {
      ## Activate default tasks, i.e. all tasks except some highly
      ## time-consuming tasks
      foreach my $task (@default_tasks) {
	  $task{$task} = 1;
      }
  }

  &RSAT::message::Info("Selected tasks", sort(keys(%task))) if ($main::verbose >= 2);

  ## Check the consistency between reference sequence in the GVF file
  ## and corresponding genomic sequence. Note that this verification
  ## costs memory (each chromosome is laoded in RAM) and time (the
  ## control of each variation requires a substring extraction).
  if ($task{check_ref_seq}) {
      &RSAT::message::Warning("Beware: task check_ref_seq is activated.", 
			      "This may take a while for large genomes with high frequencies of variations") 
	  if ($main::verbose >= 1);
  }


  ################################################################
  ## Check ensembl version
  &check_ensembl_version($db, $ensembl_version);

  ## Change Ensembl version to EnsemblGenomes version if required
  if (lc($db) eq "ensemblgenomes") {
    use Bio::EnsEMBL::Registry;
    &RSAT::message::TimeWarn("download-ensembl-variations", "Getting the list of available species", "db=".$db) if ($main::verbose >= 1);
    &LoadRegistry($registry, $db, $ensembl_version);
    
    my @dbas = @{ $registry->get_all_DBAdaptors() };
    foreach my $dba (@dbas) {
      if ($dba->species() eq "multi") {
        @fields = split("_",$dba->dbc()->dbname());
        $ensembl_version = $fields[-2];
      }
    }
  }


  

  ################################################################
  ## Print verbose
  $out = &OpenOutputFile($outfile{output});
  &Verbose() if ($main::verbose >= 1);

  ################################################################

   ## Get available species in the ensembl variation ftp server

#   my @variation_ftp = &Get_variation_ftp($db, $ensembl_version);
#   &RSAT::error::FatalError("$db version : $ensembl_version not supported. No variation available for this version.") if (scalar(@variation_ftp) == 0);

#   ## If several addresses are available loop thrhough them and list them 
#   ## Usually recent version of ensembl won't have multiple folders
#   foreach (@variation_ftp) {
#     &RSAT::message::Info("Variation URL",$_) if ($main::verbose >= 2);
#   }

#   ## Retrieve the subfolders in the variations folder
#   ## in the ensembl server
# >>>>>>> b078097db7f4286a2036f24e8aa99ceccb0912a4
  my @available_species_dir = ();
  my @available_species = ();
  my %variation_ftp = ();
  if ($db eq "ensemblgenomes") {
      ## Get ensembl variation ftp path
      %variation_ftp = &Get_variation_ftp($db, $ensembl_version);
      @available_species = sort keys %variation_ftp;
    
  } else {
    my @variation_ftp = &Get_variation_ftp($db, $ensembl_version);
    if (scalar(@variation_ftp) == 0) {
      &RSAT::message::Warning("$db version : $ensembl_version not supported. No variation available for this version.");
      exit(0);
    }


    ## If several addresses are available loop thrhough them and list them.
    ## Usually recent version of ensembl won't have multiple folders.
    foreach my $ftp (@variation_ftp) {
      &RSAT::message::Info("Variation URL",$ftp) if ($main::verbose >= 2);
      push (@available_species_dir, qx{wget -S --spider $ftp."/" 2>&1})
    }
    
    ## push the subfolder names into a variable
    ## subfolder names correspond to species
    foreach (@available_species_dir) {
      next unless (/^d/);
      my @fields = split(" ");
      next if ($fields[-1] =~ /\./);
      push (@available_species, $fields[-1]);
    }
    
  
#   &RSAT::message::Debug("Species with viariation data available \n", join("\n",@available_species_dir)) if  ($main::verbose >= 10) ;
  
  }

  ################################################################
  ## Print available species
  if ($get_available_species) {

    foreach (sort {$a cmp $b} @available_species) {
       print $out ucfirst($_),"\n";
    }

    my $exec_time = &RSAT::util::ReportExecutionTime($start_time); ## This has to be exectuted by all scripts
    print $out $exec_time if ($main::verbose >= 1); ## only report exec time if verbosity is specified
    close $out;
    exit(0);
  }

  ################################################################
  ## Check argument values

  &RSAT::error::FatalError("You must specify a species (option -species).") unless ($species);
  &RSAT::error::FatalError("No variation avalaible for $species on $db") unless ( grep($_ eq $species, @available_species ));

  ## Get genome version
  $assembly_version = &Get_assembly_version($species,$ensembl_version, $species_suffix);
  #die "BOOM";

  &RSAT::error::FatalError("No assembly version found for $species and $db version $ensembl_version. Use download-ensembl-genome before download-ensembl-variations.") unless ($assembly_version);

  
  ## Check if genome version has been installed
  $full_species_ID = &Get_full_species_ID($species, $assembly_version,$ensembl_version, $species_suffix);

  &RSAT::message::Info("Full species ID", $full_species_ID) if ($main::verbose >= 2);
  $genome_dir = &Get_genome_dir($species, $assembly_version,$ensembl_version, $species_suffix);
  &RSAT::message::Info("Genome directory", $genome_dir) if ($main::verbose >= 2);
  printf $out ("; %-22s\t%s\n", "Full species ID", $full_species_ID);
  printf $out ("; %-22s\t%s\n", "Genome directory", $genome_dir);
  &RSAT::error::FatalError($genome_dir, "does not exist. Use download-ensembl-genome before download-ensembl-variations.") 
      unless (-d $genome_dir);
  # Check if sequence file are not missing
  my %chr_file = &Get_file_seq_name($genome_dir);
  foreach my $file (keys(%chr_file)) {
      my $chr_path = $genome_dir."/".$chr_file{$file};
      unless (-f $chr_path) {
	  &RSAT::error::FatalError("Missing chromosome sequence file", $chr_path);
      }
  }


  ################################################################
  ## Retrieve GVF file from ensembl

  ## Get URL
  my $species_variation_ftp = &Get_variation_species_ftp($db,$species,$ensembl_version);

  ## Define the basename of the GVF file
  my $gvf_file = ucfirst($species).".gvf";
  
  ## Define the path of the GVF file in the variation directory
  my $variation_dir = &Get_variation_dir($species, $assembly_version,$ensembl_version, $species_suffix);
  my $gvf_file_local_path = $variation_dir."/".$gvf_file;
  my $gvf_file_gz = $gvf_file_local_path.".gz";
  
  ## Define the URL of the GVF file on Ensembl FTP server
  my $gvf_file_ftp = &Get_gvf_ftp($db,$species,$ensembl_version);
  
  &RSAT::message::Info("Species variation URL --> $species_variation_ftp") if ($main::verbose >= 2);
  &RSAT::message::Info("Species gvf URL --> $gvf_file_ftp") if ($main::verbose >= 2);
  
  if ($task{download_gvf}){
      
      ## JvH: THIS SHOULD BE REVISED: currently, the GVF files are
      ## systematically downloaded, even if they are already there,
      ## because the files are decompressed locally -> the original
      ## .gz files are systematically missing, so they are
      ## re-downloaded and re-decompressed and so on.
      
      ###Dowload variation file
      &RSAT::message::TimeWarn("Downloading GVF files from", $gvf_file_ftp) if ($main::verbose >= 2);
      &RSAT::util::CheckOutDir($variation_dir);
      system(join(" ", "wget --no-verbose --timestamping -L", $gvf_file_ftp, "-P", $variation_dir));
      
      ## Decompress variation files only if their timestamps differ
      my $time_stamp_gvf_file = `stat -c %Y $gvf_file_local_path`;
      my $time_stamp_gvf_file_gz = `stat -c %Y $gvf_file_gz`;
      if ((&IsReal($time_stamp_gvf_file)) 
	  &&(&IsReal($time_stamp_gvf_file_gz)) 
	  && ($time_stamp_gvf_file >= $time_stamp_gvf_file_gz)) {
	  &RSAT::message::Info("Skipping uncompression, since gz file is not newer than uncompressed file", $gvf_file_local_path) 
	      if ($main::verbose >= 0);
      }  else {
	  &RSAT::message::TimeWarn("Decompressing GVF file", $gvf_file_gz) if ($main::verbose >= 2);
	  my $cmd = "gunzip -vcf ".$gvf_file_gz." > ".$gvf_file_local_path; ## Uncompress the gz GVF file
	  $cmd .= "; touch -r ".$gvf_file_gz." ".$gvf_file_local_path; ## Assign the time stamp of the gzip file to the uncompressed file
	  system($cmd);
      }
  }

  ################################################################
  ## Treat the variations
  &RSAT::message::TimeWarn("Filtering variation and computing combinatorial variation") if ($main::verbose >= 2);
  my $out_rm = &OpenOutputFile($variation_dir."/Failed.tab");
  my %out_vars = ();

  ## Intialize output file per chromosome
  foreach my $chr (keys(%chr_file)) {
    $out_vars{$chr} = &OpenOutputFile($variation_dir."/".$chr.".varBed");
    $out_var = $out_vars{$chr};

    ## Print header (field names)
    print $out_var "#", join("\t",
			     "chr",
			     "start",
			     "end",
			     "strand",
			     "id",
			     "ref",
			     "alt",
			     "so_term",
			     "validate",
			     "minor_allele_freq",
			     "is_supvar",
			     "in_supvar"), "\n";
  }

  ################################################################
  ## Process/open GVF file
  if ($task{report_snps}){
      my $gvf_file = ucfirst($species).".gvf";
      &RSAT::message::TimeWarn("Starting task 'report_snps' for GVF file", $gvf_file) if ($main::verbose >= 2); 
#      my $gvf_file = $gvf_file_ftp;
#      $gvf_file =~ s/\.gz//;
#      $gvf_file =~ s/$species_variation_ftp/$variation_dir\//;
      
      ## Intialize variables 
      my @super_variation = ();
      my $last_chr = "";
      my $last_end = 0;
      my $last_id = "";
      &RSAT::message::Info("Analyzing GVF file", $gvf_file, "Only SNPs with accurate information will be kept.") 
	  if ($main::verbose >= 2);

      ## Check if the files is ordered while analyzing it
      ## if one chromose is done being analyzed it will get flagged
      ## if further down the file the chromosome appears again this means
      ## the file is not sorted.
      ## Send a warnning if this is the case and  die.

      my %analyzed_chr=(); 

      ## Open GVF file and report SNP information and sequence to be used by retrieve-variation
      my ($file) = &OpenInputFile($gvf_file_local_path);
      while (<$file>) {
	  next if (/^#/); ## Skip comments
	  chomp();
	  
	  ## Get variation info
	  my ($chr,$source,$so_term,$start,$end,$score,$strand,$phase,$attributes) = split("\t");

	  ## variants ares expected to be sorted by chromosome 
	  ## if the chromosome was fralged as completed die on error
	  if ($analyzed_chr{$chr}){
	      &RSAT::error::FatalError("Format Error: Input file is not sorted correctly");
	  }

	  my %info = ();
	  foreach my $fields (split(";",$attributes)) { ## Split and stor attribute values
	      my ($attributeID,$value) = split("=",$fields);
	      $info{$attributeID} = $value;
	  }
	  
	  ## Skip Variants that lack mandatory attributes (Reference
	  ## sequence, variant sequence and ID).
	  next unless ($info{'Reference_seq'} && $info{'Variant_seq'} && $info{'ID'});
	  
	  ## Retrive ID for the variant, which is stored in attribute
	  ## Dbxref Ej:Dbxref=dbSNP_137:rs186434315
	  my $id = ""; 
	  if ($info{'Dbxref'}) {
	      my @fields = split(":",$info{'Dbxref'});
	      $id = $fields[-1];
	  } else {
	      $id = $info{'ID'}; ## If information is not avialable use ID attribute (variant number in the table)
	  }
	  
	  ## Retrieve validation status from either "validation_status"
	  ## or "evidence" attributes.
	  if ($info{'validation_status'} || $info{'evidence'}) {
	      $info{'validate'} = 1;
	  } elsif ($info{'validation_states'}) {
	      if ($info{'validation_states'} eq "-") {
		  $info{'validate'} = 0;
	      } else {
		  $info{'validate'} = 1;
	      }
	  } else {
	      $info{'validate'} = 0; ## If information is not abailable store 0
	  }
	  
	  ## Retrive global minor allele frequency if
	  ## available. Ej:global_minor_allele_frequency=1|0.000915751|2
	  my $m_allele_freq; 
	  if ($info{'global_minor_allele_frequency'}) {
	      my @gmaf = split("\\|",$info{'global_minor_allele_frequency'});
	      $m_allele_freq = $gmaf[1];
	  } else {
	      $m_allele_freq ="NA"; ## If minor allele freq is not avialable mark as NA
	  }
	  

	  ## JVH: I SHOULD REALLY REVISE THIS.  It is really not clean
	  ## to load the chromosome sequence at the end of the
	  ## loop. There is a conceptual problem here.

	  ## Get reference sequence and change output file.  Note: the
	  ## whole script assumes that variations are sorted by
	  ## chromosome.
	  ## ALE: IF THE FILE IS NOT SORTED NOW THE PROGRAM WILL DIE ON ERROR, SAME AS BEDTOOLS

	  if ($last_chr ne $chr) {

	      ## Flag analyzed chromosome as done
	      $analyzed_chr{$last_chr}=1;

	      &Get_super_variation($last_chr, $last_end,@super_variation) unless ( scalar(@super_variation) == 0);
	      @super_variation = (); ## Empty super_variations array
	      $last_end = 0;
	      $last_id = "";
	      
	      &RSAT::message::TimeWarn("Analyzing variations on chromosome : $chr",
				       "\n\tOutfile:",$variation_dir."/".$chr.".varBed")
		  if ($main::verbose >= 2);
	      
	      $last_chr = $chr;
	      $out_var = $out_vars{$chr};
	      
	      ## Extracts the full sequence of the chromosome, in
	      ## order to check all the reference sequences.
	      ##
	      ## QUESTION FROM JVH: should we really do this
	      ## verification ? We should check how much time the
	      ## whole procedure takes with, and without it.
	      if ($task{check_ref_seq}) {
		  my $raw_file = $genome_dir."/".$chr_file{$chr};
		  &RSAT::message::TimeWarn("Reading sequence for chromosome", $chr, $raw_file) if ($main::verbose >= 0);
		  $ref_seq = qx(cat $raw_file);
#		  $ref_seq = qx($ENV{'RSAT'}/perl-scripts/sub-sequence -i $raw_file -from 1 -to 500000000 -format raw);
	      }
	      
	     
	  }

	  ################################################################
	  ## Remove bug line 
	  ##
	  ## !!!!!!!!!!!!!!!!
	  ##
	  ## QUESTION_FROM_ALE=Are there repeat lines in the gvf file?
	  ## Not sure what this line is supposed to do.
	  next if ($end < $last_end);
	  next if ($last_id eq $id);
	  
	  ################################################################
	  ## Remove variations that do not compy the information
	  ## quality expectation.
	  
	  ## Remove variations whith unkonw strand, since the strand is
	  ## required to get the correct sequence.
	  if ( $strand eq "-" ) {
	      print $out_rm $chr."\t".$start."\t".$end."\t".$strand."\t".$id;
	      print $out_rm "\tVariation must be indicate on '+' strand\n";
	      next;
	  }
	  
	  ## Remove variations where Reference sequence is unkown.
	  if ( $info{'Reference_seq'} =~ /[^ACGT\-]/) {
	      print $out_rm $chr."\t".$start."\t".$end."\t".$strand."\t".$id;
	      print $out_rm "\tReference variant $info{'Reference_seq'} does not only contain A,C,G,T,-\n";
	      next;
	  }
	  
	  ## Remove variations for which alternative variation
	  ## sequence is unkown.
	  if ( $info{'Variant_seq'} =~ /[^ACGT\-,]/) {
	      print $out_rm $chr."\t".$start."\t".$end."\t".$strand."\t".$id;
	      print $out_rm "\tAlternative variant $info{'Variant_seq'} does not only contain A,C,G,T,-\n";
	      next;
	  }
	  
	  ## Length of variation does not correspond to the end and
	  ## start information in the table.
	  if ( $end-$start+1 != length($info{'Reference_seq'}) ) {
	      print $out_rm $chr."\t".$start."\t".$end."\t".$strand."\t".$id;
	      print $out_rm "\tLength of the variation ".($end-$start+1)." ($start-$end) not identical to the length of reference seq ".(length($info{'Reference_seq'}))." ($info{'Reference_seq'})\n";
	      next;
	  }
	  
	  ## Check that sequence provided as reference in the GVF file
	  ## matches the sequence at corresponding position in the 
	  ## genome installed on RSAT.
	  if ($task{check_ref_seq}) {
	      $variation_ref_seq = substr($ref_seq,$start-1,$end-$start+1);
	      if (( $info{'Reference_seq'} ne "-") && ($info{'Reference_seq'} ne $variation_ref_seq)) {
		  print $out_rm join ("\t", 
				      $chr,
				      $start,
				      $end,
				      $strand,
				      $id,
				      "Reference sequence",
				      $info{'Reference_seq'},
				      "differs from genome sequence",
				      $variation_ref_seq), "\n";
		  next;
	      }
	  }
	  
	  
	  ################################################################
	  ## Check if the variation is not a part of a
	  ## super_variation.
	  ## 
	  ## !!!!!!!!!!!!!!!!
	  ##
	  ## Note for JvH and Alejandra: we should check what are
	  ## these "super-variations" (seem to be sets of mutually
	  ## oevrlapping variations), and if we want to maintain
	  ## them. In particular, check if they do not provoke a
	  ## combinatorial explosion of processing time and/or storage
	  ## space.
	  ## ALE: THIS PART ALSO ASSUMES VARIANTS ARE SORTED BY CHROMOSOME AND THEN BY POSSITION

	  if ( $start <= $last_end ) {
	      $last_end=$end if ($last_end < $end);
	  } else {
	      
	      &Get_super_variation($chr, $last_end,@super_variation) unless ( scalar(@super_variation) == 0);
	      @super_variation = ();
	      $last_end = $end;
	      $last_id = "";
	  }
	  
	  ## ALE: This could be causing the insertion crazyness where start was being marked
	  ## as bigger than the end. I'm not sure what was the aim of the line.
	  ## $start ++ if ($info{'Reference_seq'} eq "-");
	  push (@super_variation, join("\t",
				       $chr,
				       $start,
				       $end,
				       $strand,
				       $id,
				       $info{'Reference_seq'},
				       $info{'Variant_seq'},
				       $so_term,
				       $info{'validate'},
				       $m_allele_freq));
	  ##push (@super_variation,$chr."\t".$start."\t".$end."\t".$strand."\t".$id."\t".$info{'Reference_seq'}."\t".$info{'Variant_seq'}."\t".$so_term."\t".$info{'validate'}."t");
	  $last_id = $id;
      }
      
      &Get_super_variation($last_chr, $last_end,@super_variation);
      
      
      close $file;
  } ## close task{report_snps}

  ################################################################
  ## Delete original GVF file if requested
  if ($task{clean_gvf}) {
      unlink($gvf_file_local_path);
  } 
  if ($task{clean_gvf_gz}) {
      unlink($gvf_file_gz);
  } 

  
  &RSAT::message::Info("Variations installed in dir", $variation_dir) if ($main::verbose >= 1);
  
  ################################################################
  ## Report execution time and close output stream
  my $exec_time = &RSAT::util::ReportExecutionTime($start_time); ## This has to be exectuted by all scripts
  print $out $exec_time if ($main::verbose >= 1); ## only report exec time if verbosity is specified
  close $out;
  
  exit(0);
}


################################################################
################### SUBROUTINE DEFINITION ######################
################################################################

################################################################
## Display full help message
sub PrintHelp {
  system "pod2text -c $0";
  exit(0);
}

################################################################
## Display short help message
sub PrintOptions {
  &PrintHelp();
}

################################################################
## Get super variation
sub Get_super_variation {
  my ($chr, $last_end, @super_variation) = @_;
  &RSAT::message::Debug("Super variation array content\n", join("\n",@super_variation)) 
      if ( ($main::verbose >= 10) && (scalar(@super_variation)>1));


  ################################################################
  ## Group variations.
  ##
  ## Groups are All var, All validate var, All outside insert, All
  ## outside valide insert.  Outside insert are insert just after the
  ## end of the combinatory variation.

  my @super_variation_validate = ();
  my @super_insert = ();
  my @super_insert_validate = ();
  my $nb_validate = 0;

  if ( scalar(@super_variation) >1 ) {
    my @super_validate = ();

    my @fields = split("\t",$super_variation[0]);
    my $super_validate_end = $fields[2];

    for (my $i=0; $i<scalar(@super_variation) ; $i++) {
      my @var_info = split("\t",$super_variation[$i]);

      ## Group outside insert and outside valide insert
      if ( $var_info[5] eq "-" && $var_info[1] > $last_end) {
         push (@super_insert, $super_variation[$i]);
         push (@super_insert_validate, $super_variation[$i]) if ($var_info[8]==1);
         splice(@super_variation, $i, 1);
         $i--;

      ## Group validate variation
      } elsif ($var_info[8] == 1) {
        my $nb_validate ++;

        if ( $var_info[1] > $super_validate_end) {
          push (@super_variation_validate, \@super_validate);
          @super_validate = ();
        }

        push (@super_validate,$super_variation[$i]);
        $super_validate_end = $var_info[2] if ($super_validate_end < $var_info[2]);
      }
    }
    push (@super_variation_validate, \@super_validate) unless ( @super_validate );
  }

  #&RSAT::message::Debug("PFFFFFF\n", join("\n",@super_variation)) if ( ($main::verbose >= 10) && (scalar(@super_variation)>1));

  ################################################################
  ## Generate output line for super-variation
  %output_line = ();

  if ( scalar(@super_variation) == 1) {
    my @info = split("\t",$super_variation[0]); ## output line hash based on chr and start
    push (@{$output_line{$info[1]}{$info[2]}}, $super_variation[0]."\t0\t0\n");
  }

  elsif ( scalar(@super_variation) > 1) {
      
      ##ORIGINAL CODE: Possible error in $super_variation[0] definition causes 
      ##repeating the first variant several times while ignoring all others, possibly this is not the desired behavior
      ##Variation in the super variation
      ##foreach my $line (@super_variation) {
	##  my @info = split("\t",$line);
	 ## push (@{$output_line{$info[1]}{$info[2]}}, $super_variation[0]."\t0\t1\n");
     ## }

      ## ALEJANDRA'S SUGGESTION
      ## Variation in the super variation
      ## in_supvar attribute is defined here
      foreach my $line (@super_variation) {
	  my @info = split("\t",$line);
	  #print      $line."\t0\t1"."BOOM"."\n";
	  push (@{$output_line{$info[1]}{$info[2]}}, $line."\t0\t1\n");
      }

      ################################################################
      ## Compute "super variations", i.e. combinations between
      ## variations. THIS MAY LEAD TO AN EXPLOSION OF THE NUMBER OF
      ## VARIATIONS.
      if ($task{super_variations}) {
	  if ( scalar(@super_variation_validate) == 1 && $nb_validate == scalar(@super_variation) ) {
	      
	      ## If all variations are valid
	      my @info = split("\t", &MakeSuperVar(@{$super_variation_validate[0]} ));
	      push (@{$output_line{$info[0]}{$info[1]}}, "$chr\t".join("\t",@info)."\t1\t1\t0\n");
	      
	  } else {
	      
	      ## Validate variations
	      foreach (@super_variation_validate) {
		  next if ( scalar(@{$_}) < 1);
	      my @info = split("\t", &MakeSuperVar(@{$_} ));
		  push (@{$output_line{$info[0]}{$info[1]}}, "$chr\t".join("\t",@info)."\t1\t1\t1\n");
	      }
	      
	      # All variation
	      my @info = split("\t",&MakeSuperVar(@super_variation));
	      push (@{$output_line{$info[0]}{$info[1]}}, "$chr\t".join("\t",@info)."\t0\t1\t0\n");
	  }
      }
  }
  

  ## Outside insert
  if ( scalar(@super_insert) > 1) {
      
      if ($task{super_variations}) {
	  if (scalar(@super_insert) == scalar(@super_insert_validate) ) {
	      my @info = split("\t",&MakeSuperVar(@super_insert_validate));
	      push (@{$output_line{$info[0]}{$info[1]}}, "$chr\t".join("\t",@info)."\t1\t1\t0\n");
	      
	  } else {
	      
	      my @info = split("\t",&MakeSuperVar(@super_insert));
	      push (@{$output_line{$info[0]}{$info[1]}}, "$chr\t".join("\t",@info)."\t0\t1\t0\n");
	      
	      if ( scalar(@super_insert_validate) >= 1) {
		  my @info = split("\t",&MakeSuperVar(@super_insert_validate));
		  push (@{$output_line{$info[0]}{$info[1]}}, "$chr\t".join("\t",@info)."\t1\t1\t1\n");
	      }
	      
	  }
      }

      foreach my $insert (@super_insert) {
	  my @info = split("\t",$insert);
	  push (@{$output_line{$info[1]}{$info[2]}}, $insert."\t0\t1\n");
      }
      
  } elsif ( scalar(@super_insert) == 1) {
      my @info = split("\t",$super_insert[0]);
      push (@{$output_line{$info[1]}{$info[2]}}, $super_insert[0]."\t0\t0\n");
  }
  
  ################################################################
  ## Print

  &RSAT::message::Debug("Output variations") if (($main::verbose >= 10) && (scalar(@super_variation)>1));

  my @sorted_start = sort{$a<=>$b} ( keys( %output_line ) );
  foreach my $start ( @sorted_start ) {
    my @sorted_end = sort{$a<=>$b} ( keys( %{$output_line{$start}} ) );
    foreach my $end ( @sorted_end ) {
      print $out_var @{$output_line{$start}{$end}};
      print @{$output_line{$start}{$end}} if (($main::verbose >= 10) && (scalar(@super_variation)>1));
    }
  }
  #<STDIN> if ( ($main::verbose >= 10) && (scalar(@super_variation)>1));  
}

################################################################
## Create all combinations between variations
##
## JvH: THIS SHOULD BE REVISED, it can lead to a combinatorial
## explosion of the data size. Do we really want to create all
## possible combinations, whereas only a subset of them are observed
## in populations ? An alternative would be to use te data about
## haplotypes, but this should also be evaluated because haplotype
## datasets are voluminous.
sub MakeSuperVar {
    my @lines = @_;
    &RSAT::message::TimeWarn("Computing combinations between", scalar(@lines), "variations") if ($main::verbose >= 0);
    
    my $super_start=0;
    my $super_end=0;
    my @list_id = ();
    
    foreach my $line (@lines) {
	my @var_info = split("\t",$line);
	
	## Get coord
	$super_end = $var_info[2] if ($var_info[2] > $super_end);
	$super_start = $var_info[1] if ($var_info[1] < $super_start || $super_start == 0);

	## Get id
	push (@list_id,$var_info[4]);
    }

    ## Get ref seq
    my $super_ref = substr($ref_seq,$super_start-1,$super_end-$super_start+1);
    $super_ref = "-" if (length($super_ref) == 0);


    ## Get variants
    my @list_variants = ();
    @list_variants = &Get_alternative_variant($super_ref,$super_start-2,$super_start,$super_end,\@_,\@list_variants);

    ## Get SO_Term
    my $so_term = "sequence_alteration";
    
    if ($super_ref eq "-" ) {
	$so_term = "insertion";

    } else {
	my $same_len = 1;
	my $is_del = 1;
	
	foreach $var (@list_variants) {
	    if (length($var) >=  length($super_ref)) {
		$is_del = 0;
	    }
	    if ( length($var) != length($super_ref) ) {
		$same_len = 0;
	    }
	}

	if ($same_len && length($super_ref) == 1) {
	    $so_term = "SNV";
	} elsif ($same_len) {
	    $so_term = "subsitution";
	} elsif ($is_del) {
	    $so_term = "deletion";
	}
    }

    return $super_start."\t".$super_end."\t"."+"."\t".join(',',@list_id)."\t".$super_ref."\t".join(',',@list_variants)."\t".$so_term;
}

################################################################
## Get alternative variant.
sub Get_alternative_variant {
  my ($ref_variant,$last_end,$super_start,$super_end,$list_variations,$list_variants) = @_;

  @list_variations = @{$list_variations};
  @list_variants = @{$list_variants};

  for (my $i = 0; $i < scalar(@list_variations);$i++ ) {
    my @var_info = split("\t",$list_variations[$i]);
    my $start = $var_info[1];
    $start -- if ($var_info[5] eq "-");
    
    if ( $start > $last_end) {

      foreach my $variant (split(",", $var_info[6])) {
        $var = substr($ref_variant,0, length($ref_variant) - ($super_end-$var_info[1]+1) ).$variant.substr( $ref_variant, length($ref_variant) - ($super_end-$var_info[2]) );
        $var =~ s/\-//g if (length($var)>1);

        push (@list_variants, $var ) unless (grep ($_ eq $var, @list_variants));
        @list_variants = &Get_alternative_variant($var,$var_info[2],$super_start,$super_end,\@list_variations,\@list_variants);
      }

      @list_variants = &Get_alternative_variant($ref_variant,$var_info[2],$super_start,$super_end,\@list_variations,\@list_variants);
    }
  }
  return (@list_variants);
}


################################################################
## Read arguments
sub ReadArguments {
  my $arg;
  my @arguments = @ARGV; ## create a copy to shift, because we need ARGV to report command line in &Verbose()
  while (scalar(@arguments) >= 1) {
    $arg = shift (@arguments);

=pod

=head1 OPTIONS

=over 4

=item B<-v #>

Level of verbosity (detail in the warning messages during execution)

=cut
    if ($arg eq "-v") {
      if (&IsNatural($arguments[0])) {
	$main::verbose = shift(@arguments);
      } else {
	$main::verbose = 1;
      }

=pod

=item B<-h>

Display full help message

=cut
    } elsif ($arg eq "-h") {
      &PrintHelp();

=pod

=item B<-help>

Same as -h

=cut
    } elsif ($arg eq "-help") {
      &PrintOptions();

=pod

=item B<-species species_name>
=item B<-org species_name> (equivalent)

Name of the species for which you want to download the variations (e.g
Homo_sapiens, Mus_musculus).

Help: to get the list of species available at Ensembl, run the
command:

   I<download-ensembl-variations -available_species> 

=cut
    } elsif (($arg eq "-species") || ($arg eq "-org")) {
      $main::species = lc(shift(@arguments));

################################################################
## THIS OPTION HAS BEEN INACTIVATED BY JvH on 2014-10-28
# =pod
#
# =item B<-dir #>
#
# The directory in wich RSAT genomes must be installed. The selected
# species will be installed in a sub-directory composed of Species name
# and Ensembl genome version.
#
# Default : $RSAT/data/
#
# =cut
#     } elsif ($arg eq "-dir") {
#       $main::data_dir = shift(@arguments);

=pod

=item   B<-o outputfile>

The output file is used to hold a trace of the transfers (verbosity),
and to store the list of species when the option -available_species is
activated.

If no output file is specified, the standard output is used.

=cut
    } elsif ($arg eq "-o") {
      $outfile{output} = shift(@arguments);

=pod

=item B<-db ensembl|ensemblgenomes>

Default: Ensembl

Select the source database. 

=over

=item I<ensembl>

The "historical" Ensembl database (L<http://ensembl.org/>), restricted
to a series of genomes from model organisms (69 supported species on
Oct 30, 2014).

=item I<ensemblgenomes>

The exended EnsemblGenomes database (L<http://ensemblgenomes.org/>),
which comprises repositories for the following taxa: Bacteria
(actually includes Archaea), Fungi, Metazoa, Plants, Protists.

In Oct 2014, EnsemblGenomes supports >15,000 species.

=item I<ensemblall>

Load both Ensembl and Ensembl Genomes.

=back

=cut
   } elsif ($arg eq "-db") {
    $main::db = lc(shift(@arguments));
    unless (($main::db eq "ensembl")
	    || ($main::db eq "ensemblgenomes")
	    || ($main::db eq "ensemblall")
            ) {
	&RSAT::error::FatalError($main::db, "Invalid value for the option -db. Supported: ensembl,ensemblgenomes,ensemblall");
    }

=pod

=item B<-task>

Tasks to be performed by download-ensembl-variations

Supported tasks:

=over

=item B<download_gvf>

Download the GVF (Genome variant file) from Ensembl.

=item B<report_snps>

Calculate the theoretical distribution

=back

=cut
    } elsif ($arg eq "-task") {
	$arg = shift (@arguments);
	chomp($arg);
	my @tasks = split ",", $arg;
	foreach my $task (@tasks) {
	    $task = lc($task);
	    if ($supported_tasks{$task}) {
		$task{$task} = 1;
	    } else {
		&RSAT::error::FatalError($task, "Invalid tasks. Supported:", $supported_tasks);
	    }
	}
	
=pod

=item B<-available_species>

Get all available species on Ensembl

=cut
    } elsif ($arg eq "-available_species") {
      $main::get_available_species = 1;

=pod

=item B<-version #>

The release version of ensEMBL.

Supported versions: version numbers (e.g. 70, 72, ...), safe, latest

Default : I<safe>

=over

=item I<safe>

The file locations and/or formats of the Ensembl rsync distribution
may change between two Ensembl release. For this reason, we defined
the "safe" version, which corresponds to the earliest version of
ensembl which has been checked to work with this script.

=item I<latest>

This corresponds to the lastest available version of Ensembl. Beware:
this version is not guaranteed to be compatible with RSAT, in case
Ensembl would change their file formats or locations.

=back

=cut
    } elsif ($arg eq "-version") {
      my $version = shift(@arguments);
      if (&IsNatural($version)) {
	  $main::ensembl_version = $version;
	  &RSAT::error::FatalError($version , 
				   "Invalid Ensembl version: cannot be lower than the \"safe\" version", 
				   $ensembl_version_safe) 
	      if ($main::ensembl_version < $ensembl_version_safe);
	  &RSAT::error::FatalError($version, 
				   "Invalid Ensembl version: cannot be higher than the latest available version",
				   $ensembl_version_latest) 
	      if ($main::ensembl_version > $ensembl_version_latest);
      } elsif ($version eq "safe") {
	  $main::ensembl_version = $ensembl_version_safe;
      } elsif ($version eq "latest" ) {
	  $main::ensembl_version = $ensembl_version_latest;
      } else {
        &RSAT::error::FatalError($version, "Invalid value for Ensembl version.");
      }

=pod

=item B<-species_suffix>

Suffix to append to the full species ID.

By default, the full species ID is composed by concatenating the
Ensembl species and assembly version. The option I<-species_suffix>
allows to specify a string (e.g. _ensembl76, _for_testing, ...) that
will be appended to the full species ID.

=cut
    } elsif ($arg eq "-species_suffix") {
    $species_suffix = shift(@arguments);


    } else {
      &FatalError($arg, "Invalid option");
    }
  }

=pod

=back

=cut

}

################################################################
## Verbose message
sub Verbose {
  print $out "; download-ensembl-variations ";
  &PrintArguments($out);

  if (%main::outfile) {
    print $out "; Output files\n";
    while (my ($key,$value) = each %main::outfile) {
      printf $out ";\t%-13s\t%s\n", $key, $value;
    }
    printf $out ("; %-22s\t%s\n", "Ensembl safe version", $ensembl_version_safe);
    printf $out ("; %-22s\t%s\n", "Ensembl version",$ensembl_version);
    printf $out ("; %-22s\t%s\n", "Species suffix", $species_suffix) if ($species_suffix);
  }
}
