#!/usr/bin/perl -w

############################################################
#
# $Id: download-ensembl-variations,v 1.22 2013/08/12 10:22:28 rsat Exp $
#
############################################################

use warnings;

=pod

=head1 NAME

download-ensembl-variations

=head1 VERSION

$program_version

=head1 DESCRIPTION

Download a GVF/VCF file of variations from Ensembl [Genomes], decompress the file &
remove variations that fail to pass the quality check.

Create "combinatory variations" from sets of overlapping variations.
Write new variation file with one file per chromosome plus a separate
file for the removed variations.

/!\ Before using I<download-ensembl-variations>, genomic sequences
need to be installed in raw format for the species of interest. To
download genomic sequences, run the command
I<download-ensembl-genome>.

=head1 AUTHORS

=over

=item I<Jeremy Delerce> (Master 2 thesis 2013)

=item I<Aníbal Rivera-Gonzalez>

=item I<Walter Santana-Garcia> (wsantana@lcg.unam.mx)

=item I<Alejandra Medina-Rivera> (amedina@lcg.unam.mx)

=item I<Jacques van Helden> (Jacques.van-Helden\@univ-amu.fr)

=back

=head1 CATEGORY

=over

=item util

=back

=head1 USAGE

 download-ensembl-variations -species # -assembly # -release #

=head2 Example

Get all variations for Homo sapiens

 download-ensembl-variations -species Homo_sapiens -assembly GRCh38 -release 90

=head1 OUTPUT FORMAT

A tab delimited file containing one row per variation, with the
following column content.

=head2 VARIATIONS PASSING QC

=over

=item 1. chrom

The name of the chromosome (e.g. 1, X, 8...)

=item 2. chromStart

The starting position of the feature in the chromosome

=item 3. chromEnd

The ending position of the feature in the chromosome

=item 4. chromStrand

The strand of the feature in the chromosome

=item 5. varId

The id of the variation(s)

=item 6. refSeq

Reference sequence of the variation

=item 7. varSeq

Sequence of all the variant

=item 8. type

Type of the variation

=item 9. validate

If the variation is validate.
Go to the following link to see all validation state :
http://www.ncbi.nlm.nih.gov/projects/SNP/snp_legend.cgi?legend=validation

=item 10. minor_allele_freq

Minor allele frequency

=back

=head2 VARIATIONS FAILING QC

=over

=item 1. chrom

The name of the chromosome (e.g. 1, X, 8...)

=item 2. chromStart

The starting position of the feature in the chromosome

=item 3. chromEnd

The ending position of the feature in the chromosome

=item 4. chromStrand

The strand of the feature in the chromosome

=item 5. varId

The id of the variation(s)

=item 6. description

Why the variation is remove

=back

=head1 SEE ALSO

=head2 install-ensembl-genome

I<install-ensembl-genome> is a tool that allow to install all Ensembl genome, feature and variation.

=head1 WISH LIST: -genome_dir, missing task: clean_vcf

=cut

BEGIN {
  if ($0 =~ /([^(\/)]+)$/) {
    push (@INC, "$`lib/");
  }
}

#require "RSA.lib";
#require "RSAT_to_ensembl.lib.pl";
require "RSA.lib";
require "RSAT_to_ensembl.lib.pl";
use Bio::EnsEMBL::Registry;

################################################################
## Main package
package	main;
{

  ###############################################################
  ## Initialise parameters
  our $start_time = &RSAT::util::StartScript();
  our $program_version = do { my @r = (q$Revision: 1.22 $ =~ /\d+/g); sprintf"%d."."%02d" x $#r, @r };
    
  our %outfile = ();

  our $verbose = 2;
  our $out = STDOUT;
  our $out_var = "";
  our $db = "ensembl";
  our $taxid = "";
  our $species = "";
  our $assembly = "";
  our $species_suffix = "";
  our $full_species_ID = "";
#  our $data_dir = &Get_data_dir();
  our $registry = 'Bio::EnsEMBL::Registry';
    
    ################################################################
    ## Tasks
    
    ## Default tasks are executed if no task is specified
    local @default_tasks = (
    "download_gvf", ## Fetch gvf file from ensembl
    "report_snps", ## Read downloaded file from ensembl and report the processed data
    "clean_gvf", ## Delete the uncompressed downloaded GVF files after processing
    );
    
    ## These optional tasks are NOT executed by default, either because
    ## they are particularly time-consuming, or because we don't want to
    ## loose the raw data
    local @optional_tasks = (
    "check_ref_seq", ## Check the sequence on the reference genome
    "super_variations", ## Compute super-variations
    "clean_gvf_gz", ## Delete the downloaded compressed GVF files after processing
    );
    ## The extra tasks change exclusively for VCF download
    local @extra_tasks = (
    "fromvcf", ## Download the vcf file from ensembl
    );
    local @supported_tasks = (
    @default_tasks,
    @optional_tasks,
    @extra_tasks,
    "default", ## Run default tasks (avoid heavy tasks such as check_ref_seq)
    "all", ## Run all other tasks
    );
    local $supported_tasks = join (",", @supported_tasks);
    local %supported_tasks = ();
    foreach my $task (@supported_tasks) {
        $supported_tasks{$task} = 1;
    }
    local %task = (); ## List of tasks to be executed
    
    ################################################################
    ## Read argument values
    &ReadArguments();
    
  our $ensembl_release_safe = &get_ensembl_release_safe($db);
  our $ensembl_release_latest = &get_ensembl_release_latest($db);
  our $ensembl_release = &get_ensembl_release($db);
  our $null = "<NULL>";

    
  our $get_available_species = 0;
  our $ref_seq = "";

  our $group_lc; ## Just used with the fromvcf task ## ANRG_2
  our $personal_release; ## Just used with the fromvcf task ## ANRG_2

  

  ################################################################
  ## The vcf download process begins if -task is fromvcf ##ANRG_2
  ##
  ## This order is needed to the correctly work
  &Download_from_vcf() if(defined($task{"fromvcf"}));

  ################################################################
  ## The new rutine to validate_release ##ANRG_2
  &Validate_release();

  ################################################################
  ## Check the selected options


  ################################################################
  ## Check selected tasks

  ## If no tasks has been specified, execute them all
  if ($task{all}) {
      %task = %supported_tasks;
      delete $task{'all'};
      delete $task{'default'};

  } elsif ((scalar(keys(%task))==0) || ($task{default})) {
      ## Activate default tasks, i.e. all tasks except some highly
      ## time-consuming tasks
      foreach my $task (@default_tasks) {
	  $task{$task} = 1;
      }
  }

  &RSAT::message::Info("Selected tasks", sort(keys(%task))) if ($main::verbose >= 2);

  ## Check the consistency between reference sequence in the GVF file
  ## and corresponding genomic sequence. Note that this verification
  ## costs memory (each chromosome is laoded in RAM) and time (the
  ## control of each variation requires a substring extraction).
  if ($task{check_ref_seq}) {
      &RSAT::message::Warning("Beware: task check_ref_seq is activated.",
			      "This may take a while for large genomes with high frequencies of variations")
	  if ($main::verbose >= 1);
  }


  ################################################################
  ## Check ensembl release
  &check_ensembl_release($db, $ensembl_release);

  ## Change Ensembl release to EnsemblGenomes release if required
  if (lc($db) eq "ensemblgenomes") {

    &RSAT::message::TimeWarn("download-ensembl-variations", "Getting the list of available species", "db=".$db) if ($main::verbose >= 1);
    &LoadRegistry($registry, $db, $ensembl_release);

    my @dbas = @{ $registry->get_all_DBAdaptors() };
    foreach my $dba (@dbas) {
      if ($dba->species() eq "multi") {
        @fields = split("_",$dba->dbc()->dbname());
        $ensembl_release = $fields[-2];
      }
    }
  }




  ################################################################
  ## Print verbose
  $out = &OpenOutputFile($outfile{output});
  &Verbose() if ($main::verbose >= 1);

  ################################################################

   ## Get available species in the ensembl variation ftp server

#   my @variation_ftp = &Get_variation_ftp($db, $ensembl_release);
#   &RSAT::error::FatalError("$db release : $ensembl_release not supported. No variation available for this release.") if (scalar(@variation_ftp) == 0);

#   ## If several addresses are available loop thrhough them and list them
#   ## Usually recent release of ensembl won't have multiple folders
#   foreach (@variation_ftp) {
#     &RSAT::message::Info("Variation URL",$_) if ($main::verbose >= 2);
#   }

#   ## Retrieve the subfolders in the variations folder
#   ## in the ensembl server
# >>>>>>> b078097db7f4286a2036f24e8aa99ceccb0912a4
  my @available_species_dir = ();
  my @available_species = ();
  my %variation_ftp = ();
  if ($db eq "ensemblgenomes") {
      ## Get ensembl variation ftp path
      %variation_ftp = &Get_variation_ftp($db, $ensembl_release);
      @available_species = sort keys %variation_ftp;

  } else {
    my @variation_ftp = &Get_variation_ftp($db, $ensembl_release);
    if (scalar(@variation_ftp) == 0) {
      &RSAT::message::Warning("$db release : $ensembl_release not supported. No variation available for this release.");
      exit(0);
    }


    ## If several addresses are available loop thrhough them and list them.
    ## Usually recent release of ensembl won't have multiple folders.
    foreach my $ftp (@variation_ftp) {
      &RSAT::message::Info("Variation URL",$ftp) if ($main::verbose >= 2);
      push (@available_species_dir, qx{wget -S --spider $ftp."/" 2>&1})
    }

    ## push the subfolder names into a variable
    ## subfolder names correspond to species
    foreach (@available_species_dir) {
      next unless (/^d/);
      my @fields = split(" ");
      next if ($fields[-1] =~ /\./);
      push (@available_species, $fields[-1]);
    }


#   &RSAT::message::Debug("Species with viariation data available \n", join("\n",@available_species_dir)) if  ($main::verbose >= 10) ;

  }

  ################################################################
  ## Print available species
  if ($get_available_species) {

    foreach (sort {$a cmp $b} @available_species) {
       print $out ucfirst($_),"\n";
    }

    my $exec_time = &RSAT::util::ReportExecutionTime($start_time); ## This has to be exectuted by all scripts
    print $out $exec_time if ($main::verbose >= 1); ## only report exec time if verbosity is specified
    close $out;
    exit(0);
  }

  ################################################################
  ## Check argument values

  &RSAT::error::FatalError("You must specify a species (option -species).") unless ($species);
  &RSAT::error::FatalError("No variation avalaible for $species on $db") unless ( grep($_ eq $species, @available_species ));

  ## Get genome assembly
  #S ANRG
  $assembly = &Get_assembly($species,$ensembl_release, $species_suffix);
    
    &RSAT::message::Warning($ensembl_release);
    
    #$assembly = &Get_assembly($species,&get_ensembl_release("ensembl"), $species_suffix);
  #E ANRG
  #die "BOOM";

  &RSAT::error::FatalError("No assembly found for $species and $db release $ensembl_release. Use download-ensembl-genome before download-ensembl-variations.") unless ($assembly);


  ## Check if genome release has been installed
  $full_species_ID = &Get_full_species_ID($species, $assembly,$ensembl_release, $species_suffix);

  &RSAT::message::Info("Full species ID", $full_species_ID) if ($main::verbose >= 2);
  $genome_dir = &Get_genome_dir($species, $assembly,$ensembl_release, $species_suffix);
  &RSAT::message::Info("Genome directory", $genome_dir) if ($main::verbose >= 2);
  printf $out ("; %-22s\t%s\n", "Full species ID", $full_species_ID);
  printf $out ("; %-22s\t%s\n", "Genome directory", $genome_dir);
  &RSAT::error::FatalError($genome_dir, "does not exist. Use download-ensembl-genome before download-ensembl-variations.")
      unless (-d $genome_dir);
  # Check if sequence file are not missing
  my %chr_file = &Get_file_seq_name($genome_dir);
  foreach my $file (keys(%chr_file)) {
      my $chr_path = $genome_dir."/".$chr_file{$file};
      unless (-f $chr_path) {
        &RSAT::error::FatalError("Missing chromosome sequence file", $chr_path);
      }
  }


  ################################################################
  ## Retrieve GVF file from ensembl

  ## Get URL
  #S ANRG
  #my $species_variation_ftp = &Get_variation_species_ftp($db,$species,$ensembl_release);
  &RSAT::message::Info("SEE. We are here, this is the number: ",&get_ensembl_release("ensemblgenomes",1)) if ($main::verbose >= 8);
  my $species_variation_ftp = "";
  if ($db eq "ensemblgenomes"){
  $species_variation_ftp = &Get_variation_species_ftp($db,$species,&get_ensembl_release("ensemblgenomes",1));
  }
  elsif ($db eq "ensembl"){
  $species_variation_ftp = &Get_variation_species_ftp($db,$species,&get_ensembl_release());
  }
  #F ANRG

  ## Define the basename of the GVF file
  #S ANRG
  my $gvf_file = ucfirst($species).".gvf";
  $gvf_file = lc($species).".gvf";
  #E ANRG

  ## Define the path of the GVF file in the variation directory
  my $variation_dir = &Get_variation_dir($species, $assembly,$ensembl_release, $species_suffix);
  my $gvf_file_local_path = $variation_dir."/".$gvf_file;
  my $gvf_file_gz = $gvf_file_local_path.".gz";

  ## Define the URL of the GVF file on Ensembl FTP server
  #S ANRG
  #my $gvf_file_ftp = &Get_gvf_ftp($db,$species,$ensembl_release);
  &RSAT::message::Info("SEE. We are here, this is the number: ",$ensembl_release) if ($main::verbose >= 8);
  my $gvf_file_ftp="";
  if ($db eq "ensemblgenomes"){
  $gvf_file_ftp = &Get_gvf_ftp($db,$species,&get_ensembl_release("ensemblgenomes",1));
  }
  elsif ($db eq "ensembl"){
  $gvf_file_ftp = &Get_gvf_ftp($db,$species,&get_ensembl_release());
  }
  #E ANRG

  &RSAT::message::Info("Species variation URL --> $species_variation_ftp") if ($main::verbose >= 2);
  &RSAT::message::Info("Species gvf URL --> $gvf_file_ftp") if ($main::verbose >= 2);

  if ($task{download_gvf}){

      ## JvH: THIS SHOULD BE REVISED: currently, the GVF files are
      ## systematically downloaded, even if they are already there,
      ## because the files are decompressed locally -> the original
      ## .gz files are systematically missing, so they are
      ## re-downloaded and re-decompressed and so on.

      ###Dowload variation file
      &RSAT::message::TimeWarn("Downloading GVF files from", $gvf_file_ftp) if ($main::verbose >= 2);
      &RSAT::util::CheckOutDir($variation_dir);
      system(join(" ", "wget --no-verbose --timestamping -L", $gvf_file_ftp, "-P", $variation_dir));

      ## Decompress variation files only if their timestamps differ
      my $time_stamp_gvf_file = `stat -c %Y $gvf_file_local_path`;
      my $time_stamp_gvf_file_gz = `stat -c %Y $gvf_file_gz`;
      if ((&IsReal($time_stamp_gvf_file))
	         &&(&IsReal($time_stamp_gvf_file_gz))
	         && ($time_stamp_gvf_file >= $time_stamp_gvf_file_gz)) {

	            &RSAT::message::Info("Skipping uncompression, since gz file is not newer than uncompressed file", $gvf_file_local_path)
	            if ($main::verbose >= 0);

      }  else {

	           &RSAT::message::TimeWarn("Decompressing GVF file", $gvf_file_gz) if ($main::verbose >= 2);
	           my $cmd = "gunzip -vcf ".$gvf_file_gz." > ".$gvf_file_local_path; ## Uncompress the gz GVF file
	          $cmd .= "; touch -r ".$gvf_file_gz." ".$gvf_file_local_path; ## Assign the time stamp of the gzip file to the uncompressed file
	          system($cmd);

      }
  }

  ################################################################
  ## Treat the variations
  &RSAT::message::TimeWarn("Filtering variation and computing combinatorial variation") if ($main::verbose >= 2);
  my $out_rm = &OpenOutputFile($variation_dir."/Failed.tab");
  my %out_vars = ();

  ## Intialize output file per chromosome
  foreach my $chr (keys(%chr_file)) {
    $out_vars{$chr} = &OpenOutputFile($variation_dir."/".$chr.".varBed");
    $out_var = $out_vars{$chr};

    ## Print header (field names)
    print $out_var "#", join("\t",
			     "chr",
			     "start",
			     "end",
			     "strand",
			     "id",
			     "ref",
			     "alt",
			     "so_term",
			     "validate",
			     "minor_allele_freq"), "\n";
  }

  ################################################################
  ## Process/open GVF file
  if ($task{report_snps}){
      my $gvf_file = ucfirst($species).".gvf";
      &RSAT::message::TimeWarn("Starting task 'report_snps' for GVF file", $gvf_file) if ($main::verbose >= 2);
#      my $gvf_file = $gvf_file_ftp;
#      $gvf_file =~ s/\.gz//;
#      $gvf_file =~ s/$species_variation_ftp/$variation_dir\//;

      ## Intialize variables
      my @super_variation = ();
      my $last_chr = "";
      #my $last_end = 0;
      #my $last_id = "";
      &RSAT::message::Info("Analyzing GVF file", $gvf_file, "Only SNPs with accurate information will be kept.")
	  if ($main::verbose >= 2);

      ## Check if the files is ordered while analyzing it
      ## if one chromose is done being analyzed it will get flagged
      ## if further down the file the chromosome appears again this means
      ## the file is not sorted.
      ## Send a warnning if this is the case and  die.

      my %analyzed_chr=();

      ## Open GVF file and report SNP information and sequence to be used by retrieve-variation
      my ($file) = &OpenInputFile($gvf_file_local_path);
      while (<$file>) {
	       next if (/^#/); ## Skip comments
	        chomp();

	  ## Get variation info
	  my ($chr,$source,$so_term,$start,$end,$score,$strand,$phase,$attributes) = split("\t");

	  ## variants ares expected to be sorted by chromosome
	  ## if the chromosome was fralged as completed die on error
	  if ($analyzed_chr{$chr}){
	      &RSAT::error::FatalError("Format Error: Input file is not sorted correctly");
	  }
    ## JVH: I SHOULD REALLY REVISE THIS.  It is really not clean
    ## to load the chromosome sequence at the end of the
    ## loop. There is a conceptual problem here.

    ## Get reference sequence and change output file.  Note: the
    ## whole script assumes that variations are sorted by
    ## chromosome.
    ## ALE: IF THE FILE IS NOT SORTED NOW THE PROGRAM WILL DIE ON ERROR, SAME AS BEDTOOLS

    if ($last_chr ne $chr) {

        ## Flag analyzed chromosome as done
        $analyzed_chr{$last_chr}=1;

        #&Get_super_variation($last_chr, $last_end,@super_variation) unless ( scalar(@super_variation) == 0);
        #@super_variation = (); ## Empty super_variations array
        #$last_end = 0;
        #$last_id = "";

        &RSAT::message::TimeWarn("Analyzing variations on chromosome : $chr",
               "\n\tOutfile:",$variation_dir."/".$chr.".varBed")
                if ($main::verbose >= 2);

        $last_chr = $chr;
        $out_var = $out_vars{$chr};

        #Load raw reference sequence if chromosome is a different from last one
        my $raw_file = $genome_dir."/".$chr_file{$chr};
        $ref_seq = qx(cat $raw_file);
        #$main::last_chr = $chr;


        ## Extracts the full sequence of the chromosome, in
        ## order to check all the reference sequences.
        ##
        ## QUESTION FROM JVH: should we really do this
        ## verification ? We should check how much time the
        ## whole procedure takes with, and without it.
        #if ($task{check_ref_seq}) {
        #    my $raw_file = $genome_dir."/".$chr_file{$chr};
        #    &RSAT::message::TimeWarn("Reading sequence for chromosome", $chr, $raw_file) if ($main::verbose >= 0);
        #    $ref_seq = qx(cat $raw_file);
        #    #$ref_seq = qx($ENV{'RSAT'}/perl-scripts/sub-sequence -i $raw_file -from 1 -to 500000000 -format raw);
        #}


    }

	  my %info = ();
	  foreach my $fields (split(";",$attributes)) { ## Split and stor attribute values
	      my ($attributeID,$value) = split("=",$fields);
	      $info{$attributeID} = $value;
	  }

	  ## Skip Variants that lack mandatory attributes (Reference
	  ## sequence, variant sequence and ID).
	  next unless ($info{'Reference_seq'} && $info{'Variant_seq'} && $info{'ID'});

	  ## Retrive ID for the variant, which is stored in attribute
	  ## Dbxref Ej:Dbxref=dbSNP_137:rs186434315
	  my $id = "";
	  if ($info{'Dbxref'}) {
	      my @fields = split(":",$info{'Dbxref'});
	      $id = $fields[-1];
	  } else {
	      $id = $info{'ID'}; ## If information is not avialable use ID attribute (variant number in the table)
	  }

	  ## Retrieve validation status from either "validation_status"
	  ## or "evidence" attributes.
	  if ($info{'validation_status'} || $info{'evidence'}) {
	      $info{'validate'} = 1;
	  } elsif ($info{'validation_states'}) {
	      if ($info{'validation_states'} eq "-") {
		  $info{'validate'} = 0;
	      } else {
		  $info{'validate'} = 1;
	      }
	  } else {
	      $info{'validate'} = 0; ## If information is not abailable store 0
	  }

	  ## Retrive global minor allele frequency if
	  ## available. Ej:global_minor_allele_frequency=1|0.000915751|2
	  my $m_allele_freq;
	  if ($info{'global_minor_allele_frequency'}) {
	      my @gmaf = split("\\|",$info{'global_minor_allele_frequency'});
	      $m_allele_freq = $gmaf[1];
	  } else {
	      $m_allele_freq ="NA"; ## If minor allele freq is not avialable mark as NA
	  }




	  ################################################################
	  ## Remove bug line
	  ##
	  ## !!!!!!!!!!!!!!!!
	  ##
	  ## QUESTION_FROM_ALE=Are there repeat lines in the gvf file?
	  ## Not sure what this line is supposed to do.
	  #next if ($end < $last_end);
	  #next if ($last_id eq $id);

	  ################################################################
	  ## Remove variations that do not compy the information
	  ## quality expectation.

	  ## Remove variations whith unkonw strand, since the strand is
	  ## required to get the correct sequence.
	  if ( $strand eq "-" ) {
	      print $out_rm $chr."\t".$start."\t".$end."\t".$strand."\t".$id;
	      print $out_rm "\tVariation must be indicate on '+' strand\n";
	      next;
	  }

	  ## Remove variations where Reference sequence is unkown.
	  if ( $info{'Reference_seq'} =~ /[^ACGT\-]/) {
	      print $out_rm $chr."\t".$start."\t".$end."\t".$strand."\t".$id;
	      print $out_rm "\tReference variant $info{'Reference_seq'} does not only contain A,C,G,T,-\n";
	      next;
	  }

	  ## Remove variations for which alternative variation
	  ## sequence is unkown.
	  if ( $info{'Variant_seq'} =~ /[^ACGT\-,]/) {
	      print $out_rm $chr."\t".$start."\t".$end."\t".$strand."\t".$id;
	      print $out_rm "\tAlternative variant $info{'Variant_seq'} does not only contain A,C,G,T,-\n";
	      next;
	  }

	  ## Length of variation does not correspond to the end and
	  ## start information in the table.
	  if ( $end-$start+1 != length($info{'Reference_seq'}) ) {
	      print $out_rm $chr."\t".$start."\t".$end."\t".$strand."\t".$id;
	      print $out_rm "\tLength of the variation ".($end-$start+1)." ($start-$end) not identical to the length of reference seq ".(length($info{'Reference_seq'}))." ($info{'Reference_seq'})\n";
	      next;
	  }

	  ## Check that sequence provided as reference in the GVF file
	  ## matches the sequence at corresponding position in the
	  ## genome installed on RSAT.
	  if ($task{check_ref_seq}) {
	      $variation_ref_seq = substr($ref_seq,$start-1,$end-$start+1);
	      if (( $info{'Reference_seq'} ne "-") && ($info{'Reference_seq'} ne $variation_ref_seq)) {
		        print $out_rm join ("\t",
				      $chr,
				      $start,
				      $end,
				      $strand,
				      $id,
				      "Reference sequence",
				      $info{'Reference_seq'},
				      "differs from genome sequence",
				      $variation_ref_seq), "\n";
		        next;
	      }
	  }
    ###############################################################
	  ### Update start coordinate due to 0-based start
    $start = $start -1;

    ##################################################################
    ### Start assessing if the variant contains an insertion/deletion
    ### WSG: The start of the insertion is really the end
    if($info{'Reference_seq'} eq "-") {
      ################################################################################
      ### WSG: Change start and end, because when insertion is found it is reported
      ### the end and the next base,i.e. it comprehends the end of the between
      ### coordinate and the next base of the genome sequence.
      $start = $end;


      #Get the previous nucleotide as stated by insertions in VCF
      $start --;
      $prev_nucle = substr($ref_seq, $start,1);
      $info{'Reference_seq'} = $prev_nucle;

      #Append each insertion allele to the retrieved nucleotide
      my @alts = ();
      foreach my $alt (split(",",$info{'Variant_seq'})) {
        push (@alts,$prev_nucle.$alt);
      }

      $info{'Variant_seq'} = join (",",@alts);

    } else {

      #Get length of reference variant
      my $ref_size    = length($info{'Reference_seq'});
      my $indel_found = 0 ;
      my @valid_alts = ();
      #my @alts_size = ();
      foreach my $alt ( split(",", $info{'Variant_seq'} ) ) {

        ############################################################################################
        ### WSG: Check if allele is not the same as reference. This is mandatory when the Genotype
        ### is heterozygous as stated by GVF format. That is why I need to iterate through ALL
        ### the foreach loop.
        if($info{'Reference_seq'} ne $alt) {
          push (@valid_alts, $alt);
        }

        #Check if length of reference is different than alt
        my $size = length($alt);
        if ($ref_size != $size || $alt eq "-") {
          $indel_found = 1;
        }

      }

      #If an INDEL has been found update both reference and alternative alleles
      #with the previous nucleotide as stated by VCF format.
      if ($indel_found) {

        #Get the previous nucleotide as stated by insertions in VCF
        $start --;
        $prev_nucle = substr($ref_seq, $start,1);

        #Append each insertion allele to the retrieved nucleotide
        my @alts = ();
        foreach my $alt ( @valid_alts ) {

          if($alt eq "-"){
            $alt = $prev_nucle;
          } else {
            $alt = $prev_nucle.$alt;
          }

          push (@alts,$alt);
        }

        #Update alternative alleles
        @valid_alts = @alts;

        #Update reference allele
        $info{'Reference_seq'} = $prev_nucle.$info{'Reference_seq'};

      }

      #Update alternative alleles
      $info{'Variant_seq'} = join (",",@valid_alts);


    }
    #Get SO term for each alternative alleles
    $so_term = &Get_SO(\$info{'Reference_seq'},\$info{'Variant_seq'});

    ###############################################################
    ### Print line
    print $out_var "$chr\t$start\t$end\t$strand\t$id\t$info{'Reference_seq'}\t$info{'Variant_seq'}\t$so_term\t$info{'validate'}\t$m_allele_freq\n";

	  ################################################################
	  ## Check if the variation is not a part of a
	  ## super_variation.
	  ##
	  ## !!!!!!!!!!!!!!!!
	  ##
	  ## Note for JvH and Alejandra: we should check what are
	  ## these "super-variations" (seem to be sets of mutually
	  ## oevrlapping variations), and if we want to maintain
	  ## them. In particular, check if they do not provoke a
	  ## combinatorial explosion of processing time and/or storage
	  ## space.
	  ## ALE: THIS PART ALSO ASSUMES VARIANTS ARE SORTED BY CHROMOSOME AND THEN BY POSSITION

	  #if ( $start <= $last_end ) {
	  #    $last_end=$end if ($last_end < $end);
	  #} else {

	  #    &Get_super_variation($chr, $last_end,@super_variation) unless ( scalar(@super_variation) == 0);
	  #    @super_variation = ();
	  #    $last_end = $end;
	  #    $last_id = "";
	  #}

	  ## ALE: This could be causing the insertion crazyness where start was being marked
	  ## as bigger than the end. I'm not sure what was the aim of the line.
	  ## $start ++ if ($info{'Reference_seq'} eq "-");
	  #push (@super_variation, join("\t",
		#		       $chr,
		#		       $start,
		#		       $end,
		#		       $strand,
		#		       $id,
		#		       $info{'Reference_seq'},
		#		       $info{'Variant_seq'},
		#		       $so_term,
		#		       $info{'validate'},
		#		       $m_allele_freq));
	  ##push (@super_variation,$chr."\t".$start."\t".$end."\t".$strand."\t".$id."\t".$info{'Reference_seq'}."\t".$info{'Variant_seq'}."\t".$so_term."\t".$info{'validate'}."t");
	  #$last_id = $id;
      }

      #&Get_super_variation($last_chr, $last_end,@super_variation);


      close $file;
  } ## close task{report_snps}

  ################################################################
  ## Delete original GVF file if requested
  if ($task{clean_gvf}) {
      unlink($gvf_file_local_path);
  }
  if ($task{clean_gvf_gz}) {
      unlink($gvf_file_gz);
  }

    # Update variations table
    my	$supported_organism_file = &Get_supported_file();
    my $assembly_id=`grep -w $species $supported_organism_file | cut -f 13 | perl -lne \'s\/\\s+//g; print\'`;
    chomp($assembly_id);
    my $release=$main::personal_release;
    &UpdateVariationsSupported($species,$assembly_id,"ensemblgenomes",$release);

  &RSAT::message::Info("Variations installed in dir", $variation_dir) if ($main::verbose >= 1);

  ################################################################
  ## Report execution time and close output stream
  my $exec_time = &RSAT::util::ReportExecutionTime($start_time); ## This has to be exectuted by all scripts
  print $out $exec_time if ($main::verbose >= 1); ## only report exec time if verbosity is specified
  close $out;

  exit(0);
}


################################################################
################### SUBROUTINE DEFINITION ######################
################################################################

################################################################
## Display full help message
sub PrintHelp {
  system "pod2text -c $0";
  exit(0);
}

################################################################
## Display short help message
sub PrintOptions {
  &PrintHelp();
}

################################################################
## Get SO term, multiallelic sites contains the sequence_alteration
## name; this is not good for downstream process.

sub Get_SO {
  my $ref= ${$_[0]};
  my $alt= ${$_[1]};
  my @size_of_alts = ();

  foreach (split(",",$alt)) {
    if(length($ref) == length($_) && (length($_) == 1)) {
      if($_ eq "."){
        push @size_of_alts, "deletion";
      }else{
        push @size_of_alts, "SNV";
      }
    } elsif(length($ref) == length($_)){
      push @size_of_alts, "substitution";
    } elsif(length($ref) < length($_)) {
      push @size_of_alts, "insertion";
    } elsif(length($ref) > length($_)) {
      push @size_of_alts, "deletion";
    }
  }
  return join(",",@size_of_alts);
}

################################################################
## Get super variation
sub Get_super_variation {
  my ($chr, $last_end, @super_variation) = @_;
  &RSAT::message::Debug("Super variation array content\n", join("\n",@super_variation))
      if ( ($main::verbose >= 10) && (scalar(@super_variation)>1));


  ################################################################
  ## Group variations.
  ##
  ## Groups are All var, All validate var, All outside insert, All
  ## outside valide insert.  Outside insert are insert just after the
  ## end of the combinatory variation.

  my @super_variation_validate = ();
  my @super_insert = ();
  my @super_insert_validate = ();
  my $nb_validate = 0;

  if ( scalar(@super_variation) >1 ) {
    my @super_validate = ();

    my @fields = split("\t",$super_variation[0]);
    my $super_validate_end = $fields[2];

    for (my $i=0; $i<scalar(@super_variation) ; $i++) {
      my @var_info = split("\t",$super_variation[$i]);

      ## Group outside insert and outside valide insert
      if ( $var_info[5] eq "-" && $var_info[1] > $last_end) {
         push (@super_insert, $super_variation[$i]);
         push (@super_insert_validate, $super_variation[$i]) if ($var_info[8]==1);
         splice(@super_variation, $i, 1);
         $i--;

      ## Group validate variation
      } elsif ($var_info[8] == 1) {
        my $nb_validate ++;

        if ( $var_info[1] > $super_validate_end) {
          push (@super_variation_validate, \@super_validate);
          @super_validate = ();
        }

        push (@super_validate,$super_variation[$i]);
        $super_validate_end = $var_info[2] if ($super_validate_end < $var_info[2]);
      }
    }
    push (@super_variation_validate, \@super_validate) unless ( @super_validate );
  }

  #&RSAT::message::Debug("PFFFFFF\n", join("\n",@super_variation)) if ( ($main::verbose >= 10) && (scalar(@super_variation)>1));

  ################################################################
  ## Generate output line for super-variation
  %output_line = ();

  if ( scalar(@super_variation) == 1) {
    my @info = split("\t",$super_variation[0]); ## output line hash based on chr and start
    push (@{$output_line{$info[1]}{$info[2]}}, $super_variation[0]."\t0\t0\n");
  }

  elsif ( scalar(@super_variation) > 1) {

      ##ORIGINAL CODE: Possible error in $super_variation[0] definition causes
      ##repeating the first variant several times while ignoring all others, possibly this is not the desired behavior
      ##Variation in the super variation
      ##foreach my $line (@super_variation) {
	##  my @info = split("\t",$line);
	 ## push (@{$output_line{$info[1]}{$info[2]}}, $super_variation[0]."\t0\t1\n");
     ## }

      ## ALEJANDRA'S SUGGESTION
      ## Variation in the super variation
      ## in_supvar attribute is defined here
      foreach my $line (@super_variation) {
	  my @info = split("\t",$line);
	  #print      $line."\t0\t1"."BOOM"."\n";
	  push (@{$output_line{$info[1]}{$info[2]}}, $line."\t0\t1\n");
      }

      ################################################################
      ## Compute "super variations", i.e. combinations between
      ## variations. THIS MAY LEAD TO AN EXPLOSION OF THE NUMBER OF
      ## VARIATIONS.
      if ($task{super_variations}) {
	  if ( scalar(@super_variation_validate) == 1 && $nb_validate == scalar(@super_variation) ) {

	      ## If all variations are valid
	      my @info = split("\t", &MakeSuperVar(@{$super_variation_validate[0]} ));
	      push (@{$output_line{$info[0]}{$info[1]}}, "$chr\t".join("\t",@info)."\t1\t1\t0\n");

	  } else {

	      ## Validate variations
	      foreach (@super_variation_validate) {
		  next if ( scalar(@{$_}) < 1);
	      my @info = split("\t", &MakeSuperVar(@{$_} ));
		  push (@{$output_line{$info[0]}{$info[1]}}, "$chr\t".join("\t",@info)."\t1\t1\t1\n");
	      }

	      # All variation
	      my @info = split("\t",&MakeSuperVar(@super_variation));
	      push (@{$output_line{$info[0]}{$info[1]}}, "$chr\t".join("\t",@info)."\t0\t1\t0\n");
	  }
      }
  }


  ## Outside insert
  if ( scalar(@super_insert) > 1) {

      if ($task{super_variations}) {
	  if (scalar(@super_insert) == scalar(@super_insert_validate) ) {
	      my @info = split("\t",&MakeSuperVar(@super_insert_validate));
	      push (@{$output_line{$info[0]}{$info[1]}}, "$chr\t".join("\t",@info)."\t1\t1\t0\n");

	  } else {

	      my @info = split("\t",&MakeSuperVar(@super_insert));
	      push (@{$output_line{$info[0]}{$info[1]}}, "$chr\t".join("\t",@info)."\t0\t1\t0\n");

	      if ( scalar(@super_insert_validate) >= 1) {
		  my @info = split("\t",&MakeSuperVar(@super_insert_validate));
		  push (@{$output_line{$info[0]}{$info[1]}}, "$chr\t".join("\t",@info)."\t1\t1\t1\n");
	      }

	  }
      }

      foreach my $insert (@super_insert) {
	  my @info = split("\t",$insert);
	  push (@{$output_line{$info[1]}{$info[2]}}, $insert."\t0\t1\n");
      }

  } elsif ( scalar(@super_insert) == 1) {
      my @info = split("\t",$super_insert[0]);
      push (@{$output_line{$info[1]}{$info[2]}}, $super_insert[0]."\t0\t0\n");
  }

  ################################################################
  ## Print

  &RSAT::message::Debug("Output variations") if (($main::verbose >= 10) && (scalar(@super_variation)>1));

  my @sorted_start = sort{$a<=>$b} ( keys( %output_line ) );
  foreach my $start ( @sorted_start ) {
    my @sorted_end = sort{$a<=>$b} ( keys( %{$output_line{$start}} ) );
    foreach my $end ( @sorted_end ) {
      print $out_var @{$output_line{$start}{$end}};
      print @{$output_line{$start}{$end}} if (($main::verbose >= 10) && (scalar(@super_variation)>1));
    }
  }
  #<STDIN> if ( ($main::verbose >= 10) && (scalar(@super_variation)>1));
}

################################################################
## Create all combinations between variations
##
## JvH: THIS SHOULD BE REVISED, it can lead to a combinatorial
## explosion of the data size. Do we really want to create all
## possible combinations, whereas only a subset of them are observed
## in populations ? An alternative would be to use te data about
## haplotypes, but this should also be evaluated because haplotype
## datasets are voluminous.
sub MakeSuperVar {
    my @lines = @_;
    &RSAT::message::TimeWarn("Computing combinations between", scalar(@lines), "variations") if ($main::verbose >= 0);

    my $super_start=0;
    my $super_end=0;
    my @list_id = ();

    foreach my $line (@lines) {
	my @var_info = split("\t",$line);

	## Get coord
	$super_end = $var_info[2] if ($var_info[2] > $super_end);
	$super_start = $var_info[1] if ($var_info[1] < $super_start || $super_start == 0);

	## Get id
	push (@list_id,$var_info[4]);
    }

    ## Get ref seq
    my $super_ref = substr($ref_seq,$super_start-1,$super_end-$super_start+1);
    $super_ref = "-" if (length($super_ref) == 0);


    ## Get variants
    my @list_variants = ();
    @list_variants = &Get_alternative_variant($super_ref,$super_start-2,$super_start,$super_end,\@_,\@list_variants);

    ## Get SO_Term
    my $so_term = "sequence_alteration";

    if ($super_ref eq "-" ) {
	$so_term = "insertion";

    } else {
	my $same_len = 1;
	my $is_del = 1;

	foreach $var (@list_variants) {
	    if (length($var) >=  length($super_ref)) {
		$is_del = 0;
	    }
	    if ( length($var) != length($super_ref) ) {
		$same_len = 0;
	    }
	}

	if ($same_len && length($super_ref) == 1) {
	    $so_term = "SNV";
	} elsif ($same_len) {
	    $so_term = "substitution";
	} elsif ($is_del) {
	    $so_term = "deletion";
	}
    }

    return $super_start."\t".$super_end."\t"."+"."\t".join(',',@list_id)."\t".$super_ref."\t".join(',',@list_variants)."\t".$so_term;
}

################################################################
## Get alternative variant.
sub Get_alternative_variant {
  my ($ref_variant,$last_end,$super_start,$super_end,$list_variations,$list_variants) = @_;

  @list_variations = @{$list_variations};
  @list_variants = @{$list_variants};

  for (my $i = 0; $i < scalar(@list_variations);$i++ ) {
    my @var_info = split("\t",$list_variations[$i]);
    my $start = $var_info[1];
    $start -- if ($var_info[5] eq "-");

    if ( $start > $last_end) {

      foreach my $variant (split(",", $var_info[6])) {
        $var = substr($ref_variant,0, length($ref_variant) - ($super_end-$var_info[1]+1) ).$variant.substr( $ref_variant, length($ref_variant) - ($super_end-$var_info[2]) );
        $var =~ s/\-//g if (length($var)>1);

        push (@list_variants, $var ) unless (grep ($_ eq $var, @list_variants));
        @list_variants = &Get_alternative_variant($var,$var_info[2],$super_start,$super_end,\@list_variations,\@list_variants);
      }

      @list_variants = &Get_alternative_variant($ref_variant,$var_info[2],$super_start,$super_end,\@list_variations,\@list_variants);
    }
  }
  return (@list_variants);
}

################################################################
## Downloads VCF files from Ensembl Genomes and produces BED files of valid variants
## 
## URLs as of Jan2020: 
## ftp://ftp.ensemblgenomes.org/pub/release-46/plants/variation/vcf/triticum_turgidum
## ftp://ftp.ensemblgenomes.org/pub/release-46/metazoa/variation/vcf/aedes_aegypti_lvpagwg
sub Download_from_vcf{

   &RSAT::message::Info(" Downloading VCF file for species $main::species") if($main::verbose >= 1);


   ## set up connections and file names 
   $group_lc = lc($group_lc); # this is also called division 
   my $release=$main::personal_release;
   chomp($release);
   my $server_url   = "ftp://ftp.ensemblgenomes.org"; # TODO: this should be in RSAT_config.props
   my $database     = $server_url."/pub/release-".$release."/$group_lc/";
   my $organism_dir = $ENV{'RSAT'}."/data/ensemblgenomes/".$group_lc."/release-".$release;

   my $organism_table=$organism_dir."/species_Ensembl".ucfirst($group_lc).".txt";
   &RSAT::message::Warning("The document ",$organism_table, " doesn't exist") unless(-e $organism_table||$main::verbose < 2);
   &Install_table_of_organisms_per_group($organism_dir,$organism_table,$database,$group_lc) unless(-e $organism_table); 

   # this affects bacteria
   my $collection=&Get_Collection($organism_table,$species,$group_lc,$release); 

   # build RSAT ID for this species
   my $species_ucfirst = ucfirst($species);
   # this how makefiles/ensemblgenomes_FTP_client.mk gets the assembly, 
   # there's another way I haven't tested at /var/www/html/rsat/perl-scripts/lib//RSAT_to_ensembl.lib.pl
   my $assembly_id     = `grep -w $species $organism_table | cut -f 5 | perl -lne \'s\/\\s+//g; print\'`; 
   chomp($assembly_id);
   #my $species_rsat_id=$species_ucfirst."_".join("",$assembly_id);
   my $species_rsat_id = Get_full_species_ID( $species, $assembly_id, $release );

   # compose local installation folder
   my $species_dir    = $ENV{'RSAT'}."/data/genomes/".$species_rsat_id;
   my $variations_dir = $species_dir."/variations";

   # compose remote FTP folder
   my $vcf_ftp_url = $database."variation/vcf/".$collection."/".$species."/";

   # remote FTP files 
   my $vcf_server_gz     = $vcf_ftp_url."/".$species.".vcf.gz";
   my $vcf_server_tbi    = $vcf_ftp_url."/".$species.".vcf.gz.tbi";
   my $vcf_server_readme = $vcf_ftp_url."/README";

   # local variation files
   my $vcf_local_gz     = $variations_dir."/".$species_rsat_id.".vcf.gz";
   my $vcf_local_tbi    = $variations_dir."/".$species_rsat_id.".vcf.gz.tbi";
   my $vcf_local_readme = $variations_dir."/README";
   my $vcf_local_source = $variations_dir."/source.txt";

   ## actually download files
   my $cmd="echo". ###
	"\nmkdir -p $variations_dir". ###
	"\necho \" VARIATIONS_DIR  $variations_dir\"". ###
	"\necho". ###
	"\necho \"Downloading VCF file for species $species\"". ###
	"\nwget -cnv $vcf_server_gz -O $vcf_local_gz; \\". ## Download the vcf.gz file
	"\necho \"  VCF_LOCAL_GZ  $vcf_local_gz\"; \\".
	"\nwget -cnv $vcf_server_tbi -O $vcf_local_tbi; \\". ## Download the vcf.gz.tbi file
	"\necho \"  VCF_LOCAL_TBI  $vcf_local_tbi\"; \\".
	"\nwget -cnv $vcf_server_readme -O $vcf_local_readme; ". ## Download the readme file
	"";
   &RSAT::message::Info("Downloading vcf.gz, vcf.gz.tbi and README files") if($main::verbose >= 2);
   &RSAT::message::Warning("The $vcf_local_gz file already exists") if($main::verbose >=2 &&-e $vcf_local_gz);
   &RSAT::message::Warning("The $vcf_local_tbi file already exists") if($main::verbose >=2 &&-e $vcf_local_tbi);
   &RSAT::message::Warning("The $vcf_local_readme file already exists") if($main::verbose >=2 &&-e $vcf_local_readme);
   &doit($cmd, 0, 1, $verbose);
   #system($cmd) unless(-e $vcf_local_gz&&-e $vcf_local_tbi&&-e $vcf_local_readme);

   ## Add source
   my $source_fh = &OpenOutputFile( $vcf_local_source );
   print $source_fh "ensemblgenomes\n";
   close($source_fh);

   ## Proceed with the extraction and conversion of vcf.gz to vcf and to varBed

   &RSAT::message::Info("Decompressing the file ",$vcf_local_gz) if($main::verbose >= 2);
   #TODO Remove .txt and make the file extension $organism.vcf
   my $vcf_local_dec=$variations_dir."/".$species_rsat_id.".vcf";
   unless(-e $vcf_local_dec){
	if (-e $vcf_local_gz){
      &doit("gunzip -vcf $vcf_local_gz > $vcf_local_dec", 0, 1, $verbose);
  		#system("gunzip -vcf $vcf_local_gz > $vcf_local_dec");
	} else {
   		&RSAT::error::FatalError("The file $vcf_local_gz doesn't exist!\n");
	}
   }

   my ($tmp_lines) = &OpenInputFile($vcf_local_dec);
   my $out_rm = &OpenOutputFile($variations_dir."/Failed.tab");
   my %chromosomes=();
   my %chromosome_files=();
   my %file_chromosomes=();
   my $file_chromosome="";
   my $line=0;
   while (<$tmp_lines>){
	chomp();
	my ($chr,$pos,$id,$ref,$alt,$qual,$filter,$attributes)=split("\t");
	next if($chr =~ /^\#/);
	$line++;

	## Code fragment adapted from $RSAT/perl_scripts/convert-variations

	################################################################
	## Remove variations that do not meet the information
	## quality expectation.

	if($ref =~ /[^ATCG]/){
		&RSAT::message::Warning("Line ", $line, " Invalid VCF line: $ref don't have just ATCG", "Skipped") if($main::verbose >= 7);
		print $out_rm (join("\t",$chr,$pos,$id,$ref,$alt,$qual,$filter,$attributes)."\tReference sequence $ref does not only contain A,C,G,T\n");
		next;
	}

	if($alt =~ /[^ATCG\,]/){
		&RSAT::message::Warning("Line ", $line, " Invalid VCF line: $alt don't have just ATCG", "Skipped") if($main::verbose >= 7);
		print $out_rm (join("\t",$chr,$pos,$id,$ref,$alt,$qual,$filter,$attributes)."\tAlternative sequence $alt does not only contain A,C,G,T,\",\"\n");
		next;
	}
  #TODO Find in which snippet of code this happening

	## End of the revision of invalid lines

	################################################################
	## Checking and opening varBed files
	unless(defined($chromosomes{$chr})){
		#print($chr); ##
		$chromosomes{$chr} = $chr;
		$chromosome_files{$chr} = $variations_dir."/".$chr.".varBed";
		$file_chromosomes{$chr} = "FILE".$chr;
		&RSAT::message::Info("Creating the file ",$chromosome_files{$chr}) if($main::verbose >= 7);
		&RSAT::message::Warning("The document", $chromosome_files{$chr},"already exists, it will be rewritten") if($main::verbose >= 2 && -e $chromosome_files{$chr});
		$file_chromosomes{$chr} = &OpenOutputFile($chromosome_files{$chr});
		$file_chromosome = $file_chromosomes{$chr};
		print $file_chromosome (join("\t","#chr","start","end","strand","id","ref","alt","so_term","validate","minor_allele_freq\n"));
	}
	## End of cheking and opening

	my %info = ();
	  foreach my $fields (split(";",$attributes)) { ## Split and store attribute values
	      my ($attributeID,$value) = split("=",$fields);
	      $info{$attributeID} = $value;
	  }

	my $start = $pos-1;
  my $end = $pos-1+length($ref);
  my $strand = "+";
	my $so_term = $info{"TSA"};
  if(lc($info{"TSA"}) eq "indel"){
    #my $alt_tmp = join(",",$alt);
    #my $ref_seq = qx(cat $chromosome_files{$chr});
    #$so_term = &Get_SO(substr($ref_seq, $start,1).$alt.$ref,$alt_tmp);
    $so_term = &Get_SO(\$ref,\$alt);
  }
  my $validate = 0;
  $validate = $info{"VALIDATED"} if(defined($info{"VALIDATED"}));
	my $minor_allele_freq = "NA";
	$minor_allele_freq = $info{"CAF"} if(defined($info{"CAF"}));

	$file_chromosome = $file_chromosomes{$chr};

	&RSAT::message::Info("Writing the line: ",$chr,$start,$end,$strand,$id,$ref,$alt,$so_term,$validate,$minor_allele_freq,"in the document ", $chromosome_files{$chr}) if ($main::verbose >=12);
	print $file_chromosome (join("\t",$chr,$start,$end,$strand,$id,$ref,$alt,$so_term,$validate,$minor_allele_freq)."\n"); ## Writing the line in the corresponding file
   }
   foreach $chromosome_file (keys(%file_chromosomes)){
	    $file_chromosome = $file_chromosomes{$chromosome_file};
	    close $file_chromosome;
   }
   close $out_rm;
   close $tmp_lines;
   #&doit("rm $vcf_local_dec", 0, 1, $verbose);
   #system("rm $vcf_local_dec");
   ## END of extraction and convertion

   ##########################################################
   # Update variations table
   &UpdateVariationsSupported($species_ucfirst,$assembly_id,"ensemblgenomes",$release);

   my $exec_time = &RSAT::util::ReportExecutionTime($start_time); ## This has to be exectuted by all scripts
   print $out $exec_time if ($main::verbose >= 1); ## only report exec time if verbosity is specified
   close $out;

   exit(0);
}

################################################################
## Search the organism table according to the format of url ## ANRG_2
##
## Code fragment extracted and modified from ensemblgenomes_ftp_client.mk
sub Get_Collection{
   my ($organism_table,$species,$group_lc,$release)=@_;
   my $collection="";
   ## For Fungal and Bacterial genomes, we have to extract the collection (sub-folder
   ## of the ftp site) from the organism table
   my $collection_from_table=`awk -F\'\\t\' \'\$\$2 ==\"$species\" {print \$\$13}\' $organism_table | perl -pe \'s|_core_$release\_\.*||\'`;
   if ($group_lc eq "fungi"){
  	## Trick: some Fungi are in the root folder rather than in a
  	## collection sub-folder.  These species however have a collection
  	## field in the species table, but its value equals the species
  	## name.
  	if ($collection_from_table eq $species){
    	    $collection="TRICK";
  	}  else {
    	    $collection=$collection_from_table;
  	}
   } elsif ($group_lc eq "bacteria"){
  	## For Fungal genomes, we have to extract the collection (sub-folder
  	## of the ftp site) from the organism table.
  	$collection=$collection_from_table;
   }
   return($collection);
}

################################################################
## Install group organism table ## ANRG_2
##
## Code fragment extracted and modified from ensemblgenomes_ftp_client.mk
sub Install_table_of_organisms_per_group{
  my ($organism_dir,$organism_table,$database,$group_lc)=@_;
  &RSAT::message::Info("Downloading group organism table",$group_lc, "from ensemblgenomes") if($main::verbose >= 2);
  my $serverlist=$database."/species_Ensembl".ucfirst($group_lc).".txt";
  my $cmd="echo".
	"\nmkdir -p $organism_dir".
	"\necho \"Getting list of organisms from $database\" ".
	"\necho \"$organism_dir\" ".
	"\nwget -Ncnv $serverlist -P $organism_dir ".
	"\necho ".
	"\necho \"$organism_table\"".
	"";
  &doit($cmd, 0, 1, $verbose); #TODO FOR ANRG This is the way commands should be runned
   #system($cmd); ## It must die when we have a error
}


################################################################
## Install table of organism per group ## ANRG_2
## NOTE WSG. TODO I need to come back and clean this.
## This part must be modified
sub Validate_release {
  #my $release = $main::personal_release;
  #if (&IsNatural($release)) {
  #    $main::ensembl_release = $release;
  #    &RSAT::error::FatalError($release ,
  #        "Invalid Ensembl release: cannot be lower than the \"safe\" release",
  #        $ensembl_release_safe)   if ($main::ensembl_release < $ensembl_release_safe);
  #    &RSAT::error::FatalError($release,
  #        "Invalid Ensembl release: cannot be higher than the latest available release",
  #        $ensembl_release_latest) if ($main::ensembl_release > $ensembl_release_latest);
  #} elsif ($release eq "safe") {
  #  $main::ensembl_release = $ensembl_release_safe;
  #} elsif ($release eq "latest" ) {
  #  $main::ensembl_release = $ensembl_release_latest;
  #} else {
  #  &RSAT::error::FatalError($release, "Invalid value for Ensembl release.");
  #}
}

################################################################
## Read arguments
sub ReadArguments {
  my $arg;
  my @arguments = @ARGV; ## create a copy to shift, because we need ARGV to report command line in &Verbose()
  while (scalar(@arguments) >= 1) {
    $arg = shift (@arguments);

=pod

=head1 OPTIONS

=over 4

=item B<-v #>

Level of verbosity (detail in the warning messages during execution)

=cut
    if ($arg eq "-v") {
      if (&IsNatural($arguments[0])) {
	$main::verbose = shift(@arguments);
      } else {
	$main::verbose = 1;
      }

=pod

=item B<-h>

Display full help message

=cut
    } elsif ($arg eq "-h") {
      &PrintHelp();

=pod

=item B<-help>

Same as -h

=cut
    } elsif ($arg eq "-help") {
      &PrintOptions();

=pod

=item B<-species species_name>
=item B<-org species_name> (equivalent)

Name of the species for which you want to download the variations (e.g
Homo_sapiens, Mus_musculus).

Help: to get the list of species available at Ensembl, run the
command:

   I<download-ensembl-variations -available_species>

=cut
    } elsif (($arg eq "-species") || ($arg eq "-org")) {
      $main::species = lc(shift(@arguments));

################################################################
## THIS OPTION HAS BEEN INACTIVATED BY JvH on 2014-10-28
# =pod
#
# =item B<-dir #>
#
# The directory in wich RSAT genomes must be installed. The selected
# species will be installed in a sub-directory composed of Species name
# and Ensembl genome release.
#
# Default : $RSAT/data/
#
# =cut
#     } elsif ($arg eq "-dir") {
#       $main::data_dir = shift(@arguments);

=pod

=item   B<-o outputfile>

The output file is used to hold a trace of the transfers (verbosity),
and to store the list of species when the option -available_species is
activated.

If no output file is specified, the standard output is used.

=cut
    } elsif ($arg eq "-o") {
      $outfile{output} = shift(@arguments);

=pod

=item B<-db ensembl|ensemblgenomes>

Default: Ensembl

Select the source database.

=over

=item I<ensembl>

The "historical" Ensembl database (L<http://ensembl.org/>), restricted
to a series of genomes from model organisms (69 supported species on
Oct 30, 2014).

=item I<ensemblgenomes>

The exended EnsemblGenomes database (L<http://ensemblgenomes.org/>),
which comprises repositories for the following taxa: Bacteria
(actually includes Archaea), Fungi, Metazoa, Plants, Protists.

In Oct 2014, EnsemblGenomes supports >15,000 species.

=item I<ensemblall>

Load both Ensembl and Ensembl Genomes.

=back

=cut
   } elsif ($arg eq "-db") {
    $main::db = lc(shift(@arguments));
    unless (($main::db eq "ensembl")
	    || ($main::db eq "ensemblgenomes")
	    || ($main::db eq "ensemblall")
            ) {
	&RSAT::error::FatalError($main::db, "Invalid value for the option -db. Supported: ensembl,ensemblgenomes,ensemblall");
    }

=pod

=item B<-task>

Tasks to be performed by download-ensembl-variations

Supported tasks:

=over

=item B<download_gvf>

Download the GVF (Genome variant file) from Ensembl.

=item B<report_snps>

Process variant information and convert it to varBed.

=item B<fromvcf>

Used to retrieve and install variations from ftp ensemblgenomes. Must be Used
with -group, -species and -release options. This task can not be used in
combination with the others. IMPORTANT: task must be declared before the release.

=back

=cut
    } elsif ($arg eq "-task") {
	$arg = shift (@arguments);
	chomp($arg);
	my @tasks = split ",", $arg;
	foreach my $task (@tasks) {
	    $task = lc($task);
	    if ($supported_tasks{$task}) {
		$task{$task} = 1;
	    } else {
		&RSAT::error::FatalError($task, "Invalid tasks. Supported:", $supported_tasks);
	    }
	}

=pod

=item B<-available_species>

Get all available species on Ensembl

=cut
    } elsif ($arg eq "-available_species") {
      $main::get_available_species = 1;

=pod

=item B<-release #>

The release release of ensEMBL.

Supported releases: release numbers (e.g. 70, 72, ...), safe, latest

Default : I<safe>

=over

=item I<safe>

The file locations and/or formats of the Ensembl rsync distribution
may change between two Ensembl release. For this reason, we defined
the "safe" release, which corresponds to the earliest release of
ensembl which has been checked to work with this script.

=item I<latest>

This corresponds to the lastest available release of Ensembl. Beware:
this release is not guaranteed to be compatible with RSAT, in case
Ensembl would change their file formats or locations.

=back

=cut

  } elsif (($arg eq "-release") || ($arg eq "-version")) {
    if ($arg eq "-version") {
      &RSAT::message::Warning("option -version is obsolete, has been replaced by -release.");
    }
      my $release = shift(@arguments);
      $main::personal_release = $release;

=pod

=item B<-species_suffix>

Suffix to append to the full species ID.

By default, the full species ID is composed by concatenating the
Ensembl species and assembly. The option I<-species_suffix>
allows to specify a string (e.g. _ensembl76, _for_testing, ...) that
will be appended to the full species ID.

=cut
    } elsif ($arg eq "-species_suffix") {
    $species_suffix = shift(@arguments);

## ANRG_2 S

=pod

=item B<-group>

Group species as indicated in the RSAT manual

This option can only be used for the ensemblgenomes database.
and in combination with '-task fromvcf'.

=cut

    } elsif ($arg eq "-group") {
    chomp($group_lc = shift(@arguments));
    my @valid_groups=("plants","fungi","metazoa","protists","bacteria");
    my %valid_grps=();
    foreach $valid_group (@valid_groups){
	$valid_grps{$valid_group}=$valid_group;
    }
    &RSAT::error::FatalError($group_lc," Invalid group. Supported: ",join("\n",@valid_groups)) unless($valid_grps{lc($group_lc)});
## ANRG_2 E
    } else {
      &FatalError($arg, "Invalid option");
    }
  }

=pod

=back

=cut


}

################################################################
## Verbose message
sub Verbose {
  print $out "; download-ensembl-variations ";
  &PrintArguments($out);

  if (%main::outfile) {
    print $out "; Output files\n";
    while (my ($key,$value) = each %main::outfile) {
      printf $out ";\t%-13s\t%s\n", $key, $value;
    }
    printf $out ("; %-22s\t%s\n", "Ensembl safe release", $ensembl_release_safe);
    printf $out ("; %-22s\t%s\n", "Ensembl release",$ensembl_release);
    printf $out ("; %-22s\t%s\n", "Species suffix", $species_suffix) if ($species_suffix);
  }
}
