#!/usr/bin/perl -w

############################################################
#
# $Id: download-ensembl-genome,v 1.49 2013/10/13 08:05:33 jvanheld Exp $
#
############################################################

use warnings;

=pod

=head1 NAME

download-ensembl-genome

=head1 VERSION

$program_version

=head1 DESCRIPTION

Download the genomic sequence of a user-specified organism from the
Ensembl server, and convert the original format (fasta) into raw files
required for RSAT sequence retrieval.

=head1 AUTHORS

=over

=item I<Jeremy Delerce> (Master 2 thesis 2013)

=item I<Alejandra Medina-Rivera> (amedina@lcg.unam.mx)

=item I<Jacques van Helden> (Jacques.van-Helden\@univ-amu.fr)

=back

=head1 CATEGORY

=over

=item util

=back

=head1 USAGE

C<download-ensembl-genome -species # [-db ensemblgenomes] [-no_rm] [-release #] [-force] [-o genomesDirectory] [-var] [-available_species] [-v #]>

=head2 Examples

Get a list of species available at Ensembl

C<download-ensembl-genome -available_species>

Get raw genome sequences for the human genome (Homo sapiens)

C<download-ensembl-genome -species Homo_sapiens>

Get repeat-masked version of the mouse genome (Mus musculus)

C<download-ensembl-genome -species Mus_musculus -rm>

=head1 OUTPUT FORMAT

The program returns 

Several sequence files (one file per chromosome/contig) in raw
format.

A tab-delimited file (contig.tab) providing the attributes of each
contig.

A text file (contigs.txt) indicating which sequence file corresponds
to which chromosome.

=head1 SEE ALSO

=head2 install-ensembl-genome

A wrapper managing all the tasks required to install genome from
Ensembl on RSAT (including I<download-ensembl-genome>.

=head2 download-ensembl-variations

Downloads tab-delimited files providing the positions and descriptions
of the genomic features (genes, transcripts, CDS, ...).

=head2 download-ensembl-variations

Downloads variation for a species.

=head2 retrieve-variation-seq

I<retrieve-variation-seq> uses files product by I<download-ensembl-variations>
to make all the sequence of a variation.

=head1 WISH LIST

=cut

BEGIN {
  if ($0 =~ /([^(\/)]+)$/) {
    push (@INC, "$`lib/");
  }
}

require "RSA.lib";
require "RSAT_to_ensembl.lib.pl";
use Bio::EnsEMBL::Registry;
#use Bio::EnsEMBL::LookUp;

################################################################
## Main package
package	main;
{

  ###############################################################
  ## Initialise parameters
  our $start_time = &RSAT::util::StartScript();
  our $program_version = do { my @r = (q$Revision: 1.49 $ =~ /\d+/g); sprintf"%d."."%02d" x $#r, @r };

  our $out = SDTOUT;
  our %outfile = ();

  our $verbose = 0;

  our $db = "ensembl"; ## Supported: ensembl, ensemblgenomes, ensemblall
  our $taxid = "";
  our $species = "";
  our $assembly = "";
  our $species_suffix = "";
  our $registry = 'Bio::EnsEMBL::Registry';
  our $safe_ensembl_release = &get_ensembl_release_safe($db);
  our $latest_ensembl_release = &get_ensembl_release_latest($db);
  our $ensembl_release = &get_ensembl_release($db);
  our $null = "<NULL>";

  our $rm = 1;
  our $get_available_species = 0;

  our $force = 0; ## Force installation even if the genome is already installed

  ################################################################
  ## Read argument values
  &ReadArguments();

  ## Check that the user-selected Ensembl release is supported.
  &check_ensembl_release($db,$ensembl_release);

#  my ($host, $port) = &Get_host_port($db);

  ################################################################
  ## Print verbose
  $out = &OpenOutputFile($outfile{output});
  &Verbose() if ($main::verbose >= 1);

  ###############################################################
  ## Print the list of available species
  if ($get_available_species) {
    &RSAT::message::TimeWarn("download-ensembl-genome", "Getting the list of available species", "db=".$db) if ($main::verbose >= 1);
    &LoadRegistry($registry, $db, $ensembl_release);
    
    my @db_adaptors = @{ $registry->get_all_DBAdaptors(-group => 'core') };
    my %species_taxon = ();

    if (lc($db) eq "ensemblgenomes") {
      my $ens_release = &get_ensembl_release_safe($db);
      %species_taxon = &Get_species_taxon($db,$ens_release);
    }

    @sort_species = sort {$a->species() cmp $b->species()} @db_adaptors;

    foreach my $db_adaptor (@sort_species) {
#       my $db_connection = $db_adaptor->dbc();
#       if ($db_adaptor->group() eq "core") {
      my $species = ucfirst($db_adaptor->species());
      my $taxon = $null;
      if (defined($species_taxon{$db_adaptor->species()})) {
	$taxon = $species_taxon{$db_adaptor->species()};
      }
      print $out join("\t", $species, $taxon), "\n";
#         print $out "\t", if ($species_taxon{$db_adaptor->species()});
#         print $out "\n";
#      }
    }
    $registry->disconnect_all();

    my $exec_time = &RSAT::util::ReportExecutionTime($start_time); ## This has to be exectuted by all scripts
    print $out $exec_time if ($main::verbose >= 1); ## only report exec time if verbosity is specified
    close $out;
    exit(0);
  }

  ###############################################################  
  ## Check arguments
  ## Identify species by taxonomic ID
#   if ($taxid) {
#     my $lookup = Bio::EnsEMBL::LookUp->new(-URL=>"http://bacteria.ensembl.org/registry.json",-NO_CACHE=>1);
#     my @dbas = @{$lookup->get_all_by_taxon_id($taxid)};
#     if (scalar(@dbas) == 0) {
#       &RSAT::error::FatalError("There is no species in Ensembl corresponding to the taxonomic ID", $taxid);
#     } elsif (scalar(@dbas) > 1) {
#       &RSAT::message::Warning("The taxonomic ID", $taxid, "matches several species. The first one will be used");
#     }
#     my $dba = shift @dbas;
#     &RSAT::message::Debug("DBA", $dba);
#   }

  ## Check that the species has been specified
  unless ($species) {
    &RSAT::error::FatalError("You must specify a species to download. To get the list of supported species, type: download-ensembl-genome -available_species");
  }

  ################################################################
  ## Connecting to ensembl
  &LoadRegistry($registry, $db, $ensembl_release);

  ################################################################
  ## Get Adaptators
  &RSAT::message::TimeWarn("Getting species adaptor from Ensembl") if ($main::verbose >= 2);
  my $slice_adaptor = $registry->get_adaptor($species, 'core', 'slice');
  if ($slice_adaptor) {
    &RSAT::message::Debug("Slice adaptor", $slice_adaptor) if ($main::verbose >= 5);
  } else {
    &RSAT::error::FatalError("Cannot get slice adaptor for species", $species);
  }

  my $mca = Bio::EnsEMBL::Registry->get_adaptor( $species, "core",  "metacontainer" );
  if ($mca) {
      &RSAT::message::Debug("Coordinate system adaptor (mca)", $mca) if ($main::verbose >= 5);
  } else {
      &RSAT::error::FatalError("Cannot get coordinate system adaptor (mca) for species", $species);
  }

  ################################################################  
  # Get all Top Level slice
  my @slices = @{$slice_adaptor->fetch_all('toplevel')};

  ## Correct Ensembl error with the two Y human chromosomes 
  ## !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  ## TO BE CHECKED (JvH): is this still required ?
  if ($species eq "homo_sapiens") {
    my @slice_tmp = ();
    foreach my $slice (@slices) {
      push (@slice_tmp,$slice) unless ($slice->name() =~ /:Y:/);
    }
    push (@slice_tmp,$slice_adaptor->fetch_by_region( 'chromosome', 'Y' ));
    @slices = @slice_tmp;
  }

  ################################################################
  ## Get the list of sequences available at Ensembl and compare with
  ## those already installed on local server.

  ## Get genome_dir
  my @fields = split(":",$slices[0]->name());
  $assembly = $fields[1];
  $full_species_ID = &Get_full_species_ID($species, $assembly,$ensembl_release, $species_suffix);
  &RSAT::message::Info("Full species ID", $full_species_ID) if ($main::verbose >= 2);
  $genome_dir = &Get_genome_dir($species, $assembly,$ensembl_release, $species_suffix);
  &RSAT::message::Info("Genome directory", $genome_dir) if ($main::verbose >= 2);
  printf $out ("; %-22s\t%s\n", "Full species ID", $full_species_ID);
  printf $out ("; %-22s\t%s\n", "Genome directory", $genome_dir);
  &RSAT::util::CheckOutDir($genome_dir);

  ## Get the list of sequences already installed in the local server
  &RSAT::message::TimeWarn("Checking local versions of genomic sequences") if ($main::verbose >= 2);
  my %seq_installed = ();
  my @genome_dir_content =glob($genome_dir."/*");
  #my @genome_dir_content =`(ls $genome_dir/*)`; ## NOTE FROM ALE: please do not remove commented line
                                                ## I'm having problmes with my glob function

  &RSAT::message::Info("genome_dir_content", scalar(@genome_dir_content), "files") if ($main::verbose >= 2);
 
   foreach my $file( @genome_dir_content ) {
       ##chomp;  ## NOTE FROM ALE: please do not remove commented line
               ## I'm having problmes with my glob function
       next unless ($file =~ /\.raw/);
       
       $len = -s $file;
       $file =~ s/$genome_dir\///g;
       $file =~ s/\.raw//g;
       
       my ($type,$slice) = split($assembly, $file);
       $type =~ s/_/:/g;
       $slice =~ s/_/:/g;
       
       my $seq_key = $type.$assembly.$slice;
       $seq_installed{$seq_key} = $len;
       &RSAT::message::Debug("seq_installed", $seq_key, $len) if ($main::verbose >= 3);
   }
  



  ################################################################
  ## Download sequences
  &RSAT::message::TimeWarn("Downloading sequences") if ($main::verbose >= 2);
  foreach my $slice (@slices) {
    &RSAT::message::TimeWarn("\tTreating slice",$slice->name(), $slice->length()) if ($main::verbose >= 2);

    ## Raw sequence
    my $seq_norm_installed = $seq_installed{$slice->name()} && $seq_installed{$slice->name()} == $slice->length();
    if ( ($seq_norm_installed) && (!$force) ) {
      &RSAT::message::TimeWarn("\tSkipping download of DNA sequences for slice",$slice->name) if ($main::verbose >= 2);
      next;
    } else {
      &RSAT::message::TimeWarn("\tDownloading DNA sequences for slice",$slice->name) if ($main::verbose >= 2);
      my $filename = $slice->name().".raw";
      $filename =~ s/:/_/g;
      my $out_seq_file = &OpenOutputFile($genome_dir."/".$filename);
      print $out_seq_file $slice->seq();
      close ($out_seq_file);
    }

    ## Repeat-masked sequence
    my $seq_rm_installed = $seq_installed{$slice->name().":repeatmasked"} && $seq_installed{$slice->name().":repeatmasked"} == $slice->length();
    if ((!$rm) || ( $seq_rm_installed && !$force ) ) {
      &RSAT::message::TimeWarn("\tSkipping dowload of repeat-masked DNA sequences for slice",$slice->name()) if ($main::verbose >= 2);
    } else {
      &RSAT::message::TimeWarn("\tDownloading repeat-masked DNA sequences for slice",$slice->name()) if ($main::verbose >= 2);
      my $filename = $slice->name()."_repeatmasked.raw";
      $filename =~ s/:/_/g;
      my $out_seq_file = &OpenOutputFile($genome_dir."/".$filename);
      print $out_seq_file $slice->get_repeatmasked_seq()->seq();
      close ($out_seq_file);
    }
  }

  ################################################################
  ## Create contig description files (contigs.txt and contig.tab)
  
  &RSAT::message::TimeWarn("Creating contig description files (contigs.txt and contig.tab).") if ($main::verbose >= 2);
  my $out_contigs = &OpenOutputFile($genome_dir."/contigs.txt");
  my $out_contig =  &OpenOutputFile($genome_dir."/contig.tab");
  print $out_contig "-- dump date","\t",&AlphaDate(),"\n";
  print $out_contig "-- class","\t","EnsEMBL::Contig\n";
  print $out_contig "-- table","\t","contig\n";
  print $out_contig "-- table","\t","main\n";
  print $out_contig "-- field 1","\t","id\n";
  print $out_contig "-- field 2","\t","accession\n";
  print $out_contig "-- field 3,","\t","version\n";
  print $out_contig "-- field 4","\t","type\n";
  print $out_contig "-- field 5","\t","length\n";
  print $out_contig "-- field 6","\t","description\n";
  print $out_contig "-- header\n";
  print $out_contig "-- id","\t","accession","\t","version","\t","type","\t","length","\t","description\n";

  foreach my $slice (@slices) {

    ## contigs.txt
    my $filename = $slice->name().".raw";
    $filename =~ s/:/_/g;
    print $out_contigs $filename,"\t",$slice->name(),"\n";


    ## contig.tab
    @fields = split(":",$slice->name());
    print $out_contig $slice->seq_region_name(),"\t",$slice->name(),"\t",$assembly,"\t",$fields[0],"\t",$slice->length(),"\t",$fields[0]," ",$slice->seq_region_name(),"\n";
  }

  close $out_contig;
  close $out_contigs;  

  &UpdateEnsemblSupported();

  ################################################################
  ## Report execution time and close output stream
  my $exec_time = &RSAT::util::ReportExecutionTime($start_time); ## This has to be exectuted by all scripts
  print $out $exec_time if ($main::verbose >= 1); ## only report exec time if verbosity is specified
  close $out;

  exit(0);
}

################################################################
################### SUBROUTINE DEFINITION ######################
################################################################

################################################################
## Display full help message
sub PrintHelp {
  system "pod2text -c $0";
  exit()
}

################################################################
## Display short help message
sub PrintOptions {
  &PrintHelp();
}

################################################################
## Read arguments
sub ReadArguments {
  my $arg;
  my @arguments = @ARGV; ## create a copy to shift, because we need ARGV to report command line in &Verbose()
  while (scalar(@arguments) >= 1) {
    $arg = shift (@arguments);

=pod

=head1 OPTIONS

=over 4

=item B<-v #>

Level of verbosity (detail in the warning messages during execution)

=cut
    if ($arg eq "-v") {
      if (&IsNatural($arguments[0])) {
        $main::verbose = shift(@arguments);
      } else {
        $main::verbose = 1;
      }

=pod

=item B<-h>

Display full help message

=cut
    } elsif ($arg eq "-h") {
      &PrintHelp();

=pod

=item B<-help>

Same as -h

=cut
    } elsif ($arg eq "-help") {
      &PrintOptions();

=pod

=item B<-species species_name>

Species that you want download variation (homo_sapiens, mus_musculus).

Use the option I<-available_species> to print a list of available
species at Ensembl, or options I<-available_species -db ensemblgenomes>
to get the species available at EnsemblGenomes (extended Ensembl).

=cut
    } elsif (($arg eq "-species") || ($arg eq "-org")) {
      $main::species = lc(shift(@arguments));

=pod

=item B<-taxid #>

Taxonomic ID of the species (organism), as defined in the NCBI
taxonomy database (http://www.ncbi.nlm.nih.gov/taxonomy).

=cut
    } elsif ($arg eq "-taxid") {
      $main::taxid = shift(@arguments);

=pod

=item B<-no_rm>

Don't download repeat-masked genomic DNA.

Interspersed repeats and low
complexity regions are detected with the RepeatMasker tool and masked
by replacing repeats with 'N's.

=cut
    } elsif ($arg eq "-no_rm") {
      $main::rm = 0;

=pod

=item B<-db ensembl|ensemblgenomes|ensemblall>

Default: Ensembl

Select the source database. 

=over

=item I<ensembl>

The "historical" Ensembl database (L<http://ensembl.org/>), restricted
to a series of genomes from model organisms (69 supported species on
Oct 30, 2014).

=item I<ensemblgenomes>

The exended EnsemblGenomes database (L<http://ensemblgenomes.org/>),
which comprises repositories for the following taxa: Bacteria
(actually includes Archaea), Fungi, Metazoa, Plants, Protists.

In Oct 2014, EnsemblGenomes supports >15,000 species.

=item I<ensemblall>

Load both Ensembl and Ensembl Genomes.

=back

=cut
   } elsif ($arg eq "-db") {
    $main::db = lc(shift(@arguments));
    unless (($main::db eq "ensembl")
	    || ($main::db eq "ensemblgenomes")
	    || ($main::db eq "ensemblall")
            ) {
	&RSAT::error::FatalError($main::db, "Invalid value for the option -db. Supported: ensembl,ensemblgenomes,ensemblall");
    }

=pod

=item B<-available_species>

Get all available species on Ensembl

=cut
    } elsif ($arg eq "-available_species") {
      $main::get_available_species = 1;

=pod

=item B<-release #>

The release release of ensEMBL.

Supported releases: 70 to 72, safe, latest

Default : I<safe>

=over

=item I<safe>

The file locations and/or formats of the Ensembl rsync distribution
may change between two Ensembl release.

For this reason, we defined the "safe" release, which corresponds to
the latest release of ensembl which has been checked to work with this
script.

=item latest

This corresponds to the lastest release of Ensembl. Beware: this
release is not guaranteed to be compatible with RSAT, in case Ensembl
would change their file formats or locations.

=back

=cut
    } elsif (($arg eq "-release") || ($arg eq "-version")) {
      if ($arg eq "-version") {
	&RSAT::message::Warning("option -version is obsolete, has been replaced by -release.");
      }
      $ensembl_release = shift(@arguments);
      &RSAT::error::FatalError($ensembl_release,"is not a valid value for Ensembl release. Supported values: Natural numbers (e.g. 72), safe, latest.") 
      unless ( &IsNatural($ensembl_release) || $ensembl_release eq "safe" || $ensembl_release eq "latest" );

=pod

=item B<-species_suffix>

Suffix to append to the full species ID.

By default, the full species ID is composed by concatenating the
Ensembl species and assembly. The option I<-species_suffix>
allows to specify a string (e.g. _ensembl76, _for_testing, ...) that
will be appended to the full species ID.

=cut
    } elsif ($arg eq "-species_suffix") {
    $species_suffix = shift(@arguments);


=pod

=item B<-force>

Force installation even if the genome is already installed.

=cut

    } elsif ($arg eq "-force") {
      $force = 1;

# = pod
#
# =item B<-dir #>
#
# The directory in wich RSAT data must be installed. The selected
# species will be installed in a sub-directory composed of Species name
# and Ensembl genome release.
#
# Default : $RSAT/data/
#
# =cut
#     } elsif ($arg eq "-dir") {
#       $main::data_dir = shift(@arguments);

=pod

=item	B<-o outputfile>

The output file is used to hold a trace of the transfers (verbosity),
and to store the list of species when the option -available_species is
activated.

If no output file is specified, the standard output is used.

=cut
    } elsif ($arg eq "-o") {
      $outfile{output} = shift(@arguments);


    } else {
      &FatalError(join("\t", "Invalid option", $arg));
    }

  }

=pod

=back

=cut

}

################################################################
## Verbose message
sub Verbose {
    &RSAT::message::TimeWarn("Printing verbose") if ($main::verbose >= 3);
  print $out "; download-ensembl-genome ";
  &PrintArguments($out);
  printf $out "; %-22s\t%s\n", "Program version", $program_version;

  if (%main::outfile) {
    print $out "; Output files\n";
    while (my ($key,$value) = each %main::outfile) {
      printf $out ";\t%-13s\t%s\n", $key, $value;
    }
  }

    printf $out "; %-22s\t%s\n", "Ensembl safe release", $safe_ensembl_release; #&get_ensembl_release_safe($db);
    printf $out "; %-22s\t%s\n", "Ensembl release",$ensembl_release;
    printf $out ("; %-22s\t%s\n", "Species suffix", $species_suffix) if ($species_suffix);
}
