#!/usr/bin/perl -w

############################################################
#
# $Id: install-ensembl-genome,v 1.32 2013/10/13 12:25:26 rsat Exp $
#
############################################################

## use strict;

=pod
    
=head1 NAME

install-ensembl-genome

=head1 VERSION

$program_version

=head1 DESCRIPTION

Install on RSAT genome sequence, genomic features and (optionally)
variation features for a genome form Ensembl
(L<http://www.ensembl.org/>).

The connection to ensembl is ensured by a combination of their Perl
API and their ftp site (some information can not be obtained directly
from the API, e.g. the taxonomy).

=head1 AUTHORS

=over

=item I<Jeremy Delerce> (Master 2 thesis 2013)

=item I<Alejandra Medina-Rivera> (amedina@lcg.unam.mx)

=item I<Jacques van Helden> (Jacques.van-Helden\@univ-amu.fr)

=back

=head1 CATEGORY

=over

=item util

=back

=head1 USAGE

C<install-ensembl-genome [-version #] [-task #] [-o outputfile] [-v #] [...]>

=head2 Example

C<install-ensembl-genome -v 1 -species Saccharomyces_cerevisiae>

=head1 OUTPUT FORMAT

The program exports chromosome sequences and genomic features.

=head2 Chromosome sequence format

Chromosome sequences are exported in raw format (sequences only, no
space, no carriage return), which is indispensable in order to enable
direct access to any piece of sequences on the hard drive, rather than
loading the whole chromosome in memory.

=head2 Genomic features

Genomic features (genes, CDS, mRNAs, ...) are exported in gft format
(the RSAT specification of genomic features). For a description, of
this format and conversions from/to other formats, type

C<convert-features -h>

=head2 Variation features

Variation features are exported in a tab-delimited format specific to
RSAT. This format can be generated from files in classical formats
(VCF, GVF) with the tool I<convert-variations>.

For a description, of variation formats

C<download-ensembl-variations -h>

C<convert-variations -h>

=head1 SEE ALSO

=head2 download-ensembl-genome

I<install-ensembl-genome> calls I<download-ensembl-genome> to download
genome sequences and install them in the appropriate data directory of
the RSAT package.

=head2 download-ensembl-features

I<install-ensembl-genome> calls I<download-ensembl-features> to
download genomic features from ensembl API and install them in the
RSAT data directory.

=head2 download-ensembl-variations

I<install-ensembl-genome> calls I<download-ensembl-variations> to
download variation features from ensembl ftp and install them in the
RSAT data directory.

=head1 WISH LIST

=over

=back

=cut

BEGIN {
  if ($0 =~ /([^(\/)]+)$/) {
    push (@INC, "$`lib/");
  }
}
require "RSA.lib";
require "RSAT_to_ensembl.lib.pl";

################################################################
## Main package
package main;
{

  ################################################################
  ## Initialise parameters
  our $start_time = &RSAT::util::StartScript();
  our $program_version = do { my @r = (q$Revision: 1.32 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };

  our %infile = ();
  our %outfile = ();

  our $verbose = 0;
  our $dry_run = 0;
  our $die_on_error = 1;
  our $batch = 0;
  our $out = STDOUT;

  our $db = "ensembl"; ## Alternative: ensemblgenomes
  our $safe_ensembl_version = &get_ensembl_version_safe($db);
  our $latest_ensembl_version = &get_ensembl_version_latest($db);
  our $ensembl_version = &get_ensembl_version($db);
  our $species_suffix = "";

  #  our $safe_ensembl_version;
  #  our $latest_ensembl_version;
  #  our $ensembl_version;
  our $get_available_species = 0;

  our @tasks = ();
  our %task = ();
  our @default_tasks = qw(genome features config install);
  our @extra_tasks = qw(variations );
  our @supported_tasks = (@default_tasks, @extra_tasks);
  our %supported_task = ();
  foreach my $task (@supported_tasks) {
    $supported_task{$task} = 1;
  }
  our $supported_tasks = join ",", keys %supported_task;
  our @species_names = ();
  our $only_api = 0;

  ################################################################
  ## Read argument values
  &ReadArguments();

  ################################################################
  ## Check argument values

  ## Check that the user-selected Ensembl version is supported.
  &check_ensembl_version($db, $ensembl_version);

  ## Set default tasks if no task has been specified
  if (scalar(@tasks) < 1) {
    @tasks= @default_tasks;
  }

  ## Index tasks
  &RSAT::message::Info("Requested tasks:", join(",", @tasks)) if ($main::verbose >= 2);
  foreach my $task (@tasks) {
    next unless ($task);
    $task{$task} = 1;
  }
#  die("HELLO");

  ## Read list of species from a file
  if ($infile{species_list}) {
    &RSAT::message::TimeWarn("Reading species list from file", $infile{species_list}) if ($main::verbose >= 2);
    my ($file) = &OpenInputFile($infile{species_list});
    while (<$file>) {
      next unless (/\S/);	## Skip empty rows
      next if (/^;/);		## Skip comment lines
      next if (/^#/);		## Skip header line
      chomp();
      my ($species_name) = split(/\s+/);
      if ($species_name) {
	push (@species_names,$species_name);
      }
    }
  }

  ## Check that at least one species name has been specified
  unless ($get_available_species) {
    if (scalar(@species_names) < 1) {
      &RSAT::error::FatalError("You must specify at least one species (option -species).");
    }
  }

  ################################################################
  ## Open output stream and print verbose
  $out = &OpenOutputFile($outfile{output});
  &Verbose() if ($main::verbose >= 1);

  ################################################################
  ## If requested, get the list of available species on the Ensembl
  ## server.
  &GetAvailableSpecies();

  ################################################################
  ## Run the download and installation tasks
  my $s=0;
  my $nb_species = scalar(@species_names);

  ## If batch request, submit each species as a separate job
  if (($batch) && (scalar(@species_names) > 1)) {
    ## Collect all arguments except the organism list
    my $arg;
    my $passed_arguments = "";
    my @arguments = @ARGV; ## we use a copy of the list of arguments for this shift, because we need ARGV to report command line in &Verbose()
    while (scalar(@arguments) >= 1) {
      $arg = shift (@arguments);
      if (($arg eq "-species") 
	  || ($arg eq "-org")
	  || ($arg eq "-species_file")) { 
	shift(@arguments);
      } elsif ($arg eq "-batch") {
	## Skip batch to avoid multiplying the jobs
      } else {
	if ($arg =~ /\s/) {
	  $passed_arguments .= " '".$arg."'";
	} else {
	  $passed_arguments .= " ".$arg;
	}
      }
    }

    ## Run one install command per species
    foreach my $species (@species_names) {
      my $cmd = $ENV{RSAT}."/perl-scripts/install-ensembl-genome";
      $cmd .= " ".$passed_arguments;
      $cmd .= " -species ".$species;
      my $job_prefix = "install-ensembl-genome_".$species;
      &RSAT::message::TimeWarn("Installing species", $species, $job_prefix, $cmd) if ($main::verbose >= 0);
      &doit($cmd, $dry_run, $die_on_error, $verbose, $batch, $job_prefix);
    }

  } else {

    foreach my $species (@species_names) {
      $s++;
      my $arg = " -v ".$verbose;
      $arg .= " -db ".$db;
      $arg .= " -version ".$ensembl_version;
      $arg .= " -species ".$species;
      $arg .= " -species_suffix ".$species_suffix if ($species_suffix);

      if ($task{"genome"}) {
	&RSAT::message::TimeWarn("Downloading genome for species", $s."/".$nb_species, $species) if ($main::verbose >= 1);
	my $command = $ENV{'RSAT'}."/perl-scripts/download-ensembl-genome ".$arg;
	&doit($command, $dry_run, $die_on_error, $verbose);
      }

      if ($task{"features"}) {
	&RSAT::message::TimeWarn("Downloading features for species", $s."/".$nb_species, $species) if ($main::verbose >= 1);
	my $command = $ENV{'RSAT'}."/perl-scripts/download-ensembl-features ".$arg;
	&doit($command, $dry_run, $die_on_error, $verbose);
      }

      if ($task{"variations"}) {
	&RSAT::message::TimeWarn("Downloading variations for species", $s."/".$nb_species, $species) if ($main::verbose >= 1);
	my $command = $ENV{'RSAT'}."/perl-scripts/download-ensembl-variations ".$arg;
	&doit($command, $dry_run, $die_on_error, $verbose);
      }

      ## Get the name of the organism as installed in RSAT. This
      ## differs from the Ensembl species name, because we add a
      ## suffix with the assembly version.
      my $full_species_id = &Get_full_species_ID($species, $assembly_version, $ensembl_version, $species_suffix);
      
      &RSAT::message::Info("Species", $species, "Full species ID", $full_species_id) if ($main::verbose >= 2);

      ## Declare the organism in RSAT
      if ($task{"config"}) {
	&RSAT::message::TimeWarn("Configuring genome on RSAT for species", $s."/".$nb_species, $species) if ($main::verbose >= 1);
	my $command = $ENV{'RSAT'}."/perl-scripts/install-organism -v 1  -source ".$db." -task config -org ".$full_species_id;
	&doit($command, $dry_run, $die_on_error, $verbose);
      }

      ## Perform the remaining installation procedure (check start and
      ## stop codon, compute oligo and dyad frequencies in upstream
      ## sequences, compute aa frequencies in protein sequences, ...)
      if ($task{"install"}) {
	&RSAT::message::TimeWarn("Installing genome on RSAT for species", $s."/".$nb_species, $species, $full_species_id) if ($main::verbose >= 1);
	my $command = $ENV{'RSAT'}."/perl-scripts/install-organism -v 1  -source ".$db;
	$command .= " -org ".$full_species_id;
	$command .= " -task config,allup,seq_len_distrib,genome_segments,upstream_freq,oligos,dyads,protein_freq,start_stop";
	$command .= " -batch" if ($batch); ## JvH : to check: I don't thinkn I can pass here if the batch option is active
	&doit($command, $dry_run, $die_on_error, $verbose);
      }
    }
  }

  ################################################################
  ## Report execution time
  my $exec_time = &RSAT::util::ReportExecutionTime($start_time); ## This has to be exectuted by all scripts
  print $out $exec_time if ($main::verbose >= 1); ## only report exec time if verbosity is specified
  close $out;
  exit(0);
}


################################################################
################### SUBROUTINE DEFINITION ######################
################################################################

################################################################
## Display full help message 
sub PrintHelp {
  system "pod2text -c $0";
  exit()
}

################################################################
## Display short help message
sub PrintOptions {
  &PrintHelp();
}

################################################################
## Read arguments 
sub ReadArguments {
  my $arg;
  my @arguments = @ARGV; ## create a copy to shift, because we need ARGV to report command line in &Verbose()
  while (scalar(@arguments) >= 1) {
    $arg = shift (@arguments);

=pod

=head1 OPTIONS

=over 4

=item B<-v #>

Level of verbosity (detail in the warning messages during execution)

=cut
    if ($arg eq "-v") {
      if (&IsNatural($arguments[0])) {
	 $main::verbose = shift(@arguments);
      } else {
	 $main::verbose = 1;
      }

=pod

=item B<-h>

Display full help message

=cut
    } elsif ($arg eq "-h") {
      &PrintHelp();

=pod

=item B<-help>

Same as -h

=cut
    } elsif ($arg eq "-help") {
      &PrintOptions();

=pod

=item B<-available_species>

Get available species on Ensembl for all available task

=cut
    } elsif ($arg eq "-available_species") {
      $main::get_available_species = 1;

=pod

=item B<-species #>

Name of the species (organism).

This option can be used iteratively to install multiple
genomes. Alternatively, a file contianing a list of species names can
be entered with the option I<-species_file>.

Use the option I<-available_species> to print a list of available
species at Ensembl, or options I<-available_species -db ensemblgenomes>
to get the species available at EnsemblGenomes (extended Ensembl).

=cut
    } elsif (($arg eq "-species") || ($arg eq "-org")) { 
      my $org =  shift(@arguments);
      unless ($org =~ /_/) {
	&RSAT::error::FatalError("The option -org requires a valid organism name (non-empty, and should contain a least one underscore).");
      }
      push @species_names, $org;

=pod

=item B<-species_file #>

File containing the list of species to install.  The file must be in
text format. The first word of each row (everything that precedes a
space or tab) is considered as a species name. Further information on
the line is ignored.

=cut
    } elsif ($arg eq "-species_file") {
      $main::infile{species_list} = shift(@arguments);

=pod

=item B<-version #>

The release version of ensEMBL.

Supported versions: versions later than 70, safe, latest.  This script
was developed while Ensembl was at version 72.

Default : I<safe>

=over

=item I<safe>

The file locations and/or formats of the Ensembl ftp distribution
may change between two Ensembl release.

For this reason, we defined the "safe" version, which corresponds to
the latest version of ensembl which has been checked to work with this
script.

=item latest

This corresponds to the lastest version of Ensembl. Beware: this
version is not guaranteed to be compatible with RSAT, in case Ensembl
would change their file formats or locations.

=back

=cut
    } elsif ($arg eq "-version") {
    $ensembl_version = shift(@arguments);
    &RSAT::error::FatalError($ensembl_version,"is not a valid value for Ensembl version. Supported values: Natural numbers (e.g. 72), safe, latest.")
      unless ( &IsNatural($ensembl_version) || $ensembl_version eq "safe" || $ensembl_version eq "latest" );

=pod

=item B<-assembly_version>

Choose a particular assembly version for the configuration. In
principle this option should not be used, snice the assembly version
is automatically determined when the request is send to Ensembl.

This option can be convenient to enable the option I<-task config>
when the genome has been synchronized from another RSAT server.

=cut
    } elsif ($arg eq "-assembly_version") {
      $main::assembly_version = shift(@arguments);

=pod

=item B<-species_suffix>

Suffix to append to the full species ID.

By default, the full species ID is composed by concatenating the
Ensembl species and assembly version. The option I<-species_suffix>
allows to specify a string (e.g. _ensembl76, _for_testing, ...) that
will be appended to the full species ID.

=cut
    } elsif ($arg eq "-species_suffix") {
    $species_suffix = shift(@arguments);

=pod

=item B<-task #>

What you whant to install

Supported tasks : genome, features, variations, config, install

=over

=item B<-task genome>

Download all genomic sequences from Ensembl.
This task is executed by running I<download-ensembl-genome>.

=item B<-task features>

Download all genomic features from Ensembl.
This task is executed by running I<download-ensembl-features>.

=item B<-task variations>

Download all genomic variations from Ensembl.
This task is executed by running I<download-ensembl-variations>.

=item B<-task config>

Declare the organism to RSAT. This step is necessary for this organism
to become supported on the local RSAT installation.  This task is
executed by running I<install-organism>.

=item B<-task install>

Perform a series of installation tasks in order to have the organims
fully supported in RSAT. In particular, compute the frequencies of
oligonuleotides and dyads in all upstream sequences, in order to
calibrate background models for motif analysis.

This task is executed by running I<install-organism>.

=back

=cut
    } elsif ($arg eq "-task") {
      my @new_tasks = split(",", shift(@arguments));
      foreach my $task (@new_tasks) {
	  &RSAT::error::FatalError($task, "Invalid task for install-ensembl-genomes. Supported:", $supported_tasks)
	      unless ($supported_task{$task});
	  push (@main::tasks, split(",",$task));
      }


# =pod 
#
# =item B<-dir #>
#
# The directory in wich RSAT data must be installed. The selected
# species will be installed in a sub-directory composed of Species name
# and Ensembl genome version.
#
# Default : $RSAT/data/
#
# =cut
#     } elsif ($arg eq "-dir") {
#       $main::data_dir = shift(@arguments);

=pod

=item B<-db ensembl|ensemblgenomes>

Default: Ensembl

Select the source database. 

=over

=item I<ensembl>

The "historical" Ensembl database (L<http://ensembl.org/>), restricted
to a series of genomes from model organisms (69 supported species on
Oct 30, 2014).

=item I<ensemblgenomes>

The exended EnsemblGenomes database (L<http://ensemblgenomes.org/>),
which comprises repositories for the following taxa: Bacteria
(actually includes Archaea), Fungi, Metazoa, Plants, Protists.

In Oct 2014, EnsemblGenomes supports >15,000 species.

=item I<ensemblall>

Load both Ensembl and Ensembl Genomes.

=back

=cut
   } elsif ($arg eq "-db") {
    $main::db = lc(shift(@arguments));
    unless (($main::db eq "ensembl")
	    || ($main::db eq "ensemblgenomes")
	    || ($main::db eq "ensemblall")
    ) {
	&RSAT::error::FatalError($main::db, "Invalid value for the option -db. Supported: ensembl,ensemblgenomes,ensemblall");
    }

=pod

=item	B<-o outputfile>

If no output file is specified, the standard output is used.  This
allows to use the command within a pipe.

=cut
   } elsif ($arg eq "-o") {
	$outfile{output} = shift(@arguments);

=pod

=item B<-only_api>

Only use API to download. This argument is passed to
I<download-ensembl-features>.

=cut
   } elsif ($arg eq "-only_api") {
     $main::only_api = 1;


=pod

=item B<-nodie>

Do not die in case a sub-program returns an error.

The option -nodie allows you to circumvent problems with specific
sub-tasks, but this is not recommended because the results may be
incomplete.

=cut

     } elsif ($arg eq "-nodie") {
	$main::die_on_error = 0;

=pod

=pod

=item B<-dry>

Dry run: print the commands without running them (for testing and
debugging).

=cut
     } elsif ($arg eq "-dry") {
	$dry_run = 1;

=item B<-batch>

Run the last installation tasks (I<install-organism>) in batch. This
option requires to dispose of a PC cluster, and to configure it
properly in the configuration file $RSAT/RSAT_config.props.

=cut
     } elsif ($arg eq "-batch") {
	$batch = 1;

     } else {
	&FatalError(join("\t", "Invalid option", $arg));
     }
 }

=pod

=back

=cut

}

################################################################
## Verbose message
sub Verbose {
  print $out "; install-ensembl-genome ";
  &PrintArguments($out);
  printf $out "; %-22s\t%s\n", "Program version", $program_version;
  printf $out "; %-22s\t%s\n", "Ensembl release", $ensembl_version;
#  printf $out "; %-22s\t%s\n", "Assembly version", $assembly_version;
  printf $out "; %-22s\t%s\n", "Species suffix", $species_suffix;
  if (%main::infile) {
    print $out "; Input files\n";
    while (my ($key,$value) = each %main::infile) {
      printf $out ";\t%-13s\t%s\n", $key, $value;
    }
  }
  if (%main::outfile) {
    print $out "; Output files\n";
    while (my ($key,$value) = each %main::outfile) {
      printf $out ";\t%-13s\t%s\n", $key, $value;
    }
  }
}

################################################################
## Get the list of available species for the 3 components of the
## installation (download, features and variations).
sub GetAvailableSpecies {
  if ($get_available_species) {
    my %available_species = ();

    my $arg = "-v ".$verbose;
    $arg .= " -available_species";
    $arg .= " -version $ensembl_version";
    $arg .= " -db ".$db;

    my %species_taxon = ();

    ## Get the list of available species from Ensembl via download-ensembl-genome
    &RSAT::message::TimeWarn("Getting the list of species with available genomes in", $db) if ($main::verbose >= 2);
    my @species = qx{$ENV{'RSAT'}/perl-scripts/download-ensembl-genome  $arg};
    foreach (@species) {
      chomp();
      my ($species,$taxon) = split("\t");
      $species_taxon{$species} = $taxon unless ($db eq "ensembl");
      push (@{$available_species{$species}}, "genome");
    }

    ## Get the list of genomes for which features are available
    &RSAT::message::TimeWarn("Getting the list of species with available features in ", $db) if ($main::verbose >= 2);
    if ($only_api) {
      $arg .= " -only_api";
    }
    my $command = $ENV{'RSAT'}."/perl-scripts/download-ensembl-features -available_species ".$arg;
    @species = qx{$command};
    foreach (@species) {
      chomp();
      my ($species,$taxon) = split("\t");
      $species_taxon{$species} = $taxon unless ($db eq "ensembl");
      push (@{$available_species{$species}}, "features");
    }

    ## Get the list of genomes for which variations are available
    &RSAT::message::TimeWarn("Getting the list of species with available variations in", $db) if ($main::verbose >= 2);
    @species = qx{$ENV{'RSAT'}/perl-scripts/download-ensembl-variations -available_species $arg};
    foreach (@species) {
      chomp();
      push (@{$available_species{$_}}, "variations");
    }

    ## Print out the result
    foreach my $species ( sort {$a cmp $b} keys(%available_species) ) {
      print $out $species,"\t",join(", ",@{$available_species{$species}});
      print $out "\t", $species_taxon{$species} if ($species_taxon{$species});
      print $out "\n";
    }

    ## Report execution time
    my $exec_time = &RSAT::util::ReportExecutionTime($start_time); ## This has to be exectuted by all scripts
    print $out $exec_time if ($main::verbose >= 1); ## only report exec time if verbosity is specified
    close $out;
    exit(0);
  }
}

__END__
