#!/usr/bin/perl -w
############################################################
#
# $Id: install-ensembl-genome,v 1.4 2013/03/29 17:46:12 jvanheld Exp $
#
############################################################

## use strict;

=pod

=head1 NAME

install-ensembl-genome

=head1 VERSION

$program_version

=head1 DESCRIPTION

Download a genome + annotations from Ensembl (http://www.ensembl.org/)
and installs it on RSAT.

=head1 AUTHORS

Jeremy.Delerce\@etu.univ-amu.fr

Jacques.van-Helden\@univ-amu.fr

=head1 CATEGORY

=over

=item util

=back

=head1 USAGE

install-ensembl-genome [-i inputfile] [-o outputfile] [-v #] [...]

=head2 Example

 install-ensembl-genome -v 1 -org Homo_sapiens

=head1 OUTPUT FORMAT

The program exports chromosome sequences and genomic features.

=head2 Chromosome sequence format

Chromosome sequences are exported in raw format (sequences only, no
space, no carriage return), which is indispensable in order to enable
direct access to any piece of sequences without having to load whole
chromosomee in memory.


=head2 Genomic features

Genomic features (genes, CDS, mRNAs, ...) are exported in gft format
(the RSAT specification of genomic features). For a description, of
this format and conversions from/to other formats, type

  convert-features -h

=head1 SEE ALSO

=head2 supported-organisms-ensembl

Returns the list of all organisms supported at Ensembl.

=head2 ensembl-org-info

Provides basic information about genome content (version, chromosome
names) for a given organism.

=head1 WISH LIST

=over

=item B<-task sequences>

Download all chromosomal sequences from Ensembl.

=item B<-task features>

Download all chromosomal features from Ensembl.

=back

=cut


BEGIN {
  if ($0 =~ /([^(\/)]+)$/) {
    push (@INC, "$`lib/");
  }
}
require "RSA.lib";
use Bio::EnsEMBL::Registry; ## Required to connect to Ensembl API

################################################################
## Main package
package main;
{

  ################################################################
  ## Initialise parameters
  our $start_time = &RSAT::util::StartScript();
  our $program_version = do { my @r = (q$Revision: 1.4 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
  #    $program_version = "0.00";

  our %infile = ();
  our %outfile = ();

  our $verbose = 0;
  our $in = STDIN;
  our $out = STDOUT;


  our @org_names = ();
  our $registry = "Bio::EnsEMBL::Registry";


  ################################################################
  ## Read argument values
  &ReadArguments();

  ################################################################
  ## Check argument values
  if (scalar(@org_names) < 1) {
    &RSAT::error::FatalError("You must specify at least one organism (option -org).");
  }

  ################################################################
  ## Open a connection to Ensembl
  &RSAT::message::TimeWarn("Opening connection to Ensembl registry", $registry) if ($main::verbose >= 1);
  $registry->load_registry_from_db(
				   -host => 'ensembldb.ensembl.org',
				   -user => 'anonymous'
				  );


  my $org_nb = scalar(@org_names);
  my $i = 0;
  for my $org_name (@org_names) {
    $i++;

    ################################################################
    ## Open adaptors for selected organism
    &RSAT::message::TimeWarn("Opening adaptors for organism", $i."/".$org_nb,  $org_name) if ($main::verbose >= 0);
    my $slice_adaptor = $registry->get_adaptor($org_name, 'core', 'slice');
    my $vf_adaptor = $registry->get_adaptor($org_name, 'variation','variationfeature');

    ## Get the list of chromosomes with their names and sizes
    my @slices = @{$slice_adaptor->fetch_all('chromosome')};

    ## Identify the list of slices for each chromosome. Each slice
    ## actually corresponds to one contig
#    my @sorted_slices = sort {$a->seq_region_name() cmp $b->seq_region_name()} @slices;
    my @sorted_slices = sort {$b->length() <=> $a->length()} @slices;

    foreach my $slice (@sorted_slices) {
      my $name=$slice->name();
      my $seq_region_name=$slice->seq_region_name();
      my $start=$slice->start();
      my $end=$slice->end();
      my $len=$slice->length();
      &RSAT::message::TimeWarn("Retrieving sequence for chromosome slice", $seq_region_name, $name, $start, $end, $len) if ($main::verbose >= 1);
      #      my $seq = $slice->seq();
    }
  }

  ################################################################
  ## Open output stream
  $out = &OpenOutputFile($outfile{output});


  ################################################################
  ## Print verbose
  &Verbose() if ($main::verbose >= 1);

  ################################################################
  ## Execute the command

  ################################################################
  ## Insert here output printing

  ################################################################
  ## Report execution time and close output stream
  my $exec_time = &RSAT::util::ReportExecutionTime($start_time); ## This has to be exectuted by all scripts
  print $out $exec_time if ($main::verbose >= 1); ## only report exec time if verbosity is specified
  close $out if ($outfile{output});

  exit(0);
}

################################################################
################### SUBROUTINE DEFINITION ######################
################################################################


################################################################
## Display full help message 
sub PrintHelp {
  system "pod2text -c $0";
  exit()
}

################################################################
## Display short help message
sub PrintOptions {
  &PrintHelp();
}

################################################################
## Read arguments 
sub ReadArguments {
  my $arg;
  my @arguments = @ARGV; ## create a copy to shift, because we need ARGV to report command line in &Verbose()
  while (scalar(@arguments) >= 1) {
    $arg = shift (@arguments);
    ## Verbosity

=pod

=head1 OPTIONS

=over 4

=item B<-v #>

Level of verbosity (detail in the warning messages during execution)

=cut
    if ($arg eq "-v") {
      if (&IsNatural($arguments[0])) {
	$main::verbose = shift(@arguments);
      } else {
	$main::verbose = 1;
      }


=pod

=item B<-h>

Display full help message

=cut
    } elsif ($arg eq "-h") {
      &PrintHelp();


=pod

=item B<-help>

Same as -h

=cut
    } elsif ($arg eq "-help") {
      &PrintOptions();

=pod

=item B<-org organism_name>

Name of the species (organism).

The list of supported species in Ensembl can be obtained withb
I<supported-organisms-ensembl>.

This option can be used iteratively to install multiple
genomes. Alternatively, a list of organisms can be entered with the
option I<-i>.

=cut
    } elsif ($arg eq "-org") {
      push @org_names, shift(@arguments);


=pod

=item B<-i inputfile>

Specify an input file containing a list of organisms to be installed.

=cut
    } elsif ($arg eq "-i") {
      $main::infile{input} = shift(@arguments);

=pod

=item	B<-o outputfile>

If no output file is specified, the standard output is used.  This
allows to use the command within a pipe.

=cut
    } elsif ($arg eq "-o") {
      $outfile{output} = shift(@arguments);

    } else {
      &FatalError(join("\t", "Invalid option", $arg));

    }
  }

=pod

=back

=cut

}

################################################################
## Verbose message
sub Verbose {
  print $out "; install-ensembl-genome ";
  &PrintArguments($out);
  printf $out "; %-22s\t%s\n", "Program version", $program_version;
  if (%main::infile) {
    print $out "; Input files\n";
    while (my ($key,$value) = each %main::infile) {
      printf $out ";\t%-13s\t%s\n", $key, $value;
    }
  }
  if (%main::outfile) {
    print $out "; Output files\n";
    while (my ($key,$value) = each %main::outfile) {
      printf $out ";\t%-13s\t%s\n", $key, $value;
    }
  }
}


__END__
