#!/usr/bin/perl -w
############################################################
#
# $Id: download-organism,v 1.32 2013/08/12 10:11:54 rsat Exp $
#
############################################################

## use strict;

=pod

=head1 NAME

download-organism

=head1 VERSION

$program_version

=head1 DESCRIPTION

Download organism(s) from the main RSAT server in order to add support
on a local machine.

Organisms can be loaded one by one (option -org), or by taxon (option
-taxon).

=head1 AUTHORS

Jacques.van.Helden@ulb.ac.be

=head1 CATEGORY

Data management

=head1 REQUIREMENTS

This program relies on a Web aspirator to download genomes from the
remote server to the local machine. By default, we use the freeware
program I<wget>.

=head1 USAGE

Download one or several organisms by species identifiers.

 download-organism -org Species_id_1 -org Species_id_2 ...


Download all the organisms belonging to a given taxon.

 download-organism -taxon taxon

=head1 OUTPUT

By default, the downloaded organisms are stored in the $RSAT genome
repository ($RSAT/public_html/data/genomes).

=head1 SEE ALSO

=head2 suported-organisms

The program I<supported-organism> can be used to obtain the list of
supported organisms on remote RSAT server.

 supported-organisms-server

The query to the remote server can also be restricted to a given
taxon.

 supported-organisms-server -taxon Fungi


=head2 install-organism

The program I<install-organism> performs all the formatting and
calibration tasks for importing genomes from the reference databases
(NCBI, EMBL) to RSAT. 

This differs from I<download-organism>, which transfers the
RSAT-formatted genomes from a RSAT server.

If a genome is available on the RSAT server, it is recommended to use
download-genomes in order to obtain it immediately in the RSAT format,
rather than install-genomes.

=head1 WISH LIST

=over


=item B<-taxfreq>

Download the taxon-wise oligo and dyad frequencies. Those are required for
comparative genomics approaches (footprint-discovery, footprint-scan).

=item B<-get_command>

Use an alternative program for downloading the genome data. Default:
wget.

=item B<anonymous rsync>

Downloading with I<rsync> is currently reserved to the RSAT team
because it requires a password on the RSAT server. We have currently
no way to ensure an anonymous rsync, but this may be envisaged in the
future for the convenience of external users.

=back

=cut


BEGIN {
  if ($0 =~ /([^(\/)]+)$/) {
    push (@INC, "$`lib/");
  }
}
require "RSA.lib";
require RSAT::util;
require RSAT::server;


################################################################
## Main package
package main;
{

  ################################################################
  ## Initialise parameters
  local $start_time = &RSAT::util::StartScript();
  $get_command = "wget";
  $program_version = do { my @r = (q$Revision: 1.32 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };

  ## Default action is to download. This variable is used to display status
  $action = "Downloading";

  ## Parameters for downloading with rsync
  $rsat_server_dir = "";

  $ssh_login = "";
  $ssh_bashrc = "";

  $verbose = 0;
  $dry = 0; ## Print the commands without executing them
  $die_on_error = 0;

  ## Remote server from which genomes should be downloaded
#  $RSAT_SERVER="http://teaching.rsat.eu/";
  $RSAT_SERVER="http://rsat-tagc.univ-mrs.fr/rsat/";

  ## List of organisms to install
  @organisms = ();

  ## List of taxa to install
  @taxa = ();

  ## Local genomes directory
  $local_genomes_dir = $ENV{RSAT}."/public_html/data/genomes/";

  ## Indexes for organism attributes
  %last_update = ();
  %source = ();

  ## First and lst organisms
  our $skip_org = 0;
  our $last_org = 0;
  our $no_blast = 1;

  ################################################################
  ## Read argument values
  &ReadArguments();

  ################################################################
  ## Check that the web aspirator is working
#  $which_aspirator = `which $get_command`;
  $which_aspirator = &RSAT::server::GetProgramPath($get_command);
  &RSAT::message::Debug($which_aspirator) if ($main::verbose >= 4);
  unless ($which_aspirator) {
    &RSAT::error::FatalError("The program ".$get_command." is not found in your path. This program is required for ".$action." data from the server. ");
  }

  ################################################################
  ## Check argument values
  if ((scalar(@organisms) == 0) && (scalar(@taxa) == 0) && ($orglist_file eq "")) {
    &FatalError("You must specify at least one organism (-org) or taxon (-taxon), or a file containing organism names (-org_list).");
  }

  ## Upload option requires ssh login
  if ($main::upload) {
    $action = "Uploading";
    &RSAT::error::FatalError("Option -upload requires to specify an ssh login (option -ssh).") unless ($ssh_login);
  }
  

  ## Output directory
  &RSAT::message::Info("Local genome directory", $local_genomes_dir) if ($main::verbose >= 2);
  &RSAT::util::CheckOutDir($local_genomes_dir);
#  chdir($local_genomes_dir);

  ################################################################
  ## Open output stream
  $main::out = &OpenOutputFile($main::outfile{output});


  ## Get RSAT path for rsync
  if ($get_command eq "rsync") {

    ## Check if the remote RSAT path (on the server) was specified
    if ($ssh_login =~ /(\S+):(\S+)/) {
      $ssh_login = $1;
      $rsat_server_dir = $2;
    }
    
    ## If RSAT server dir is provided with the option -ssh,
    ## automatically load the RSAT_config.bashrc file in this
    ## directory
    if ($rsat_server_dir) {
      unless ($ssh_bashrc) {
	$ssh_bashrc = $rsat_server_dir."/RSAT_config.bashrc";
      }
      $source_bashrc_cmd = "source ".$ssh_bashrc."; ";
      
    } else {
      ## Obtain the remote RSAT path by ssh if required
      if ($ssh_bashrc) {
	$source_bashrc_cmd = "source ".$ssh_bashrc."; ";
      }
      $rsat_server_dir_cmd = "ssh $ssh_login '";
      $rsat_server_dir_cmd .= $source_bashrc_cmd if ($source_bashrc_cmd);
      $rsat_server_dir_cmd .= " echo \$RSAT'";
      &RSAT::message::TimeWarn("Getting RSAT dir on server\n\t", $rsat_server_dir_cmd) if ($main::verbose >= 3);
      $rsat_server_dir = `$rsat_server_dir_cmd`;
      chomp($rsat_server_dir);
      &RSAT::message::Info("RSAT dir on remote server", $rsat_server_dir) if ($main::verbose >= 2);
      &RSAT::error::FatalError("Cannot get RSAT path on remote server", $ssh_login) unless ($rsat_server_dir);
    }
    &RSAT::message::Info("Remote server", $ssh_login, "RSAT env",  $rsat_server_dir) if ($main::verbose >= 2);
  }
  
  ################################################################
  ## Print verbose
  &Verbose() if ($main::verbose);

  ################################################################
  ## Obtain the lists of organisms for selected taxa
  foreach my $taxon (@taxa) {
    &RSAT::message::TimeWarn("Getting list of organisms from RSAT server for taxon", $taxon);
    my $orgs_for_taxa;
    if ($upload) {
      &RSAT::message::Info("Getting list of organisms from local host",$ENV{rsat_site},"to upload for taxon", $taxon);
      $cmd = "supported-organisms -return ID -taxon ".$taxon." | grep -v '^#'";
    } elsif ($get_command eq "rsync") {
      &RSAT::message::Info("Getting list of organisms to download for taxon", $taxon, "by ssh connection", $ssh_login);
      $cmd = "ssh $ssh_login '".$source_bashrc_cmd."supported-organisms -return ID -taxon ".$taxon." | grep -v \"^#\"'";
    } else {
      &RSAT::message::Info("Getting list of organisms to download for taxon", $taxon, " by web services");
      $cmd = "supported-organisms-server -url ".$RSAT_SERVER." -return ID -taxon ".$taxon." | grep -v '^#'";
    }
    &RSAT::message::TimeWarn("\n; ", $cmd) if ($main::verbose >= 3);
    $orgs_for_taxa = `$cmd`;

    chomp ($orgs_for_taxa);
    @orgs_for_taxa = sort(split(/\s+/, $orgs_for_taxa));
    push @organisms, @orgs_for_taxa;
  }

  ## Read list of organisms from a file
  if ($main::orglist_file) {
      &RSAT::message::Info ("Reading list of organisms form file", $main::orglist_file ) if ($main::verbose >= 1);
      my @orgs_from_file = ();
      ($main::orglist) = &OpenInputFile($main::orglist_file);
      while (<$main::orglist>) {
	  chomp();
	  s/\r/\n/g;	  ## Suppress Windows-specific carriage return
	  next if /^;/;		## Comment line
	  next if /^\#/;		## Header line
	  next if /^\--/;		## SQL comment line
	  next unless /\S/;		## Empty line
	  my ($org) = split /\s/;
	  $org = &trim($org); ## Remove leading and trailing spaces
	  push @orgs_from_file, $org;
      }
      close $main::orglist;
      if (scalar(@orgs_from_file) == 0) {
	  &RSAT::error::FatalError("The organism file", $main::orglist_file, "should contain at least one valid organism name.\n",
				   "Use the command supported-organisms-server to obtain the list of organisms supported on the server.");
      } else {
	  &RSAT::message::Info(scalar(@orgs_from_file), "organisms in file", $main::orglist_file) if ($main::verbose >= 1);
	  push @organisms, @orgs_from_file;
      }
  }
  
  &RSAT::message::Info($action, scalar(@organisms), "organisms");
  my $i = 0;
  foreach my $org (@organisms) {
    $i++;
    warn join ("\t", ";", $i, $org) , "\n";
  }
#  &RSAT::message::Info(join("\n\t", "Organisms: ", @organisms));

  ################################################################
  ## Get the update dates + source from the server (in order to restrict the
  ## number of queries, get dates for all organisms and store them for
  ## the selected organisms only)
  &RSAT::message::TimeWarn("Getting information about supported organisms from server") if ($main::verbose >= 2);


  ################################################################
  ## Get organism-specific parameters (data source, last update date).
  my @all_organisms;
  if ($upload) {
    ## For upload, the update date and data sources are read locally
    $cmd = "supported-organisms -v 0 -return ID,last_update,source | grep -v '^#'";
  } elsif ($ssh_login) {
    $cmd = "ssh $ssh_login '".$source_bashrc_cmd."supported-organisms -v 0 -return ID,last_update,source | grep -v \"^#\"'";
  } else {
    $cmd = "supported-organisms-server -url ".$RSAT_SERVER." -v 0 -return ID,last_update,source | grep -v '^#'";
  }
  &RSAT::message::TimeWarn("Getting list of supported organisms from remote server\n; ", $cmd) if ($main::verbose >= 2);
  @all_organisms = `$cmd`;

  &RSAT::message::Info(scalar(@all_organisms), "supported organisms on server", $RSAT_SERVER) if ($main::verbose >= 2);

  foreach my $org_line (@all_organisms) {
    next unless ($org_line); ## Skip empty line returned by RSAT Web service
    chomp($org_line);
    my ($org, $date, $source) = split /\t/, $org_line;
    if ($org) {
      $last_update{$org} = $date || '<NA>';
      $source{$org} = $source || '<NA>';
    }
    &RSAT::message::Debug("Server supported", $org_line, $org, $last_update{$org}, $source{$org}) if ($main::verbose >= 5);
  }


  ################################################################
  ## Download the genomes
  my $org_nb = scalar(@organisms);
  $i = 0;
  foreach my $org (@organisms) {
    $i++;

    next if (($skip_org > 0) && ($i <= $skip_org));
    next if (($last_org > 0) && ($i > $last_org));

    ## download one genome
    $cmd = $which_aspirator;
    if ($get_command eq "wget") {
      my $RSAT_SERVER_ROOT = $RSAT_SERVER;
      $RSAT_SERVER_ROOT =~ s|http://||;
      if ($RSAT_SERVER_ROOT =~ /\//) {
	  $RSAT_SERVER_ROOT = '/'.$'; #'
      }

      &RSAT::message::TimeWarn($action, "genome", $i."/".$org_nb, $org, 
			       "RSAT server", $RSAT_SERVER) if ($main::verbose >= 1);
      &RSAT::message::Debug("RSAT server root", $RSAT_SERVER_ROOT) if ($main::verbose >= 3);
      $cmd .= " --reject jobs --reject 'index.html*'";
      $cmd .= " -X ".${RSAT_SERVER_ROOT}."/data/genomes/".$org."/blast_hits" if ($no_blast);
      if ($main::verbose <= 3) {
	## quiet mode for wget
	$cmd .= " --quiet"; 
      } else {
	## "low-verbosity" mode for wget (the default mode is VERY
	## verbosy, the "low-verbosity" mode is still quite verbosy)
      	$cmd .= " -nv"; 
      }
      $cmd .= " --passive-ftp";
      $cmd .= " --no-parent";
      $cmd .= " --recursive";
      $cmd .= " --timestamping";
      $cmd .= " --relative";
      $cmd .= " --dont-remove-listing";
      $cmd .= " --convert-links";
      $cmd .= " --exclude-directories=jobs";
      $cmd .= " --directory-prefix=".$local_genomes_dir;
      $cmd .= " --no-host-directories --cut-dirs=4";
      $cmd .= " ".${RSAT_SERVER}."/data/genomes/".$org."/";

    } elsif ($get_command eq "rsync") {
      my $rsync_opt = " -e ssh -ruptl -z ";
      $rsync_opt .= " -v" if ($main::verbose >= 4);
      $rsync_opt .= " --exclude jobs --exclude '*~'";
      $rsync_opt .= " --exclude blast_hits --exclude blastdb" if ($no_blast);

      if ($main::upload) {
#	my $remote_genome = $ssh_login.":".$rsat_server_dir."/public_html/data/genomes/".$org."/*";
	my $remote_genome = $ssh_login.":".$rsat_server_dir."/public_html/data/genomes/";
	&RSAT::message::TimeWarn($action, "genome", $i."/".$org_nb, $org, "RSAT server\n", $remote_genome)
	    if ($main::verbose >= 1);
	$cmd .= $rsync_opt;
	$cmd .= " ".$local_genomes_dir."/".$org;
	$cmd .= " ".$remote_genome;
	$cmd =~ s/\/\//\//g;
#	&RSAT::error::FatalError("NOT YET IMPLEMENTED");
      } else {
#	my $remote_genome = $ssh_login.":".$rsat_server_dir."/public_html/data/genomes/".$org."/'*'";
	my $remote_genome = $ssh_login.":".$rsat_server_dir."/public_html/data/genomes/".$org;
	&RSAT::message::TimeWarn($action, "genome", $i."/".$org_nb, $org, "RSAT server\n", $remote_genome) 
	    if ($main::verbose >= 1);
	$cmd .= $rsync_opt;
	$cmd .= " ".$remote_genome;
	$cmd .= " ".$local_genomes_dir."/";
	$cmd =~ s/\/\//\//g;
      }
    }
    &doit($cmd, $dry, $die_on_error, $verbose, 0, 0);


    ## update the organism in the RSAT configuration tables
    unless ($no_config) {
      my $source = $source{$org} || '<NA>';
      my $date = $last_update{$org} || '<NA>';
      &RSAT::message::TimeWarn("Updating organism",$org) if ($main::verbose >= 2);
      $cmd = "install-organism -v 1 -task config,phylogeny -org ".$org;
      $cmd .= " -date \"".$date."\"";
      $cmd .= " -source \"".$source."\"";
#    $cmd .= " >>& install-organisms_log.txt";
      if ($main::upload) {
	$cmd = "ssh $ssh_login '".$source_bashrc_cmd."$cmd'";
      }
      &doit($cmd, $dry, $die_on_error, $verbose, 0, 0);
      &RSAT::message::TimeWarn("Updating config and phylogeny\n", $cmd) if ($main::verbose >= 2);
    }
  }

  my $exec_time = &RSAT::util::ReportExecutionTime($start_time);
  print $main::out $exec_time if ($main::verbose >= 1);
  exit(0);
}

################################################################
################### SUBROUTINE DEFINITION ######################
################################################################


################################################################
## Display full help message 
sub PrintHelp {
    system "pod2text -c $0";
    exit()
}

################################################################
## Display short help message
sub PrintOptions {
    &PrintHelp();
}

################################################################
## Read arguments 
sub ReadArguments {
    my $arg;
    my @arguments = @ARGV; ## create a copy to shift, because we need ARGV to report command line in &Verbose()
    while (scalar(@arguments) >= 1) {
      $arg = shift (@arguments);
	## Verbosity
=pod

=head1 OPTIONS

=over 4

=item B<-v #>

Level of verbosity (detail in the warning messages during execution)

=cut
	if ($arg eq "-v") {
	    if (&IsNatural($arguments[0])) {
		$main::verbose = shift(@arguments);
	    } else {
		$main::verbose = 1;
	    }

	    ## Help message
=pod

=item B<-h>

Display full help message

=cut
	} elsif ($arg eq "-h") {
	    &PrintHelp();

	    ## List of options
=pod

=item B<-help>

Same as -h

=cut
	} elsif ($arg eq "-help") {
	    &PrintOptions();

	    ## Selected organism(s)
=pod

=item B<-org organism>

ID of the organism to be downloaded. This ID is generally the full
name of the organism, where spaces and weird characters have been
replaced by an underscore (_).

Ex: download-organism -v 1 -org Escherichia_coli_K12

The option -org can be used iteratively on teh command line to specify
multiple organisms.

 download-organism -v 1 -org Escherichia_coli_K12 \
    -org Mycoplasma_genitalium \
    -org Saccharomyces_cerevisiae

=cut

} elsif ($arg eq "-org") {
	  push @organisms,  shift(@arguments);

=pod

=item B<-org_list>

This option gives the posibility to specify a set of reference
organisms rather than a taxon. Orthologs will only be searched in the
organisms belonging to the given list.

File format: each row should contain the identifier of one
organism. Lines starting with a semicolumn are ignored.

=cut

} elsif ($arg eq "-org_list") {
    $main::orglist_file= shift(@arguments);

	    ## Selected taxon (or taxa)
=pod

=item B<-taxon taxon>

Name of a taxon for which all organisms will be downloaded.

The option -taxon can be used iteratively on the command line to
specify multiple taxa.

 download-organism -v 1 -taxon Enterobacteriales \
    -taxon Saccharomycetales

=cut

} elsif ($arg eq "-taxon") {
	  push @taxa,  shift(@arguments);


=pod

=item B<-no_blast> |  B<-blast>

Options to specify whether the cross-species blast files should be
downloaded or not together with the genome. 

Default: -no_blast

These files are used by several comparative genomics programs
(I<get-orthologs>, I<footprint-discovery>, I<footprint-scan>...). With
the increase of th number of sequenced genomes, the blast-hit folder
occupies a large disk space, and should be downloaded only

=cut

} elsif ($arg eq "-no_blast") {
    $main::no_blast = 1;

} elsif ($arg eq "-blast") {
    $main::no_blast = 0;

	    ## Output directory
=pod

=item	B<-dir output directory>


Directory for exporting the genome(s) of the selected organism(s).  

By default, genomes are exported in the standar RSAT genome folder
($RSAT/public_html/data/genomes/). This requires write permissions on
this folder. If you don't have those writing permissions, specifying
an alternative directory may be convenient for obtaining the data
files, but the downloaded genoems will not be supported on the local
machine unless the configuration file is adapted (and this anyways
requires the writing permission in the RSAT directory).

=cut
	} elsif ($arg eq "-dir") {
	    $main::local_genomes_dir = shift(@arguments);


=pod

=item	B<-o outputfile>

The output file contains timing and information about downloaded
organisms (installation directories, ...).

If no output file is specified, the standard output is used.  This
allows to use the command within a pipe.

=cut
    } elsif ($arg eq "-o") {
      $main::outfile{output} = shift(@arguments);

=pod

=item B<-server server/path>

Specify an alternative server.

Default: http://rsat-tagc.univ-mrs.fr/rsat/

=cut

    } elsif ($arg eq "-server") {
      $RSAT_SERVER = shift(@arguments);

=pod

=item B<-rsync>

This option requires a ssh login on the main RSAT server, it is
reserved for the RSAT maintenance team.

When this option is activated, genomes are downloaded with the program
I<rsync> is used instead of I<wget>. This ensures a faster transfer
(using the powerful featres of rsync such as on-the-fly compression,
smart updating, etc).

=cut
	  } elsif ($arg eq "-rsync") {
	    $main::get_command = "rsync";


=pod

=item B<-ssh your.login@some.rsat.server>
=item B<-ssh your.login@some.rsat.server:rsat_path_on_server>

This option requires a ssh login on the main RSAT server, it is
reserved for the RSAT maintenance team.

When this option is activated, genomes are downloaded with the program
I<rsync> is used instead of I<wget>. This ensures a faster transfer
(using the powerful features of rsync such as on-the-fly compression,
smart updating, etc).

The rsync connection however requires a user account enabled with ssh
access on the server, so it is reserved to the members of the RSAT
team.

The RSAT path on the remote server can be specified in the
argument. If not, the program will attempt to obtain the RSAT
environment variable by ssh connection to the server, but this might
fail (e.g. if the server issues some message at login).

Examples:

  download-organism -v 1 -org Saccharomyces_cerevisiae \
    -ssh your.login@your.rsat.server.org

  download-organism -v 1 -org Saccharomyces_cerevisiae \
    -ssh your.login@your.rsat.server.org:rsat_path_on_server

=cut
    } elsif ($arg eq "-ssh") {
      $main::get_command = "rsync";
      $main::ssh_login = shift(@arguments);

=pod

=item B<-bashrc bahrc_file>

This option is only used with the option -ssh.

Specify the path of the RSAT bashrc file.  The ssh synchronization
mode requires for the client to load the RSAT_config.bashrc file.
Depending on server configuration, this file might not be loaded in
non-interactive secure shell mode. The file specified with the option
-bashrc will be sourced as the first instruction of the remote ssh
commands.

=cut
    } elsif ($arg eq "-bashrc") {
      $main::ssh_bashrc = shift(@arguments);

=pod

=item B<-upload>

This option is only possible with the ssh option.

=cut
    } elsif ($arg eq "-upload") {
      $main::upload = 1;

=pod

=item B<-skip>

Skip the N first organisms of the list. This option is useful for
resuming an interrupted download.

=cut
	} elsif ($arg eq "-skip") {
	    $main::skip_org = shift(@arguments);

=pod

=item B<-last>

Stop after the N first organisms of the list. This option is useful
for testing the options with taxon-wise queries.

=cut
	} elsif ($arg eq "-last") {
	    $main::last_org = shift(@arguments);


=pod

=item B<-no_config>

Download (or upload) the files but do not run the configuration task
(install-organisms -task config).

=cut
	} elsif ($arg eq "-no_config") {
	    $main::no_config = 1;

=pod

=item B<-dry>

Dry run: print the commands but do not execute them (just for
testing).

=cut
	} elsif ($arg eq "-dry") {
	    $main::dry = 1;

	} else {
	    &FatalError(join("\t", "Invalid option", $arg));

	}
    }

=pod

=back

=cut

}

################################################################
## Verbose message
sub Verbose {
  print $main::out "; download-organism ";
  &PrintArguments($main::out);
  printf $main::out "; %-22s\t%s\n", "Program version", $program_version;
  if (%main::infile) {
    print $main::out "; Input files\n";
    while (my ($key,$value) = each %main::infile) {
      printf $main::out ";\t%-13s\t%s\n", $key, $value;
    }
  }
  if (%main::outfile) {
    print $main::out "; Output files\n";
    while (my ($key,$value) = each %main::outfile) {
      printf $main::out ";\t%-13s\t%s\n", $key, $value;
    }
  }
}


__END__
