#!/usr/bin/env perl
############################################################
#
# $Id: install-organism,v 1.129 2013/10/13 12:24:00 jvanheld Exp $
#
# Time-stamp: <2017-03-17 10:53:06 eda>
#
############################################################

## TO DEBUG: with option -org_list all organisms get the same name
## (the first one of the list). JvH 2016-02-17.

# use strict;
BEGIN {
  if ($0 =~ /([^(\/)]+)$/) {
    push (@INC, "$`lib/");
  }
}
require "RSA.lib";
require "RSAT_to_ensembl.lib.pl";
use RSAT::util;
use RSAT::Tree;
use RSAT::TreeNode;
use RSAT::organism;


if ($0 =~ /([^(\/)]+)$/) {
  push (@INC, "$`lib/");
}

## Main package
package main;
{

  ################################################################
  ## Initialize parameters
  local $start_time = &RSAT::util::StartScript();
  $org_table = $ENV{RSAT}."/public_html/data/supported_organisms.tab";
  @masking_modes = ("");	## by default, use no masking
  @oligo_lengths=(6,1,2,3,4,5,7,8);
  $verbose = 0;

  $seq_format = "fasta"; ## default format for exporting sequences
  $img_format = $ENV{rsat_img_format} || "png";
  $noov="";
  $strands="-1str";
  $purged_frequencies = 0; #### temporarily inactivated because for human genome it suppresses almost all sequences !!!!
  $supported_for_installation{"fasta"} = 1;
  $supported_for_installation{"raw"} = 1;
  $supported_for_installation{"filelist"} = 1;
  @default_tasks = qw(
			config
			phylogeny
			start_stop
			allup
			seq_len_distrib
			genome_segments
			upstream_freq
			genome_freq
			protein_freq
			protein_len
			oligos
			dyads
		       );

  @extra_tasks = qw(
                    fasta_genome
                    fasta_genome_rm
                    chrom_sizes
                    index_bedtools
                    uninstall
                    erase
		    distrib
		    ncf
		    intergenic_freq
		    ensembl_freq
                    clean
                    default
            rewrite
		    all
      );

  ## Tasks to handle download and parsing of genomes from NCBI ftp/rsync site.
  @species_tasks = qw(available
                      download
                      list
                      parse);

  @organism_tasks = (@default_tasks, @extra_tasks);
  @supported_tasks = (@species_tasks, @organism_tasks);
  %supported_task = ();
  foreach my $task (@supported_tasks) {
    $supported_task{$task} = 1;
  }
  foreach my $task (@species_tasks) {
    $species_task{$task} = 1;
  }
  foreach my $task (@organism_tasks) {
    $organism_task{$task} = 1;
  }
  $supported_tasks = join ",", @supported_tasks;
  %task = ();
  $parse_options = "";

  $null = "<NA>";		## NULL value
  $source = $null;
  $taxonomy = $null;
  $taxid = $null;
  @taxa = ();
  our %infile = ();

  ## File containing a list of organisms to install
  $infile{organisms} = "";

  ## File containing a list of species to install from NCBI/Refseq FTP site
  $infile{species} = "";

  ## List of organisms
  our @species = (); # for download and install
  our @organisms = (); # organisms are concatenation of species + strain

  ## Strain
  our $strain = "";
  our $max_strains = ""; # max number of strains per species for parsing task

  ## Group
  our $group = "";

  ## options for testing and debugging
  $skip = 0;  ## Skip the top organisms from the lsit
  $last = 0;  ## Stop after a given number of organisms from the list

  our $backup_org_table = 0;

  ## installation date
  local $install_date = $force_date || &AlphaDate();
  chomp $install_date;

  ## Store the starting directory
  $dir{main} = $ENV{PWD};

  ## Directory to download files from NCBI
  $dir{downloads} = $ENV{DOWNLOAD_DIR} || $ENV{RSAT}."/downloads/";
#  $dir{refseq} = $ENV{REFSEQ_DIR} || $ENV{RSAT}."/downloads/refseq";

  ## Options for the job scheduler
  $dry = 0;
  $die_on_error = 0;
  $batch = 0;
  $job_prefix = "install_org";
  $log_handle = "";
  $err_handle = "";
  $cluster_queue = $ENV{CLUSTER_QUEUE};
  $download_date="";
      
  ################################################################
  ## Read arguments
  &ReadArguments();

  ################################################################
  ## Check parameter values

  @task = keys %task;
  if ($#task == -1) {
    &RSAT::error::FatalError("You should specify at least one task.\nSupported tasks\n\t$supported_tasks\n");
  }

  ## Run all default tasks
  if ($task{default}) {
    foreach my $task (@default_tasks) {
      $task{$task} = 1;
    }
  }

  ## Run all the tasks for each organism (not the species-wide tasks
  ## like downloading and parsing).
  if ($task{all}) {
    %task = %organism_task;
  }


  ## Check compatibility between batch and task dependencies
  if ($batch) {
    if ($task{allup} && $task{upstream_freq}) {
      &RSAT::error::FatalError("Tasks allup and upstream_freq cannot be combined in batch mode. First run allup, then upstream_freq.");
    }

    if ($task{config}) {
	&RSAT::error::FatalError("Task config cannot be executed in batch mode. ");
    }
  }

  ## Consistency between tasks
  if ($task{oligos} || $task{dyads}) {
    unless (($task{upstream_freq}) ||
	    ($task{intergenic_freq}) ||
	    ($task{protein_freq}) ||
	    ($task{genome_freq}) ||
	    ($task{ensembl_freq})) {
      &RSAT::error::FatalError("The tasks 'oligos' and 'dyads' require to specify at least one sequence type among the following. ",
			       "\n\tupstream_freq,intergenic_freq,genome_freq,protein_freq,ensembl_freq");
    }
  }
  if (($task{protein_freq}) && !($task{oligos})) {
    &RSAT::error::FatalError("The task 'protein_freq' requires to activate the task 'oligos'");
  }

  ## Count the number of species-wide and organism-specific takss
  our $requested_species_tasks = 0;
  our $requested_organism_tasks = 0;
  &RSAT::message::TimeWarn("Checking tasks", join(";", keys(%task))) if ($main::verbose >= 2);
  foreach my $task (keys(%task)) {
      if ($species_task{$task}) {
	  $requested_species_tasks++;
      }
      if ($organism_task{$task}) {
	  $requested_organism_tasks++;
      }
  }
  &RSAT::message::Info("Species tasks", $requested_species_tasks) if ($main::verbose >= 2);
  &RSAT::message::Info("Organism tasks", $requested_organism_tasks) if ($main::verbose >= 2);
  

  ################################################################
  ## If a species file has been specified, read species list
  if ($infile{species}) {
    &RSAT::message::TimeWarn("Reading species file", $infile{species}) if ($main::verbose >= 1);
    my ($speciess) = &OpenInputFile($infile{species});
    while (<$speciess>) {
      next unless (/\S/); ## Skip empty rows
      next if (/^;/); ## Skip comment rows
      next if (/^#/); ## Skip header rows
      chomp();
      my ($species) = split /\s+/;
      push @species, $species;
    }
    &RSAT::message::Info(scalar(@species), "species read from file", $infile{species}) if ($main::verbose >= 1);
  }

  ################################################################
  ## Check consistency of arguments for download/list/parse tasks.
  if ($task{parse} || $task{download} || $task{list} || ($task{config} && ($group))) {
    #$source = "NCBI-Refseq";
      $source = "NCBI"; ## 2019 this format works better for the newest formating in NCBI

    ## Check the directory containing the Genbank-formatted organisms
    ## (which can be downloaded from NCBI/refseq).
    unless ($dir{refseq}) { # Refseq dir specified on the command line
	$dir{refseq} = $dir{downloads}."/refseq";

    }
    &RSAT::util::CheckOutDir($dir{refseq});

    ## Check that all required parameters are defined

    ## One taxonomic group
    unless ($group) {
      &RSAT::error::FatalError("Tasks download, list and parse require to specify a group (e.g. fungi, bacteria, ...) to which all the species to download should belong.");
    }

    ## At least one species is required
    unless (scalar(@species) >= 1) {
      &RSAT::error::FatalError("Tasks download, list, parse and config require to specify at least one species (e.g. Saccharomyces_cerevisiae, Escherichia_coli). To get a list of available species, use option '-task available'. ");
    }

    ## If strain is specified, check that only one species has been requested
    if ($strain) {
      &RSAT::error::FatalError("Option -strain is only compatible with a single species. ") unless (scalar(@species) == 1);
    }
  }

  ################################################################
  ## Open output stream and print arguments if required
  our $out = &OpenOutputFile($outputfile);
  &Verbose if ($main::verbose >= 1);

  ################################################################
  ## Get the list of available species for the selected group
  if ($task{available}) {
    &RSAT::message::TimeWarn("Getting available species for group", $group) if ($main::verbose >= 2);
    unless ($group) {
      &RSAT::error::FatalError("The task 'available' requires to specify a group (e.g. fungi, bacteria, ...).");
    }
    
    
    my $command = 'rsync --copy-links  --times  --list-only rsync://ftp.ncbi.nlm.nih.gov/genomes/refseq/'.$group.'/ | awk \'$1 ~ "drw"  && $5 ~ "^[A-Z]"  {print $5}\'';
    if ($main::verbose >= 3) {
      print "\nRunning command\n", $command, "\n";
    }
    my $available_species = `$command`;
    print $main::out $available_species;
#      &doit($command, $dry, $die_on_error, $main::verbose);
    
    my $exec_time = &RSAT::util::ReportExecutionTime($start_time);
    print $main::out $exec_time if ($main::verbose >= 1);
    close $main::out if ($outputfile);
    exit(0); # Available species is not compatible with any other task
  }

    
    ############ Rewrite table
    if($task{rewrite}){
        &RewriteConfigTab();
        exit;
    }
    
  ################################################################
  ## Download the required files from NCBI FTP site
  if ($task{download}) {

    ## Download each species
    my $species_nb = 0;
    foreach my $species (@species) {
	$species_nb++;

	if (($last_species > 0) && ($species_nb > $last_species)) {
	    &RSAT::message::Warning("Stopped after", $last_species, "species") if ($main::verbose >= 1);
	    last;
	}
	if (($skip_species > 0) && ($species_nb <= $skip_species)) {
	    &RSAT::message::Warning("Skipping", $skip_species, "species") if (($species_nb == 1) && ($main::verbose >= 1));
	    next;
	}


	## Dowload the assembly file for the group

	my $assembly_summary_url=  join("/", "rsync://ftp.ncbi.nlm.nih.gov/genomes/refseq", $group, "assembly_summary.txt");
	&RSAT::util::CheckOutDir($dir{downloads});
	my $command = "rsync --copy-links --recursive --times -R --verbose ".$assembly_summary_url." ".$dir{downloads};
	&doit($command, $dry, $die_on_error, $main::verbose);
	&RSAT::message::TimeWarn("Downloaded group assembly summary file", $dir{downloads}) if ($main::verbose >= 3);

	## Select the information for the species and assembly, retrieve the column with the FTP address for the genome
	my $space_species=$species ;
	$space_species=~s/_/ /g;

	my $command = join (" ","grep", '"'.$space_species.'"',$dir{downloads}."/refseq/".$group."/assembly_summary.txt | grep ". '"'.$strain.'"'. "| cut -f20 | head -n1" ); ## Some organisms already have complete versions from different places

	my $URL= `$command`;
	chomp($URL);

	$URL=~s/ftp:\/\///;

	chomp($URL);
	&RSAT::message::TimeWarn("Downloading genome from NCBI FTP site", $URL) if ($main::verbose >= 1);

	
	$dir{downloads} = $ENV{RSAT}."/downloads/refseq/".$strain;
	
	&RSAT::util::CheckOutDir($dir{downloads});
	$RSYNC_URL = $URL;
	$RSYNC_URL =~ s/https/rsync/;
	#my $command = "rsync --copy-links --recursive --times -R --verbose ".${RSYNC_URL}."/ ".$dir{downloads}."/";
	my $command = "rsync --copy-links --times --recursive --verbose ".$RSYNC_URL."/* ".$dir{downloads}."/";

	# print $command ; 	die "BOOM";

	&doit($command, $dry, $die_on_error, $main::verbose);


	&RSAT::message::TimeWarn("Downloaded genome in dir", $dir{downloads}) if ($main::verbose >= 3);
	$download_date =  &AlphaDate();

       
    }
  }

  ## For parsing task, build organism names from species names +
  ## strains (as sub-directories of the NCBI/Refseq
  ## latest_assembly_versions)
#  if (($task{parse}) || ($task{list})) {
  if ($task{parse} || $task{list} || ($task{config} && ($group))) {
   
	 
      ## Check refseq directory
      unless (-d $dir{refseq}) {
	  &RSAT::error::FatalError("Refseq directory ".$dir{refseq}." does not exists");
      }

      ## Header for species list
      if ($task{list}) {
	  print $main::out "#", join("\t", "organism_name", "org_nb",  "species",  "species_nb", "strain", "strain_nb"), "\n";
      }
      
      # Iterate over species
      my $org_nb = 0; # Initialize organism counter
      my $species_nb = 0; # Initialize species counter
      foreach my $species (@species) {
	  $species_nb++;
	  if (($last_species > 0) && ($species_nb > $last_species)) {
	      &RSAT::message::Warning("Stopped after", $last_species, "species") if ($main::verbose >= 1);
	      last;
	  }

	  my $latest_assembly_dir = $dir{refseq};
	#  my $latest_assembly_dir = $dir{refseq}."/".$group."/".$species."/all_assembly_versions/suppressed";


	  my @strain_folders;
	  if ($strain) {
	      @strain_folders = glob($latest_assembly_dir."/".$strain);
#	      die "HELLO\t", $strain, "\t", join("\n\t", @strain_folders);
	  } else {
	      @strain_folders = glob($latest_assembly_dir."/*");
	  }

	  my $strains_for_species = scalar(@strain_folders);

	  # Iterate over each strain of current species
	  my $strain_nb = 0; # Strain counter for current species
	  foreach my $strain_folder (@strain_folders) {
	      
	      my $current_strain = &ShortFileName($strain_folder);

	      ## User-selected strain
	      if ($strain) {
#		  &RSAT::message::Debug($strain, $current_strain) if ($main::verbose >= 10);
		  next unless ($strain eq $current_strain);
	      }

	      ## Strain
	      $strain_nb ++;

	      ## Restrinct the number of strains if requested
	      if (($max_strains > 0) && ($strain_nb > $max_strains)) {
		  my $remaining_strains = scalar(@strain_folders) - $strain_nb + 1;
		  &RSAT::message::Info("Species", $species, "ignoring ", $remaining_strains, "strains of", $strains_for_species) if ($main::verbose >= 2);
		  last;
	      }

	      ## Update organism counter
	      $org_nb++;

	      ## Skip first N species if requested
	      if (($skip_species > 0) && ($species_nb <= $skip_species)) {
		  &RSAT::message::Warning("Skipping", $skip_species, "species") if (($species_nb == 1) && ($main::verbose >= 1));
		  next;
	      }

	      ## Build organism name (concatenation of species + strain)
	      my $organism_name = $species."_".$current_strain;

	      ## Add organism to the list
	      push @organisms, $organism_name;
	      &RSAT::message::Info(scalar(@organisms), $species, $current_strain, $organism_name) if ($main::verbose >= 3);

	      ## Parse the selected organism if requested
	      $genome_dir = $ENV{RSAT}."/public_html/data/genomes/".$organism_name."/genome";

	      if ($task{list}) {
		  print $out join("\t", $organism_name, $org_nb, $species, $species_nb, $current_strain, $strain_nb), "\n";
	      }

	      if ($task{parse}) {
		 
		  &ParseGenome($species, $current_strain, $organism_name, $strain_folder, $genome_dir, $org_nb);
	      }
	  }
      }
  }

#### option  -all_organisms is not supported anymore
#   if ($main::all_organisms) {
#     @organisms_dirs = glob($dir{refseq}."/*_*");
# #    @organisms_dirs = &glob($dir{genbank}."/*_*");
#     foreach my $dir (@organisms_dirs) {
#       my $org = &RSAT::util::ShortFileName($dir);
#       next if ($org =~ /^readme/i); ## Skip readme file
#
#       ## The NCBI dir Bacteria contains some folders starting with
#       ## "_", I guess these genomes are problematic.
#       next if ($org =~ /^_/);  ## Skip directories that do not seem to correspond to gene_species
#       next if ($org =~ /uncultured/);  ## Skip uncultured genomes
#
#       ## The NCBI dir Bacteria contains some gzip archives. 
#       next if ($org =~ /\.gz$/); 
#
#       ## The Bacteria dir contains some folders with fuzzy names,
#       ## starting with a lowercase. I prefer to ignore these genomes
#       ## (e.g. secondary_endosymbiont_of_Ctenarytaina_eucalypti_uid172737)
#       next unless ($org =~ /^[A-Z]/); 
#       push @organisms, $org;
#     }
#     &RSAT::message::Info("Installing",scalar(@organisms),
# 			 "organisms found in the Refseq directory", 
# 			 $dir{refseq}) if ($main::verbose >= 1);
#     if ($main::verbose >= 3) {
#       for my $i (1..scalar(@organisms)) {
# 	&RSAT::message::Info("", $i, $organisms[$i-1]);
#       }
#     }
#   }

  ## If an organism file has been specified, read organism list
  if ($infile{organisms}) {
    my ($orgs) = &OpenInputFile($infile{organisms});
    while (<$orgs>) {
      next unless (/\S/); ## Skip empty rows
      next if (/^;/); ## Skip comment rows
      next if (/^#/); ## Skip header rows
      chomp();
      my ($org) = split /\s+/;
      push @organisms, $org;
    }
  }


  ## If a taxon has been specified, collect the list of organisms belonging to this taxon in the RSAT-supported organisms
  if (scalar(@taxa) > 0) {
    if ($task{parse}) {
      &RSAT::message::Warning("With the option -taxon, the parse option will only re-parse previously installed organisms from the selected taxon");
    }

    $tree = new RSAT::Tree();
    $tree->LoadSupportedTaxonomy("Organisms", \%supported_organism);

    foreach my $taxon (@taxa) {
      my $taxon_node = $tree->get_node_by_id($taxon);
      my @taxon_organisms = $taxon_node->get_all_descendents("DFS","leaf",undef,undef);
      &RSAT::message::TimeWarn("Collected", scalar(@taxon_organisms), "organisms for taxon", $taxon, $taxon_node) if ($main::verbose >= 1);
      foreach my $org_node (@taxon_organisms) {
	my $short_organism_name = $org_node->getid();
	push @organisms, $short_organism_name;
      }
    }

    @organisms = sort (@organisms);
  }

  &RSAT::message::Info("organisms", scalar(@organisms)) if ($main::verbose >= 2);

  ## Check that at least one organism has been specified
  &RSAT::error::FatalError("At least one organism should be specified (options -org, -taxon or -all_organisms)") 
      unless ((scalar(@organisms) > 0) || ($task{parse}));

  ## Some options are incompatible with multiple organisms
  if (scalar(@organisms) > 1) {
    &RSAT::message::FatalError("The option -organism (organism full name) is not valid when multiple organisms are selected.") if ($user_specified_orgname);
    &RSAT::message::FatalError("The option -dir is (installation directory) not valid when multiple organisms are selected.") if ($install_dir);
    &RSAT::message::FatalError("The option -syn (synonym table) is not valid when multiple organisms are selected.") if ($outfile{synonyms});
  }

  ################################################################
  ## Check if any task has to be done on organisms (and not just to
  ## NCBI species).
  if ($requested_organism_tasks == 0) {
      &RSAT::message::Warning("No organism-specific task has been requested. ") if ($main::verbose >= 1);
      exit(0);
  }

  ################################################################
  ## Iterate installation over selected organisms
  $i = 0;
  foreach our $organism_short_name (@organisms) {
    $i++;
    if (($skip > 0) && ($i <= $skip)) {
      &RSAT::message::TimeWarn("Skipping organism", $i."/".scalar(@organisms), $organism_short_name) if ($main::verbose >= 1);
      next;
    }
    if (($last > 0) && ($i > $last)) {
      &RSAT::message::TimeWarn("Stopping after organism", $i."/".scalar(@organisms), $organism_short_name) if ($main::verbose >= 1);
      last;
    }
    &RSAT::message::TimeWarn("Installing organism", $i."/".scalar(@organisms), $organism_short_name) if ($main::verbose >= 1);

    $backup_org_table = 0 if ($i > 1); ## When we treat multiple organisms, we don't want to store a backup for each one
    
    unless (($task{erase}) || ($task{uninstall})) {

      ## Create an object to hold the organism attribtues
      $organism = new RSAT::organism();
#    $organism->check_name($organism_short_name, 1);
      $organism->force_attribute("ID", $organism_short_name);
      $organism->set_attribute("feature_types", '*');
      
      ################################################################
      ## If the organism was previously supported, ensure that the
      ## serialized files are suppressed.
      &RSAT::message::Info("Cleaning previous serialized files") if ($main::verbose >= 3);
      $organism->delete_serial_files();

      ################################################################
      ## Automatic full name specification
      my $organism_full_name = "";
      unless ($organism_full_name{organism_short_name}) {
	if ($user_specified_orgname) {
	  ## Full name specified with option -organism on the command line
	  $organism_full_name = $user_specified_orgname;
	} else {
	  ## Build organism full name from the organism short name (replace _ by spaces)
	  $organism_full_name = $organism_short_name;
      if(($source eq 'ensemblgenomes') || ($source eq 'ensembl')){
          $organism_full_name = (defined $species[0] && !($species[0] eq '')) ? ucfirst($species[0]) : $organism_short_name;
          @names = split($organism_full_name."_", $organism_short_name);
          $organism->set_attribute("genome_assembly", $names[1]);
      }
	  $organism_full_name =~ s/\_/ /g;
	}
      }
#      $organism->set_attribute("name", $organism_short_name);
      $organism->set_attribute("name", $organism_full_name);
      
      
      
	################################################################
	## Installation directories and files
	umask 0002;

      ## TO DO (JvH, 2017-12-12): below and elsewhere in the script,
      ## replace the global hash tables by attributes set on the
      ## organism object;
      
	if ($install_dir) {
	    $dir{install} = $install_dir;
	    &RSAT::message::Info("Installing genome in directory specified on the command line", $dir{install}) if ($main::verbose >= 3);
	} elsif ($supported_organism{$organism_short_name}->{'data'}) {
	    $dir{install} = $supported_organism{$organism_short_name}->{'data'};
	    &RSAT::message::Info("Installing genome in directory previously specified in the config file", $dir{install}) if ($main::verbose >= 3);
	} else {
	    $dir{install} = $ENV{RSAT}."/public_html/data/genomes/".$organism_short_name;
	    &RSAT::message::Info("Installing genome in default directory", $dir{install}) if ($main::verbose >= 3);
	}
     

	$dir{genome} = $dir{install}."/genome";
	$dir{oligos} = $dir{install}."/oligo-frequencies";
	$outfile{features} = $dir{genome}."/feature.tab";
	$outfile{synonyms} = $dir{genome}."/feature_names.tab";
	$outfile{genome} = $dir{genome}."/contigs.txt";
	$outfile{fasta_genome} = $dir{genome}."/".$organism_short_name.".dna.genome.fa";
	$outfile{chrom_sizes} = $dir{genome}."/".$organism_short_name."_chrom_sizes.txt";
	$outfile{fasta_genome_rm} = $dir{genome}."/".$organism_short_name.".dna_rm.genome.fa";

	## Create output directoryies if required
	unless ($dry) {
	    &RSAT::util::CheckOutDir($dir{install});
	    &RSAT::util::CheckOutDir($dir{genome});
	    &RSAT::util::CheckOutDir($dir{oligos});
	}
    }
    
    ################################################################
    ###################### Installation tasks ######################
    ################################################################

    ## Open an output stream for messages
    &OrgParameters() if ((scalar(@organisms) ==1) || ($main::verbose >= 3));
    
    &Erase() if ($task{erase});

    &Uninstall() if ($task{uninstall});
    
    unless (($task{uninstall}) || ($task{erase})) {
	
	if ($task{config}) {
	    $config_ok = &UpdateConfig();
	    next unless ($config_ok);
	}
      
      &CreatePhylogeny() if ($task{phylogeny});
      
      &StartAndStopCodons() if ($task{start_stop});
      
      &IntergenicSegments() if ($task{genome_segments});
      
      &AllUpstream() if ($task{allup});

      &FastaGenomeAndBedtoolsIndex() if ($task{fasta_genome} || $task{chrom_sizes} || $task{index_bedtools});

      &CalcFrequencies() if (($task{oligos}) || ($task{distrib}) || ($task{dyads}));
      
      # New name for peptidic sequence file (Sept 2015). I suppress
      # organism name from sequence file name.
      my $peptidic_seq_file = "peptidic_sequences.fasta";
      unless (-e $dir{genome}."/".$peptidic_seq_file) {
	## Previous name for the peptidic sequence file
	$peptidic_seq_file = ${organism_short_name}."_aa.fasta";
      }

      if ($task{protein_len}) {
	  my $command = &SeqLengthDistributionCmd($peptidic_seq_file, "fasta", "protein", 50, 2000);
	  &doit($command, $dry, $die_on_error, $main::verbose);
      }



      &CleanUp() if ($task{clean});
      
      ## Touch the installation dir to indicate the last modification date
      $dir{main};
      chdir($dir{main});
      &RSAT::message::Info("Working dir",  $ENV{PWD}) if ($main::verbose >= 4);
      system "touch $dir{install}";
    }


  }

  my $exec_time = &RSAT::util::ReportExecutionTime($start_time);
  print $main::out $exec_time if ($main::verbose >= 1);
  close $out if ($outputfile);
  exit(0);
}


################################################################
##################  SUBROUTINE DEFINITIONS  ####################
################################################################


################################################################
## Display verbosity message
sub Verbose {
  print $out "; install-organism ";
  &PrintArguments($out);

  print $out ";\n; Tasks:\n";
  foreach my $task (sort(keys(%task))) {
    print $out ";\t$task\n";
  }
  
  print $out ";\n; Supported organism table:\n";
  printf $out ";    %-25s\t%s\n", "RSAT config", $main::org_table;
  printf $out ";    %-25s\t%s\n", "\$RSA_LOCAL_CONFIG", $ENV{'RSA_LOCAL_CONFIG'} if ($ENV{'RSA_LOCAL_CONFIG'});
  
}


################################################################
## Print organism parameters
sub OrgParameters {
  print $out ";\n; Organism parameters:\n";
  if ($infile{organisms}) {
    printf $out ";    %-25s\t%s\n", "organism file", $infile{organisms};
  } else {
    printf $out ";    %-25s\t%s\n", "ID", $organism_short_name;
    printf $out ";    %-25s\t%s\n", "Name", $organism_full_name; ## THIS IS WRONG (JvH, 2017-12-12)
    printf $out ";    %-25s\t%s\n", "Update date", $install_date;
    printf $out ";    %-25s\t%s\n", "Data source", $source;
  }
  
  print $out ";\n; Directories and files:\n";
  if ($task{genome}) {
    printf $out ";    %-25s\t%s\n", "genome sequence format", $seq_format;
    printf $out ";    %-25s\t%s\n", "genome sequence file", $infile{genome};
  }
  printf $out ";    %-25s\t%s\n", "feature table", $infile{features} if ($infile{features} );;
  printf $out ";    %-25s\t%s\n", "feature file", $infile{ptt} if ($infile{ptt} );
  printf $out ";    %-25s\t%s\n", "synonyms", $infile{synonyms} if ($infile{synonyms});
  printf $out ";    %-25s\t%s\n", "Refseq dir", $dir{refseq};
  printf $out ";    %-25s\t%s\n", "Installation directory", $dir{install};
  printf $out ";    %-25s\t%s\n", "Genome directory", $dir{genome};
    printf $out ";    %-25s\t%s\n", "genome sequence file", $outfile{genome};
  printf $out ";    %-25s\t%s\n", "feature table", $outfile{features};
  printf $out ";    %-25s\t%s\n", "synonyms", $outfile{synonyms};
#    printf $out ";    %-25s\t%s\n", "file to update", $config_to_update;
}


################################################################
## Uninstall an organism by deleting its row from the configuration
## file. This does not clean the organism directory on the hard drive
## (which requires the task "erase").
sub Uninstall {
    foreach our $organism_short_name (@organisms) {
	&RSAT::OrganismManager::check_name($organism_short_name, 1);
	if (defined($supported_organism{$organism_short_name})) {
	    &RSAT::message::Warning("Uninstalling organism", $organism_short_name) if ($main::verbose >= 1);
	    delete($supported_organism{$organism_short_name});
	}
    }
    &RSAT::OrganismManager::export_supported_organisms(file=>$main::org_table, backup=>$main::backup_org_table);
}

################################################################
## Erase an organism by deleting its whole folder in
## RSAT/public_html/data/genomes directory.
sub Erase {
  foreach our $organism_short_name (@organisms) {
    &RSAT::message::Warning("Erasing organism", $organism_short_name) if ($main::verbose >= 1);
      
    ## Identify the data directory for the organism
    my $organism_data_dir = "";
    if ((defined($supported_organism{$organism_short_name})) &&
	(defined($main::supported_organism{$organism_short_name}->{'data'}))) {
      $organism_data_dir = $main::supported_organism{$organism_short_name}->{'data'};
    } else {
      $organism_data_dir = $ENV{RSAT}."/public_html/data/genomes/".$organism_short_name;
    }
    
    ## Remove the directory
    if (-e $organism_data_dir) {
      &RSAT::message::Warning("Erasing directory", $organism_data_dir) if ($main::verbose >= 2);
      &doit("rm -rf ".$organism_data_dir, $dry, $die_on_error, $main::verbose);
    } else {
      &RSAT::message::Warning("No need to erase unexisting directory", $organism_data_dir);
    }

  }
}

################################################################
## Update configuration file
sub UpdateConfig {
  ## Read taxonomy from the parsing result
  my $organism_table = $dir{install}."/genome/organism.tab";
  my $id = $null;
  $taxid = $null;
  $taxonomy = $null;

  unless (-e $organism_table) {
    if ($main::die_on_error) {
      &RSAT::error::FatalError("Missing organism table", $organism_table);
    } else {
      &RSAT::message::Warning("Missing organism table", $organism_table);
      return(0);
    }
  }

  ($org_handle) = &OpenInputFile($organism_table);
  my $taxid_field = 1; ## Default field for the taxonomic ID
  my $taxonomy_field = 2; ## Default field for the taxonomy
  while (<$org_handle>) {
    chomp();
    next unless (/\S/);
    if (/^-- field\s+(\d)	taxonomy/) {
      $taxonomy_field = $1;
    } elsif (/^-- field\s+(\d)	taxid/) { 
      $taxid_field = $1;
    } elsif (/^--/ || /^;/ || /^#/) { #; are required to parse organism.tab created by parse_gtf
      next;
    } else {
      @fields = split "\t";
      $id = $fields[0];

      ## Get taxonomy
      $taxonomy = $fields[$taxonomy_field - 1];

      ## Check if taxid is conform (must be a number)
      $taxid = $fields[$taxid_field - 1];
      if ($taxid =~ /^\d+$/) {
	    $supported_organism{$organism_short_name}->{'taxid'} = $taxid;
	    &RSAT::message::Info($organism_short_name, "TAXID", $taxid) if ($main::verbose >= 3);
      } else {
	    &RSAT::message::Warning($taxid, "Invalid taxid in organism description table",  $organism_table);
	    $taxid = $null;
      }

      &RSAT::message::Info ("Parsed taxonomy from organism.tab", 
			    "id", $id, 
			    "taxid", $taxid, 
			    "taxonomy", $taxonomy) if ($main::verbose >= 3);
    }
  }
  close $org_handle;

  ## Default limits of upstream region for retrieve-seq
  unless (defined($up_from)) {
    ## If already defined in the file supported-organism.tab, keep
    ## previously defined value (might have been specifically tuned
    ## for some reason).
    if (defined($supported_organism{$organism_short_name}->{'up_from'})) {
      $up_from = $supported_organism{$organism_short_name}->{'up_from'};;
    } else {
      ## Taxon-specific default values for sequence lengths
      if (
	  ($taxonomy =~ /^Bacteria;/) || ($taxonomy =~ /^Archaea;/) ||
	  ($taxonomy =~ /;\s*Bacteria;/) || ($taxonomy =~ /;\s*Archaea;/)
	 ) {
	$up_from=-400;
      } elsif (($taxonomy =~ /^Viruses;/) ||
	       ($taxonomy =~ /;\s*Viruses;/)
	      ) {
	$up_from=-400;
      } elsif ($taxonomy =~ /;\s*Fungi;/) {
	$up_from=-800;
      } elsif ($taxonomy =~ /;\s*Nematoda;/) {
	$up_from=-3000;
      } elsif ($taxonomy =~ /;\s*Insecta;/) {
	$up_from=-3000;
      } elsif ($taxonomy =~ /;\s*Metazoa/) {
	$up_from=-3000;
      } elsif ($taxonomy =~ /;\s*Viridiplantae/) {
	$up_from=-3000;
      } else {
	$up_from=-1000;
      }
    }
  }
  unless (defined($up_to)) {
    if (defined($supported_organism{$organism_short_name}->{'up_to'})) {
      $up_to = $supported_organism{$organism_short_name}->{'up_to'};
    } else {
      $up_to = -1;
    }
  }
  &RSAT::message::Info("Upstream region limits from", $up_from, "to", $up_to) if ($main::verbose >= 3);

  &UpdateConfigTab() unless (($dry) || ($batch));
  return(1);
}

################################################################
## Update the perl config file
sub UpdateConfigPerl {
  my $comment_previous_config = 0;
  if ($local_config) {
    $config_to_update = $ENV{'RSA_LOCAL_CONFIG'} ;
  } else {
    $config_to_update = $ENV{RSAT}."/public_html/data/supported_organisms.pl";
  }

  ## Check if the organism was already installed before
  if (($main::verbose >= 4) && defined($supported_organism{$organism_short_name}->{'genome'})) {
      &RSAT::message::Warning($organism_short_name, "already defined in the config file", $config_to_update) ;
  }
  
  ## read previous config
  open CONFIG, $config_to_update;
  while ($line = <CONFIG>) {
    chomp $line;
    last if ($line =~ /return/);
    if (($line =~ /supported_organism\{\'$organism_short_name\'\}/) && ($line !~ /^\#/)) {
      ## comment the previous config
      if ($comment_previous_config) {
	$previous_config .= "# ${line} # reinstalled on\t${date}\n";
      }
    } else {
	$previous_config .= $line."\n";
      }
  }
  close CONFIG;

  ## Write updated config file
  &RSAT::message::Info ("Updating supported organisms", $config_to_update) if ($main::verbose >= 3);
  open CONFIG, ">$config_to_update" 
    || die "Error: cannot write config file $config_to_update\n";
  print CONFIG $previous_config;
  my $new_org_config = 	 "\n#### $organism_short_name\t$organism_full_name{$organism_short_name}\t$install_date\n";
  $new_org_config .= "\$supported_organism{'$organism_short_name'}->{'name'} = \"$organism_full_name{$organism_short_name}\";\n";
  $new_org_config .=  "\$supported_organism{'$organism_short_name'}->{'data'} = \"$dir{install}\";\n";
  $new_org_config .=  "\$supported_organism{'$organism_short_name'}->{'last_update'} = \"".$install_date."\";\n";
  $new_org_config .=  "\$supported_organism{'$organism_short_name'}->{'source'} = \"$source\";\n";
  ## OLIVIER SAND SHOULD CHECK IF THIS RESTRICTION FOR ensembl IS STILL VALID
  unless ($source =~ 'ensembl') {
    $new_org_config .=  "\$supported_organism{'$organism_short_name'}->{'features'} = \"$outfile{features}\";\n";
    $new_org_config .=  "\$supported_organism{'$organism_short_name'}->{'genome'} = \"$outfile{genome}\";\n";
    $new_org_config .=  "\$supported_organism{'$organism_short_name'}->{'seq_format'} = \"filelist\";\n";
  }
  $new_org_config .=  "\$supported_organism{'$organism_short_name'}->{'taxonomy'} = \"$taxonomy\";\n";
  if (defined($outfile{synonyms})) {
    $new_org_config .=  "\$supported_organism{'$organism_short_name'}->{'synonyms'} = \"$outfile{synonyms}\";\n";
  }
  $new_org_config .=  "\$supported_organism{'$organism_short_name'}->{'up_from'} = ".$up_from.";\n";
  $new_org_config .=  "\$supported_organism{'$organism_short_name'}->{'up_to'} = ".$up_to.";\n";

  ## Replace absolute paths by relative paths
  ##   $new_org_config =~ s|$ENV{RSAT}\/|\$ENV\{RSAT\}\/|g;
  $new_org_config =~ s|$ENV{RSAT}|\$ENV\{RSAT\}\/|g;
  $new_org_config =~ s|\/\/|/|g;

  #$new_org_config .= "\$supported_organism{'$organism_short_name'}->{'download_date'} = ".$download_date.";\n";

  
  print CONFIG $new_org_config;
  print CONFIG "\nreturn 1;\n";
  close CONFIG;
}

################################################################
## Update the tab-delimited file  describing the list of organisms
sub UpdateConfigTab {
  my (%args) = @_;
  &RSAT::message::Info ("Updating supported organisms", $main::org_table) if ($main::verbose >= 3);

  ## Update the hash variable for the new organism
  $supported_organism{$organism_short_name}->{'name'} = $args{name} || $organism->get_attribute("name");
  $supported_organism{$organism_short_name}->{'data'} = $args{data} || $dir{install};
  $supported_organism{$organism_short_name}->{'last_update'} = $args{last_update} ||  $install_date;
    $supported_organism{$organism_short_name}->{'genome_assembly'} = $organism->get_attribute("genome_assembly");

  #$supported_organism{$organism_short_name}->{'download_date'} =  $download_date if ($task{download});

#  &RSAT::message::Debug($organism,  $organism_short_name, $organism->get_attribute("name"), $install_date) if ($main::verbose >= 10);

  ## Change source if requested
  #$prev_source = $supported_organism{$organism_short_name}->{'source'} ;
  if (($main::source) && ($main::source ne $null)) {
    $supported_organism{$organism_short_name}->{'source'} = $main::source;
  } elsif (($args{source}) && ($args{source} ne $null)) {
    $supported_organism{$organism_short_name}->{'source'} = $args{source};
  }
 
  ## OLIVIER SAND SHOULD CHECK IF THIS RESTRICTION FOR ensembl IS STILL VALID
#  unless ($source eq 'ensembl') {
    $supported_organism{$organism_short_name}->{'features'} = $args{features} || $main::outfile{features};
    $supported_organism{$organism_short_name}->{'genome'} = $args{genome} || $main::outfile{genome};
    $supported_organism{$organism_short_name}->{'seq_format'} = $args{seq_format} || "filelist";
#  }
  $supported_organism{$organism_short_name}->{'taxonomy'} = $args{taxonomy} || $main::taxonomy;
  $supported_organism{$organism_short_name}->{'taxid'} = $args{taxid} || $main::taxid;
  if (defined($main::outfile{synonyms})) {
    $supported_organism{$organism_short_name}->{'synonyms'} = $args{synonyms} || $main::outfile{synonyms};
  }
  $supported_organism{$organism_short_name}->{'up_to'} = $args{up_to} || $main::up_to;

  if (($main::up_from) && ($main::up_from ne $null)) {
    $supported_organism{$organism_short_name}->{'up_from'} = $main::up_from;
  } elsif (($args{up_from}) && ($args{up_from} ne $null)) {
    $supported_organism{$organism_short_name}->{'up_from'} = $args{up_from};
  }
 
  ## Update blast column in config table if blast information is available

  my $blast_folder=$supported_organism{$organism_short_name}->{'data'}."/blast_hits" ;
  print "genome folder ". $genome_folder."\n";
  if ( -d $blast_folder ){
    $supported_organism{$organism_short_name}->{'blast_available'}= 1;
  } else {
    $supported_organism{$organism_short_name}->{'blast_available'}= 0;
  }
  ## Update variant columns in config table if variant information
  my $variants_folder=$supported_organism{$organism_short_name}->{'data'}."/variations" ;
 
  print "variants folder" . $variants_folder."\n"; 
  if ( -d $variants_folder ){
    $supported_organism{$organism_short_name}->{'variant_available'}= 1;
    my $variant_source = `cat $variants_folder/source.txt`;
    chomp($variant_source);
    $supported_organism{$organism_short_name}->{'variant_source'}= $variants_source || "<NA>";
    $supported_organism{$organism_short_name}->{'path_to_variant_files'}= $variants_folder;
  } else {
    $supported_organism{$organism_short_name}->{'variant_available'}= 0; 
    $supported_organism{$organism_short_name}->{'variant_source'}= "<NA>";
    $supported_organism{$organism_short_name}->{'path_to_variant_files'}= "<NA>";
  }


  &RSAT::message::Debug("&UpdateConfigTab()",
			$supported_organism{$organism_short_name}->{'source'},
			$supported_organism{$organism_short_name}->{'taxid'},
			$supported_organism{$organism_short_name}->{'ID'},
			$supported_organism{$organism_short_name}->{'last_update'},
			$supported_organism{$organism_short_name}->{'up_from'},
			$supported_organism{$organism_short_name}->{'name'},
			$supported_organism{$organism_short_name}->{'variant_available'},
			$supported_organism{$organism_short_name}->{'variant_source'},
			$supported_organism{$organism_short_name}->{'path_to_variant_files'}
      ) if ($main::verbose >= 3);

 

  ## Export the updated table of supported organisms
  &RSAT::OrganismManager::export_supported_organisms(file=>$main::org_table, backup=>$main::backup_org_table);
}

sub RewriteConfigTab{
    my $supported_organism_file = &Get_supported_file();
    my $new_file = '';
    if (-f $supported_organism_file) {
        my ($s_o_file) = &OpenInputFile($supported_organism_file);
        
        ## Read supported_organisms_ensembl.tab
        my $data_dir = &Get_data_dir();
        open ($fen, $data_dir."/supported_organisms_ensembl.tab");
        my %ensembl = ();
        while (my $row = <$fen>){
            chomp($row);
            my @fields = split("\t", $row);
            $ensembl{$fields[0]} = "\t".$fields[2]."\t".$fields[4]."\t".$fields[5];
            #print $fields[0];
        }
        close($fen);
        
        my %blast = ();
        my %variations = ();
        
        while(<$s_o_file>){
            my @fields = split("\t");
            my $key = $fields[0];
            if(-d $data_dir."/genomes/".$key."/blast_hits"){
                $blast{$key} = 1;
            }else{
                $blast{$key} = 0;
            }
            $variations{$key} = ();
            if(-d $data_dir."/genomes/".$key."/variations"){
                $variations{$key}{"variant_available"} = 1;
                my $variant_source = `cat $variants_folder/source.txt`;
                chomp($variant_source);
                $variations{$key}{"source"} = $variant_source || "<NA>";
                $variations{$key}{"path"} = &RSAT::util::hide_RSAT_path($data_dir."/genomes/".$key."/variations");
            }else{
                $variations{$key}{"variant_available"} = 0;
                $variations{$key}{"source"} = "<NA>";
                $variations{$key}{"path"} = "<NA>";
            }
        
        }
        close($s_o_file);
        my ($s_o_file) = &OpenInputFile($supported_organism_file);
        ## Read the whole file of supported organisms from ensembl,
        ## and store species description lines in a hash indexed by
        ## full species ID, in order to sort them after having changed
        ## the current species fields.
        open($fnew, ">", $data_dir."/supported_organisms_new.tab");
        while (my $row = <$s_o_file>) {
            next if (/^;/); ## Skip comment lines
            if($row =~ /^#/){
                my @fields = split("\t");
                print $fnew "#ID\tname\ttaxid\tsource\tlast_update\tnb\tseq_format\tup_from\tup_to\ttaxonomy\tdata\tgenome\tgenome_assembly\tgenome_version\tdownload_date\tvariant_available\tvariant_source\tpath_to_variant_files\tblast_available\n";
            }else{
                chomp($row);
                my @fields = split("\t", $row);
                my $full_species_id = $fields[0];
                #check if the species is already in the list
                my $newline = $row;
                if(defined $ensembl{$full_species_id}){
                    $newline .= $ensembl{$full_species_id};
                }else{
                    $newline .= ("\t<NA>")x3;
                }
                $newline .= "\t".$variations{$full_species_id}{"variant_available"}."\t".$variations{$full_species_id}{"source"}."\t".$variations{$full_species_id}{"path"}."\t".$blast{$full_species_id}."\n";
                print $fnew $newline;
            }
        }
        close $s_o_file;
    	system("mv " . $data_dir . "/supported_organisms_new.tab " . $data_dir . "/supported_organisms.tab");
    }
}

################################################################
## Create a directory for the taxonomic group of the organism
sub CreatePhylogeny {
    my $taxonomy = "";
    &RSAT::message::Warning("Phylogeny indexation has been disabled");
    return;

    ## read taxonomy from the parsing result
    my $organism_table = $dir{install}."/genome/organism.tab";

    ($org_handle) = &OpenInputFile($organism_table);
    while (<$org_handle>) {
        chomp;
        if (/^-- field (\d)	taxonomy/) {
            $taxonomy_field = $1;
        } elsif (/^--/) {
            next;
        } else {
            if (defined($taxonomy_field)) {
                @fields = split "\t";
                $taxonomy = $fields[$taxonomy_field - 1];
                &RSAT::message::Info ("Taxonomyyy\t$taxonomy\n") if ($main::verbose >= 3);
            } else {
                &Warning("Cannot read taxonomy in file $organism_table\n");
            }
        }
    }
    close $org_handle;

    if ($taxonomy) {
	$taxonomy = &trim($taxonomy);
	$taxonomy =~ s|\s*;\s*|/|g; ## Each taxonomic level becomes a subdirectorry
	$taxonomy =~ s|\s+|_|g; ## I prefer to avoid spaces in directory names
	$taxonomy =~ s|\(|_|g; ## this character cannot be used for a directory name
	$taxonomy =~ s|\)|_|g; ## this character cannot be used for a directory name
	$taxonomy =~ s|\,|.|g; ## Not fatal, but usually not found in folder names.
	$taxonomy =~ s|\:|.|g; ## Not fatal, but usually not found in folder names.
	$dir{taxonomy} = $ENV{RSAT}."/public_html/data/phylogeny/".$taxonomy;
	$dir{taxonomy} =~ s|//|/|g;
	my ($org_dir) = &ShortFileName($dir{install});
	&RSAT::util::CheckOutDir($dir{taxonomy}) unless ($dry);
	if ($main::verbose >= 3) {
	    &RSAT::message::Info("Taxonomy directory", $dir{taxonomy});
	    &RSAT::message::Info("Organism directory", $org_dir);
	    &RSAT::message::Info("Link to directory", $dir{install});
	}
	&doit("cd $dir{taxonomy}; rm $org_dir; ln -s $dir{install} .",0,0,$main::verbose);
    } else {
	&RSAT::message::Warning("Cannot identify taxonomy in table ".$organism_table);
    }

}

################################################################
### extract the non-redudant set of intergenic and gene sequences
sub IntergenicSegments {
  &RSAT::message::TimeWarn("&IntergenicSegments()") if ($main::verbose >= 3);

  chdir $dir{genome};

  ## Retrieve sequences for different subtypes of genome regions
  &RSAT::message::TimeWarn("coding-or-not: extracting sequences for different types of genomic regions") if ($main::verbose >= 2);
  my $command = "coding-or-not ";
  $command .= "-v 1 " if ($main::verbose >= 1);
  $command .= "-org $organism_short_name -return ncs,cs,pos,seq,stats";
#  &doit($command, $dry, $die_on_error, $main::verbose);

  my @types = ();
  push @types, "gene";
  push @types, "intergenic";

  foreach my $seq_type (@types) {
    my $seq_file = $dir{genome}."/".$organism_short_name."_".$seq_type."_segments.fasta";

    ## Draw sequence length distributions
    &RSAT::message::TimeWarn("Computing sequence length for genomic regions", $seq_type) if ($main::verbose >= 2);
    if ($task{seq_len_distrib}) {
      $command .= "; " unless ($command eq "");
      $command .= &SeqLengthDistributionCmd($seq_file, "fasta", $seq_type, 50, 2000);
    }

    ## Compress sequence file
    if (-e $seq_file) {
      &RSAT::message::TimeWarn("Compressing sequences for genomic regions", $seq_type) if ($main::verbose >= 2);
      $command .= "; " unless ($command eq "");
      $command .= "gzip -f $seq_file";

      ## Purge sequences
      if ($purged_frequencies) {
	&RSAT::message::TimeWarn("Purging sequences for genomic regions", $seq_type) if ($main::verbose >= 2);
	$command .= "; " unless ($command eq "");
	$command .= &PurgeSeqCommand($seq_file);
      }
#      &doit($command, $dry, $die_on_error, $main::verbose);
      &doit($command, $dry, $die_on_error, $main::verbose, $batch, $job_prefix, $log_handle, $err_handle, $cluster_queue);  
    }

  }

  chdir $dir{main};
}


################################################################
### extract the complete set of upstream sequences
sub AllUpstream {
  &RSAT::message::TimeWarn("Retrieving all upstream sequences") if ($main::verbose >= 2);

  foreach my $masking (@masking_modes) {
    foreach my $noorf ("", "-noorf") {
      my $seq_type = "upstream${noorf}${masking}";
      my $seq_file = $dir{genome}."/".${organism_short_name}."_".${seq_type};
      my @seq_formats = qw(fasta ft);
      #      my @seq_formats = qw(ft);
      my $command = "";

      ## Retrieve sequences in two formats: fasta is used for
      ## computing oligo and dyad frequencies, whereas ft is convenient for
      ## retrieve-seq-multigenome
      foreach $format (@seq_formats) {
	$command .= "; " unless ($command eq ""); 
	$command .= "retrieve-seq -all";
	$command .= " -type upstream -feattype gene -org ".$organism_short_name;
	$command .= " ".${noorf};
	$command .= " ".${masking};
	$command .= " -label ID";
	$command .= " -format ".$format;
	$command .= " -o ".$seq_file.".".$format;
#	&doit($command, $dry, $die_on_error, $main::verbose);
	&RSAT::message::Info("Exported sequence file", $seq_file.".".$format) if ($main::verbose >= 3);
      }

      ## Compute sequence length distributions and generate a frequency plot
      $command .= "; ".&SeqLengthDistributionCmd($seq_file.".fasta", "fasta", $seq_type, 50, 2000) if ($noorf);

      ## Compress sequence file
      $command .= "; gzip -f ".$seq_file.".fasta";

      ## Purge sequences
      if ($purged_frequencies) {
	$command .= "; ";
	$command .= &PurgeSeqCommand($seq_file.".fasta", "fasta");
      }

      &doit($command, $dry, $die_on_error, $main::verbose, $batch, $job_prefix, $log_handle, $err_handle, $cluster_queue);  
#      &doit($command, $dry, $die_on_error, $main::verbose);

    }
  }
  chdir $dir{main};
}


################################################################
## Convert genome sequence to fasta, required for retrieve-seq-bed and
## bedtools, and generate the fasta index file (.fai).
sub FastaGenomeAndBedtoolsIndex {
  &RSAT::message::TimeWarn("Generating fasta sequences and bedtools fasta index. ") if ($main::verbose >= 2);

  my $command = "";

  ## Generate fasta sequence
  if ($task{fasta_genome}) {
    $command .= "convert-seq";
    $command .= " -i ".$outfile{genome};
    $command .= " -from filelist -to fasta";
    $command .= " -o ".$outfile{fasta_genome};
#    &doit($command, $dry, $die_on_error, $main::verbose);
    &RSAT::message::Info("\tFasta genome", $outfile{fasta_genome}) if ($main::verbose >= 3);

    ## JvH (2017-11-02): I don't remember why I made a soft link to
    ## dna.genome.fa instead of generating the repeat-masked genome.
#    $command .= "; cd ".$dir{genome}. "; ln -f -s ".$organism_short_name.".dna.genome.fa ".$organism_short_name.".dna_rm.genome.fa; cd -";

#    &doit($command, $dry, $die_on_error, $main::verbose);
  }

  ## Convert soft-masked genome into hard-masked genome
  if ($task{fasta_genome_rm}) {
    $command .= "; convert-seq";
    $command .= " -i ".$outfile{fasta_genome};
    $command .= " -from fasta -to fasta -mask lower";
    $command .= " -o ".$outfile{fasta_genome_rm};
  }

  ## Compute chromosome sizes
  if ($task{chrom_sizes}) {
    $command .= "; " if ($command);
    $command .= "sequence-lengths -i ".$outfile{fasta_genome};
    $command .= " -o ".$outfile{chrom_sizes};
    &RSAT::message::Info("\tChromsome sizes", $outfile{chrom_sizes}) if ($main::verbose >= 3);
  }

  ################################################################
  ## Generate bedtool indexes. This must be done as RSAT user in order
  ## to enable retrieve-seq-bed, which relies on bedtools getfasta.
  ## 
  ## We use a trick to run bedtools getfasta with a single-nucleotide bed
  ## file, whihc creates the fasta index if it does not exist. .
  if ($task{index_bedtools}) {
    my $rand_fragment_coord = &RSAT::util::make_temp_file("","random-genome-fragments_coord", 1);
    my $rand_fragment_seq = &RSAT::util::make_temp_file("","random-genome-fragments_seq", 1);
    $command .= "; " if ($command);
    $command .= &RSAT::server::GetProgramPath("random-genome-fragments");
    $command .= " -n 1 -l 1";
    $command .= " -org ".$organism_short_name;
    $command .= " -o ".$rand_fragment_coord;
    $command .= " ; ";
    $command .= &RSAT::server::GetProgramPath("retrieve-seq-bed");
    $command .= " -i ".$rand_fragment_coord;
    $command .= " -org ".$organism_short_name;
    $command .= " -o ".$rand_fragment_seq;
    $command .= " ; ";
    $command .= &RSAT::server::GetProgramPath("retrieve-seq-bed");
    $command .= " -rm";
    $command .= " -i ".$rand_fragment_coord;
    $command .= " -org ".$organism_short_name;
    $command .= " -o ".$rand_fragment_seq;
    $command .= " ; rm -f ".$rand_fragment_seq; 
    $command .= " ; rm -f ".$rand_fragment_coord; 
    &RSAT::message::Info("\tFasta index", $outfile{fasta_genome}.".fai") if ($main::verbose >= 3);
    #&doit($command, $dry, $die_on_error, $main::verbose);
  }

  &doit($command, $dry, $die_on_error, $main::verbose, $batch, $job_prefix, $log_handle, $err_handle, $cluster_queue);  
}


################################################################
## Draw an histogram of sequence lengths
sub SeqLengthDistributionCmd {
    my ($seq_file, $format, $seq_type, $ci, $xmax) = @_;
    &RSAT::message::TimeWarn("&SeqLengthDistributionCmd()", $seq_file, $format, $seq_type, $ci, $xmax) if ($main::verbose >= 2);
    $ci = 50 unless $ci;
    $format = "fasta" unless $format;
    my $classfreq_from = 0;
    chdir $dir{genome};

    if ($main::verbose >= 4) {
	my $pwd = $ENV{PWD};
#2D	my $pwd = `pwd`;
#	chomp $pwd;
	&RSAT::message::TimeWarn("&SeqLengthDistributionCmd()", $seq_file, $seq_type, "Working dir", $pwd);
    }

    my $len_prefix = $dir{genome}."/".$organism_short_name."_".$seq_type."_lengths";
    my $command = "sequence-lengths -i $seq_file -in_format ".$format;
    $command .= " -o ${len_prefix}.tab";
    $command .= "; classfreq -v 1 -i ${len_prefix}.tab ";
    $command .= " -col 2 -ci ".$ci." -from ".$classfreq_from;
    $command .= " -o ".$len_prefix."_distrib.tab";
    $command .= "; XYgraph -i ${len_prefix}_distrib.tab";
    $command .= " -xcol 3 -ycol 4,5,6 -lines -xmin $classfreq_from";
    $command .= " -xsize 800 -ysize 400 -legend";
    $command .= " -xgstep1 200 -xgstep2 50 -xmin 0";
    $command .= " -xmax ".$xmax;
    $command .= " -ymin 0 -ygstep2 1000";
    $command .= " -xlog 2" if( ($seq_type eq "intergenic") || ($seq_type eq "gene"));
    $command .= " -xleg1 'Sequence length (bp)'";
    $command .= " -yleg1 'Frequency'";
    $command .= " -title1 '".$organism_full_name{$organism_short_name}."'";
    $command .= " -title2 'length distribution of ${seq_type} sequences'";
    $command .= " -format ".$img_format;
    $command .= " -o ".$len_prefix."_distrib.".$img_format;
#    &doit($command, $dry, $die_on_error, $main::verbose);
    return($command);
}

################################################################
## purge sequences
sub PurgeSeqCommand {
  my ($seq_file, $format) = @_;
  &RSAT::message::Warning("Purging sequence", $seq_file, $format) if ($main::verbose >= 2);
  $format = "fasta" unless ($format);
  my $purged_seq_file = `basename $seq_file $format`;
  $purged_seq_file .= "_purged.$format";
  my $command = "purge-sequence -i $seq_file -format $format -ml 300 -mis 9 -2str -o $purged_seq_file";
#  &doit($command, $dry, $die_on_error, $main::verbose);
  ## compress purged sequence file
  $command .= "; gzip -f $purged_seq_file";
#  &doit($command, $dry, $die_on_error, $main::verbose);
  return($command);
}

################################################################
### calculate oligo and dyad frequencies in different sequence types :
### - intergenic
### - upstream
### - genomic
### 
sub CalcFrequencies { 
  #    my ($seq_file, $seq_format, $seq_type) = ();
  &RSAT::message::TimeWarn("&CalcFrequencies()", $seq_file, $seq_format, $seq_type) if ($main::verbose >= 2);
  chdir $dir{oligos};

  ################################################################
  ## Calculate oligonucleotide and dyad frequencies in all upstream
  ## sequences
  if ($task{upstream_freq}) {
    foreach my $masking (@masking_modes) {
      foreach my $noorf ("-noorf", "") {
	$seq_type = "upstream${noorf}${masking}";
	if ($purged_frequencies) {
	  $seq_file = "${organism_short_name}_${seq_type}_purged.fasta";
	  $seq_format = "fasta";
	} else {
	  $seq_file = "${organism_short_name}_${seq_type}.fasta";
	  $seq_format = "fasta";
	}
	&RSAT::message::TimeWarn("Calculating upstream oligo and dyad frequencies", $noorf, $masking) if ($main::verbose >= 2);
	&CalcOligoFreq($dir{genome}."/".$seq_file, $seq_format, $seq_type) if ($task{oligos});
	&CalcOligoDistrib($dir{genome}."/".$seq_file, $seq_format, $seq_type) if ($task{distrib} && !($noorf));
	&CalcDyadFreq($dir{genome}."/".$seq_file, $seq_format, $seq_type) if ($task{dyads});
      }
    }
  }

  if ($task{ensembl_freq}) {

    unless (defined($up_from)) {
      if (defined($supported_organism{$organism_short_name}->{'up_from'})) {
	$up_from = $supported_organism{$organism_short_name}->{'up_from'};;
      }
    }

    unless (defined($up_to)) {
      if (defined($supported_organism{$organism_short_name}->{'up_to'})) {
	$up_to = $supported_organism{$organism_short_name}->{'up_to'};
      }
    }

    foreach my $masking (@masking_modes) {
      #	foreach my $masking ("-rm") {
      #	    foreach my $noorf ("-noorf", "") {
      foreach my $maskcoding ("-maskcoding") {
	#		    foreach my $type ("upstream_mrna") {
	#		    foreach my $type ("intron") {
	#		    foreach my $type ("firstintron") {
	foreach my $type ("utr") {
	  #			$seq_type = "${type}${up_from}${up_to}${maskcoding}${masking}";
	  $seq_type = "${type}${maskcoding}${masking}";
	  #		    if ($purged_frequencies) {
	  #			$seq_file = "${organism_short_name}_${seq_type}_purged.fasta";
	  #			$seq_format = "fasta";
	  #		    } else {
	  $seq_file = "${organism_short_name}_${seq_type}.fasta";
	  $seq_format = "fasta";
	  #		    }
	  &RSAT::message::TimeWarn("Calculating upstream oligo and dyad frequencies") if ($main::verbose >= 1);
	  &CalcOligoFreq($dir{genome}."/".$seq_file, $seq_format, $seq_type) if ($task{oligos});
	  #		    &CalcOligoDistrib($dir{genome}."/".$seq_file, $seq_format, $seq_type) if ($task{distrib} && !($noorf));
	  #		    &CalcDyadFreq($dir{genome}."/".$seq_file, $seq_format, $seq_type) if ($task{dyads});
	}
	#	    }
      }
    }
  }


  ################################################################
  ## Calculate oligont frequencies in all intergenic sequences
  if ($task{intergenic_freq}) {
    $seq_type = "intergenic";
    &RSAT::message::TimeWarn("Calculating ${seq_type} oligo and dyad frequencies") if ($main::verbose >= 1);
    if ($purged_frequencies) {
      $seq_file = "${organism_short_name}_intergenic_segments_purged.fasta";
      $seq_format = "fasta";
    } else {
      $seq_file = "${organism_short_name}_intergenic_segments.fasta";
      $seq_format = "fasta";
    }
    &CalcOligoFreq($dir{genome}."/".$seq_file, $seq_format, $seq_type) if ($task{oligos});
    #	&CalcOligoDistrib($dir{genome}."/".$seq_file, $seq_format, $seq_type) if ($task{distrib});
    &CalcDyadFreq($dir{genome}."/".$seq_file, $seq_format, $seq_type) if ($task{dyads});
  }
    
  ################################################################
  ## Calculate oligo-frequencies in full genome
  if ($task{genome_freq}) {
    $seq_type = "genomic";

    ## First check if a fasta file exists for the current genome
    ## (since 2015-11, fasta genomes are downloaded from
    ## ensemblgenomes). If yes, use it in priority because it allows
    ## to use the optiojn -quick for oligo-analysis and dyad-analysis.
    my $genome_dir = $ENV{RSAT}."/public_html/data/genomes/".$organism_short_name."/genome";
    my $fasta_genome = $genome_dir."/".$organism_short_name.".dna.genome.fa";
    if (-e $fasta_genome) {
	$seq_file = $fasta_genome;
	$seq_format = "fasta";
	&CalcOligoFreq($seq_file, $seq_format, $seq_type) if ($task{oligos});
	#	&CalcOligoDistrib($seq_file, $seq_format, $seq_type) if ($task{distrib});
	&CalcDyadFreq($seq_file, $seq_format, $seq_type) if ($task{dyads});
    } else {
	&RSAT::message::Warning("Missing fasta genome file", $fasta_genome);
	&RSAT::message::Warning("Fasta genome file not found -> skipping whole genome oligo/dyad frequencies.");
	$seq_format = "filelist";
	$seq_file = $outfile{'genome'};
    }
    #&CalcOligoFreq($seq_file, $seq_format, $seq_type) if ($task{oligos});
    #	&CalcOligoDistrib($seq_file, $seq_format, $seq_type) if ($task{distrib});
    #&CalcDyadFreq($seq_file, $seq_format, $seq_type) if ($task{dyads});
    
  }

  ################################################################
  ## calculate oligont frequencies in all gene sequences 
  ##
  ## GENE FREQUENCIES ARE NOT WORKING ANYMORE, AND, BESIDES, THEY WERE NOT
  ## USEFUL
  if ($task{gene_freq}) {
    $seq_type = "gene";
    if ($purged_frequencies) {
      $seq_file = "${organism_short_name}_gene_segments_purged.fasta";
      $seq_format = "fasta";
    } else {
      $seq_file = "${organism_short_name}_gene_segments.fasta";
      $seq_format = "fasta";
    }
    &CalcOligoFreq($dir{genome}."/".$seq_file, $seq_format, $seq_type) if ($task{oligos});
    #	&CalcOligoDistrib($dir{genome}."/".$seq_file, $seq_format, $seq_type) if ($task{distrib});
    &CalcDyadFreq($dir{genome}."/".$seq_file, $seq_format, $seq_type) if ($task{dyads});
  }

  ################################################################
  # Calculate oligopeptide frequencies in all protein sequences
  if ($task{protein_freq}) {
    $seq_type = "protein";
    &RSAT::message::TimeWarn("Calculating oligopeptide frequencies") if ($main::verbose >= 2);
    # New name for peptidic sequence file (Sept 2015). I suppress
    # organism name from sequence file name.
    my $seq_file = $dir{genome}."/peptidic_sequences.fasta";
    unless (-e $seq_file) {
      ## Previous name for the peptidic sequence file
      $seq_file = $dir{genome}."/".${organism_short_name}."_aa.fasta";
    }

    if (-e $seq_file) {
      $seq_format = "fasta";
      &CalcOligoFreq($seq_file, $seq_format, $seq_type) if ($task{oligos});
    } else {
      &RSAT::message::Warning("Skipping oligopeptide frequencies because peptidic sequence file is missing", $seq_file);
    }
  }

  chdir $dir{main};
}

################################################################
## Calculate oligonucleotide frequencies for a specified sequence
## file
sub CalcOligoFreq {
  my ($seq_file,$seq_format,$seq_type) = @_;
  &RSAT::message::TimeWarn("&CalcOligoFreq()", $seq_file,$seq_format,$seq_type) if ($main::verbose >= 2);

  my @current_oligo_lengths = @oligo_lengths;
#  my @current_oligo_lengths = (6);
  my $oligo_seq_type = "dna";
  my $residue_type = "nt";
  my $quick = "";
  my @batch_commands = ();

  my @strands = ("-1str", "-2str");
  if ($seq_type eq "protein") {
    @current_oligo_lengths = 1..3;
    $oligo_seq_type = "prot";
    $residue_type = "pept";
    @strands = ("");
  }

  ## Quick option only works with DNA sequences in fasta format
  unless (($seq_type eq "protein") || ($seq_format ne "fasta")) {
    $quick = "-quick";
  }
  
  ## Uncompress sequence file if required because the option -quick
  ## currently does not support compressed files
  my $uncompressed = 0;
  if (!(-e $seq_file) && (-e $seq_file.".gz")) {
      &RSAT::message::Warning("Uncompressing sequence file", $seq_file) if ($main::verbose >= 2);
      my $command = "gunzip ".$seq_file.".gz";
      my $job_prefix = ${organism_short_name}."_compress";
      $uncompressed = 1;
      &doit($command, $dry, $die_on_error, $main::verbose); #, $batch, $job_prefix, $log_handle, $err_handle, $cluster_queue);  
      $seq_file =~ s/\.gz$//;
  }

  my $job_prefix = ${organism_short_name}."_oligos";
  foreach my $noov ("-noov", "-ovlp") {
    foreach my $strands (@strands) {
      foreach my $oligo_length (@current_oligo_lengths) {
	&RSAT::message::TimeWarn("Calculating oligo frequencies",$seq_file, $seq_format, $seq_type, "l=".$oligo_length, $noov, $strands) if ($main::verbose >= 2);

	my $out_file = $dir{oligos}."/".${oligo_length}.${residue_type}."_".${seq_type}."_".${organism_short_name}.${noov}.${strands}.".freq";
	my $command = &RSAT::server::GetProgramPath("oligo-analysis");
	$command .= " ".$quick;
	$command .= " -v 1 ";
	$command .= " ".$strands;
	$command .= " -i ".$seq_file;
	$command .= " -format ".$seq_format;
	$command .= " -seqtype ".$oligo_seq_type;
	$command .= " ".$noov;
	$command .= " -l ".$oligo_length." -type dna ";
	$command .= " -return freq,occ";
	$command .= " -o ".$out_file;
	$command .= "; gzip -f ".$out_file;
	if ($batch) {
	    push @batch_commands, $command;
	} else {
	    &doit($command, $dry, $die_on_error, $main::verbose, $batch, $job_prefix, $log_handle, $err_handle, $cluster_queue);  
	}
      }
    }
  }

  ## Run all the oligo frequencies in one batch job
  if ($batch) {
      my $batch_command = join(" ;", @batch_commands);
      &doit($batch_command, $dry, $die_on_error, $main::verbose, $batch, $job_prefix, $log_handle, $err_handle, $cluster_queue);  
  }

  ## Recompress the sequence file if required
  if ($uncompressed) {
      my $command = "gzip ".$seq_file;
      my $job_prefix = ${organism_short_name}."_uncompress";
      if ($batch) {
	  &RSAT::message::Warning("Cannot recompress sequence in batch mode", $seq_file) if ($main::verbose >= 2);
      } else {
	  &RSAT::message::Info("Compressing sequence file", $seq_file) if ($main::verbose >= 2);
	  &doit($command, $dry, $die_on_error, $main::verbose, $batch, $job_prefix, $log_handle, $err_handle, $cluster_queue);  
      }
  }
}

################################################################
## Calculate oligonucleotide distribution for a specified sequence
## file
sub CalcOligoDistrib {
  my ($seq_file,$seq_format,$seq_type) = @_;
  &RSAT::message::TimeWarn("&CalcOligoDistrib()", $seq_file,$seq_format,$seq_type) if ($main::verbose >= 2);
  @strands = ("-1str", "-2str");
  foreach my $noov ("-noov", "-ovlp") {
    foreach my $strands (@strands) {
      foreach my $oligo_length (@oligo_lengths) {
	my $job_prefix = ${organism_short_name}."_oligo_".$oligo_length;

	#### Calculate occurrence distributions in the sequence file
	my $distrib_file = $dir{oligos}."/"."${oligo_length}nt_${seq_type}_${organism_short_name}${noov}${strands}_distrib.tab";
	my $command = &RSAT::server::GetProgramPath("oligo-analysis");
	$command .= " -v 1";
	$command .= " -i ".$seq_file;
	$command .= " -format ".$seq_format;
	$command .= " ".$strands;
	$command .= " ".$noov;
	$command .= " -l ".$oligo_length." -type dna";
	$command .= " -return occ -distrib";
	$command .= " -o ".$distrib_file;
	$command .= " ; gzip -f ".$distrib_file;
	&doit($command, $dry, $die_on_error, $main::verbose, $batch, $job_prefix, $log_handle, $err_handle, $cluster_queue);  

	#### Fit a Poisson and a negbin on occurrence distribution
	foreach my $theor ("negbin", "poisson") {
	  my $fitting_file = "$dir{oligos}/${oligo_length}nt_${seq_type}_${organism_short_name}${noov}${strands}_${theor}.tab";
	  $command = "fit-distribution -v 1 -i ".$distrib_file;
	  $command .= " -o ".$fitting_file;
	  $command .= " -distrib ".$theor;
	  $command .= " ; gzip -f ".$fitting_file;
	  &doit($command, $dry, $die_on_error, $main::verbose);  
	}
      }
    }
  }
}


################################################################
## calculate dyad frequencies for a specified sequence
## file
sub CalcDyadFreq {
  my ($seq_file,$seq_format,$seq_type) = @_;
  &RSAT::message::TimeWarn("&CalcDyadFreq()", $seq_file,$seq_format,$seq_type) if ($main::verbose >= 2);
  $min_spacing = 0;
  $max_spacing = 20;
  @monad_lengths = (3,2,1);
  @strands = ("-1str", "-2str");
  my $quick = "";
  my @batch_commands = ();

  ## Quick option only works with DNA sequences in fasta format
  unless (($seq_type eq "protein") || ($seq_format ne "fasta")) {
    $quick = "-quick";
  }

  ## uncompress sequence file if required because the option -quick
  ## currently does not support compressed files
  my $uncompressed = 0;
  if (!(-e $seq_file) && (-e $seq_file.".gz")) {
      &RSAT::message::Warning("Uncompressing sequence file", $seq_file) if ($main::verbose >= 2);
      my $command = "gunzip ".$seq_file.".gz";
      my $job_prefix = ${organism_short_name}."_uncompress";
      $uncompressed = 1;
      &doit($command, $dry, $die_on_error, $main::verbose); #, $batch, $job_prefix, $log_handle, $err_handle, $cluster_queue);  
      $seq_file =~ s/\.gz$//;
  }


  my $job_prefix = ${organism_short_name}."_dyads";
  foreach my $noov ("-noov", "-ovlp") {
    foreach my $strands (@strands) {
      foreach my $monad_length (@monad_lengths) {
	&RSAT::message::TimeWarn("Calculating dyad frequencies",$seq_file, $seq_format, $seq_type, "l=".$monad_length, $noov, $strands) if ($main::verbose >= 2);
	$dyad_file = $dir{oligos}."/"."dyads_${monad_length}nt_sp${min_spacing}-${max_spacing}_${seq_type}_${organism_short_name}${noov}${strands}";
	$dyad_file .= ".freq";
	my $command = &RSAT::server::GetProgramPath("dyad-analysis");
	$command .= " ".$quick;
	$command .= " -v 1";
	$command .= " -i ".$seq_file;
	$command .= " -format ".$seq_format;
	$command .= " -timeout 240000 ";
	$command .= " -type any -seqtype dna";
	$command .= " ".$strands;
	$command .= " ".$noov;
	$command .= " -sp ".$min_spacing."-".$max_spacing;
	$command .= " -l $monad_length";
	$command .= " -return freq,occ";
	$command .= " -o ".$dyad_file;
	$command .= "; gzip -f ".$dyad_file;
	if ($batch) {
	    push @batch_commands, $command;
	} else {
	    &doit($command, $dry, $die_on_error, $main::verbose, $batch, $job_prefix, $log_handle, $err_handle, $cluster_queue);
	}
      }
    }
  }

  ## Run all the oligo frequencies in one batch job
  if ($batch) {
      my $batch_command = join(" ;", @batch_commands);
      &doit($batch_command, $dry, $die_on_error, $main::verbose, $batch, $job_prefix, $log_handle, $err_handle, $cluster_queue);  
  }

  ## Recompress the sequence file if required
  if ($uncompressed) {
      my $command = "gzip ".$seq_file;
      my $job_prefix = ${organism_short_name}."_uncompress";
      if ($batch) {
	  &RSAT::message::Warning("Cannot recompress sequence in batch mode", $seq_file) if ($main::verbose >= 2);
      } else {
	  &RSAT::message::Info("Compressing sequence file", $seq_file) if ($main::verbose >= 2);
	  &doit($command, $dry, $die_on_error, $main::verbose, $batch, $job_prefix, $log_handle, $err_handle, $cluster_queue);  
      }
  }
}


################################################################
## display full help message
sub PrintHelp {
    open HELP, "| more";
    print HELP <<End_of_help;
NAME
	install-organism

AUTHOR
        Jacques van Helden (Jacques.van-Helden\@univ-amu.fr)

USAGE
        install-organism -org organism_name

DESCRIPTION
	Install one or several organisms on the local RSAT instance.

	This script is a task manager, which (depending on the
	selected tasks) manages different steps necessary for the
	installation of an organism from the NCBI flat files:

	- Downloading genomes from the NCBI/Refseq ftp site via the
          rsync protocol
          (rsync://ftp.ncbi.nlm.nih.gov/genomes/refseq).

	- Parsing the .gbff files (Genbank-formatted flat files).

	- Adding the organism in the config file.

	- Calculating trinucleotide frequencies in the start and stop
          codons (a way to check consistency of the gene locations).

	- Calculating oligonucleotide and dyad frequencies, which will
          be used as background models for motif discovery.

CATEGORY
	Data management.

OPTIONS
	-h	(must be first argument) display full help message
	-help	(must be first argument) display options
	-v	verbose

    MANDATORY ARGUMENTS

    	The only mandatory option is to specify one or more
    	organism(s) to be installed. This can be done with any of the
    	following options: -org, -org_file, -all_organisms, -taxon.

	-org	organism name without spaces (e.g. Saccharomyces_cerevisiae)

		The option -org can be used iteratively on the same
		command line to iterate the installation over multiple
		organisms.

        -org_file organism_file_name
		Text file containing a list of organisms to install.
		The first word of each row is taken as a query
		orgnanism. Further information of the same row is
		ignored.

	-all_organisms

		Install all the organisms found in the Refseq
		directory (see option -refseq).

      	-taxon  taxon name (mutually exclusive with -org)

		The installation will iterate over all organisms of
		the selected taxon. Note that the command will only
		apply to the organisms previously declared with the
		command install-organism -task config. The option
		-taxon is thus convenient for re-running installation
		tasks on previously installed organisms rather than
		for installing new genomes downloaded e.g. from NCBI.

		The option -taxon can be used iteratively on the same
		command line to iterate the installation over multiple
		taxa.

    SPECIFIC ARGUMENTS FOR THE DOWNLOAD AND PARSE TASK

        -group taxonomic group according to the NCBI/Refseq ftp/rsync
	       site (ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq). 

	       Examples: bacteria, fungi, protozoa, plant,
	       invertebrate, vertebrate_mammalian, vertebrate_other

	-species species name, with space replaced by underscore
	         (e.g. Saccharomyces_cerevisiae). 

		 Note that one species may correspond to several
	         organisms if several strains are available for this
	         species. In particular, for some bacteria there are
	         several thousands of sequenced strains
	         (e.g. Escherichia coli). 

		 The option -max_strains can be used to restrict the
	         number of installed strains per species. 

		 An alternative is to specify a custom list of species
		 + strains with the option -org_table (this is the
		 recommended way to install multiple genomes from
		 NCBI).

	-species_file file containing species names in the first
	        column.

	-max_strains #

	        Maximal number of strains per species to
		parse/install.  This option avoids filling disk with
		particular bacterial species for which thousands of
		strains have been sequenced (e.g. Escherichia coli).

	-strain restrict the download to a single strain or
                assembly. By default all strains of the selected
                species are downloaded, but this can be heavy for some
                bacterial species where thousands of strains have been
                sequenced (e.g. Escherichia coli).

	      Note: the RSAT organism name is automatically built by
	      concatenating the NCBI species name and strain
	      identifier.

	-assembly_table

	        Table extracted from the NCBI ftp site, indicating the
	        organisms to install. 

		Example: assembly table of fungi:

	        ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/fungi/assembly_summary.txt

		The table can be the full table for a given group, or
		a custom subset, which can be selected on some
		informative fields of the assembly table. 

                We recommend to select the subset of rows having
		either "Complete genome" or "Chromosome" in the 12th
		column ("assembly_level"), and to add a filter on
		"refseq_category" in order to select "representative
		genome" or "reference genome". 

		The assembly table can also be filtered out manually
		in order to select a custom subset of the available
		genomes.

	-refseq
		Local directory containing the mirror of the refseq
		genomes found on the NCBI site:
		       ftp://ftp.ncbi.nih.gov/genomes/refseq
		Normally, the refseq directory is specified by
		defining a global variable REFSEQ_DIR in the config
		file. The option -refseq allows to overwrite this
		value.


    OPTIONAL ARGUMENTS

	-skip #	Skip the first # organisms of the list.

	        This option is convenient to resume the installation
	        of a list of organisms, when it has been interrupted.

		It also allows to organize the installation by chunks
		of organisms.

	-skip_species # skip the first # of the list of NCBI species. 

	        Note that one species can correspond to several
	        organisms, if several strains have been sequenced.

	-last #	Stop after the first # organisms of the list.

		This option can be used to organize the installation
		by chunks of organisms.

	-last_species # Stop after the first # species of the NCBI
	        list.

	-organism Full name of the organism (e.g. 'Saccharomyces
		cerevisiae'). This option is only valid when a single
		organism is installed.

	-source	data source

	-dir dir{install}
		Absolute path of the installation directory. 
		BEWARE : you should provide the absolute path of the
		installation directory, not the relative path.

	-batch run some tasks (for example the calibration of oligos
		and dyads) in batch mode.  This options works on our
		lab cluster, but could be adapted for other
		configurations by adapting the parameters in
		RSAT_config.props.

	-queue  specify a queue to send the job with the option -batch. 

	        On this instance of RSAT, default queue is $ENV{CLUSTER_QUEUE}

                Type 'qstat -f' to get a list of existing queues.

	-config
		Specify an alternative organism configuration file for the
		genome to be installed.

		By default, the organism configuration file is 
	      	\$RSAT/public_html/data/genomes/supported_organisms.pl

        -backup_org_table
                Store a backup copy of the organism table before
                overwriting it.

	-local
		Absolute path of a RSA local config file.

		By default, the newly installed organism is added to
		the main RSA config file is changed (provided the user
		has write access to the RSA config file).

		In addition to the organisms installed by the RSAT
		system administrator (found in
		$ENV{RSAT}/public_html/data/supported_organisms.tab), users can
		install some organisms locally.

		For this, the user must first define an environment
		variable called RSA_LOCAL_CONFIG, and indicating the
		absolute path of the local config file.  
		E.g.  
		  export RSA_LOCAL_CONFIG=/home/fred/RSA.local.config

		When install-organism is called with the option
		-local, the new organism is added to the file
		indicated by the environment variable RSA_LOCAL_CONFIG
		rather than the main RSA config file.

	-syn	synonym table
		A tab-delimited file containing two coloumns. The
		forst column contains a gene ID, the second a gene
		name.

	-up_from distal limit of the upstream regions (e.g. -800 for yeast)

	-up_to	proximal limit of the upstream regions (e.g. -1)

	-genbank genbank directory (obsolete, see refseq)

	-prefid feattype idname
	        passed to parse-genbank.pl

	-date last_update

	        Force the 'last_update' attribute to a given date. 

		This option is used by download-organism to ensure
		that the local genome has the same installation date
		as the server, rather than using the date of download
		as update date.

	-ensembl
		ENSEMBL directory. Directory containing the ENSEMBL
		flat files in Genbank format (ext .dat)

		Example: 
		ftp.ensembl.org/pub/current_worm/data/flatfiles/genbank

	-task	specification of a single installation task
		    e.g.
			install-organism -task dyads
		supported tasks: $supported_tasks

		Description of the tasks
		------------------------

		available    list available species on the NCBI FTP site.
		             This task cannot be combine with any
		             other task.

		download     download genome from the NCBI FTP site
		             (ftp://ftp.ncbi.nih.gov/genomes/refseq/)
		             via the rsync protocol.

		list         print the list of downloaded strains for the
		             user-specified species.

		parse        parse genome data (sequence, features) from 
		             genbank-formatted flat files (gbff)
		             downloaded from NCBI. Parsing is ensured
		             by the program parse-genbank.pl.

		config	     update configuration file

		start_stop   
			     calculate start and stop codon
			     frequencies

		allup	     retrieve all upstream sequences

		genome_segments
			     retrieve sequences and limits of genome segments
			     (intergenic, genic)

		oligos	     calculate oligonucleotide frequencies

			     This task requires to specify, in
			     addition, the type(s) of sequences for
			     which oligo frequencies have to be
			     calculated (upstream_freq,
			     intergenic_freq, genome_freq).

		dyads	     calculate dyad frequencies

			     This task requires to specify, in
			     addition, the type(s) of sequences for
			     which dyad frequencies have to be
			     calculated (upstream_freq,
			     intergenic_freq, genome_freq).

		ncf	     calculate oligo and dyad frequencies in
			     intergenic segments

		upstream_freq
			     calculate oligo and dyad frequencies for
			     all upstream sequences

		intergenic_freq
			     calculate oligo and dyad frequencies for
			     all intergenic sequences

		genome_freq  
			     calculate oligo and dyad frequencies for
			     the whole genome sequence. This is not
			     recommended for higher organisms, where
			     the genome represents several Gigabases,
			     and the computation of all oligo and dyad
			     frequencies might take ages.

		clean	     remove unnecessary sequence files

	-rm	calibrate oligo and dyad frequncies on repeat masked
		sequences, in addition to the non-masked sequences.

	-img_format
		image format for the graphs of sequence length distribution

EXAMPLES

    Get the list of available genomes for one of the supported groups
    on the NCBI FTP/rsync site.  Supported values for option -group
    are the subdirectories of the FTP site
    (rsync://ftp.ncbi.nlm.nih.gov/genomes/refseq).

	install-organism -v 1 -group [GROUP]  -task available \
            -o available_vertebrate.tab

    Download one species with rsync:

    	install-organism -v 1  -group [GROUP] -species [SPECIES] \
            -task download

    Parse the genome for a given species, and declare it in RSAT
    supported organism. NOTE: this installs all the strains for the
    selected species. For some species this can represent thousands of
    strains (e.g. Eschrichia coli). Strains can be restricted with the
    options -strain or -max_strains.

        install-organism -v 1  -group [GROUP] -species [SPECIES] \
            -task parse,config

    Run the default installation steps for a given species. Note: the
    option -list is required in order to collect the organism names,
    which are made by concatenating species and strain

        install-organism -v 1  -group [GROUP] -species [SPECIES] \
            -task list,default

    Installation can be automated and parallelized with a job
    scheduler (e.g. qsub) in order to install all the species of a
    given group.

    Example: install all species of the group "fungi" at NCBI.

    Step 1: get the list of species available at NCBI

        install-organism -v 1 -group fungi -task available \
	    -o fungi_available_species.txt

    Step 2: download genome for all the strains of these
    species. Beware, this takes space, there are several hundreds of
    species.

	install-organism -v 1 -group fungi -task download \
    	    -species_file fungi_available_species.txt

    Step 3: parse the genomes of all strains for each fungal species.

	install-organism -v 1 -group fungi -task parse,config \
    	    -species_file fungi_available_species.txt

    Step 4: collect the list of downloaded organisms. Organism names
    are built by concatenating species and strain names.

	install-organism -v 1 -group fungi \
    	    -species_file fungi_available_species.txt \
	    -task list \
	    -o fungi_downloaded_orgnanisms.txt

    Step 5: extract fasta sequences for different types of genomic
    regions, and run some control tests (e.g. oligonucleotide
    frequencies of start and stop codons). With the option -batch, the
    tasks are sent to a job scheuler (qsub) in order to be executed in
    parallel.

	install-organism -v 1 \
	    -org_file fungi_downloaded_orgnanisms.txt \
	    -task start_stop,allup,seq_len_distrib,genome_segments \
	    -task protein_len,fasta_genome,fasta_genome_rm \
	    -task chrom_sizes,index_bedtools \
	    -batch

    Step 6: compute the oligomer frequency tables. In batch mode, this
    task can be done only after the previous job list is finished,
    because of the dependencies between the parallelized tasks
    (upstream sequences have to be computed before computing their
    oligonucleotide and dyad frequencies).

	install-organism -v 1 \
	    -org_file fungi_downloaded_orgnanisms.txt \
	    -task upstream_freq,genome_freq,protein_freq,oligos,dyads \
	    -batch

SEE ALSO

    download-organisms

        The program I<install-organism> performs all the formatting
	and calibration tasks for importing genomes from the reference
	databases (NCBI, EMBL) to RSAT.

	The program I<download-organism> transfers the RSAT-formatted
	genomes from a RSAT server. 

 	If a genome is available on the RSAT server, it is recommended
	to use download-genomes in order to obtain it immediately in
	the RSAT format, rather than install-genomes.


End_of_help
    close HELP;
    exit(0);
}

################################################################
## Display short help message
sub PrintOptions {
  open HELP, "| more";
  print HELP <<End_short_help;
install_organism options
------------------------
-h		(must be first argument) display full help message
-help		(must be first argument) display options
-v		verbose
-n		dry run (print commands without executing them)
-org		organism name without spaces (e.g. Saccharomyces_cerevisiae); can be used iteratively
-organism	full organism name (e.g. Saccharomyces cerevisiae)
-taxon		iterate installation tasks over all the previously installed organisms of a taxon
-org_file	organism_file_name
-group          taxonomic group required for the URL of the NCBI/refseq FTP site.
-species        species name (for tasks download, list and parse)
-species_file   file containing species names in the first column
-strain         strain (for task parse)
-max_strains    max number of strains per species to be parsed / installed
-skip #         Skip the # first organisms (convenient to resume interrupted list) 
-skip_species # Skip the # first species of the NCBI list
-last #         Stop after the # first organisms (convenient to install by chunks) 
-last_species # Stop after the # first species of the NCBI list
-source		data source (e.g. refseq);
-dir		absolute path of the installation directory
-batch  	run some tasks (for examplethe calibration of oligos and dyads) in batch mode.
-queue          specify a queue to send the job with the option -batch (default: $ENV{CLUSTER_QUEUE}) 
-config		alternative organism configuration file
-backup_org_table  Store a backup copy of the organism table before overwriting it.
-local		update local config file 
		(specified by the environment variable RSA_LOCAL_CONFIG)
-refseq	        refseq directory
-ensembl	ensembl directory
-task		installation task ($supported_tasks)
-rm		calibrate oligo and dyad frequncies on repeat masked sequences
-syn		synonym table
-up_from       	distal limit of the upstream regions (e.g. -800 for yeast)
-up_to		proximal limit of the upstream regions (e.g. -1)
-prefid feattype idname     passed to parse-genbank.pl
-date 		 force last_update to a given date (for synchro between server and local installation)
-img_format	 image format for the graphs of sequence length distribution
End_short_help
  close HELP;
  exit;
}

################################################################
## Read arguments 
sub ReadArguments {
  my @arguments = @ARGV; ## create a copy to shift, because we need ARGV to report command line in &Verbose()
  while (scalar(@arguments) >= 1) {
    $arg = shift (@arguments);

    #    foreach my $a (0..$#ARGV) {
    ### verbose ###
    if ($arg eq "-v") {
      if (&IsNatural($arguments[0])) {
	$main::verbose = shift(@arguments);
      } else {
	$main::verbose = 1;
      }

      ## dry run
    } elsif ($arg eq "-n") {
      $dry = 1;
      $main::verbose = 1;

      ## detailed help
    } elsif ($arg eq "-h") {
      &PrintHelp();

      ## list of options
    } elsif ($arg eq "-help") {
      &PrintOptions();

      ## output file
    } elsif ($arg eq "-o") {
      $outputfile = shift(@arguments);


      ## Organism name in one word (spaces replaced  by underscores)
    } elsif ($arg eq "-org") {
      my $organism_short_name = shift(@arguments);
      &RSAT::error::FatalError( $organism_short_name, "Invalid organism name: cannot contain spaces") if ($organism_short_name =~ /\s/);
      &RSAT::error::FatalError( $organism_short_name, "Invalid organism name: should not be empty.") unless ($organism_short_name =~ /\S/);
      &RSAT::error::FatalError( $organism_short_name, "Invalid organism name: should contain at least one underscore.") unless ($organism_short_name =~ /_/);
      push @organisms, $organism_short_name;

      ## Species, used to define the FTP site for the task "download"
    } elsif ($arg eq "-species") {
      my $species = shift(@arguments);
      &RSAT::error::FatalError( $organism_short_name, "Invalid species name: cannot contain spaces") if ($species =~ /\s/);
      &RSAT::error::FatalError( $organism_short_name, "Invalid species name: should not be empty.") unless ($species =~ /\S/);
      &RSAT::error::FatalError( $organism_short_name, "Invalid species name: should contain at least one underscore.") unless ($species =~ /_/);
      push @species, $species;


      ## Input file with organism list
    } elsif ($arg eq "-species_file") {
      $infile{species} = shift(@arguments);


      ## Strain, used to define the FTP site for the task "download"
    } elsif ($arg eq "-strain") {
      $strain = shift(@arguments);

      ## Strain, used to define the FTP site for the task "download"
    } elsif ($arg eq "-max_strains") {
      $max_strains = shift(@arguments);
      &RSAT::error::FatalError($max_strains, "is not a valid value for option -max_strains. Should be a Natural number. ") unless (&RSAT::util::IsNatural($max_strains));

      ## Group, used to define the FTP site for the task "download"
    } elsif ($arg eq "-group") {
      $group = shift(@arguments);

      ## Input file with organism list
    } elsif ($arg eq "-org_file") {
      $infile{organisms} = shift(@arguments);

      ## Taxon
    } elsif ($arg eq "-taxon") {
      push @taxa, shift(@arguments);

      ## All organisms found in the Refseq dir
    } elsif ($arg eq "-all_organisms") {
      $main::all_organisms = 1;

      ## Skip the N first organisms of the list (can be useful if the
      ## task was interrupted).
    } elsif ($arg eq "-skip") {
      $main::skip = shift(@arguments);
      &RSAT::error::FatalError($main::skip, "Invalid value for option -skip, should be a Natural number") 
	  unless (&IsNatural($main::skip));

      ## Skip the N first species of the NCBI list
    } elsif ($arg eq "-skip_species") {
      $main::skip_species = shift(@arguments);
      &RSAT::error::FatalError($main::skip_species, "Invalid value for option -skip_species, should be a Natural number") 
	  unless (&IsNatural($main::skip_species));

      ## Stop after the N first organisms of the list (can be useful
      ## for tests on a subset of the list of oganisms).
    } elsif ($arg eq "-last") {
      $main::last = shift(@arguments);
      &RSAT::error::FatalError($main::last, "Invalid value for option -last, should be a Natural number") 
	  unless (&IsNatural($main::last));

      ## Stop after the N first species of the NCBI list
    } elsif ($arg eq "-last_species") {
      $main::last_species = shift(@arguments);
      &RSAT::error::FatalError($main::last_species, "Invalid value for option -last_species, should be a Natural number") 
	  unless (&IsNatural($main::last_species));


      ## Full organism name (may include spaces)
    } elsif ($arg eq "-organism") {
      $user_specified_orgname = shift(@arguments);

      ## synonyms
    } elsif ($arg =~ /-syn/) {
      $infile{synonyms} = shift(@arguments);

      ## Specify the limits of upstream regions
    } elsif ($arg eq "-up_from") {
      $up_from = shift(@arguments);
      &FatalError($up_from, "Invalid value for the up_from parameter (must be integer)") unless (&IsInteger($up_from));
      &FatalError($up_from, "Invalid value for the up_from parameter (must be negative)") if ($up_from >= 0);

    } elsif ($arg eq "-up_to") {
      $up_to = shift(@arguments);
      &FatalError(join ("\t", $up_to, "Invalid value for the to parameter (must be integer)")) unless (&IsInteger($up_to));

    } elsif ($arg eq "-prefid") {
      $parse_options .= join(" " , " -prefid", shift(@arguments), shift(@arguments));

    } elsif ($arg eq "-date") {
      $force_date = shift(@arguments);

      ## installation dir
    } elsif ($arg =~ /^-dir/) {
      $install_dir = shift(@arguments);

      ## Refseq dir
    } elsif ($arg =~ /^-refseq/) {
      $dir{refseq} = shift(@arguments);

      ## Genbank dir
    } elsif ($arg =~ /^-genbank/) {
      &RSAT::error::FatalError("Option -genbank is obsolete, use -refseq instead.");

      ## ENSEMBL dir
    } elsif ($arg =~ /^-ensembl/) {
      $dir{ensembl} = shift(@arguments);

      ## data source
    } elsif ($arg eq "-source") {
      $source = shift(@arguments);

      ## Batch mode
    } elsif ($arg eq "-batch") {
      $batch = 1;

      ## Queue for the job scheduler
    } elsif ($arg eq "-queue") {
      $cluster_queue = shift(@arguments);

      ## Masking modes
    } elsif ($arg eq "-rm") {
      $task{mask} = 1;
      push @masking_modes, "-rm";

      ## task selection
    } elsif (($arg =~ /^-task/) 
	     || ($arg =~ /^-step/)) {
      my @requested_tasks = split ",", shift(@arguments);
      foreach my $task (@requested_tasks) {
	next unless $task;
	if ($supported_task{$task}) {
	  $task{$task} = 1;
	} else {
	  &RSAT::error::FatalError("Unsupported task '$task'. \n\tSupported: $supported_tasks");
	}
      }

      ## image format
    } elsif ($arg eq "-img_format") {
      $img_format = lc(shift(@arguments));

      ## local configuration file specified with an environment variable
    } elsif ($arg =~ /^-local/) {
      unless ($ENV{'RSA_LOCAL_CONFIG'}) {
	die "Error : local config file must be specified \nin an environment variable RSA_LOCAL_CONFIG\n";
      }
      $local_config = 1;

      ## alternative configuration file
    } elsif ($arg eq "-config") {
      $ENV{'RSA_LOCAL_CONFIG'}  = shift(@arguments);
      unless ($ENV{'RSA_LOCAL_CONFIG'}) {
	die "Error : local config file must be specified \nin an environment variable RSA_LOCAL_CONFIG\n";
      }
      $local_config = 1;

      ## Store a backup copy of organism table before overwriting it
    } elsif ($arg eq "-backup_org_table") {
      $main::backup_org_table = 1;

=pod

=item B<-dry>

Dry run: print the commands but do not execute them.

=cut
    } elsif ($arg eq "-dry") {
	$main::dry = 1;;

=pod

=item B<-nodie>

Do not die in case a sub-program returns an error.

The option -nodie allows you to circumvent problems with specific
sub-tasks, but this is not recommended because the results may be
incomplete.

=cut

      } elsif ($arg eq "-nodie") {
	$main::die_on_error = 0;

    }
  }
}

################################################################
## Retrieve start and stop codons and calculate word occurrences
## (for checking)
sub StartAndStopCodons {
  &RSAT::message::TimeWarn("&StartAndStopCodons()", $organism_short_name) if ($main::verbose >= 2);
  #    my $label =  "orf";
  my $label =  "id,ctg,reg_left,reg_right,orf_strand";
  my $prefix = $dir{genome}."/".${organism_short_name};
  my $job_prefix = ${organism_short_name}."_start_codons";
  my $command = &RSAT::server::GetProgramPath("retrieve-seq");
  $command .= " -v 1";
  $command .= " -org ".$organism_short_name;
  $command .= " -all";
  $command .= " -type upstream -feattype cds -from 0 -to 2 ";
  $command .= " -format wc -nocomment -label ".$label;
  $command .= " -o ".$prefix."_start_codons.wc";
  $command .= " ; ";
  $command .= &RSAT::server::GetProgramPath("oligo-analysis");
  $command .= " -type dna -l 3 -return occ,freq -v -format wc -1str -sort";
  $command .= " -i ".$prefix."_start_codons.wc";
  $command .= " -o ".$prefix."_start_codon_frequencies.tab";
#  &RSAT::message::Debug($command) if ($main::verbose >= 10);
  &doit($command, $dry, $die_on_error, $main::verbose, $batch, $job_prefix, $log_handle, $err_handle, $cluster_queue);  
#  &doit($command, $dry, $die_on_error, $main::verbose);
  &RSAT::message::Info("\tStart codons", $prefix."_start_codons.wc") if ($main::verbose >= 2);
  &RSAT::message::Info("\tStart codon frequencies", $prefix."_start_codon_frequencies.tab") if ($main::verbose >= 2);

  $job_prefix = ${organism_short_name}."_stop_codons";
  $command = &RSAT::server::GetProgramPath("retrieve-seq");
  $command .= " -v 1";
  $command .= " -org ".$organism_short_name;
  $command .= " -all";
  $command .= " -type downstream  -feattype cds -from -2 -to 0 ";
  $command .= " -format wc -nocomment -label ".$label;
  $command .= " -o ".$prefix."_stop_codons.wc";
  $command .= " ; ";
  $command .= &RSAT::server::GetProgramPath("oligo-analysis");
  $command .= " -type dna -l 3 -return occ,freq -v -format wc -1str -sort";
  $command .= " -i ".$prefix."_stop_codons.wc";
  $command .= " -o ".$prefix."_stop_codon_frequencies.tab";

  &doit($command, $dry, $die_on_error, $main::verbose, $batch, $job_prefix, $log_handle, $err_handle, $cluster_queue);  
#  &doit($command, $dry, $die_on_error, $main::verbose);
  &RSAT::message::Info("\tStop codons", $prefix."_stop_codons.wc") if ($main::verbose >= 2);
  &RSAT::message::Info("\tStop codon frequencies", $prefix."_stop_codon_frequencies.tab") if ($main::verbose >= 2);
}


################################################################
## Parse the genome from Genbank files
sub ParseGenome {
  my ($species, $strain, $organism_name, $strain_dir, $genome_dir, $org_nb) = @_;
  &RSAT::message::TimeWarn("Parsing genome", $org_nb, $species, $strain, $organism_name, $strain_dir) if ($main::verbose >= 2);

  my $command = "$ENV{RSAT}/perl-scripts/parse-genbank.pl -v ".($main::verbose -1);
  $command .= " -i ".$strain_dir;
  $command .= $parse_options;
  if ($dir{ensembl}) {
    $command .= " -ext dat";
  } else {
    $command .= " -ext gbff"; ## 2019 format in NCBI
    $command .= " -genomeInFasta "; ## 2019 format in NCBI has the genome in fasta version separated from the genebank format
  }
  $command .= " -org ".$organism_name;
  #    if ($source ne $null) {
  $command .= " -source '$source'";
  #    }
  $command .= " -o ".$genome_dir;
  &doit($command, $dry, $die_on_error, $main::verbose, $batch, $job_prefix, $log_handle, $err_handle, $cluster_queue);
}

################################################################
## Clean up unnecessary files to save disk space
sub CleanUp {
  &RSAT::message::TimeWarn("&CleanUp()") if ($main::verbose >= 2);
  chdir $dir{genome};

  ## delete files with intergenic and gene segment sequences
  my @files = ();
  foreach my $seq_type (qw(intergenic gene upstream upstream-noorf)) {
    foreach my $format (qw(wc fasta)) {
      foreach my $extension ("", ".gz") {
	foreach my $segments ("", "_segments") {
	  foreach my $purged ("", "_purged") {
	    my $file = "${organism_short_name}_${seq_type}${segments}${purged}.${format}${extension}";
	    if (-e $file) {
	      push @files, $file;
	    }
	  }
	}
      }
    }
  }
  foreach my $file (@files) {
    my $command = "rm -f $file";
    &doit($command, $dry, $die_on_error, $main::verbose);
  }
}

