#!/usr/bin/perl
############################################################
#
# $Id: install-organism,v 1.90 2009/11/02 05:36:00 jvanheld Exp $
#
# Time-stamp: <2003-10-21 01:20:28 jvanheld>
#
############################################################
if ($0 =~ /([^(\/)]+)$/) {
    push (@INC, "$`lib/");

}
require "RSA.lib";
require RSAT::util;

#### initialisation

$config_table = $ENV{RSAT}."/data/supported_organisms.tab";
@masking_modes = (""); ## by default, use no masking
@oligo_lengths=(6,1,2,3,4,5,7,8);
$verbose = 0;
$batch = 0;
$die_on_error = 1;
$img_format = $ENV{rsat_img_format} || "png";
$noov="";
$strands="-1str";
$purged_frequencies = 0; #### temporarily inactivated because for human genome it suppresses almost all sequences !!!!
$supported_for_installation{"fasta"} = 1;
$supported_for_installation{"raw"} = 1;
$supported_for_installation{"filelist"} = 1;
%supported_task = (
		   parse=>1,
		   config=>1,
		   start_stop=>1,
		   allup=>1,
		   seq_len_distrib=>1,
		   genome_segments=>1,
		   oligos=>1,
		   distrib=>1,
		   ncf=>1,
		   protein_freq=>1,
		   gene_freq=>1,
		   genome_freq=>1,
		   upstream_freq=>1,
		   intergenic_freq=>1,
                   ensembl_freq=>1,
		   dyads=>1,
		   clean=>1,
		   phylogeny=>1,
		   all=>1
		   );
$supported_tasks = join ",", sort (keys %supported_task);
%task = ();
$parse_options = "";

$null = "<NULL>"; ## NULL value
$source = "NCBI";
$taxonomy = $null;

################################################################
############################ arguments #########################
################################################################

&ReadArguments();

################################################################
## Initialize parameters
$start_time = &AlphaDate();

### installation date
local $install_date = $force_date ||`date '+%Y/%m/%d %H:%M:%S'`;
chomp $install_date;

## Store the starting directory
$dir{main} = `pwd`; 

@task = keys %task;
if ($#task == -1) {
    &RSAT::error::FatalError("You should specify at least one task.\nSupported tasks\n\t$supported_tasks\n");
}
if ($task{all}) {
    %task = %supported_task;
}


################################################################
#################### check argument values #####################
################################################################

### if new data is parsed from Genbank, automatically update the config
#$task{config} = 1 if ($task{parse}); 

#### check organism name
unless ($organism_short_name) {
    print "Enter organism short name : ";
    $organism_short_name = <STDIN>;
    $organism_short_name = &trim($organism_short_name);
    die "\n\tError : invalid short name '$organism_short_name' \n\tshort name cannot contain spaces\n\n"
	if ($organism_short_name =~ /\s/);
    die "\n\tError: you should specify a short name for your organism\n\n"
	unless ($organism_short_name =~ /\S/);
}

unless ($organism_full_name) {
    if ($organism_short_name =~ /\_/) {
	$organism_full_name = $organism_short_name;
	$organism_full_name =~ s/\_/ /g;
    } else {
	print "Enter organism full name : ";
	$organism_full_name = <STDIN>;
	$organism_full_name = &trim($organism_full_name);
	die "\n\tError: you should specify a full name for the organism\n\n"
	    unless ($organism_full_name =~ /\S/);
    }
}


if ($task{oligos} || $task{dyads}) {
    unless (($task{upstream_freq}) ||
	    ($task{intergenic_freq}) ||
	    ($task{protein_freq}) ||
	    ($task{genome_freq}) ||
	    ($task{ensembl_freq})) {
	
	&RSAT::error::FatalError("The tasks 'oligos' and 'dyads' require to specify at least one sequence type among the following. ", 
				 "\n\tupstream_freq,intergenic_freq,genome_freq,protein_freq,ensembl_freq");
    }
}

################################################################
## Input directory
if ($task{parse}) {
    unless ($dir{genbank}) {
	if ($ENV{GENBANK_DIR}) {
	    $dir{genbank} = $ENV{GENBANK_DIR};
	} elsif ($GENBANK_DIR) {
	    $dir{genbank} = $GENBANK_DIR;
	} else {
	    &RSAT::error::FatalError("You should specify the directory where Genbank genomes can be found.");
	}
    }
    unless (-d $dir{genbank}) {
	&RSAT::error::FatalError("Genbank directory $dir{genbank} does not exists");
    }
}

################################################################
##################### installation directories #################
################################################################
umask 0002;
if ($dir{install}) {
    &RSAT::message::Info("Installing genome in directory specified on the command line", $dir{install}) if ($main::verbose >= 1);
} elsif ($supported_organism{$organism_short_name}->{'data'}) {
    $dir{install} = $supported_organism{$organism_short_name}->{'data'};
    &RSAT::message::Info("Installing genome in directory previously specified in the config file", $dir{install}) if ($main::verbose >= 1);
} else {
    $dir{install} = $ENV{RSAT}."/data/genomes/".$organism_short_name;
    &RSAT::message::Info("Installing genome in default directory", $dir{install}) if ($main::verbose >= 1);
}

&RSAT::util::CheckOutDir($dir{install});

$dir{genome} = "$dir{install}/genome";
&RSAT::util::CheckOutDir($dir{genome});

$dir{oligos} = "$dir{install}/oligo-frequencies";
&RSAT::util::CheckOutDir($dir{oligos});

$outfile{features} = "$dir{genome}/feature.tab" unless ($outfile{features});
$outfile{synonyms} = "$dir{genome}/feature_names.tab" unless ($outfile{synonyms});
$outfile{genome} = "$dir{genome}/contigs.txt" unless ($outfile{genome});


################################################################
###################### Installation tasks ######################
################################################################

## Open an output stream for messages
$out = &OpenOutputFile($outputfile);
&Verbose() if ($verbose >= 1);

&ParseGenome() if ($task{parse});

&UpdateConfig() if ($task{config});

&CreatePhylogeny() if ($task{phylogeny});

&StartAndStopCodons() if ($task{start_stop});

&IntergenicSegments() if ($task{genome_segments});

&AllUpstream() if ($task{allup});

&CalcFrequencies() if (($task{oligos}) || ($task{distrib}) || ($task{dyads}));

&CleanUp if ($task{clean});

close $out if ($outputfile);

## Touch the installation dir to indicate the last modification date
$dir{main};
chdir($dir{main});
&RSAT::message::Info(join("\t", "working dir",  `pwd`)) if ($main::verbose >= 2);
system "touch $dir{install}";

if ($verbose >= 1) {
    $done_time  = &AlphaDate();
    print $out "; Job started $start_time\n";
    print $out "; Job done    $done_time\n";
}


exit(0);



################################################################
#####################     subroutines     ######################
################################################################


################################################################
#### display verbosity message
sub Verbose {
    print $out "; install-organism ";
    &PrintArguments($out);
    print $out ";\n; Installing organism\n";
    print $out "; -------------------\n";

    print $out ";\n; Tasks:\n";
    foreach my $task (keys %task) {
	print $out ";\t$task\n";
    }

    print $out ";\n; Config files:", $rsa_config,"\n";
    printf $out ";    %-25s\t%s\n", "RSAT config", $main::config_table;
    printf $out ";    %-25s\t%s\n", "\$RSA_LOCAL_CONFIG", $ENV{'RSA_LOCAL_CONFIG'} if ($ENV{'RSA_LOCAL_CONFIG'});

    print $out ";\n; Organism parameters:\n";
    printf $out ";    %-25s\t%s\n", "ID", $organism_short_name;
    printf $out ";    %-25s\t%s\n", "Name", $organism_full_name;
    printf $out ";    %-25s\t%s\n", "Update date", $install_date;
    printf $out ";    %-25s\t%s\n", "data source", $source;

    print $out ";\n; Directories and files:\n";
    if ($task{genome}) {
	printf $out ";    %-25s\t%s\n", "genome sequence format", $seq_format;
	printf $out ";    %-25s\t%s\n", "genome sequence file", $infile{genome};
    }
    printf $out ";    %-25s\t%s\n", "feature table", $infile{features} if ($infile{features} );;
    printf $out ";    %-25s\t%s\n", "feature file", $infile{ptt} if ($infile{ptt} );
    printf $out ";    %-25s\t%s\n", "synonyms", $infile{synonyms} if ($infile{synonyms});
    printf $out ";    %-25s\t%s\n", "Genbank dir", $dir{genbank};
    printf $out ";    %-25s\t%s\n", "installation dir", $dir{install};
    printf $out ";    %-25s\t%s\n", "genome sequence file", $outfile{genome};
    printf $out ";    %-25s\t%s\n", "feature table", $outfile{features};
    printf $out ";    %-25s\t%s\n", "synonyms", $outfile{synonyms};
#    printf $out ";    %-25s\t%s\n", "file to update", $config_to_update;
}


################################################################
## Update configuration file
sub UpdateConfig {

  ### read taxonomy from the parsing result
  local $organism_table = $dir{install}."/genome/organism.tab";
  ($org_handle) = &OpenInputFile($organism_table);
  my $taxonomy_field = 2;
  while (<$org_handle>) {
    chomp;
    next unless (/\S/);
    if (/^-- field\s+(\d)	taxonomy/) {
      $taxonomy_field = $1;
    } elsif (/^--/) {
      next;
    } else {
      if (defined($taxonomy_field)) {
	@fields = split "\t";
	$id = $fields[0];
	$taxonomy = $fields[$taxonomy_field - 1];
	&RSAT::message::Info (join("\t", "Parsed taxonomy from organism.tab", "Id",$id, "Taxonomy", $taxonomy)) if ($main::verbose >= 2);
      } else {
	&Warning("Cannot read taxonomy in file $organism_table\n");
      }
    }
  }
  close $org_handle;


  #### default limits of upstream region for retrieve-seq
  unless (defined($up_from)) {
    if (defined($supported_organism{$organism_short_name}->{'up_from'})) {
      $up_from = $supported_organism{$organism_short_name}->{'up_from'};;
    } else {
      if (($taxonomy =~ /^Bacteria/) || ($taxonomy =~ /^Archaea/)) {
	$up_from=-400;
      } elsif ($taxonomy =~ /^Viruses/) {
	$up_from=-400;
      } elsif ($taxonomy =~ /^Eukaryota; Fungi/) {
	$up_from=-800;
      } elsif ($taxonomy =~ /^Eukaryota;Metazoa/) {
	$up_from=-2000;
      } else {
	$up_from=-1000;
      }
    }
  }
  unless (defined($up_to)) {
    if (defined($supported_organism{$organism_short_name}->{'up_to'})) {
      $up_to = $supported_organism{$organism_short_name}->{'up_to'};
    } else {
      $up_to = -1;
    }
  }
  &RSAT::message::Info(join("\t", "Upstream region limits from", $up_from, "to", $up_to)) if ($main::verbose >= 2);

  ## 2009/05/13: The old perl config file should not be used anymore,
  ## but is still exported for the sake of backward compatibility
  &UpdateConfigPerl();

  ## Since 2009/05/13
  &UpdateConfigTab();
}

################################################################
## Update the perl config file
sub UpdateConfigPerl {
  my $comment_previous_config = 0;
  if ($local_config) {
    $config_to_update = $ENV{'RSA_LOCAL_CONFIG'} ;
  } else {
    $config_to_update = "$ENV{RSAT}/data/supported_organisms.pl";
  }

  ## Check if the organism was already installed before
  if (defined($supported_organism{$organism_short_name}->{'genome'})) {
      &RSAT::message::Warning(join("\t", $organism_short_name, "already defined in the config file\n",
				   $config_to_update,
				   "\n\tprevious config will be commented")) if ($main::verbose >= 2);
  }

  #### read previous config
  open CONFIG, $config_to_update;
  while ($line = <CONFIG>) {
    chomp $line;
    last if ($line =~ /return/);
    if (($line =~ /supported_organism\{\'$organism_short_name\'\}/) && ($line !~ /^\#/)) {
      #### comment the previous config
      if ($comment_previous_config) {
	$previous_config .= "# ${line} # reinstalled on\t${date}\n";
      }
    } else {
	$previous_config .= $line."\n";
      }
  }
  close CONFIG;

  #### write new config
  &RSAT::message::Info ("Updating supported organisms", $config_to_update) if ($verbose >= 1);
  open CONFIG, ">$config_to_update" 
    || die "Error: cannot write config file $config_to_update\n";
  print CONFIG $previous_config;
  my $new_org_config = 	 "\n#### $organism_short_name\t$organism_full_name\t$install_date\n";
  $new_org_config .= "\$supported_organism{'$organism_short_name'}->{'name'} = \"$organism_full_name\";\n";
  $new_org_config .=  "\$supported_organism{'$organism_short_name'}->{'data'} = \"$dir{install}\";\n";
  $new_org_config .=  "\$supported_organism{'$organism_short_name'}->{'last_update'} = \"".$install_date."\";\n";
  $new_org_config .=  "\$supported_organism{'$organism_short_name'}->{'source'} = \"$source\";\n";
  ## OLIVIER SAND SHOULD CHECK IF THIS RESTRICTION FOR ensembl IS STILL VALID
  unless ($source eq 'ensembl') {
    $new_org_config .=  "\$supported_organism{'$organism_short_name'}->{'features'} = \"$outfile{features}\";\n";
    $new_org_config .=  "\$supported_organism{'$organism_short_name'}->{'genome'} = \"$outfile{genome}\";\n";
    $new_org_config .=  "\$supported_organism{'$organism_short_name'}->{'seq_format'} = \"filelist\";\n";
  }
  $new_org_config .=  "\$supported_organism{'$organism_short_name'}->{'taxonomy'} = \"$taxonomy\";\n";
  if (defined($outfile{synonyms})) {
    $new_org_config .=  "\$supported_organism{'$organism_short_name'}->{'synonyms'} = \"$outfile{synonyms}\";\n";
  }
  $new_org_config .=  "\$supported_organism{'$organism_short_name'}->{'up_from'} = ".$up_from.";\n";
  $new_org_config .=  "\$supported_organism{'$organism_short_name'}->{'up_to'} = ".$up_to.";\n";

  #### Replace absolute paths by relative paths
  ##   $new_org_config =~ s|$ENV{RSAT}\/|\$ENV\{RSAT\}\/|g;
  $new_org_config =~ s|$ENV{RSAT}|\$ENV\{RSAT\}\/|g;
  $new_org_config =~ s|\/\/|/|g;

  print CONFIG $new_org_config;
  print CONFIG "\nreturn 1;\n";
  close CONFIG;
}

################################################################
## Update the tab-delimited file 
sub UpdateConfigTab {
  my %args = @_;
#  &RSAT::message::Info ("Updating supported organisms", $main::config_table) if ($verbose >= 1);

  ## Update the hash variable for the new organism
  $supported_organism{$organism_short_name}->{'name'} = $args{name} || $main::organism_full_name;
  $supported_organism{$organism_short_name}->{'data'} = $args{data} || $dir{install};
  $supported_organism{$organism_short_name}->{'last_update'} = $args{last_update} ||  $main::install_date;
  $supported_organism{$organism_short_name}->{'source'} = $args{source} || $main::source;
  ## OLIVIER SAND SHOULD CHEXK IF THIS RESTRICTION FOR ensembl IS STILL VALID
  unless ($source eq 'ensembl') {
    $supported_organism{$organism_short_name}->{'features'} = $args{features} || $main::outfile{features};
    $supported_organism{$organism_short_name}->{'genome'} = $args{genome} || $main::outfile{genome};
    $supported_organism{$organism_short_name}->{'seq_format'} = $args{seq_format} || "filelist";
  }
  $supported_organism{$organism_short_name}->{'taxonomy'} = $args{taxonomy} || $main::taxonomy;
  if (defined($main::outfile{synonyms})) {
    $supported_organism{$organism_short_name}->{'synonyms'} = $args{synonyms} || $main::outfile{synonyms};
  }
  $supported_organism{$organism_short_name}->{'up_to'} = $args{up_to} || $main::up_to;
  $supported_organism{$organism_short_name}->{'up_from'} = $args{up_from} || $main::up_from;

    ## Export the updated table of supported organisms
    &RSAT::OrganismManager::export_supported_organisms($main::config_table);
#    &RSAT::message::Debug("new_org_config", $new_org_config) if ($main::verbose >= 0);
}

################################################################
## Create a directory for the taxonomic group of the organism
sub CreatePhylogeny {
    my $taxonomy = "";

    ### read taxonomy from the parsing result
    $organism_table = $dir{install}."/genome/organism.tab";

    ($org_handle) = &OpenInputFile($organism_table);
    while (<$org_handle>) {
	chomp;
	if (/^-- field (\d)	taxonomy/) {
	    $taxonomy_field = $1;
	} elsif (/^--/) {
	    next;
	} else {
	    if (defined($taxonomy_field)) {
		@fields = split "\t";
		$taxonomy = $fields[$taxonomy_field - 1];
		&RSAT::message::Info ("Taxonomy\t$taxonomy\n") if ($main::verbose >= 2);
	    } else {
		&Warning("Cannot read taxonomy in file $organism_table\n");
	    }
	}
    }
    close $org_handle;

    if ($taxonomy) {
	$taxonomy = &trim($taxonomy);
	$taxonomy =~ s|\s*;\s*|/|g; ## Each taxonomic level becomes a subdirectorry
	$taxonomy =~ s|\s+|_|g; ## I prefer to avoid spaces in directory names
	$taxonomy =~ s|\(|_|g; ## this character cannot be used for a directory name
	$taxonomy =~ s|\)|_|g; ## this character cannot be used for a directory name
	$taxonomy =~ s|\,|.|g; ## Not fatal, but usually not found in folder names.
	$taxonomy =~ s|\:|.|g; ## Not fatal, but usually not found in folder names.
	$dir{taxonomy} = $ENV{RSAT}."/data/phylogeny/".$taxonomy;
	$dir{taxonomy} =~ s|//|/|g;
	my ($org_dir) = &ShortFileName($dir{install});
	&RSAT::util::CheckOutDir($dir{taxonomy});
	if ($main::verbose >= 2) {
	    &RSAT::message::Info("Taxonomy directory", $dir{taxonomy});
	    &RSAT::message::Info("Organism directory", $org_dir);
	    &RSAT::message::Info("Link to directory", $dir{install});
	}
	&doit("cd $dir{taxonomy}; rm $org_dir; ln -s $dir{install} .",0,0,$verbose);
    } else {
	&RSAT::error::FatalError("Cannot identify taxonomy in table ".$organism_table);
    }

}

################################################################
### extract the non-redudant set of intergenic and gene sequences
sub IntergenicSegments {
    chdir $dir{genome};

    #### retrieve intergenic sequences
    my $command = "coding-or-not ";
    $command .= "-v " if ($verbose >= 1);
    $command .= "-org $organism_short_name -return ncs,cs,pos,seq,stats \n\n";
    &doit($command, $dry_run, $die_on_error, $verbose);

    my @types = ();
    push @types, "gene";
    push @types, "intergenic";
	    
    foreach my $seq_type (@types) {
	my $seq_file = "$dir{genome}/${organism_short_name}_${seq_type}_segments.wc";

	#### draw sequence length distributions
	&SeqLengthDistribution($seq_file, "wc", $seq_type, 50) if ($task{seq_len_distrib});
	
	#### compress sequence file
	$command = "gzip -f $seq_file";
	&doit($command, $dry_run, $die_on_error, $verbose);

	#### purge sequences
	&PurgeSequences($seq_file) if ($purged_frequencies);
	
    }

    chdir $dir{main};
}


################################################################
### extract the complete set of upstream sequences
sub AllUpstream {
    warn "; Retrieving all upstream sequences\n" if ($verbose >= 1);
#    chdir $dir{genome};

    foreach my $masking (@masking_modes) {
	foreach my $noorf ("", "-noorf") {
	    my $seq_type = "upstream${noorf}${masking}";
	    my $seq_file = "$dir{genome}/${organism_short_name}_${seq_type}.fasta";
	    my $format = "fasta";

	    #### retrieve intergenic sequences
	    my $command = "retrieve-seq ${noorf} ${masking} -format $format";
	    $command .= " -type upstream -all -org $organism_short_name ";
	    $command .= " -all -o $seq_file\n\n";
	    &doit($command, $dry_run, $die_on_error, $verbose);

	    #### draw sequence length distributions
	    &SeqLengthDistribution($seq_file, $format, $seq_type, 50) if (($noorf) && ($task{seq_len_distrib}));

	    #### compress sequence file
	    $command = "gzip -f $seq_file";
	    &doit($command, $dry_run, $die_on_error, $verbose);

	    #### purge sequences
	    &PurgeSequences($seq_file) if ($purged_frequencies);
	}
    }

    chdir $dir{main};
}
    
################################################################
#### draw an histogram of sequence lengths
sub SeqLengthDistribution {
    my ($seq_file, $format, $seq_type, $ci) = @_;
    $ci = 50 unless $ci;
    $format = "fasta" unless $format;
    my $classfreq_from = 0;
    chdir $dir{genome};

    if ($main::verbose >= 2) {
	my $pwd = `pwd`;
	chomp $pwd;
	&RSAT::message::TimeWarn(join"\t", "SeqLengthDistribution", $seq_file, $seq_type, "Working dir", $pwd) if ($main::verbose >= 2);
    }

    my $command = "sequence-lengths -i $seq_file -format ".$format;
    $command .= " -o ${organism_short_name}_${seq_type}_segments_lengths.tab";
    $command .= "; cut -f 2 ${organism_short_name}_${seq_type}_segments_lengths.tab ";
    $command .= "| classfreq -ci $ci -v -from $classfreq_from";
    $command .= " -o ${organism_short_name}_${seq_type}_segments_lengths.tab";
    $command .= "; XYgraph -i ${organism_short_name}_${seq_type}_segments_lengths.tab";
    $command .= " -xcol 3 -ycol 7,8,9 -lines -xmin $classfreq_from";
    $command .= " -ymin 0 -ymax 1  -xsize 600 -ysize 400 -legend";
    $command .= " -xgtask1 100 -xgtask2 250 -ygtask1 0.1";
    $command .= " -xlog 2" if( ($seq_type eq "intergenic") || ($seq_type eq "gene"));
    $command .= " -xleg1 'sequence length (bp)'";
    $command .= " -yleg1 'frequency'";
    $command .= " -title1 '".$organism_full_name."'";
    $command .= " -title2 'length distribution of ${seq_type} sequences'";
    $command .= " -format ".$img_format;
    $command .= " -o ${organism_short_name}_${seq_type}_segments_lengths.".$img_format;
    &doit($command, $dry_run, $die_on_error, $verbose);
}

################################################################
#### purge sequences
sub PurgeSequences {
    my ($seq_file, $format) = @_;
    $format = "fasta" unless ($format);
    my $purged_seq_file = `basename $seq_file $format`;
    $purged_seq_file .= "_purged.$format";
    my $command = "purge-sequence -i $seq_file -format wc -ml 300 -mis 9 -2str -o $purged_seq_file";
    &doit($command, $dry_run, $die_on_error, $verbose);
    #### compress purged sequence file
    $command = "gzip -f $purged_seq_file";
    &doit($command, $dry_run, $die_on_error, $verbose);
}

################################################################
### calculate oligo and dyad frequencies in different sequence types :
### - intergenic
### - upstream
### - genomic
### 
sub CalcFrequencies { 
#    my ($seq_file, $seq_format, $seq_type) = ();
    
    chdir $dir{oligos};

    ################################################################
    # calculate oligont frequencies in all upstream sequences
    if ($task{upstream_freq}) {
	foreach my $masking (@masking_modes) {
	    foreach my $noorf ("-noorf", "") {
		$seq_type = "upstream${noorf}${masking}";
		if ($purged_frequencies) {
		    $seq_file = "${organism_short_name}_${seq_type}_purged.fasta";
		    $seq_format = "fasta";
		} else {
		    $seq_file = "${organism_short_name}_${seq_type}.fasta";
		    $seq_format = "fasta";
		}
		&RSAT::message::TimeWarn("Calculating upstream oligo and dyad frequencies") if ($main::verbose >= 1);
		&CalcOligoFreq($dir{genome}."/".$seq_file, $seq_format, $seq_type) if ($task{oligos});
		&CalcOligoDistrib($dir{genome}."/".$seq_file, $seq_format, $seq_type) if ($task{distrib} && !($noorf));
		&CalcDyadFreq($dir{genome}."/".$seq_file, $seq_format, $seq_type) if ($task{dyads});
	    }
	}
    }

    if ($task{ensembl_freq}) {

    unless (defined($up_from)) {
	if (defined($supported_organism{$organism_short_name}->{'up_from'})) {
	    $up_from = $supported_organism{$organism_short_name}->{'up_from'};;
	}
    }
    
    unless (defined($up_to)) {
	if (defined($supported_organism{$organism_short_name}->{'up_to'})) {
	    $up_to = $supported_organism{$organism_short_name}->{'up_to'};
	}
    }

	foreach my $masking (@masking_modes) {
#	foreach my $masking ("-rm") {
#	    foreach my $noorf ("-noorf", "") {
	        foreach my $maskcoding ("-maskcoding") {
#		    foreach my $type ("upstream_mrna") {
#		    foreach my $type ("intron") {
#		    foreach my $type ("firstintron") {
		    foreach my $type ("utr") {
#			$seq_type = "${type}${up_from}${up_to}${maskcoding}${masking}";
			$seq_type = "${type}${maskcoding}${masking}";
#		    if ($purged_frequencies) {
#			$seq_file = "${organism_short_name}_${seq_type}_purged.fasta";
#			$seq_format = "fasta";
#		    } else {
			$seq_file = "${organism_short_name}_${seq_type}.fasta";
			$seq_format = "fasta";
#		    }
		    &RSAT::message::TimeWarn("Calculating upstream oligo and dyad frequencies") if ($main::verbose >= 1);
		    &CalcOligoFreq($dir{genome}."/".$seq_file, $seq_format, $seq_type) if ($task{oligos});
#		    &CalcOligoDistrib($dir{genome}."/".$seq_file, $seq_format, $seq_type) if ($task{distrib} && !($noorf));
#		    &CalcDyadFreq($dir{genome}."/".$seq_file, $seq_format, $seq_type) if ($task{dyads});
		}
#	    }
	}
    }
}


    ################################################################
    # calculate oligont frequencies in all intergenic sequences
    if ($task{intergenic_freq}) {
	$seq_type = "intergenic";
	&RSAT::message::TimeWarn("Calculating ${seq_type} oligo and dyad frequencies") if ($main::verbose >= 1);
	if ($purged_frequencies) {
	    $seq_file = "${organism_short_name}_intergenic_segments_purged.fasta";
	    $seq_format = "fasta";
	} else {
	    $seq_file = "${organism_short_name}_intergenic_segments.wc";
	    $seq_format = "wc";
	}
	&CalcOligoFreq($dir{genome}."/".$seq_file, $seq_format, $seq_type) if ($task{oligos});
#	&CalcOligoDistrib($dir{genome}."/".$seq_file, $seq_format, $seq_type) if ($task{distrib});
	&CalcDyadFreq($dir{genome}."/".$seq_file, $seq_format, $seq_type) if ($task{dyads});
    }
    
    ################################################################
    # calculate oligo-frequencies in full genome
    if ($task{genome_freq}) {
	$seq_type = "genomic";
	$seq_format = "filelist";
	$seq_file = $outfile{'genome'};
	&CalcOligoFreq($seq_file, $seq_format, $seq_type) if ($task{oligos});
#	&CalcOligoDistrib($seq_file, $seq_format, $seq_type) if ($task{distrib});
	&CalcDyadFreq($seq_file, $seq_format, $seq_type) if ($task{dyads});
    }

    ################################################################
    # calculate oligont frequencies in all gene sequences
    if ($task{gene_freq}) {
	$seq_type = "gene";
	if ($purged_frequencies) {
	    $seq_file = "${organism_short_name}_gene_segments_purged.fasta";
	    $seq_format = "fasta";
	} else {
	    $seq_file = "${organism_short_name}_gene_segments.wc";
	    $seq_format = "wc";
	}
	&CalcOligoFreq($dir{genome}."/".$seq_file, $seq_format, $seq_type) if ($task{oligos});
#	&CalcOligoDistrib($dir{genome}."/".$seq_file, $seq_format, $seq_type) if ($task{distrib});
	&CalcDyadFreq($dir{genome}."/".$seq_file, $seq_format, $seq_type) if ($task{dyads});
    }

    ################################################################
    # Calculate oligopeptide frequencies in all protein sequences
    if ($task{protein_freq}) {
	$seq_type = "protein";
	$seq_file = "${organism_short_name}_aa.fasta";
	$seq_format = "fasta";
	&CalcOligoFreq($dir{genome}."/".$seq_file, $seq_format, $seq_type) if ($task{oligos});
    }

    chdir $dir{main};
}

################################################################
## Calculate oligonucleotide frequencies for a specified sequence
## file
sub CalcOligoFreq {
  my ($seq_file,$seq_format,$seq_type) = @_;
  my @current_oligo_lengths = @oligo_lengths;
#  my @current_oligo_lengths = (6);
  my $oligo_seq_type = "dna";
  my $residue_type = "nt";
  my @strands = ("-1str", "-2str");
  if ($seq_type eq "protein") {
    @current_oligo_lengths = 1..3;
    $oligo_seq_type = "prot";
    $residue_type = "pept";
    @strands = ("");
  }
  foreach my $noov ("-noov", "-ovlp") {
    foreach my $strands (@strands) {
      foreach my $oligo_length (@current_oligo_lengths) {
	&RSAT::message::TimeWarn("Calculating oligo frequencies",$seq_file, $seq_format, $seq_type, "l=".$oligo_length, $noov, $strands) if ($main::verbose >= 1);
	$job_prefix = ${organism_short_name}."_oligo_".$oligo_length;
	$out_file = $dir{oligos}."/".${oligo_length}.${residue_type}."_".${seq_type}."_".${organism_short_name}.${noov}.${strands}.".freq";
	my $command = "$SCRIPTS/oligo-analysis -v 1 ${strands} -i $seq_file -format $seq_format ";
	$command .= " ".$strands;
	$command .= " -seqtype ".$oligo_seq_type;
	$command .= " ".$noov;
	$command .= " -l ".$oligo_length." -type dna ";
	$command .= " -return freq,occ";
	$command .= " -o ".$out_file;
	$command .= "; gzip -f ".$out_file;
	&doit($command, $dry_run, $die_on_error, $verbose, $batch, $job_prefix);  
      }
    }
  }
}

################################################################
#### calculate oligonucleotide distribution for a specified sequence
#### file
sub CalcOligoDistrib {
  my ($seq_file,$seq_format,$seq_type) = @_;
  @strands = ("-1str", "-2str");
  foreach my $noov ("-noov", "-ovlp") {
    foreach my $strands (@strands) {
      foreach my $oligo_length (@oligo_lengths) {
	$job_prefix = ${organism_short_name}."_oligo_".$oligo_length;
	#### Calculate occurrence distributions in the sequence file
	my $distrib_file = "$dir{oligos}/${oligo_length}nt_${seq_type}_${organism_short_name}${noov}${strands}_distrib.tab";
	my $command = "$SCRIPTS/oligo-analysis -v 1 ${strands} -i $seq_file -format $seq_format ";
	$command .= " ".$strands;
	$command .= " ".$noov;
	$command .= " -l ".$oligo_length." -type dna";
	$command .= " -return occ -distrib";
	$command .= " -o ".$distrib_file;
	$command .= " ; gzip -f ".$distrib_file;
	&doit($command, $dry_run, $die_on_error, $verbose, $batch, $job_prefix);  

	#### Fit a Poisson and a negbin on occurrence distribution
	foreach my $theor ("negbin", "poisson") {
	  my $fitting_file = "$dir{oligos}/${oligo_length}nt_${seq_type}_${organism_short_name}${noov}${strands}_${theor}.tab";
	  $command = "fit-distribution -v 1 -i ".$distrib_file;
	  $command .= " -o ".$fitting_file;
	  $command .= " -distrib ".$theor;
	  $command .= " ; gzip -f ".$fitting_file;
	  &doit($command, $dry_run, $die_on_error, $verbose);  
	}
      }
    }
  }
}


################################################################
#### calculate dyad frequencies for a specified sequence
#### file
sub CalcDyadFreq {
  my ($seq_file,$seq_format,$seq_type) = @_;
  $min_spacing = 0;
  $max_spacing = 20;
  @monad_lengths = (3,2,1);
  @strands = ("-1str", "-2str");
  foreach my $noov ("-noov", "-ovlp") {
    foreach my $strands (@strands) {
      foreach my $monad_length (@monad_lengths) {
	&RSAT::message::TimeWarn("Calculating dyad frequencies",$seq_file, $seq_format, $seq_type, "l=".$monad_length, $noov, $strands) if ($main::verbose >= 1);
	$job_prefix = ${organism_short_name}."_dyad_".$monad_length;
	$dyad_file = "dyads_${monad_length}nt_sp${min_spacing}-${max_spacing}_${seq_type}_${organism_short_name}${noov}${strands}";
	$dyad_file .= ".freq";
	my $command = "$SCRIPTS/dyad-analysis -v 1 -i $seq_file -format $seq_format";
	$command .= " -timeout 240000 ";
	$command .= " -type any -seqtype dna";
	$command .= " $strands";
	$command .= " $noov";
	$command .= " -sp ${min_spacing}-${max_spacing}";
	$command .= " -l $monad_length";
	$command .= " -return freq,occ";
	$command .= " -o $dyad_file";
	$command .= "; gzip -f $dyad_file ";
	&doit($command, $dry_run, $die_on_error, $verbose, $batch, $job_prefix);
      }
    }
  }
}


################################################################
#### display full help message
sub PrintHelp {
    open HELP, "| more";
    print HELP <<End_of_help;
NAME
	install-organism

AUTHOR
        Jacques van Helden (jvanheld\@bigre.ulb.ac.be)

USAGE
        install-organism -org organism_name

DESCRIPTION
	Add support for an organism in RSA-tools.

	This script is a task manager, which (depending on the
	selected tasks) manages different steps necessary for the
	installation of an organism from the NCBI flat files :

	- parse the .gbk files

	- add the organism in the config file

	- calculate trinucleotide frequencies in the start and stop
          codons (a way to check consistency of the gene locations)

	- calculates oligonucleotide and dyad frequencies

CATEGORY
	Data management.

OPTIONS
	-h	(must be first argument) display full help message
	-help	(must be first argument) display options
	-v	verbose

    MANDATORY ARGUMENTS
	-org	organism name without spaces 
	        (e.g. Saccharomyces_cerevisiae)


    OPTIONAL ARGUMENTS
	-organism
		Full name of the organism 
		(e.g. 'Saccharomyces cerevisiae').

	-source	data source

	-dir dir{install}
		Absolute path of the installation directory. 
		BEWARE : you should provide the absolute path of the
		installation directory, not the relative path.

	-batch  run some tasks (for examplethe calibration of oligos
		and dyads) in batch mode.  This options works on our
		lab cluster, but could be adapted for other
		configurations by adapting the method &doit() in the
		utilities ($RSAT/lib/RSA.lib).

	-config
		Specify an alternative organism configuration file for the
		genome to be installed.

		By default, the organism configuration file is 
			   \$RSAT/data/genomes/supported_organisms.pl

	-local
		Absolute path of a RSA local config file.

		By default, the newly installed organism is added to
		the main RSA config file is changed (provided the user
		has write access to the RSA config file).

		In addition to the organisms installed by the RSAT
		system administrator (found in
		$ENV{RSAT}/RSA.config), users can install some
		organisms locally.

		For this, the user must first define an environment
		variable called RSA_LOCAL_CONFIG, and indicating the
		absolute path of the local config file.  
		E.g.  
		  export RSA_LOCAL_CONFIG=/home/fred/RSA.local.config

		When install-organisms is called with the option
		-local, the new organism is added to the file
		indicated by the environment variable RSA_LOCAL_CONFIG
		rather than the main RSA config file.

	-syn	synonym table
		A tab-delimited file containing two coloumns. The
		forst column contains a gene ID, the second a gene
		name.
	-up_from distal limit of the upstream regions (e.g. -800 for yeast)
	-up_to	proximal limit of the upstream regions (e.g. -1)
	-genbank
		genbank directory

		A directory containing a mirror of the NCBI genbank
		genome directory:
		       http://ftp.ncbi.nih.gov/genomes
		Normally, the genbank directory is specified by
		defining a global variable GENBANK_DIR in the config
		file. The option -genbank allows to overwrite this
		value.

	-prefid feattype idname
	        passed to parse-genbank.pl

	-date last_update

	        Force the 'last_update' attribute to a given date. 

		This option is used by download-organism to ensure
		that the local genome has the same installation date
		as the server, rather than using the date of download
		as update date.

	-ensembl
		ENSEMBL directory. Directory containing the ENSEMBL
		flat files in Genbank format (ext .dat)

		Example: 
		ftp.ensembl.org/pub/current_worm/data/flatfiles/genbank

	-task	specification of a single installation task
		    e.g.
			install-organism -task dyads
		supported tasks: $supported_tasks

		Description of the tasks
		------------------------
		genome	     format genome sequence
			     (obsolete)

		features     prepare feature table
			     (obsolete)

		config	     update configuration file

		start_stop   
			     calculate start and stop codon
			     frequencies

		allup	     retrieve all upstream sequences

		genome_segments
			     retrieve sequences and limits of genome segments
			     (intergenic, genic)

		oligos	     calculate oligonucleotide frequencies

			     This task requires to specify, in
			     addition, the type(s) of sequences for
			     which oligo frequencies have to be
			     calculated (upstream_freq,
			     intergenic_freq, genome_freq).

		dyads	     calculate dyad frequencies

			     This task requires to specify, in
			     addition, the type(s) of sequences for
			     which dyad frequencies have to be
			     calculated (upstream_freq,
			     intergenic_freq, genome_freq).

		ncf	     calculate oligo and dyad frequencies in
			     intergenic segments

		upstream_freq
			     calculate oligo and dyad frequencies for
			     all upstream sequences

		intergenic_freq
			     calculate oligo and dyad frequencies for
			     all intergenic sequences

		genome_freq  
			     calculate oligo and dyad frequencies for
			     the whole genome sequence. This is not
			     recommended for higher organisms, where
			     the genome represents several Gigabases,
			     and the computation of all oligo and dyad
			     frequencies might take ages.

		clean	     remove unnecessary sequence files

	-rm	calibrate oligo and dyad frequncies on repeat masked
		sequences, in addition to the non-masked sequences.

	-img_format
		image format for the graphs of sequence length distribution

SEE ALSO

    donwload-organisms

        The program I<install-organism> performs all the formatting
	and calibration tasks for importing genomes from the reference
	databases (NCBI, EMBL) to RSAT.

	The program I<download-organism> transfers the RSAT-formatted
	genomes from a RSAT server. 

 	If a genome is available on the RSAT server, it is recommended
	to use download-genomes in order to obtain it immediately in
	the RSAT format, rather than install-genomes.


End_of_help
    close HELP;
    exit(0);
}

################################################################
#### Display short help message
sub PrintOptions {
  open HELP, "| more";
  print HELP <<End_short_help;
install_organism options
------------------------
-h		(must be first argument) display full help message
-help		(must be first argument) display options
-v		verbose
-n		dry run (print commands without executing them)
-org		organism name without spaces (e.g. Saccharomyces_cerevisiae);
-organism	full organism name (e.g. Saccharomyces cerevisiae)
-source		data source (e.g. ncbi);
-dir		absolute path of the installation directory
-batch  	run some tasks (for examplethe calibration of oligos and dyads) in batch mode.
-config		alternative organism configuration file
-local		update local config file 
		(specified by the environment variable RSA_LOCAL_CONFIG)
-genbank	genbank directory
-ensembl	ensembl directory
-task		installation task ($supported_tasks)
-rm		calibrate oligo and dyad frequncies on repeat masked sequences
-syn		synonym table
-up_from       	distal limit of the upstream regions (e.g. -800 for yeast)
-up_to		proximal limit of the upstream regions (e.g. -1)
-prefid feattype idname     passed to parse-genbank.pl
-date 		 force last_update to a given date (for synchro between server and local installation)
-img_format	 image format for the graphs of sequence length distribution
End_short_help
  close HELP;
  exit;
}

################################################################
#### Read arguments 
sub ReadArguments {
    foreach my $a (0..$#ARGV) {
	### verbose ###
	if ($ARGV[$a] eq "-v") {
	    if (&IsNatural($ARGV[$a+1])) {
		$verbose = $ARGV[$a+1];
	    } else {
		$verbose = 1;
	    }
	    
	    #### dry run
	} elsif ($ARGV[$a] eq "-n") {
	    $dry_run = 1;
	    $verbose = 1;

	    ### detailed help
	} elsif ($ARGV[$a] eq "-h") {
	    &PrintHelp();
	    
	    ### list of options
	} elsif ($ARGV[$a] eq "-help") {
	    &PrintOptions();
	    
	    #### output file
	} elsif ($ARGV[$a] eq "-o") {
	    $outputfile = $ARGV[$a+1];

	    ### data source
	} elsif ($ARGV[$a] eq "-source") {
	    $source = $ARGV[$a+1];

	    ### organism
	} elsif ($ARGV[$a] eq "-org") {
	    $organism_short_name = $ARGV[$a+1];
	} elsif ($ARGV[$a] eq "-organism") {
	    $organism_full_name = $ARGV[$a+1];

	    #### synonyms
	} elsif ($ARGV[$a] =~ /-syn/) {
	    $infile{synonyms} = $ARGV[$a+1];

	    #### Specify the limits of upstream regions
	} elsif ($ARGV[$a] eq "-up_from") {
	    $up_from = $ARGV[$a+1];
	    &FatalError(join ("\t", $up_from, "Invalid value for the up_from parameter (must be integer)")) unless (&IsInteger($up_from));
	    &FatalError(join ("\t", $up_from, "Invalid value for the up_from parameter (must be negative)")) if ($up_from >= 0);

	} elsif ($ARGV[$a] eq "-up_to") {
	    $up_to = $ARGV[$a+1];
	    &FatalError(join ("\t", $up_to, "Invalid value for the to parameter (must be integer)")) unless (&IsInteger($up_to));

	} elsif ($ARGV[$a] eq "-prefid") {
	    $parse_options .= join(" " , " -prefid", $ARGV[$a+1], $ARGV[$a+2]);

	} elsif ($ARGV[$a] eq "-date") {
	    $force_date = $ARGV[$a+1];

	    #### installation dir
	} elsif ($ARGV[$a] =~ /^-dir/) {
	    $dir{install} = $ARGV[$a+1];

	    #### Genbank dir
	} elsif ($ARGV[$a] =~ /^-genbank/) {
	    $dir{genbank} = $ARGV[$a+1];

	    #### ENSEMBL dir
	} elsif ($ARGV[$a] =~ /^-ensembl/) {
	    $dir{ensembl} = $ARGV[$a+1];

	    #### Sequence source
	} elsif ($ARGV[$a] eq "-source") {
	    $source = $ARGV[$a+1];

	    #### Batch mode
	} elsif ($ARGV[$a] eq "-batch") {
	    $batch = 1;

	    #### Masking modes
	} elsif ($ARGV[$a] eq "-rm") {
	    push @masking_modes, "-rm";;

	    #### task selection
	} elsif (($ARGV[$a] =~ /^-task/) 
		 || ($ARGV[$a] =~ /^-step/)) {
	    my @requested_tasks = split ",", $ARGV[$a+1];
	    foreach my $task (@requested_tasks) {
		next unless $task;
		if ($supported_task{$task}) {
		    $task{$task} = 1;
		} else {
		    &RSAT::error::FatalError("Unsupported task '$task'. \n\tSupported: $supported_tasks");
		}
	    }

	    ### image format
	  } elsif ($ARGV[$a] eq "-img_format") {
	    $img_format = lc($ARGV[$a+1]);

	    #### local configuration file specified with an environment variable
	} elsif ($ARGV[$a] =~ /^-local/) {
	    unless ($ENV{'RSA_LOCAL_CONFIG'}) {
		die "Error : local config file must be specified \nin an environment variable RSA_LOCAL_CONFIG\n";
	    }
	    $local_config = 1;

	    #### alternative configuration file
	} elsif ($ARGV[$a] =~ /^-config/) {
	    $ENV{'RSA_LOCAL_CONFIG'}  = $ARGV[$a+1];
	    unless ($ENV{'RSA_LOCAL_CONFIG'}) {
		die "Error : local config file must be specified \nin an environment variable RSA_LOCAL_CONFIG\n";
	    }
	    $local_config = 1;

	}
    }
}

################################################################
#### retrieve start and stop codons and calculate word occurrences
#### (for checking)
sub StartAndStopCodons {
#    my $label =  "orf";
    my $label =  "id,ctg,reg_left,reg_right,orf_strand";
    my $prefix = $dir{genome}."/".${organism_short_name};
    my $command = "retrieve-seq -v -org ${organism_short_name} -all  \ ";
    $command .= "-type upstream -feattype CDS -from 0 -to 2  \ ";
    $command .= "-format wc -nocomment -label $label \ ";
    $command .= " -o ".$prefix."_start_codons.wc";
    $command .= " ; oligo-analysis -type dna -l 3 -return occ,freq -v -format wc -1str -sort \ ";
    $command .= " -i ".$prefix."_start_codons.wc";
    $command .= " -o ".$prefix."_start_codon_frequencies";
    &doit($command, $dry_run, $die_on_error, $verbose);

    $command = "retrieve-seq -v -org ${organism_short_name} -all  \ ";
    $command .= "-type downstream  -feattype CDS -from 0 -to -2  \ ";
    $command .= "-format wc -nocomment -label $label \ ";
    $command .= " -o ".$prefix."_stop_codons.wc";
    $command .= " ; oligo-analysis -type dna -l 3 -return occ,freq -v -format wc -1str -sort \ ";
    $command .= " -i ".$prefix."_stop_codons.wc";
    $command .= " -o $dir{genome}/${organism_short_name}_stop_codon_frequencies";
    &doit($command, $dry_run, $die_on_error, $verbose);

}
# ################################################################
# #### retrieve start and stop codons and calculate word occurrences
# #### (for checking)
# sub StartAndStopCodons {
#     my $command = "retrieve-seq -v -org ${organism_short_name} -all  \ ";
#     $command .= "-type downstream -from 0 -to -2  \ ";
#     $command .= "-format wc -nocomment -label orf \ ";
#     $command .= " | oligo-analysis -type dna -l 3 -return occ,freq -v -format wc -1str -sort \ ";
#     $command .= " -o $dir{genome}/${organism_short_name}_stop_codon_frequencies";
#     &doit($command, $dry_run, $die_on_error, $verbose);

#     $command = "retrieve-seq -v -org ${organism_short_name} -all  \ ";
#     $command .= "-type upstream -from 0 -to 2  \ ";
#     $command .= "-format wc -nocomment -label orf \ ";
#     $command .= " | oligo-analysis -type dna -l 3 -return occ,freq -v -format wc -1str -sort \ ";
#     $command .= " -o $dir{genome}/${organism_short_name}_start_codon_frequencies";
#     &doit($command, $dry_run, $die_on_error, $verbose);
# }



# ################################################################
# #  Generates a RSAT feature table (.ftt) 
# #  from a NCBI feature table (.ptt)
# #
# #  usage
# #    &ParseNcbiFeatures($ptt_file);
# sub ParseNcbiFeatures {
#     my ($ptt) = @_;
    
#     open PTT, $ptt || die "Error : cannot read ppt file $ptt\n";
#     open FTT, ">$outfile{features}" || die "Error: cannot write file $outfile{features}\n";
#     open SYNONYMS, ">$outfile{synonyms}" || die "Error: cannot write file $outfile{synonyms}\n";
#     print FTT ";ID";
#     print FTT "\tTYPE";
#     print FTT "\tNAME";
#     print FTT "\tCONTIG";
#     print FTT "\tLEFT";
#     print FTT "\tRIGHT";
#     print FTT "\tSTRAND";
#     print FTT "\tDESCR";
#     print FTT "\t\n";
    
#     while (<PTT>) {
# 	@fields =  split;
# 	$pos = shift @fields;
# 	#      $pos = &trim(substr($line,0,20));
# 	if ($pos =~ /^(\d+)\.\.(\d+)$/) {
# 	    $left = $1;
# 	    $right = $2;
# 	    $strand = shift @fields;
# 	    #	$strand = &trim(substr($line,20,3));
# 	    $strand =~ s/\+/D/;
# 	    $strand =~ s/\-/R/;
# 	    $len = shift @fields;
# 	    $id = shift @fields;
# 	    #	$id = &trim(substr($line,29,9));
# 	    $name = shift @fields;
# 	    #	$name = &trim(substr($line,38,15));
# 	    if ($name eq "") {
# 		$name = $id;
# 	    }
	    
# 	    $descr = join " ", @fields;
# 	    #	$descr = &trim(substr($line,63));
# 	    if (($organism_short_name eq "hinf") && ($descr =~ /\((\S+)\)\s*$/)) {
# 		$synonym = $name;
# 		$name = $1;
# 		print SYNONYMS "$id\t$synonym\n";
# 	    } elsif (($organism_short_name eq "bsub") && ($descr =~ /alternate gene name: (.+)/)) {
# 		if ($1 =~ /;/) {
# 		    $synonyms = $`;
# 		} else {
# 		    $synonyms = $1;
# 		}
# 		@synonyms = split ",", $synonyms;
# 		foreach $synonym (@synonyms) {
# 		    $synonym =~ s/^\s+//;
# 		    $synonym =~ s/\s+$//;
# 		    print SYNONYMS "$id\t$synonym\n";
# 		}
# 	    } elsif (($organism_short_name eq "ecoli") & ($descr =~ /^(b\d+)\s/)) {
# 		$synonym = $1;
# 		print SYNONYMS "$id\t$synonym\n";
# 	    } elsif (($organism_short_name eq "mtub") & ($descr =~ /^(Rv\d+)\s/)) {
# 		$synonym = $1;
# 		print SYNONYMS "$id\t$synonym\n";
# 	    } elsif (($organism_short_name eq "aquae") & ($descr =~ /^(aq_\d+)\s/)) {
# 		$synonym = $1;
# 		print SYNONYMS "$id\t$synonym\n";
# 	    }
	    
# 	    #### get the contig sequence identifier for bacteria
# 	    open CTG, $outfile{genome} ||
# 		die "Error: cannot open contig list file $outfile{genome}\n";
# 	    while (<CTG>) {
# 		chomp;
# 		@fields = split "\t";
# 		$chom = $fields[1];

# 	    }
# 	    close CTG;
# 	    #$ctg = $organism_short_name;
# 	    $type = "CDS";
# 	    print FTT "$id";
# 	    print FTT "\t$type";
# 	    print FTT "\t$name";
# 	    print FTT "\t$ctg";
# 	    print FTT "\t$left";
# 	    print FTT "\t$right";
# 	    print FTT "\t$strand";
# 	    print FTT "\t$descr";
# 	    print FTT "\n";
# 	} 
	
#     }
    
#     close SYNONYMS;
#     close PTT;
#     close FTT;
# }

# sub ParseNcbiFeatures {
#     ################################################################
#     #  Generates a RSAT feature table (.ftt) 
#     #  from a NCBI feature table (.ptt)
#     #
#     #  usage
#     #    &ParseNcbiFeatures($ptt_file);
#     ################################################################
#     my ($ptt) = @_;
    
#     open PTT, $ptt || die "Error : cannot read ppt file $ptt\n";
#     open FTT, ">$outfile{features}" || die "Error: cannot write file $outfile{features}\n";
#     open SYNONYMS, ">$outfile{synonyms}" || die "Error: cannot write file $outfile{synonyms}\n";
#     print FTT ";ID";
#     print FTT "\tTYPE";
#     print FTT "\tNAME";
#     print FTT "\tCTG";
#     print FTT "\tLEFT";
#     print FTT "\tRIGHT";
#     print FTT "\tSTRAND";
#     print FTT "\tDESCR";
#     print FTT "\t\n";
    
#     while (<PTT>) {
# 	@fields =  split;
# 	$pos = shift @fields;
# 	#      $pos = &trim(substr($line,0,20));
# 	if ($pos =~ /^(\d+)\.\.(\d+)$/) {
# 	    $left = $1;
# 	    $right = $2;
# 	    $strand = shift @fields;
# 	    #	$strand = &trim(substr($line,20,3));
# 	    $strand =~ s/\+/D/;
# 	    $strand =~ s/\-/R/;
# 	    $len = shift @fields;
# 	    $id = shift @fields;
# 	    #	$id = &trim(substr($line,29,9));
# 	    $name = shift @fields;
# 	    #	$name = &trim(substr($line,38,15));
# 	    if ($name eq "") {
# 		$name = $id;
# 	    }
	    
# 	    $descr = join " ", @fields;
# 	    #	$descr = &trim(substr($line,63));
# 	    if (($organism_short_name eq "hinf") && ($descr =~ /\((\S+)\)\s*$/)) {
# 		$synonym = $name;
# 		$name = $1;
# 		print SYNONYMS "$id\t$synonym\n";
# 	    } elsif (($organism_short_name eq "bsub") && ($descr =~ /alternate gene name: (.+)/)) {
# 		if ($1 =~ /;/) {
# 		    $synonyms = $`;
# 		} else {
# 		    $synonyms = $1;
# 		}
# 		@synonyms = split ",", $synonyms;
# 		foreach $synonym (@synonyms) {
# 		    $synonym =~ s/^\s+//;
# 		    $synonym =~ s/\s+$//;
# 		    print SYNONYMS "$id\t$synonym\n";
# 		}
# 	    } elsif (($organism_short_name eq "ecoli") & ($descr =~ /^(b\d+)\s/)) {
# 		$synonym = $1;
# 		print SYNONYMS "$id\t$synonym\n";
# 	    } elsif (($organism_short_name eq "mtub") & ($descr =~ /^(Rv\d+)\s/)) {
# 		$synonym = $1;
# 		print SYNONYMS "$id\t$synonym\n";
# 	    } elsif (($organism_short_name eq "aquae") & ($descr =~ /^(aq_\d+)\s/)) {
# 		$synonym = $1;
# 		print SYNONYMS "$id\t$synonym\n";
# 	    }
	    
# 	    #### get the contig sequence identifier for bacteria
# 	    open CTG, $outfile{genome} ||
# 		die "Error: cannot open contig list file $outfile{genome}\n";
# 	    while (<CTG>) {
# 		chomp;
# 		@fields = split "\t";
# 		$chom = $fields[1];

# 	    }
# 	    close CTG;
# 	    $ctg = $organism_short_name;
# 	    $type = "CDS";
# 	    print FTT "$id";
# 	    print FTT "\t$type";
# 	    print FTT "\t$name";
# 	    print FTT "\t$ctg";
# 	    print FTT "\t$left";
# 	    print FTT "\t$right";
# 	    print FTT "\t$strand";
# 	    print FTT "\t$descr";
# 	    print FTT "\n";
# 	} 
	
#     }
    
#     close SYNONYMS;
#     close PTT;
#     close FTT;
# }

# sub ParseNCBIFeatures {
#     ################################################################
#     #  Generates a RSAT feature table (.ftt) 
#     #  from a NCBI feature table (.ptt)
#     #
#     #  usage
#     #    &ParseNcbiFeatures($ptt_file);
#     ################################################################
#     my ($ptt) = @_;
    
#     open PTT, $ptt 
# 	|| die "Error : cannot read ppt file $ptt\n";
#     open SYNONYMS, ">$outfile{synonyms}" 
# 	|| die "Error: cannot write file $outfile{synonyms}\n";

#     #### create a feature table file
#     open FTT, ">$outfile{features}" 
# 	|| die "Error: cannot write file $outfile{features}\n";
#     #### print feature table header
#     print FTT ";ID";
#     print FTT "\tTYPE";
#     print FTT "\tNAME";
#     print FTT "\tCTG";
#     print FTT "\tLEFT";
#     print FTT "\tRIGHT";
#     print FTT "\tSTRAND";
#     print FTT "\tDESCR";
#     print FTT "\t\n";
    
    
#     #### read genbank feature table
#     my $line=0;
#     while (<PTT>) {
# 	$line++;
# 	chomp;
# 	next unless (/\S/);
# 	@fields =  split "\t";
	
# 	#### parse location
# 	my $location = shift @fields;
# 	$location = &trim($location);
# 	if ($location =~ /^(\d+)\.\.(\d+)$/) {
# 	    $left = $1;
# 	    $right = $2;
# 	} else {
# 	    warn "Line $line\tInvalid location :$location\tskipping this row.\n";
# 	    next;
# 	}
	
# 	#### parse strand
# 	$strand = shift @fields;
# 	$strand = &trim($strand);
# 	$strand =~ s/\+/D/;
# 	$strand =~ s/\-/R/;

# 	#### parse length
# 	$len = shift @fields;

# 	#### parse PID
# 	$id = shift @fields;

# 	#### parse name
# 	$name = shift @fields;
# 	if (($name eq "") || ($name eq "-")){
# 	    $name = $id;
# 	}
	
# 	#### parse synonym
# 	$synonym = shift @fields;
# 	if (($synonym) && ($synonym ne "-")){
# 	    print SYNONYMS "$id\t$synonym\n";
# 	}

# 	#### parse code 
# 	$code = shift @fields;

# 	#### parse COG 
# 	$cog = shift @fields;

# 	#### parse descroption
# 	$descr = join " ", @fields;

	
# 	#### get the contig sequence identifier for bacteria
# 	open CTG, $outfile{genome} ||
# 	    die "Error: cannot open contig list file $outfile{genome}\n";
# 	while (<CTG>) {
# 	    chomp;
# 	    @fields = split "\t";
# 	    $ctg = $fields[1];
	    
# 	}
# 	close CTG;

# 	$type = "CDS";

# 	print FTT "$id";
# 	print FTT "\t$type";
# 	print FTT "\t$name";
# 	print FTT "\t$ctg";
# 	print FTT "\t$left";
# 	print FTT "\t$right";
# 	print FTT "\t$strand";
# 	print FTT "\t$descr";
# 	print FTT "\n";
	
#     }
    
#     close SYNONYMS;
#     close PTT;
#     close FTT;
# }


################################################################
#### parse the genome from Genbank files
sub ParseGenome {
    if ($dir{ensembl}) {
	$dir{source} = $dir{ensembl};
    } elsif (-d "$dir{genbank}/$organism_short_name") {
	$dir{source} = "$dir{genbank}/$organism_short_name";
    } elsif (-d "$dir{genbank}/Bacteria/$organism_short_name") {
	$dir{source} = "$dir{genbank}/Bacteria/$organism_short_name";
    } else {
	&RSAT::error::FatalError("Cannot find a directory $organism_short_name in genbank dir $dir{genbank}" );
    }
    my $command = "$ENV{RSAT}/perl-scripts/parse-genbank.pl -v 1";
    $command .= " -i ".$dir{source};
    $command .= $parse_options;
    if ($dir{ensembl}) {
	$command .= " -ext dat";
	$command .= " -org ".$organism_short_name;
    }
#    if ($source ne $null) {
      $command .= " -source '$source'";
#    }
    $command .= " -o ".$dir{genome};
    &doit($command, $dry_run, $die_on_error, $verbose);
}

################################################################
## clean up unnecessary files to save disk space
sub CleanUp {
    chdir $dir{genome};

    #### delete files with intergenic and gene segment sequences
    my @files = ();
    foreach my $seq_type qw(intergenic gene upstream upstream-noorf) {
	foreach my $format qw(wc fasta) {
	    foreach my $extension ("", ".gz") {
		foreach my $segments ("", "_segments") {
		    foreach my $purged ("", "_purged") {
			my $file = "${organism_short_name}_${seq_type}${segments}${purged}.${format}${extension}";
			if (-e $file) {
			    push @files, $file;
			}
		    }
		}
	    }
	}
    }
    foreach my $file (@files) {
	my $command = "rm -f $file";
	&doit($command, $dry_run, $die_on_error, $verbose);
    }
}

