#!/usr/bin/env perl
############################################################
#
# $Id: parse-gtf,v 1.48 2013/10/03 17:24:24 jvanheld Exp $
#
############################################################

## use strict;

=pod

=head1 NAME

parse-gtf

=head1 VERSION

$program_version

=head1 DESCRIPTION

Parse GTF and GFF3 files.

=head1 AUTHORS

Jacques.van-Helden\@univ-amu.fr & Bruno Contreras-Moreira

=head1 CATEGORY

=over

=item genome management

=back

=head1 USAGE

parse-gtf [-i inputfile] [-o outputfile] [-v #] [...]

=head1 INPUT FORMAT

A single file in GTF (General Transfer Format) format. 

Format specification
  http://www.ensembl.org/info/website/upload/gff.html

If the input file name terminates with .gz, it is uncompressed on the
flight.

=head1 OUTPUT FORMAT

The file is parsed and exported to a set of tab-delimited files
separated by biological feature type (CDS, transcript, gene), and
normalized according to the SQL rules : for each feature type, one
table contains all unique attributes, with one attribute per column
(e.g. cds.tab), and each multi-valued attribute is exported in a
separate table (e.g. cds_names.tab)

=head1 SEE ALSO

=head1 WISH LIST

=over

=item B<-download_gtf>

Download the gtf file from user-specified URL.

=item B<-download_fasta>

Download fasta file with genomic sequences from user-specified URL.

=item B<-download_fasta_rm>

Download fasta file with repeat-masked genomic sequences from user-specified URL.


=back

=cut


BEGIN {
  if ($0 =~ /([^(\/)]+)$/) {
    push (@INC, "$`lib/");
  }
}
require "RSA.lib";

use HTTP::Tiny;
use JSON;

@gtf_fields = qw( 
  seqname 
  source
  feature
  start
  end
  score
  strand
  frame
  attribute);

## Rename GTF fields according to RSAT nomenclature
%gtf_to_rsat_fields = (
  "feature_id"=>"id",
  "seqname"=>"ctg",
  "feature"=>"type",
  "gene_name"=>"name",
  "start"=>"left",
  "end"=>"right",
  "strand"=>"strand",
  "attribute"=>"description");

my @common_fields = qw(
  feature_id
  feature
  gene_name
  seqname
  start
  end 
  strand
  gene_biotype
  );

## Field order for RSAT
#     $col{'id'} = 0;
#     $col{'type'} = 1;
#     $col{'name'} = 2;
#     $col{'ctg'} = 3;
#     $col{'left'} = 4;
#     $col{'right'} = 5;
#     $col{'strand'} = 6;
#     $col{'descr'} = 7;
#     $col{'location'} = 8;

## Select attributes that should be exported for genes
my @out_fields = ();
@{$out_fields{gene}} = (@common_fields, "attribute");

## Select attributes that should be exported for transcripts
@{$out_fields{"transcript"}} = (@common_fields, qw(
 gene_id
 transcript_id
 transcript_name
 transcript_biotype
));

@{$out_fields{"exon"}} = (@common_fields, qw(
 attribute
));

@{$out_fields{"cds"}} = (@common_fields, qw(
 gene_id
 transcript_id
 exon_number
 protein_id
));

@{$out_fields{"start_codon"}} = (@common_fields, qw(
 attribute
));

@{$out_fields{"stop_codon"}} = (@common_fields, qw(
 attribute
));

@{$out_fields{"organism"}} = qw(id taxonomy source);

@{$out_fields{"contig"}} = qw(id length raw_file description);

@{$out_fields{"contig_rm"}} = qw(id length raw_file description);


################################################################
## Main package
package main;
{

  ################################################################
  ## Initialise parameters
  our $start_time = &RSAT::util::StartScript();
  our $program_version = do { my @r = (q$Revision: 1.48 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
  #    $program_version = "0.00";

  our $organism_name = ""; ## Organism name
  our $taxid = ""; ## Taxonomic ID of the organism
  our $taxonomy; ## Taxonomic description (required for some programs)
  our $gtf_source = "parse_gtf"; ## Should be set to the reals ource, e.g. ensembl or ensemblgenomes, with the option -gtf_source

  our %infile = ();
  our %outfile = ();

  our $verbose = 0;
  our $in = STDIN;
  our $out = STDOUT;

  our $last = 0; ## Stop  parsing after a user-specified number of rows
  our $null = "<NA>"; ## Non-allocated values

  our $batch = 0;

  ## Supported tasks
  our @supported_tasks = qw(parse_gtf
                            split_features
                            seq_len
                            index_fasta
                            parse_fasta
                            config
                            install
                            all
			   );
  our $supported_tasks = join (",", @supported_tasks);
  our %supported_task = ();
  foreach my $task (@supported_tasks) {
    $supported_task{$task} = 1;
  }
  our %task = ();

  ################################################################
  ## Read argument values
  &ReadArguments();

  ################################################################
  ## Check argument values

  ## Output directory is required
  &RSAT::error::FatalError("The output directory must be specified (option -o).") unless ($dir{output});
  &RSAT::util::CheckOutDir($main::dir{output}, "", 775);
  

  ## GTF file must be specified
  &RSAT::error::FatalError("The GTF file must be specified (option -i).") unless ($infile{"gtf"});
  
  ## Define organism name from the file name if not specified by the user
  unless ($organism_name) {
    $organism_name = &ShortFileName($infile{"gtf"});
    $organism_name =~ s/\.gz$//;
    $organism_name =~ s/\.gtf$//;
  }

  ## Check that at least one task has been defined. If not just parse
  ## the GTF file.
  if (scalar(keys(%task)) == 0) {
    $task{"parse_gtf"} = 1;
    if (($infile{"fasta"}) || ($infile{fasta_rm})) {
      $task{"parse_fasta"} = 1;
    }
  }
  if ($task{"all"}) {
    %task = %supported_task;
  }

  ################################################################
  ## Open output streams

  ## Log file
  &RSAT::message::TimeWarn("Opening log stream") if ($main::verbose >= 2);
  $outfile{log} = $dir{output}."/log.txt"; $out = &OpenOutputFile($outfile{log});

  ## Open tab-delimited files for each feature type
  foreach my $feature (keys %out_fields) {
    $outfile{$feature} = $dir{output}."/".$feature.".tab";
    $handle{$feature} = &OpenOutputFile($outfile{$feature});
    my $handle = $handle{$feature};
    my @out_fields = @{$out_fields{$feature}};
    my @header_fields = ();
    print $handle "; GTF file: ", $infile{"gtf"}, "\n";
    print $handle "; Feature type: ", $feature, "\n";
    print $handle "; Column content:\n";
    foreach my $f (1..scalar(@out_fields)) {
      my $field = $out_fields[$f-1];
      if (defined($gtf_to_rsat_fields{$field})) {
	$field = $gtf_to_rsat_fields{$field};
      }
      print $handle join("\t", ";", $f, $field), "\n";
      push @header_fields, $field;
    }
    print $handle '#', join ("\t", @header_fields), "\n";
  }


  ## Open a specific handle for the file contigs.txt
  $outfile{"contigs_txt"} = $dir{output}."/contigs.txt";
  $handle{"contigs_txt"} = &RSAT::util::OpenOutputFile($outfile{contigs_txt});
  
  $outfile{"contigs_txt_rm"} = $dir{output}."/contigs_rm.txt";
  $handle{"contigs_txt_rm"} = &RSAT::util::OpenOutputFile($outfile{contigs_txt_rm});
 

  ################################################################
  ## Extract one GTF file per subtye of features
  if ($task{split_features}) {
    &RSAT::message::TimeWarn("Splitting features by type") if ($main::verbose >= 2);

    my @feature_types = qw(exon
                           five_prime_utr
                           gene
                           start_codon
                           stop_codon
                           three_prime_utr
                           transcript);
    foreach my $type (@feature_types) {
	$outfile{$type.'_gtf'} =  $infile{"gtf"};
	$outfile{$type.'_gtf'} =~ s/.gtf$/_${type}.gtf/;
	&RSAT::message::TimeWarn("\t".$type, $outfile{$type.'_gtf'}) if ($main::verbose >= 2);
	my $cmd = "awk '\$3 == \"${type}\"' $infile{gtf} > $outfile{$type.'_gtf'}";
	&RSAT::util::doit($cmd);
    }


    ################################################################
    ## Compute introns
    &RSAT::message::Warning("I STILL NEED TO COMPUTE INTRONS");

    ################################################################
    ## Derive other feature types from the ones found in the GTF file
    sub derive_feature {
	my ($derived_type, $source_type, $plus_column, $minus_column) = @_;
	($outfile{$derived_type.'_gtf'} = $infile{'gtf'}) =~ s/.gtf$/_${derived_type}.gtf/;
	my $cmd = "awk -F'\t' '\$7==\"+\" {print \$1\"\t\"\$2\"\t".${derived_type}."\t\"\$".${plus_column}."\"\t\"\$".${plus_column}."\"\t\"\$6\"\t\"\$7\"\t\"\$8\"\t\"\$9}' ".$outfile{$source_type.'_gtf'}." > ".$outfile{$derived_type.'_gtf'}.'_tmp';
	$cmd .= "; awk -F'\t' '\$7==\"-\" {print \$1\"\t\"\$2\"\t".${derived_type}."\t\"\$".${minus_column}."\"\t\"\$".${minus_column}."\"\t\"\$6\"\t\"\$7\"\t\"\$8\"\t\"\$9}' ".$outfile{$source_type.'_gtf'}." >> ".$outfile{$derived_type.'_gtf'}.'_tmp';
	$cmd .= "; sort -k1,1 -k4,4n ".$outfile{$derived_type.'_gtf'}.'_tmp'." > ".$outfile{$derived_type.'_gtf'};
	$cmd .= "; rm -f ".$outfile{$derived_type.'_gtf'}.'_tmp';
	&RSAT::message::TimeWarn("Computing ".$derived_type." coordinates from ".$source_type, $outfile{$derived_type.'_gtf'}) if ($main::verbose >= 2);
	&RSAT::util::doit($cmd);
    }
    &derive_feature("tss", "transcript", 4, 5);
    &derive_feature("tts", "transcript", 5, 4);
    &derive_feature("gene_start", "gene", 5, 4);
    &derive_feature("gene_end", "gene", 5, 4);

  }


  ################################################################
  ## Generate all the tables required by RSAT.
  ##
  ## This combines the parsing of the GTF fiel + some REST query to
  ## collect organism properties.

  if ($task{"parse_gtf"}) {

    ################################################################
    ## Create a file with organism attribute, required for RSAT
    &RSAT::message::TimeWarn("Getting organism attributes from REST web services") if ($main::verbose >= 2);
    my %organism = ();
    $organism{id} = $organism_name;
    $organism{source} = $gtf_source;
    $organism{taxonomy} = $null;

    # try to get taxonomy from eg REST interface
    if ( (defined($taxid)) && ($taxid ne "")) {
      &RSAT::message::TimeWarn("Getting taxonomy for", $organism_name, "taxid=".$taxid) if ($main::verbose >= 2);
      my ($taxonomy,$sp_name) = ('','');
      my $server = 'http://rest.ensembl.org'; # should be in %ENV
      my $ext = '/taxonomy/classification/'.$taxid.'?';

      # hard-coded in case theres no network
      #$taxonomy = 'Eukaryota;Viridiplantae;Streptophyta;Embryophyta;Tracheophyta;Spermatophyta;Magnoliophyta;Liliopsida;Poales;Poaceae;PACMAD clade;Panicoideae;Paniceae;Setaria;Setaria italica';

      if($taxonomy eq '') {
        my $http = HTTP::Tiny->new();
        my $response = $http->get($server.$ext,{ headers => { 'Content-type' => 'application/json' } });
      
        &RSAT::message::Warning("failed connecting to $server$ext") unless $response->{success};

        if (length ($response->{content})) {
	      my $json = decode_json($response->{content});
	
          # check whether first taxon is actually a species name
	      my $last_taxon = $json->[0];
	      if ($last_taxon->{'children'}) {
	        my $last_children = $last_taxon->{'children'}->[0];
	        if (($last_children->{'scientific_name'}) && 
	          ($last_children->{'scientific_name'} =~ m/^[A-Z]\w+ \w+/)) {
	             $sp_name = $last_children->{'scientific_name'};
	        }
	      }

	      # print taxonomy in reverse order
	      foreach my $taxon (reverse(@$json)) {
	        $taxonomy .= $taxon->{'scientific_name'}.';';
	      }
	      $taxonomy .= $sp_name;
        }

        $organism{id} = $taxid;
	    $organism{taxonomy} = $taxonomy;

	    if ($taxonomy) {
	      &RSAT::message::TimeWarn("REST taxonomy:", $taxonomy) if ($main::verbose >= 2);
	    } else {
	      &RSAT::message::Warning("Could not get taxonomy for organism", $organism_name);
	    }
      }
    } else {
      &RSAT::error::FatalError("No TAXID for organism", $organism_name);
    }

    my @organism_fields = ();
    foreach my $field (@{$out_fields{organism}}) {
      push @organism_fields, $organism{$field};
    }
    my $handle = $handle{"organism"};
    print $handle join("\t", @organism_fields), "\n";

    #die "HELLO\t", $outfile{"organism"};
    

    ################################################################
    ## Parse GTF file
    &RSAT::message::TimeWarn("Parsing file", "GTF", $main::infile{"gtf"}) if ($main::verbose >= 2);
    ($main::in) = &OpenInputFile($main::infile{"gtf"});
    my $l = 0; ## Line counter
    while (<$main::in>) {
      next unless (/\S/); ## Skip empty rows
#    next if (/^;/); ## Skip comment rows
      next if (/^#!/); ## Skip header rows

      $l++;

      if (($main::last > 0) && ($l > $main::last)) {
	&RSAT::message::Warning("Stopping the parsing after", $last, "rows (option -last).");
	last;
      }

      my %attributes = (); ## initialize attributes hash table

      chomp();
#    my ($seqname, $source, $feature, $start, $end, $score, $strand, $frame, $attribute) = split("\t");
      my @fields = split("\t");

      foreach my $f (0..$#gtf_fields) {
	    $field = $gtf_fields[$f];
	    $attributes{$field} = $fields[$f];
      }

      ## Increment counter per feature type
      my $feature = lc($attributes{feature});
      $feature_count{$feature}++;
      $contig_count{$attributes{seqname}}++;
      $feature_per_contig_count{$attributes{seqname}}{$feature}++;

      ## Format some specific attribtues according to RSAT requirements
      if(defined($attributes{strand}))
      {
        $attributes{strand} =~ s/\+/D/;
        $attributes{strand} =~ s/\-/R/;
      }

      #    &RSAT::message::Debug($attribute) if ($main::verbose >= 2);

      next unless (defined($handle{$feature})); ## Skip feature types not defined in export list

      ## Parse the "attribute" field of the gtf, which contains several attributes
      # there are alternative formats:
      #1    tair    gene    3631    5899    .   +   .   gene_id "AT1G01010"; gene_version "1"; gene_name "NAC001"; ...
      #1    tair    CDS 3760    3913    .   +   0   gene_id "AT1G01010"; ... protein_id "AT1G01010.1";...
      #Chr01    phytozomev10    gene    27355   28320   .   -   .   ID=Glyma.01G000100.Wm82.a2.v1;Name=Glyma.01G000100;...
      #Chr01    phytozomev10    CDS 27656   27824   .   -   1   ID=Glyma.01G000100.1.Wm82.a2.v1.CDS.3;Parent=...
      #contig_1000513   Cufflinks   gene    946 4541    .   +   .   ID=TCONS_00000005;gene_id=XLOC_000003;...
      # GDR (genome db for Rosaceae) gff-version 3
      #PAV_r1.0chr1 maker   gene    65131   87167   .   +   .   ID=Pav_sc0000551.1_g020.1.mk;Note=PREDICTED: importin-11 isoform X2
      #PAV_r1.0chr1 maker   exon    65131   65355   .   +   .   ID=Pav_sc0000551.1_g020.1.mk:exon:1;Parent=Pav_sc0000551.1_g020.1.mk
      #PAV_r1.0chr1 maker   CDS 65215   65355   .   +   0   ID=Pav_sc0000551.1_g020.1.mk:CDS:1;Parent=Pav_sc0000551.1_g020.1.mk
      #
      # new supported attributes: key=value, Parent, Name, \w+.CDS.\d+, 
      
      my @attributes = split /;\s*+/, $attributes{attribute};
      foreach my $attr (@attributes) {
	    if ($attr =~ /(\S+)\s+\"(\S+)\"/ || $attr =~ /(\S+?)=(\S+)/) {
	        $attributes{$1} = $2; #print "$feature $1 -> $2\n";
            if($feature eq 'cds') {
                my $cds_attr = $attributes{$1};

                # capture exon number from JGI/GDR annotations in GTF format
                if($cds_attr =~ m/CDS.(\d+)/) {
                    $attributes{exon_number} = $1;
                }
            }
	    }
      }
    
      ## set gene_id if undefined (as in gff3 made with maker)
      if($attributes{ID} && !$attributes{gene_id}) { $attributes{gene_id} = $attributes{ID} }

      ## Use gene ID as gene name if not defined
      unless ($attributes{gene_name}) {
    	$attributes{gene_name} = $attributes{gene_id} || $attributes{Name} || $attributes{Parent} || $null;
      }
      unless ($attributes{transcript_name}) {
	    $attributes{transcript_name} = $attributes{transcript_id} || $attributes{Name} || $attributes{Parent} || $null;
      }

      ## Define feature-specific fields
      $attributes{feature_id} = $feature."_".$feature_count{$feature};
      if ($feature eq "gene") {
	    $attributes{feature_id} = $attributes{gene_id} || $attributes{Name};
	    $attributes{gene_id}  ||= $attributes{Name};
      } elsif ($feature eq "cds") {
    	$attributes{feature_id}   = $attributes{protein_id} || $attributes{Parent};
	    $attributes{gene_id}    ||= $attributes{Parent}; # in JGI GTF this is really protein_id
        $attributes{protein_id} ||= $attributes{Parent};
        $attributes{transcript_id} ||= $attributes{Parent};
      }

      #if($feature eq 'cds'){
      #  print "$feature";
      #  foreach my $k qw( feature_id gene_name ) #gene_id transcript_id gene_name seqname start end strand )
      #  {
      #      if(!$attributes{$k}){ $attributes{$k} = '?' }
      #      print "\t$attributes{$k}";
      #  } print "\n";
      #}

      ## parse feature type-specific fields
      my @values = ();
      foreach my $field (@{$out_fields{$feature}}) {
#      
#      &RSAT::message::Debug($feature, $attributes{feature_id}, $field, $attributes{$field});
	    my $value = $null;
	    if (defined($attributes{$field})) {
	        $value =  $attributes{$field};
	    }
	    push @values, $value;
      }

      my $handle = $handle{$feature};
      print $handle join("\t", @values), "\n";

    }
    close $main::in if ($main::infile{"gtf"});

  }


  ################################################################
  ## Compute sequence lengths, which will be required for example
  ## for some bedtools operation (option -g)
  if ($task{"seq_len"}) {
    foreach my $fasta_type ("fasta", "fasta_rm", "fasta_pep") {
	$outfile{$fasta_type."_lengths"} = $infile{$fasta_type};
	$outfile{$fasta_type."_lengths"} =~ s/\.fa$//;
	$outfile{$fasta_type."_lengths"} .= "_lengths.tab";
	my $cmd = "sequence-lengths -v 0 -i $infile{$fasta_type} -o $outfile{$fasta_type.'_lengths'}";
	&RSAT::message::TimeWarn("Computing sequence lengths",$fasta_type, $outfile{$fasta_type."_lengths"}) if ($main::verbose >= 2);
	&RSAT::util::doit($cmd);

    }      
  }

  ################################################################
  ## Index the fasta file (required for bedtools getfasta, which is
  ## called by retrieve-seq-bed);
  if ($task{"index_fasta"}) {
    foreach my $fasta_type ("fasta", "fasta_rm", "fasta_pep") {
      my $cmd = "samtools faidx ".$infile{$fasta_type};
      &RSAT::message::TimeWarn("Indexing fasta file",$fasta_type, $infile{$fasta_type}) if ($main::verbose >= 2);
      &RSAT::util::doit($cmd);
    }      
  }

  ################################################################
  ## Parse the fasta file
  if ($task{"parse_fasta"}) {

    ## Parse genomic sequences
    foreach my $fasta_type ("fasta", "fasta_rm") {

      my $fasta_file = $infile{$fasta_type};
      if ($fasta_file) {
	&RSAT::message::TimeWarn("Parsing file", $fasta_type, $fasta_file) if ($main::verbose >= 2);
	my ($in, $input_dir) = &OpenInputFile($fasta_file);
	my $seq_nb = 0;
	my %args = ();
	## Define a suffix for the raw files
	my $seq_suffix = "";
	if ($fasta_type eq "fasta_rm") {
	  $seq_suffix = "_rm";
	}

	while ((($current_seq, $current_id, @comments) = &ReadNextSequence($in, "fasta", $input_dir, "", "", %args)) &&
	       (($current_seq ne "") || ($current_id ne ""))) {
	  $seq_nb++;
	  $contig{id} = $current_id;
	  $contig{length} = length($current_seq);
	  $contig{raw_file} = $current_id.$seq_suffix.".raw";
	  $contig{raw_file_path} = $dir{output}."/".$contig{raw_file};
	  $contig{description} = join("; ", @comments);
	  &RSAT::message::Info("Fasta sequence", $seq_nb, $current_id, $contig{raw_file}) if ($main::verbose >= 4);
	  
	  ## Print sequence in raw file
	  my $raw = &OpenOutputFile($contig{raw_file_path});
	  &PrintNextSequence($raw, "raw", 0, $current_seq, $current_id);
	  close $raw;
	  
	  ## Print contig table
	  my @contig_fields = ();
	  foreach my $field (@{$out_fields{"contig".$seq_suffix}}) {
	    push @contig_fields, $contig{$field};
	  }
	  my $handle = $handle{"contig".$seq_suffix};
#	  &RSAT::message::Debug("HELLO", "seq_suffix=".$seq_suffix, "contig".$seq_suffix, "\t", "handle=".$handle, join (";", keys(%handle))) if ($main::verbose >= 10);
	  print $handle join("\t", @contig_fields), "\n";
	  
	  ## Print contigs.txt file, required for retrieve-seq
	  $handle = $handle{"contigs_txt".$seq_suffix};
	  print  $handle join ("\t", $contig{raw_file}, $contig{id}, "NA"), "\n";
	}
	close($in);
      }
    }

    ################################################################
    ## Install peptidic sequences if required.
    ##
    ## Currently we just read the file and write it in the right place
    ## of the genome directory.
    ##
    ## TO DO: adapt the header to indicate the link between gene ID,
    ## transcript ID and protein ID. This willl be useful for
    ## genome-blast.
    ##
    ## For this reason, I already read the content of the file, since
    ## I will need to modify sequence headers.
    $pep_fasta_file = $infile{"fasta_pep"};
    if ($pep_fasta_file) {
      ## Open a specific handle for peptidic sequences
      $outfile{"peptidic_sequences"} = $dir{output}."/peptidic_sequences.fasta";
      $handle{"peptidic_sequences"} = &RSAT::util::OpenOutputFile($outfile{"peptidic_sequences"});
      my $handle = $handle{"peptidic_sequences"};

      &RSAT::message::TimeWarn("Parsing file", "fasta_pep", $pep_fasta_file) if ($main::verbose >= 2);
      my ($in, $input_dir) = &OpenInputFile($pep_fasta_file);
      my $seq_nb = 0;  ## Sequence counter
      my %args = (); 
      while  ((($current_seq, $current_id, @comments) = &ReadNextSequence($in, "fasta", $input_dir, "", "", %args)) &&
	      (($current_seq ne "") || ($current_id ne ""))) {
	$seq_nb++;
	#&RSAT::message::Debug("Peptidic sequence", $seq_nb, $current_id) if ($main::verbose >= 10);
	&PrintNextSequence($handle, "fasta", 0, $current_seq, $current_id, @comments);
      }
    }
  }

  ################################################################
  ## Update RSAT config
  if ($task{"config"})  {
    &RSAT::message::TimeWarn("Configuring RSAT for organism", $organism_name) if ($main::verbose >= 2);
    my $cmd = $ENV{RSAT}."/perl-scripts/install-organism -v 1 -source ".$gtf_source." -org ".$organism_name;
    $cmd .= " -task config";
    &RSAT::util::doit($cmd);
  }

  ################################################################
  ## Install the organism in RSAT
  if ($task{"install"})  {
    &RSAT::message::TimeWarn("Installing organism in RSAT", $organism_name) if ($main::verbose >= 2);
    my $cmd = $ENV{RSAT}."/perl-scripts/install-organism -v 1 -source ".$gtf_source." -org ".$organism_name;
    $cmd .= " -task default";
    $cmd .= " -batch" if ($batch);
    &RSAT::util::doit($cmd);
  }

  
  ################################################################
  ## Print verbose
  &Verbose() if ($main::verbose >= 1);

  ################################################################
  ## Report execution time and close output stream
  &close_and_quit();
}

################################################################
################### SUBROUTINE DEFINITION ######################
################################################################


################################################################
## Close output file and quit
sub close_and_quit {

  ## Report execution time
  my $exec_time = &RSAT::util::ReportExecutionTime($start_time); ## This has to be exectuted by all scripts
  print $main::out $exec_time if ($main::verbose >= 1); ## only report exec time if verbosity is specified

  ## Close output file
  if ($dir{output}) {
    close $main::out;
    foreach my $feature (keys(%handle)) {
      close $handle{$feature};
    }
    &RSAT::message::TimeWarn("Output directory", $dir{output}) if ($main::verbose >= 2);
  }

  ## CLOSE OTHER FILES HERE IF REQUIRED
  foreach my $key (keys(%handle)) {
    close($handle{$key});
  }

  exit(0);
}


################################################################
## Display full help message 
sub PrintHelp {
  system "pod2text -c $0";
  exit()
}

################################################################
## Display short help message
sub PrintOptions {
  &PrintHelp();
}

################################################################
## Read arguments 
sub ReadArguments {
  my $arg;
  my @arguments = @ARGV; ## create a copy to shift, because we need ARGV to report command line in &Verbose()
  while (scalar(@arguments) >= 1) {
    $arg = shift (@arguments);


=pod

=head1 OPTIONS

=over 4

=item B<-v #>

Level of verbosity (detail in the warning messages during execution)

=cut
    if ($arg eq "-v") {
      if (&IsNatural($arguments[0])) {
	$main::verbose = shift(@arguments);
      } else {
	$main::verbose = 1;
      }


=pod

=item B<-h>

Display full help message

=cut
    } elsif ($arg eq "-h") {
      &PrintHelp();


=pod

=item B<-help>

Same as -h

=cut
    } elsif ($arg eq "-help") {
      &PrintOptions();


=pod

=item B<-i inputfile>

GTF file. If the file bears the .gz extension, it will be
automatically uncompressed during the parsing.

If no input file is specified, the standard input is used.  This
allows to use the command within a pipe.

=cut
    } elsif ($arg eq "-i") {
      $main::infile{"gtf"} = shift(@arguments);

=pod

=item B<-fasta genome_seq_file>

Path to a file containing the genome sequences. This file must be in
fasta format, and the sequence IDs must correspond to the seqname
field of the parsed GTF file.

The file will be parsed to generate the raw sequence files required
for RSAT (named [seq_id].raw) , and the script will export a
file contigs.txt indicating wich raw sequence file contains the
sequence of contig.

=cut

    } elsif ($arg eq "-fasta") {
      $main::infile{"fasta"} = shift(@arguments);

=pod

=item B<-fasta_rm repeat_maskedgenome_seq_file>

Path to a file containing the repeat-masked genome sequences. This
file must be in fasta format, and the sequence IDs must correspond to
the seqname field of the parsed GTF file.

The file will be parsed to generate the raw sequence files required
for RSAT (named [seq_id]_rm.raw) , and the script will export a file
contigs_rm.txt indicating wich raw sequence file contains the
repeat-masked sequence of each contig.


=cut

    } elsif ($arg eq "-fasta_rm") {
      $main::infile{"fasta_rm"} = shift(@arguments);

=pod

=item B<-fasta_pep peptidic_sequence_file>

Install the peptidic sequence file, which will be required for RSAT
installation, in order to compute oligopeptide frequencies, and to run
genome-blast.

B<Warning>: we still need to fix the link between gene IDs and product
IDs.

=cut

    } elsif ($arg eq "-fasta_pep") {
      $main::infile{"fasta_pep"} = shift(@arguments);

=pod

=item B<-last N>

Stop parsing after N rows (for debugging and testing).

=cut 

    } elsif ($arg eq "-last") {
      $main::last = shift(@arguments);
      &RSAT::error::FatalError($main::last, "Invalid value for option -last. Must be Natural number.")
	  unless (&RSAT::util::IsNatural($main::last));

=pod

=item	B<-o output directory>

Directory where the parsing result will be stored.

=cut
    } elsif ($arg eq "-o") {
      $dir{output} = shift(@arguments);

=pod

=item	B<-org_name organism_name>

Organism name for the installation on RSAT. 

=cut
    } elsif ($arg eq "-org_name") {
      $organism_name = shift(@arguments);

=pod

=item	B<-gtf_source source>

Source of the GTF file. 

=cut
    } elsif ($arg eq "-gtf_source") {
      $gtf_source = shift(@arguments);

=pod

=item	B<-taxid taxonomic ID>

If specified, the program will collect the corresponding taxonomy from
EnsemblGenome REST server.

=cut
    } elsif ($arg eq "-taxid") {
      $taxid = shift(@arguments);


=pod

=item B<-task task1,task2,...>

Supported tasks:

=over


=item I<parse_gtf>

Parse the GTF file to extract genomic features (genes, transcripts,
...).

=item I<split_features>

Split the GTF file into one separate GTF file per feature type,
according to the EnsembLGenomes typology.

=over

=item CDS

=item exon

=item five_prime_utr

=item gene

=item start_codon

=item stop_codon

=item three_prime_utr

=item transcript

=back

In addition, from the above features we derive the coordinates of the
following feature types.

=over

=item intron

Introns are computed as the difference between genes and exons ("gene
minus exon").

=item gene_start

5' end of each gene (taking into account its strand)

=item gene_end

3' end of each gene (taking into account its strand)

=item TSS

transcription start site, i.e. the 5' end of each transcript

=item TTS

transcription termination site, i.e. the 3' end of each transcript

=back

=item I<parse_fasta>

Parse the fasta file(s) specified with the options -fasta and/or
-fasta_rm, and export the sequences in raw format (required for RSAT).

B<Note>: in the near future I (JvH) intend to avoid this, because it
duplicates all the sequences -> occupies twice more space than
needed. I first need to adapt retrieve-seq in order to get sequences
directly from the indexed fasta genomes, via bedtools getfasta.

=item I<seq_len>

Compute the length of the sequences (chromosomes, masked chromosomes
and peptides). Chromosome lengths are useful for some bedtools
commands (e.g. shuffle).

=item I<config>

Update the RSAT genome table to enable using the parsed genome with
RSAT I<retrieve-seq> and other tools.

=item I<install>

Run installation steps (compute oligo and dyad frequencies, ...) in
order to provide full support for the parsed genome on the RSAT
server.

=back

=cut
    } elsif ($arg eq "-task") {
      my @requested_tasks = split ",", shift (@arguments);
      foreach my $task (@requested_tasks) {
	next unless $task;
	if ($supported_task{$task}) {
	  $task{$task} = 1;
	} else {
	  &RSAT::error::FatalError("Task '$task' is not supported. \n\tSupported: $supported_tasks");
	}
      }

=pod

=item B<-batch>

This option is passed to install-organism in order to send the
computation of oligo and dyad frequency tables to a job scheduler. It
requires to have a job scheduler properly configured in RSAT
configuration files.

=cut

    } elsif ($arg eq "-batch") {
	$batch = 1;

    } else {
      &FatalError(join("\t", "Invalid option", $arg));

    }
  }

=pod

=back

=cut

}

################################################################
## Verbose message
sub Verbose {
  print $out "; parse-gtf ";
  &PrintArguments($out);
  printf $out "; %-22s\t%s\n", "Program version", $program_version;
  if (%main::infile) {
    print $out "; Input files\n";
    while (my ($key,$value) = each %main::infile) {
      printf $out ";\t%-13s\t%s\n", $key, $value;
    }
  }
  if (%main::outfile) {
    print $out "; Output files\n";
    while (my ($key,$value) = each %main::outfile) {
      printf $out ";\t%-13s\t%s\n", $key, $value;
    }
  }


  my @feature_types =  sort(keys(%feature_count));
  @feature_type_headers = map {$_ =~ s/_codon//g; $_} sort(keys(%feature_count));
  print $out "; Parsed rows per feature type\n";
  foreach my $feature (@feature_types) {
    printf $out ";\t%-13s\t%s\n", $feature, $feature_count{$feature};
  }

  print $out "; Parsed rows per contig\n";
  printf $out ";\t%-13s\t%s\n", "Contig", join("\t", "Total",  @feature_type_headers);
  foreach my $contig (sort (keys( %contig_count))) {
    my @contig_counts = map {$_ =~ s/^$/0/g; $_} $contig_count{$contig};
    
    foreach my $feature (@feature_types) {
      if (defined($feature_per_contig_count{$contig}{$feature})) {
	push @contig_counts, $feature_per_contig_count{$contig}{$feature} ;
      } else {
	push @contig_counts, 0;
      }
    }
    printf $out ";\t%-13s\t%s\n", $contig, join("\t", @contig_counts);
  }


}


__END__
