#! /usr/bin/perl 
use strict;
open GTF, "TreeseiV2_FilteredModelsv2.0.gtf";
open CDSTAB, ">cds.tab";
open CONTIGS, ">contigs.txt";
open SYNONYMS, ">cds_names.tab";
open SEQIN, "Trichoderma_reeseiV2.fasta";
open PROTSEQ, "TreeseiV2_FilteredModelsv2.0.proteins.fasta";
open AA, ">Trichoderma_reesei_aa.fasta";


my %features = ();
my %contigs = ();

while (my $ligne = <GTF>) {
  chomp $ligne;
#   next if ($ligne !~ /transcript/);
  my @lignecp = split /\t/, $ligne;
  my $scaffold = $lignecp[0];
  my $transcript_gene = $lignecp[8];
  my @transcript_genecp = split ";", $transcript_gene;
  my $gene_id = $transcript_genecp[0];
  $gene_id =~ s/gene_id \"//;
  $gene_id =~ s/\"//;
  my $transcript_id = $transcript_genecp[1];
  my $dir = $lignecp[6];
  # fill the hash    $transcript_id =~ s/transcript_id \"//;
  if (!defined ($features{$gene_id}{"transcript_id"})) {
    $transcript_id =~ s/ transcript_id \"//;
    $transcript_id =~ s/\"//;
    $features{$gene_id}{"transcript_id"} = $transcript_id if ($transcript_id ne "");
  } else {
    $transcript_id = $features{$gene_id}{"transcript_id"};
  }
  $features{$gene_id}{"contig"} = $scaffold;
  $features{$gene_id}{"dir"} = $dir;
  
  if ($lignecp[2] eq "start_codon") {
    $features{$gene_id}{"start"} = $lignecp[3] if ($dir eq "+");
    $features{$gene_id}{"start"} = $lignecp[4] if ($dir eq "-");
  }
  if ($lignecp[2] eq "stop_codon") {
    $features{$gene_id}{"stop"} = $lignecp[4] if ($dir eq "+");
    $features{$gene_id}{"stop"} = $lignecp[3] if ($dir eq "-");
  } 
  if ($lignecp[2] eq "CDS") {
    if ($dir eq "+") {
      $features{$gene_id}{"cdsstart"} = $lignecp[3] if ($lignecp[3] < $features{$gene_id}{"cdsstart"} || !defined( $features{$gene_id}{"cdsstart"}));
      $features{$gene_id}{"cdsstop"} = $lignecp[4] if ($lignecp[4] > $features{$gene_id}{"cdsstop"} || !defined( $features{$gene_id}{"cdsstop"}));
    } else {
      $features{$gene_id}{"cdsstart"} = $lignecp[4] if ($lignecp[4] > $features{$gene_id}{"cdsstart"} || !defined( $features{$gene_id}{"cdsstart"}));
      $features{$gene_id}{"cdsstop"} = $lignecp[3] if ($lignecp[3] < $features{$gene_id}{"cdsstop"} || !defined( $features{$gene_id}{"cdsstop"}));
    }
  }
  
  print CONTIGS join "\t", $scaffold.".tab", $scaffold, "linear\n" if (!defined($contigs{$scaffold}));
  $contigs{$scaffold}++;
}


print CDSTAB "-- dump date   	20080308_233549\n-- class       	Genbank::CDS\n-- table       	cds\n-- table       	main\n-- field 1	id\n-- field 2	GI\n-- field 3	GeneID\n-- field 4	chrom_position\n-- field 5	chromosome\n-- field 6	codon_start\n-- field 7	contig\n-- field 8	description\n-- field 9	end_pos\n-- field 10	gene\n-- field 11	gene_id\n-- field 12	name\n-- field 13	organism\n-- field 14	product\n-- field 15	protein_id\n-- field 16	start_pos\n-- field 17	strand\n-- field 18	taxid\n-- field 19	type\n-- header\n-- id	GI	GeneID	chrom_position	chromosome	codon_start	contig	description	end_pos	gene	gene_id	name	organism	product	protein_id	start_pos	strand	taxid	type\n";

foreach my $gene_id (keys %features){
  my $transcript_id = $features{$gene_id}{"transcript_id"};
  my $start = $features{$gene_id}{"start"} || $features{$gene_id}{"cdsstart"};
  my $stop = $features{$gene_id}{"stop"} || $features{$gene_id}{"cdsstop"};
  if (!defined($start)) {
     $start = $features{$gene_id}{"cdsstart"};
#      print "$gene_id $start\n";
#      exit(0);
  }
  
  
  my $dir = $features{$gene_id}{"dir"};

  
  
  if ($start > $stop) {
    my $swap = $stop;
    $stop = $start;
    $start = $swap;
  }
  my $chrom_position = $start."..".$stop;
  if ($dir eq "-") {
    $chrom_position = "complement(".$chrom_position.")";
    $dir = "R";
  } else {
    $dir = "D";
  }
  my $contig = $features{$gene_id}{"contig"};
  my $org_id = 51453;
  my $organism = "Trichoderma reesei";
  my $type = "CDS";
  my @outputline = ();
  $outputline[0] = $gene_id;
  $outputline[1] = "";
  $outputline[2] = $gene_id;
  $outputline[3] = $chrom_position;
  $outputline[4] = "";
  $outputline[5] = 1;
  $outputline[6] = $contig;
  $outputline[7] = "";
  
  $outputline[8] = $stop;
  $outputline[9] = $gene_id;
  $outputline[10] = $gene_id;
  $outputline[11] = $gene_id;
  $outputline[12] = $organism;
  $outputline[13] = $transcript_id;
  $outputline[14] = $transcript_id;
  $outputline[15] = $start;
  $outputline[16] = $dir;
  $outputline[17] = $org_id;
  $outputline[18] = $type;
  print CDSTAB join "\t", @outputline;
  print CDSTAB "\n";
  print SYNONYMS join "\t", $gene_id, $gene_id, "primary"."\n";
  print SYNONYMS join "\t", $gene_id, $transcript_id, "alternate"."\n";
}

while (my $ligne = <SEQIN>) {
  chomp $ligne;
  if ($ligne =~ />/) {
    close SEQOUT;
    my $file = $ligne.".tab";
    $file =~ s/>//;
    open SEQOUT, ">$file";
  } else {
    print SEQOUT $ligne;
  }
}

while (my $ligne = <PROTSEQ>) {
  chomp $ligne;
  if ($ligne =~ />/) {
    my @lignecp = split /\|/, $ligne;
    my $id = $lignecp[3];
    print AA ">$id\n";
  } else {
    $ligne =~ s/\*//;
    print AA $ligne."\n";
  }
} 