#! /usr/bin/perl
use strict;
open GFF, "candida_21.gff";
open CDS, ">cds.tab";
open NAMES, ">cds_names.tab";
open ORG, ">organism.tab";
print ORG "-- dump date   	20080308_233524
-- class       	Genbank::Organism
-- table       	organism
-- table       	main
-- field 1	id
-- field 2	taxonomy
-- header
-- id	taxonomy
5476	Eukaryota; Fungi; Ascomycota; Saccharomycotina; Saccharomycetes; Saccharomycetales; Saccharomycetaceae; Candida";
print CDS "-- dump date   	20080308_233549
-- class       	Genbank::CDS
-- table       	cds
-- table       	main
-- field 1	id
-- field 2	GI
-- field 3	GeneID
-- field 4	chrom_position
-- field 5	chromosome
-- field 6	codon_start
-- field 7	contig
-- field 8	description
-- field 9	end_pos
-- field 10	gene
-- field 11	gene_id
-- field 12	name
-- field 13	organism
-- field 14	product
-- field 15	protein_id
-- field 16	start_pos
-- field 17	strand
-- field 18	taxid
-- field 19	type
-- header
-- id	GI	GeneID	chrom_position	chromosome	codon_start	contig	description	end_pos	gene	gene_id	name	organism	product	protein_id	start_pos	strand	taxid	type
";


while (my $ligne = <GFF>) {
  next if ($ligne =~ /^#/);
  next if ($ligne !~ /ORF/);
  chomp $ligne;
  my @lignecp = split /\t/, $ligne;
  my $id = $lignecp[0];
  my $chrom = $id;
  $chrom = chop $chrom, 
  my $start = $lignecp[3];
  my $end = $lignecp[4];
  my $dir = $lignecp[6];
  my @lignecp8cp = split /;/, $lignecp[8];
  my $id = "";
  my $name = "";
  my @aliases = ();
  my $descr = "";
  my $rsat_dir = "D";
  my $chrompos = "";
  foreach my $item (@lignecp8cp) {
    
    if ($item =~ /ID=/) {
      
      $item =~ s/ID=//;
      $id = $item;
      push @aliases, $item;
    } elsif ($item =~ /Name=/) {
      
      $item =~ s/Name=//;
      $name = $item;
      push @aliases, $item;
    } elsif ($item =~ /Note=/) {
      
      $item =~ s/Note=//;
      $descr = $item;
      $descr = &htmldecode($descr);
    } elsif ($item =~ /orf_classification=/) {
      $item =~ s/orf_classification=//;
      $descr .= "; ".$item;
      $descr = &htmldecode($descr);
      print "$descr\n";
    } elsif ($item =~ /Alias=/) {
      $item =~ s/Alias=//;
      my @itemcp = split /,/, $item;
      push @aliases, @itemcp;
    }
    if ($start > $end) {
      my $b = $end;
      $end = $start;
      $start = $b;
    }
    $chrompos = "$start..$end";
    
    if ($dir eq "-") {
      $chrompos = "complement($start..$end)";
      $rsat_dir = "R" ;
    }
  }
  print NAMES join "\t", $id, $id, "primary\n";
  foreach my $alias (@aliases) {
    next if ($alias eq $id); 
    print NAMES join "\t", $id, $alias, "alternate\n";
  }
  print CDS join "\t", $id, "", $id, $chrompos, $chrom, $start, "chrom_".$chrom, $descr, $end, $id, $id, $name, "Candida_albicans", $id, $id, $start,$rsat_dir, "5476", "CDS\n";
}


open CHROM, "Ca21_chromosomes.fasta";
open CONTIGS, ">contigs.txt";
while (my $ligne = <CHROM>) {
  chomp $ligne;
  if ($ligne =~ /^\>/) {
    close CHROMRAW;
    $ligne =~ s/\>//;
    my @lignecp = split / /, $ligne;
    my $chrom = $lignecp[0];
    $chrom = chop $chrom;
    my $contig = "chrom"."_$chrom";    
    open CHROMRAW, ">$contig".".raw";
    print CONTIGS join "\t", "$contig".".raw", $contig, "linear\n";
  } else {
    print CHROMRAW $ligne;
  }
}

open PROT, "orf_trans_all_assembly_21.fasta";
open PROTFASTA, ">Candida_albicans_aa.fasta";
while (my $ligne = <PROT>) {
  if ($ligne =~ /^\>/) {
    my @lignecp = split / /, $ligne;
    print PROTFASTA $lignecp[0]."\n";
  } else {
    print PROTFASTA $ligne;
  }
}

sub htmldecode {
  my $string = @_[0];
  $string =~ s/%20/ /g;
  $string =~ s/%3B/;/g;
  $string =~ s/%2C/,/g;
  $string =~ s/%25/%/g;
  $string =~ s/%2F/\//g;
  $string =~ s/%3C/</g;
  $string =~ s/%2B/+/g;
  $string =~ s/%3E/>/g;
  $string =~ s/%3D/=/g;
  $string =~ s/%3A/:/g;
  $string =~ s/%22/'/g;
  $string =~ s/%26/&/g;
  $string =~ s/%5B/\[/g;
  $string =~ s/%5D/\]/g;
  return $string;
}