#! /usr/bin/perl use strict; open GTF, "TreeseiV2_FilteredModelsv2.0.gtf"; open CDSTAB, ">cds.tab"; open CONTIGS, ">contigs.txt"; open SYNONYMS, ">cds_names.tab"; open SEQIN, "Trichoderma_reeseiV2.fasta"; open PROTSEQ, "TreeseiV2_FilteredModelsv2.0.proteins.fasta"; open AA, ">Trichoderma_reesei_aa.fasta"; my %features = (); my %contigs = (); while (my $ligne = ) { chomp $ligne; # next if ($ligne !~ /transcript/); my @lignecp = split /\t/, $ligne; my $scaffold = $lignecp[0]; my $transcript_gene = $lignecp[8]; my @transcript_genecp = split ";", $transcript_gene; my $gene_id = $transcript_genecp[0]; $gene_id =~ s/gene_id \"//; $gene_id =~ s/\"//; my $transcript_id = $transcript_genecp[1]; my $dir = $lignecp[6]; # fill the hash $transcript_id =~ s/transcript_id \"//; if (!defined ($features{$gene_id}{"transcript_id"})) { $transcript_id =~ s/ transcript_id \"//; $transcript_id =~ s/\"//; $features{$gene_id}{"transcript_id"} = $transcript_id if ($transcript_id ne ""); } else { $transcript_id = $features{$gene_id}{"transcript_id"}; } $features{$gene_id}{"contig"} = $scaffold; $features{$gene_id}{"dir"} = $dir; if ($lignecp[2] eq "start_codon") { $features{$gene_id}{"start"} = $lignecp[3] if ($dir eq "+"); $features{$gene_id}{"start"} = $lignecp[4] if ($dir eq "-"); } if ($lignecp[2] eq "stop_codon") { $features{$gene_id}{"stop"} = $lignecp[4] if ($dir eq "+"); $features{$gene_id}{"stop"} = $lignecp[3] if ($dir eq "-"); } if ($lignecp[2] eq "CDS") { if ($dir eq "+") { $features{$gene_id}{"cdsstart"} = $lignecp[3] if ($lignecp[3] < $features{$gene_id}{"cdsstart"} || !defined( $features{$gene_id}{"cdsstart"})); $features{$gene_id}{"cdsstop"} = $lignecp[4] if ($lignecp[4] > $features{$gene_id}{"cdsstop"} || !defined( $features{$gene_id}{"cdsstop"})); } else { $features{$gene_id}{"cdsstart"} = $lignecp[4] if ($lignecp[4] > $features{$gene_id}{"cdsstart"} || !defined( $features{$gene_id}{"cdsstart"})); $features{$gene_id}{"cdsstop"} = $lignecp[3] if ($lignecp[3] < $features{$gene_id}{"cdsstop"} || !defined( $features{$gene_id}{"cdsstop"})); } } print CONTIGS join "\t", $scaffold.".tab", $scaffold, "linear\n" if (!defined($contigs{$scaffold})); $contigs{$scaffold}++; } print CDSTAB "-- dump date 20080308_233549\n-- class Genbank::CDS\n-- table cds\n-- table main\n-- field 1 id\n-- field 2 GI\n-- field 3 GeneID\n-- field 4 chrom_position\n-- field 5 chromosome\n-- field 6 codon_start\n-- field 7 contig\n-- field 8 description\n-- field 9 end_pos\n-- field 10 gene\n-- field 11 gene_id\n-- field 12 name\n-- field 13 organism\n-- field 14 product\n-- field 15 protein_id\n-- field 16 start_pos\n-- field 17 strand\n-- field 18 taxid\n-- field 19 type\n-- header\n-- id GI GeneID chrom_position chromosome codon_start contig description end_pos gene gene_id name organism product protein_id start_pos strand taxid type\n"; foreach my $gene_id (keys %features){ my $transcript_id = $features{$gene_id}{"transcript_id"}; my $start = $features{$gene_id}{"start"} || $features{$gene_id}{"cdsstart"}; my $stop = $features{$gene_id}{"stop"} || $features{$gene_id}{"cdsstop"}; if (!defined($start)) { $start = $features{$gene_id}{"cdsstart"}; # print "$gene_id $start\n"; # exit(0); } my $dir = $features{$gene_id}{"dir"}; if ($start > $stop) { my $swap = $stop; $stop = $start; $start = $swap; } my $chrom_position = $start."..".$stop; if ($dir eq "-") { $chrom_position = "complement(".$chrom_position.")"; $dir = "R"; } else { $dir = "D"; } my $contig = $features{$gene_id}{"contig"}; my $org_id = 51453; my $organism = "Trichoderma reesei"; my $type = "CDS"; my @outputline = (); $outputline[0] = $gene_id; $outputline[1] = ""; $outputline[2] = $gene_id; $outputline[3] = $chrom_position; $outputline[4] = ""; $outputline[5] = 1; $outputline[6] = $contig; $outputline[7] = ""; $outputline[8] = $stop; $outputline[9] = $gene_id; $outputline[10] = $gene_id; $outputline[11] = $gene_id; $outputline[12] = $organism; $outputline[13] = $transcript_id; $outputline[14] = $transcript_id; $outputline[15] = $start; $outputline[16] = $dir; $outputline[17] = $org_id; $outputline[18] = $type; print CDSTAB join "\t", @outputline; print CDSTAB "\n"; print SYNONYMS join "\t", $gene_id, $gene_id, "primary"."\n"; print SYNONYMS join "\t", $gene_id, $transcript_id, "alternate"."\n"; } while (my $ligne = ) { chomp $ligne; if ($ligne =~ />/) { close SEQOUT; my $file = $ligne.".tab"; $file =~ s/>//; open SEQOUT, ">$file"; } else { print SEQOUT $ligne; } } while (my $ligne = ) { chomp $ligne; if ($ligne =~ />/) { my @lignecp = split /\|/, $ligne; my $id = $lignecp[3]; print AA ">$id\n"; } else { $ligne =~ s/\*//; print AA $ligne."\n"; } }