#!/usr/bin/perl -w use strict; use lib "/jolidisk/software/src/ensembl/v65/ensembl/modules"; use lib "/jolidisk/software/src/ensembl/v65/ensembl-compara/modules"; use lib "/jolidisk/software/src/ensembl/bioperl-1.2.3/"; use Bio::EnsEMBL::Registry; use POSIX; #use Data::Dumper; my %fileh = ( "ES" => '/jolidisk/users/ballester/mechali/data/reads_filtered/ES_indiff.bed2', "NP" => '/jolidisk/users/ballester/mechali/data/reads_filtered/s_3_NP.bed2', "19H" => '/jolidisk/users/ballester/mechali/data/reads_filtered/s_1_ES_19H.bed2', "input" => '/jolidisk/users/ballester/mechali/data/reads_filtered/input.bed2', ); my $bedes = $fileh{"ES"}; my $bed19h = $fileh{"19H"}; my $bednp = $fileh{"NP"}; my $bedinput = $fileh{"input"}; #-- Output open OUT_ES, ">", "out_es_bins2.txt" or die $!; open OUT_19H, ">", "out_19h_bins2.txt" or die $!; open OUT_NP, ">", "out_np_bins2.txt" or die $!; my $bin = 100; my $kb = 5000; my $verbose; # = 1 ; #--------------------------- # Connect to DBs #-------------------------- my $registry = 'Bio::EnsEMBL::Registry'; $registry->load_registry_from_db( -host => 'ensembldb.ensembl.org', -user => 'anonymous', ); print "api V:",$registry->software_version,"\n" if ($verbose); #-- Get the Compara Adaptors my $gdb_a = $registry->get_adaptor("Compara", "compara", "GenomeDB"); my $dnafrag_a = $registry->get_adaptor("Compara", "compara", "Dnafrag"); my $gdb = $gdb_a->fetch_by_dbID("57"); #-- Get DnaFrog - Chro - Length my $h_dnafrags; my $dnafrags = $dnafrag_a->fetch_all_by_GenomeDB_region($gdb, 'chromosome'); foreach my $dnafrag(@$dnafrags){ my $e = $dnafrag->length; my $chr = $dnafrag->name; $h_dnafrags->{$chr} = $e; } #-- load all BED / ORI my $file = "/jolidisk/users/ballester/mechali/analysis/heatmap/concat_MACS_merged300bp.bed"; open(FILE, $file) or die("Unable to open file"); my $hash_peaks; my $hash_nbs; while (my $line = ) { chomp $line; my ($chro, $start, $end, $nb ) = split(" ", $line); push(@{ $hash_peaks->{$chro} }, $line); $hash_nbs->{$chro}++; } my @sorted_desc = reverse sort { $hash_nbs->{$a} cmp $hash_nbs->{$b} } keys %$hash_nbs; #--------------------------- #-- Read/Bins BED files by 100bp bins #-- Chip - Input for each bin #-- Each window was divided into 100 bins of 100 bp in size. An enrichment value was assigned to each bin by counting the number of sequencing reads in that bin and subtracting the number of reads in the same bin of an input library. #--------------------------- my $hoh_es = &binBed($bedes, $bin, $h_dnafrags); my $hoh_np = &binBed($bednp, $bin, $h_dnafrags); my $hoh_19h = &binBed($bed19h, $bin, $h_dnafrags); my $hoh_input = &binBed($bedinput, $bin, $h_dnafrags); #-- get their RFs, nbs of RFs #-- Foreach description group get the Bed data foreach my $chro (@sorted_desc){ #print join(" ",$chro,$hash_nbs->{$chro})."\n"; my $lines = $hash_peaks->{$chro}; foreach my $line (@$lines){ my ($chro2, $start, $end, $nb ) = split(" ", $line); my $center = floor($start + ($end - $start)/2) ; my $center_floor = floor($center / $bin) * $bin +1 ; ## print "->$center $center_floor\n"; #if ($chro eq "M"){ $chro = "MT"} my $range_s = $center_floor - $kb ; my $range_e = $center_floor + $kb ; my @line_array_es; my @line_array_np; my @line_array_19h; #my @line_array_hnf6; my $size_array = 0; for (my $coord = $range_s; $coord <= $range_e ; $coord+=$bin) { my $start = $coord; my $end = $coord + $bin -1; #my $chro = $module->seq_region_name; #if ($chro eq "M"){ $chro = "MT"}; #-- get reads counts for each bins my $tmpreads_es = $hoh_es->{$chro}->{join(":",$chro,$start,$end)} ; my $tmpreads_np = $hoh_np->{$chro}->{join(":",$chro,$start,$end)} ; my $tmpreads_19h = $hoh_19h->{$chro}->{join(":",$chro,$start,$end)} ; my $reads_input = 0; $reads_input = $hoh_input->{$chro}->{join(":",$chro,$start,$end)} if ($hoh_input->{$chro}->{join(":",$chro,$start,$end)}); my $reads_es; if (!$tmpreads_es){ $reads_es = 0 ; }else{ if ($reads_input > 0){ $reads_es = $tmpreads_es - $reads_input ; }else{ $reads_es = $tmpreads_es; } } push @line_array_es, $reads_es; my $reads_np; if (!$tmpreads_np){ $reads_np = 0 ; }else{ if ($reads_input > 0){ $reads_np = $tmpreads_np - $reads_input ; }else{ $reads_np = $tmpreads_np; } } push @line_array_np, $reads_np; my $reads_19h; if (!$tmpreads_19h){ $reads_19h = 0 ; }else{ if ($reads_input > 0){ $reads_19h = $tmpreads_19h - $reads_input ; }else{ $reads_19h = $tmpreads_19h; } } push @line_array_19h, $reads_19h; $size_array++ ; }#<-- end bins # $i++; ##?? if (scalar @line_array_es == $size_array ){ my $line_es ; my $line_np ; my $line_19h ; foreach my $val (@line_array_es){ $line_es .= $val." "; } foreach my $val (@line_array_19h){ $line_19h .= $val." "; } foreach my $val (@line_array_np){ $line_np .= $val." "; } chop $line_es; chop $line_19h; chop $line_np; print OUT_ES join("\t",$chro2, $start, $end, $nb)." ".$line_es."\n"; print OUT_19H join("\t",$chro2, $start, $end, $nb)." ".$line_19h."\n"; print OUT_NP join("\t",$chro2, $start, $end, $nb)." ".$line_np."\n"; ##?? } }#<- end module # my $fake_line; # for (0..$bin){ # $fake_line .= "999 "; # } # chop $fake_line; # print OUT_ES $fake_line."\n"; # print OUT_NP $fake_line."\n"; # print OUT_19H $fake_line."\n"; }#<- end description module close(OUT_ES); close(OUT_19H); close(OUT_NP); #--------------------------- #-- Read/Bins BED files by 100bp bins #-- Chip - Input for each bin #-- Each window was divided into 100 bins of 100 bp in size. An enrichment value was assigned to each bin by counting the number of sequencing reads in that bin and subtracting the number of reads in the same bin of an input library. #--------------------------- #---------------------------------------------- #-- SUBROUTINES #---------------------------------------------- sub binBed{ ## print "Enter binBed\n"; my ($file, $bin, $h_dnafrags) = @_; #----------------------------------- #-- Create the hash of bins #----------------------------------- my $hoh; # my $dnafrags = $dnafrag_a->fetch_all_by_GenomeDB_region($gdb, 'chromosome'); # foreach my $dnafrag(@$dnafrags){ while ( my ($chr, $e) = each(%$h_dnafrags) ) { my $h; my $s = 1; #my $e = $dnafrag->length; #my $chr = $dnafrag->name; # print $chr, "\n"; for (my $coord = $s; $coord <= $e ; $coord+=$bin) { my $end = $coord + $bin -1; my $tag = join(":", "chr".$chr,$coord,$end); # print "$tag\n" ; #1:1:500 #1:501:1000 #1:1001:1500 #1:1501:2000 $h->{$tag} = 0; } $hoh->{$chr} = $h; } #-------------------- #-- read file #-------------------- # print "open $file\n"; open(FILE, $file) or die("Unable to open file"); ## print "open file\n"; #----------------------------------- #-- populate the hash of bins with nb of reads #----------------------------------- my $i = 0 ; while (my $line = ) { chop $line; #chr1 12 53 HWUSI-EAS582_186:4:29:334:1442/1 0 + my ($chro, $rs, $re, $tmp, $score, $strand) = split(" ",$line); #$chro =~ s/chr//; #if ($chro eq "M"){ $chro = "MT"} #-- this round the start coordinate of the read into a multiple of #-- the bin size eg 152,632 for a bin of 100 => 152,600 my $range_s = floor($rs / $bin) * $bin +1 ; my $range_e = $range_s + $bin -1; my $key = join(":", $chro,$range_s,$range_e); $hoh->{$chro}->{$key}++; my $nb = $hoh->{$chro}->{$key}; # print " => $line -- $key $nb\n"; $i++; } return $hoh; } __END__ ./heatmap.pl