#!/usr/bin/perl -w use strict; use lib "/jolidisk/software/src/ensembl/v65/ensembl/modules"; use lib "/jolidisk/software/src/ensembl/v65/ensembl-compara/modules"; use lib "/jolidisk/software/src/ensembl/bioperl-1.2.3/"; use Bio::EnsEMBL::Registry; use POSIX; #use Data::Dumper; my $bedcoord = $ARGV[0]; my $reads = $ARGV[1]; my $bin = $ARGV[2]; my $kb = $ARGV[3]; my $input = $ARGV[4]; my $verbose; # = 1 ; #--------------------------- # Connect to DBs #-------------------------- my $registry = 'Bio::EnsEMBL::Registry'; $registry->load_registry_from_db( -host => 'ensembldb.ensembl.org', -user => 'anonymous', ); print "api V:",$registry->software_version,"\n" if ($verbose); #-- Get the Compara Adaptors my $gdb_a = $registry->get_adaptor("Compara", "compara", "GenomeDB"); my $dnafrag_a = $registry->get_adaptor("Compara", "compara", "Dnafrag"); my $gdb = $gdb_a->fetch_by_dbID("57"); #-- Get DnaFrog - Chro - Length my $h_dnafrags; my $dnafrags = $dnafrag_a->fetch_all_by_GenomeDB_region($gdb, 'chromosome'); foreach my $dnafrag(@$dnafrags){ my $e = $dnafrag->length; my $chr = $dnafrag->name; $h_dnafrags->{$chr} = $e; } #-- load all BED / ORI open(FILE, $bedcoord) or die("Unable to open $bedcoord"); my $hash_peaks; my $hash_nbs; my @desc; while (my $line = ) { chomp $line; my ($chro, $start, $end) = split(" ", $line); #push(@{ $hash_peaks->{$chro} }, $line); push (@desc, $line); $hash_nbs->{$chro}++; } #my @sorted_desc = reverse sort { $hash_nbs->{$a} cmp $hash_nbs->{$b} } keys %$hash_nbs; #--------------------------- #-- Read/Bins BED files by 100bp bins #-- Chip - Input for each bin #-- Each window was divided into 100 bins of 100 bp in size. An enrichment value was assigned to each bin by counting the number of sequencing reads in that bin and subtracting the number of reads in the same bin of an input library. #--------------------------- my $hoh_reads = &binBed($reads, $bin, $h_dnafrags); my $hoh_input = &binBed($input, $bin, $h_dnafrags) if ($input) ; #-- get their RFs, nbs of RFs #-- Foreach description group get the Bed data foreach my $line (@desc){ #foreach my $chro (@sorted_desc){ #print join(" ",$chro,$hash_nbs->{$chro})."\n"; # my $lines = $hash_peaks->{$chro}; # foreach my $line (@$lines){ my ($chro2, $start, $end, $nb) = split(" ", $line); my $center = floor($start + ($end - $start)/2) ; my $center_floor = floor($center / $bin) * $bin +1 ; ## print "->$center $center_floor\n"; #if ($chro eq "M"){ $chro = "MT"} my $range_s = $center_floor - $kb ; my $range_e = $center_floor + $kb ; my @line_array_es; my $size_array = 0; for (my $coord = $range_s; $coord <= $range_e ; $coord+=$bin) { my $start = $coord; my $end = $coord + $bin -1; #my $chro = $module->seq_region_name; #if ($chro eq "M"){ $chro = "MT"}; #-- get reads counts for each bins my $tmpreads_es = $hoh_reads->{$chro2}->{join(":",$chro2,$start,$end)} ; my $reads_input = 0; $reads_input = $hoh_input->{$chro2}->{join(":",$chro2,$start,$end)} if ($hoh_input->{$chro2}->{join(":",$chro2,$start,$end)} && ($input)); my $reads_es; if (!$tmpreads_es){ $reads_es = 0 ; }else{ if ($reads_input > 0){ $reads_es = $tmpreads_es - $reads_input ; }else{ $reads_es = $tmpreads_es; } } push @line_array_es, $reads_es; $size_array++ ; }#<-- end bins # $i++; ##?? if (scalar @line_array_es == $size_array ){ my $line_es ; foreach my $val (@line_array_es){ $line_es .= $val." "; } chop $line_es; print join("\t",$chro2, $start, $end, $nb)." ".$line_es."\n"; ##?? } # }#<- end module # my $fake_line; # for (0..$bin){ # $fake_line .= "999 "; # } # chop $fake_line; # print OUT_ES $fake_line."\n"; # print OUT_NP $fake_line."\n"; # print OUT_19H $fake_line."\n"; }#<- end description module #--------------------------- #-- Read/Bins BED files by 100bp bins #-- Chip - Input for each bin #-- Each window was divided into 100 bins of 100 bp in size. An enrichment value was assigned to each bin by counting the number of sequencing reads in that bin and subtracting the number of reads in the same bin of an input library. #--------------------------- #---------------------------------------------- #-- SUBROUTINES #---------------------------------------------- sub binBed{ ## print "Enter binBed\n"; my ($file, $bin, $h_dnafrags) = @_; #----------------------------------- #-- Create the hash of bins #----------------------------------- my $hoh; # my $dnafrags = $dnafrag_a->fetch_all_by_GenomeDB_region($gdb, 'chromosome'); # foreach my $dnafrag(@$dnafrags){ while ( my ($chr, $e) = each(%$h_dnafrags) ) { my $h; my $s = 1; #my $e = $dnafrag->length; #my $chr = $dnafrag->name; # print $chr, "\n"; for (my $coord = $s; $coord <= $e ; $coord+=$bin) { my $end = $coord + $bin -1; my $tag = join(":", "chr".$chr,$coord,$end); # print "$tag\n" ; #1:1:500 #1:501:1000 #1:1001:1500 #1:1501:2000 $h->{$tag} = 0; } $hoh->{$chr} = $h; } #-------------------- #-- read file #-------------------- warn "open $file\n"; open(FILE, $file) or die("Unable to open ". $file); #----------------------------------- #-- populate the hash of bins with nb of reads #----------------------------------- my $i = 0 ; while (my $line = ) { chop $line; #chr1 12 53 HWUSI-EAS582_186:4:29:334:1442/1 0 + my ($chro, $rs, $re, $tmp, $score, $strand) = split(" ",$line); #$chro =~ s/chr//; #if ($chro eq "M"){ $chro = "MT"} #-- this round the start coordinate of the read into a multiple of #-- the bin size eg 152,632 for a bin of 100 => 152,600 my $range_s = floor($rs / $bin) * $bin +1 ; my $range_e = $range_s + $bin -1; my $key = join(":", $chro,$range_s,$range_e); $hoh->{$chro}->{$key}++; my $nb = $hoh->{$chro}->{$key}; # print " => $line -- $key $nb\n"; $i++; } return $hoh; } __END__ ./hm_generic.pl concat_MACS_merged300bp.bed /jolidisk/users/ballester/mechali/data/geo/GSE28254_henk/GSM821394_mES_V6.5_H3K27m3_ChIP.reads.bed #-------------------------------------------- #-- Heatmap mark histo #-------------------------------------------- cd /jolidisk/users/ballester/mechali/data/geo/GSE36114_cell2012/ ./hm_generic.pl ~/jmechali/analysis/heatmap/concat_MACS_merged300bp.bed GSM881348_E14_H2AZ.bed > GSM881348_E14_H2AZ.hm ./hm_generic.pl ~/jmechali/analysis/heatmap/concat_MACS_merged300bp.bed GSM881349_E14_H3K27ac.bed > GSM881349_E14_H3K27ac.hm ./hm_generic.pl ~/jmechali/analysis/heatmap/concat_MACS_merged300bp.bed GSM881350_E14_H3K27me3.bed > GSM881350_E14_H3K27me3.hm ./hm_generic.pl ~/jmechali/analysis/heatmap/concat_MACS_merged300bp.bed GSM881351_E14_H3K36me3.bed > GSM881351_E14_H3K36me3.hm ./hm_generic.pl ~/jmechali/analysis/heatmap/concat_MACS_merged300bp.bed GSM881352_E14_H3K4me1.bed > GSM881352_E14_H3K4me1.hm ./hm_generic.pl ~/jmechali/analysis/heatmap/concat_MACS_merged300bp.bed GSM881353_E14_H3K4me2.bed > GSM881353_E14_H3K4me2.hm ./hm_generic.pl ~/jmechali/analysis/heatmap/concat_MACS_merged300bp.bed GSM881354_E14_H3K4me3.bed > GSM881354_E14_H3K4me3.hm