Index: matrix-scan
===================================================================
RCS file: /cvs/rsat/rsa-tools/perl-scripts/matrix-scan,v
retrieving revision 1.184
retrieving revision 1.185
diff -r1.184 -r1.185
4c4
< # $Id: matrix-scan,v 1.184 2010/12/20 16:05:48 jvanheld Exp $
---
> # $Id: matrix-scan,v 1.185 2010/12/22 14:03:04 jvanheld Exp $
304a305
>   local $top_seq = 0;
333c334
<   $infinite = "999"; ## String to print for infinite values (e.g. when the eval < e-300, it i rounded to 0, thus the significance is infinite)
---
>   $infinite = "350"; ## String to print for infinite values (e.g. when the eval < e-300, it i rounded to 0, thus the significance is infinite)
680d680
< 
683c683,684
< 	################################################################
---
> 
>     ################################################################
746,748c747,748
<   		$calc_fields{sites} = 0; #do not calculate sites if matrix-scan-quick is supported
<   		$calc_fields{pval} = 0; #pval is already calculated by matrix-scan-quick
<   	}
---
>     $calc_fields{sites} = 0; #do not calculate sites if matrix-scan-quick is supported
>   }
834,840c834,840
<     	## bg window : calculate bg pseudo frequency
<     	if ($bg_method ne "input") {
<     		unless ($bg_pseudo_user_specified) {
<      			$main::bg_pseudo = sqrt($main::window)/(sqrt($main::window) + $main::window);
<      			&RSAT::message::Debug("window length: ", $main::window,"bg pseudo-frequency calculated:",$main::bg_pseudo) if ($main::verbose >= 3);
<       		}
<     	}
---
>     ## bg window : calculate bg pseudo frequency
>     if ($bg_method ne "input") {
>       unless ($bg_pseudo_user_specified) {
> 	$main::bg_pseudo = sqrt($main::window)/(sqrt($main::window) + $main::window);
> 	&RSAT::message::Debug("window length: ", $main::window,"bg pseudo-frequency calculated:",$main::bg_pseudo) if ($main::verbose >= 5);
>       }
>     }
865c865
<       &RSAT::message::Debug("Creating temporary background model file", $tmp_bg_file) if ($main::verbose >= 3);
---
>       &RSAT::message::Debug("Creating temporary background model file", $tmp_bg_file) if ($main::verbose >= 5);
884c884
<   &RSAT::message::TimeWarn(join("\t", "PRIOR", join(" ", %prior))) if ($main::verbose >= 3);
---
>   &RSAT::message::TimeWarn(join("\t", "PRIOR", join(" ", %prior))) if ($main::verbose >= 4);
908c908
< 	      if ($main::verbose >= 3);
---
> 	      if ($main::verbose >= 4);
958c958
< # 					      %prior, join(" ", $matrix->getPrior()))) if ($main::verbose >= 3);
---
> # 					      %prior, join(" ", $matrix->getPrior()))) if ($main::verbose >= 5);
964c964
< # 					     )) if ($main::verbose >= 3);
---
> # 					     )) if ($main::verbose >= 5);
968,978c968,977
< 		## Set the matrix prior
< 		$matrix->setMarkovModel($bg_model);
< 		&RSAT::message::TimeWarn(join("\t", "Setting matrix priors for matrix", $matrix->get_attribute("name"),
< 					      %prior, join(" ", $matrix->getPrior()))) if ($main::verbose >= 3);
< 		## Calculate min and max weight values
< 		my ($Wmin, $Wmax, $Wrange) = $matrix->weight_range() if ($return_fields{weight_limits} || $calc_fields{normw});
< 		&RSAT::message::TimeWarn(join("\t", "Calculated weight range for matrix",
< 					      $matrix->get_attribute("name"),
< 					      $Wmin, $Wmax, $Wrange,
< 					     )) if ($main::verbose >= 3);
< 
---
> 	  ## Set the matrix prior
> 	  $matrix->setMarkovModel($bg_model);
> 	  &RSAT::message::TimeWarn(join("\t", "Setting matrix priors for matrix", $matrix->get_attribute("name"),
> 					%prior, join(" ", $matrix->getPrior()))) if ($main::verbose >= 4);
> 	  ## Calculate min and max weight values
> 	  my ($Wmin, $Wmax, $Wrange) = $matrix->weight_range() if ($return_fields{weight_limits} || $calc_fields{normw});
> 	  &RSAT::message::TimeWarn(join("\t", "Calculated weight range for matrix",
> 					$matrix->get_attribute("name"),
> 					$Wmin, $Wmax, $Wrange,
> 				       )) if ($main::verbose >= 4);
989c988
< 				) if ($main::verbose >= 3);
---
> 				) if ($main::verbose >= 4);
1139c1138
<   &RSAT::message::TimeWarn("Scaning sequences with", scalar(@matrices), "matrices") if ($main::verbose >= 2);
---
>   &RSAT::message::TimeWarn("Scanning sequences with", scalar(@matrices), "matrices") if ($main::verbose >= 2);
1142a1142
> 
1145a1146,1159
>     ## Sequence length
>     $current_seq = lc($current_seq);
>     my $seq_len = length($current_seq);
>     $sum_seq_len += $seq_len;
> 
>     ## Update sequence number
>     $sequence_number++;
> 
>     ## Top sequences
>     if (($top_seq > 0) && ($sequence_number > $top_seq)) {
>       &RSAT::message::Info("Stopped after $top_seq sequences") if ($main::verbose >= 2);
>       last;
>     }
> 
1151c1165
<     $sequence_scores->{$seq_id}->{length} = length($current_seq);
---
>     $sequence_scores->{$seq_id}->{length} = $seq_len;
1156,1204d1169
< 
<     ## Count the number of N residues and increment the corresponding counter
<     my $n_seq = $current_seq;
<     $n_seq =~ s/[^N]//gi;
<     $n_residues += length($n_seq);
< 
<     ## Sequence length
<     $current_seq = lc($current_seq);
<     $sequence_number++;
<     my $seq_len = length($current_seq);
<     $sum_seq_len += $seq_len;
< 
<     ## Calculate the offset
<     local $orig_pos = 0; ## $orig_pos is local because it has to be passed to subroutines for CRER positions
<     local $ref_strand = "D"; ## Strand of the sequence relative to the
< 			     ## reference (chromosome) for genomic
< 			     ## coordinates. if this strand is R, all
< 			     ## matches have to be reversed.
<     if ($origin eq "end") {
<       $orig_pos = $seq_len + $main::offset + 1;
<     } elsif ($origin eq "center") {
<       $orig_pos = &round(($seq_len+1)/2) + $main::offset;
<     } elsif ($origin eq "start") {
<       $orig_pos = $main::offset;
<     } elsif ($origin eq "genomic") {
<       ($ref_org, $ref_chrom, $ref_start, $ref_end, $ref_strand, $genome_format, $browser_url) = &parse_genomic_coordinates($seq_id, @seq_comments);
<       if ((defined($ref_strand)) && ($ref_strand eq "D")) {
< 	$orig_pos = -$ref_start +1 + $main::offset;
< #      } elsif ($ref_strand eq "R") {
<       } else {
< 	$ref_strand = "DR";
< 	$orig_pos = -$ref_end +1 + $main::offset;
< #      } else {
< #	&RSAT::error::FatalError($ref_strand, "Invalid strand specification for genomic coordinates");
<       }
<       $seq_id = $ref_chrom;
< 
<       &RSAT::message::Debug("Reference for genomic coordinates",
< 			    "start=".$ref_start,
< 			    "end=".$ref_end,
< 			    "strand=".$ref_strand,
< 			    "orig_pos=".$orig_pos,
< 			   ) if ($main::verbose >= 10);
<     }
< 
<     ################################################################
<     ## Report sequence limits
<     &PrintSequenceLimits($seq_id, $seq_len, $orig_pos, $ref_strand) if ($return_fields{limits});
< 
1234c1199,1200
<     ### CRER detection
---
>     ################################################################
>     ### Initialize variables for CRER detection
1244c1210
<       ## adapt seed to that final crer size does not exceed the desired crer_size
---
>       ## adapt seed to that final CRER size does not exceed the desired crer_size
1256,1258c1222,1231
<     &RSAT::message::TimeWarn("Scanning sequence", $sequence_number,  $seq_id,"len=".$seq_len, "orig=".$orig_pos,  $seq_id,  "with ".scalar(@matrices)." PSSM")
<       if ($main::verbose >= 3);
<     &RSAT::message::psWarn("Scanning sequence with ".scalar(@matrices)." PSSM", $sequence_number,  $seq_id,"len=".$seq_len, "orig=".$orig_pos,  $seq_id) if (($main::verbose >= 2) && (($sequence_number % 50) == 0));
---
>     &RSAT::message::TimeWarn("Scanning sequence", $sequence_number,
> 			     $seq_id,"len=".$seq_len, 
> 			     "orig=".$origin, "offset=".$offset,
> 			     $seq_id,  "with ".scalar(@matrices)." PSSM")
>       if (($main::verbose >= 4) || (($main::verbose >= 3) && (($sequence_number % 10) == 1)));
>     &RSAT::message::psWarn("Scanning sequence with ".scalar(@matrices)." PSSM", $sequence_number,
> 			   $seq_id,"len=".$seq_len, 
> 			   "orig=".$origin, "offset=".$offset,
> 			   $seq_id) 
>       if (($main::verbose >= 4) && (($sequence_number % 100) == 1));
1309c1282
< 		&RSAT::message::Debug("added word", $added_word,"begins at" ,$pos - ($markov + 2) ) if ($main::verbose >= 3);
---
> 		&RSAT::message::Debug("added word", $added_word,"begins at" ,$pos - ($markov + 2) ) if ($main::verbose >= 5);
1334c1307
< 		&RSAT::message::Debug("deleted word", $deleted_word,"begins at" ,($pos + $ncol -1) -1) if ($main::verbose >= 3);
---
> 		&RSAT::message::Debug("deleted word", $deleted_word,"begins at" ,($pos + $ncol -1) -1) if ($main::verbose >= 5);
1455,1456c1428,1429
<     	&RSAT::message::TimeWarn( "Returning individual crer for sequence", $seq_id)
< 	  if ($main::verbose >= 3);
---
>     	&RSAT::message::TimeWarn( "Returning individual CRER for sequence", $seq_id)
> 	  if ($main::verbose >= 4);
1460c1433
< 	  &RSAT::message::Info("No crer in sequence",
---
> 	  &RSAT::message::Info("No CRER in sequence",
1462c1435
< 			      ) if ($main::verbose >= 3);
---
> 			      ) if ($main::verbose >= 4);
1466c1439
< 	  &RSAT::message::TimeWarn("Sorting crer for sequence", $seq_id) if ($main::verbose >= 3);
---
> 	  &RSAT::message::TimeWarn("Sorting CRER for sequence", $seq_id) if ($main::verbose >= 5);
1474c1447
<       	## Return the crer
---
>       	## Return the CRER
1492c1465
< 	if ($main::verbose >= 3);
---
> 	if ($main::verbose >= 4);
1500c1473
< 			    ) if ($main::verbose >= 3);
---
> 			    ) if ($main::verbose >= 4);
1504c1477
< 	&RSAT::message::TimeWarn("Sorting matches for sequence", $seq_id) if ($main::verbose >= 3);
---
> 	&RSAT::message::TimeWarn("Sorting matches for sequence", $seq_id) if ($main::verbose >= 4);
1554c1527
<    if ($quick_scan) {
---
>    if (($return_fields{sites}) && ($quick_scan)) {
1703c1676
< 		&QuickScan($matrix,"distrib");
---
> 	&QuickScan($matrix,"distrib");
1714c1687
< 	&RSAT::message::Info("score range", $matrix_name, $sorted_scores[0], $sorted_scores[$#sorted_scores], scalar(@sorted_scores)) if ((scalar(@sorted_scores) >= 1) && ($main::verbose >= 3));
---
> 	&RSAT::message::Info("score range", $matrix_name, $sorted_scores[0], $sorted_scores[$#sorted_scores], scalar(@sorted_scores)) if ((scalar(@sorted_scores) >= 1) && ($main::verbose >= 4));
1734c1707
< 	&RSAT::message::Info("score range", $matrix_name, $sorted_scores[0], $sorted_scores[$#sorted_scores], scalar(@sorted_scores)) if ((scalar(@sorted_scores) >= 1) && ($main::verbose >= 3));
---
> 	&RSAT::message::Info("score range", $matrix_name, $sorted_scores[0], $sorted_scores[$#sorted_scores], scalar(@sorted_scores)) if ((scalar(@sorted_scores) >= 1) && ($main::verbose >= 4));
1769c1742
< 	&RSAT::message::Info("score range", $matrix_name, $sorted_scores[0], $sorted_scores[$#sorted_scores], scalar(@sorted_scores)) if ((scalar(@sorted_scores) >= 1) && ($main::verbose >= 3));
---
> 	&RSAT::message::Info("score range", $matrix_name, $sorted_scores[0], $sorted_scores[$#sorted_scores], scalar(@sorted_scores)) if ((scalar(@sorted_scores) >= 1) && ($main::verbose >= 4));
1773c1746
< 	  my $prior = $null;
---
> 	  my $site_pval = $null;
1777c1750
< 	      $prior  = $exp_score_distrib->{$matrix_name}->{$score}->{occ_prior};
---
> 	      $site_pval  = $exp_score_distrib->{$matrix_name}->{$score}->{occ_prior};
1779c1752
< 				    $matrix_name, $score, $prior) if ($main::verbose >= 5);
---
> 				    $matrix_name, $score, $site_pval) if ($main::verbose >= 5);
1785c1758
< 	    $prior = ${$main::pval{$matrix}}{$score};
---
> 	    $site_pval = ${$main::pval{$matrix}}{$score};
1787,1789c1760,1761
< 	  $score_distrib->{$matrix_name}->{$score}->{occ_prior} = $prior;
< 	  if ($prior eq $null) {
< 	
---
> 	  $score_distrib->{$matrix_name}->{$score}->{occ_prior} = $site_pval;
> 	  if ($site_pval eq $null) {
1792c1764
< 	    $score_distrib->{$matrix_name}->{$score}->{exp_occ} = $prior * $n;
---
> 	    $score_distrib->{$matrix_name}->{$score}->{exp_occ} = $site_pval * $n;
1805c1777
< 	&RSAT::message::TimeWarn("score range", $matrix_name, $sorted_scores[0], $sorted_scores[$#sorted_scores], scalar(@sorted_scores)) if ((scalar(@sorted_scores) >= 1) && ($main::verbose >= 3));
---
> 	&RSAT::message::TimeWarn("score range", $matrix_name, $sorted_scores[0], $sorted_scores[$#sorted_scores], scalar(@sorted_scores)) if ((scalar(@sorted_scores) >= 1) && ($main::verbose >= 4));
1841c1813
< 	  if ($main::verbose >= 3) {
---
> 	  if ($main::verbose >= 4) {
1864c1836
<       &RSAT::message::Info("score range", $matrix_name, $sorted_scores[0], $sorted_scores[$#sorted_scores], scalar(@sorted_scores)) if ((scalar(@sorted_scores) >= 1) && ($main::verbose >= 3));
---
>       &RSAT::message::Info("score range", $matrix_name, $sorted_scores[0], $sorted_scores[$#sorted_scores], scalar(@sorted_scores)) if ((scalar(@sorted_scores) >= 1) && ($main::verbose >= 4));
1991a1964
> -top_seq #	only scan the top # sequences
2213a2187,2200
> =pod
> 
> =item B<-top_seq #>
> 
> Only scan with the top # sequences. This option allows to perform
> quick tests or to scan only a given number of sequences at the top of
> the input file (e.g. for collection of ChIP-seq peaks).
> 
> =cut
>     } elsif ($arg eq "-top_seq") {
>       $main::top_seq = shift(@arguments);
>       &RSAT::error::FatalError("$main::top_seq. Invalid top_seq, should be an Integer number.")
> 	unless (&RSAT::util::IsInteger($main::top_seq));
> 
3110,3112c3097,3100
<   my $matrix = shift;
<   my $scan_mode = shift;
<   my $total_scored = shift;
---
>   my ($matrix, $scan_mode, $total_scored) = @_;
> 
> 
>   &RSAT::message::TimeWarn("Scanning with matrix-scan-quick", "mode=".$scan_mode) if ($main::verbose >= 2);
3130c3118
<   &RSAT::message::TimeWarn("Exported partial matrix to file", $tmp_matrix_file) if ($main::verbose >= 2);
---
>   &RSAT::message::TimeWarn("Exported partial matrix to file", $tmp_matrix_file) if ($main::verbose >= 3);
3217a3206
>       $score_distrib->{$matrix_name}->{$score}->{score} = $score; ## This is just a trick to filter the distrib on score thresholds
3221d3209
<       $score_distrib->{$matrix_name}->{$score}->{score} = $score; ## This is just a trick to filter the distrib on score thresholds
3736c3724
< Calculate the score of the crer
---
> Calculate the score of the CRER
4026c4014
<       &RSAT::message::Info("score range", $matrix_name, $sorted_scores[0], $sorted_scores[$#sorted_scores], scalar(@sorted_scores)) if ((scalar(@sorted_scores) >= 1) && ($main::verbose >= 3));
---
>       &RSAT::message::Info("score range", $matrix_name, $sorted_scores[0], $sorted_scores[$#sorted_scores], scalar(@sorted_scores)) if ((scalar(@sorted_scores) >= 1) && ($main::verbose >= 4));
4048c4036
<       &RSAT::message::Info("score range", $matrix_name, $sorted_scores[0], $sorted_scores[$#sorted_scores], scalar(@sorted_scores)) if ((scalar(@sorted_scores) >= 1) && ($main::verbose >= 3));
---
>       &RSAT::message::Info("score range", $matrix_name, $sorted_scores[0], $sorted_scores[$#sorted_scores], scalar(@sorted_scores)) if ((scalar(@sorted_scores) >= 1) && ($main::verbose >= 4));
4159c4147
<     my @sorted_scores = sort {$a <=> $b} keys %{$score_distrib->{$matrix_name}};
---
>     &RSAT::message::Info("score range", $matrix_name, $sorted_scores[0], $sorted_scores[$#sorted_scores], scalar(@sorted_scores)) if ((scalar(@sorted_scores) >= 1) && ($main::verbose >= 4));
4161d4148
<     &RSAT::message::Info("score range", $matrix_name, $sorted_scores[0], $sorted_scores[$#sorted_scores], scalar(@sorted_scores)) if ((scalar(@sorted_scores) >= 1) && ($main::verbose >= 3));
4162a4150
>     my @sorted_scores = ();
4167c4155,4158
<       &RSAT::message::Info("Sorting distribution by occ_pval") if ($main::verbose >= 4);
---
>       &RSAT::message::Info("Sorted distribution by occ_pval") if ($main::verbose >= 4);
>     } else {
>       @sorted_scores = sort {$a <=> $b} keys %{$score_distrib->{$matrix_name}};
>       &RSAT::message::Info("Sorted distribution by weight score") if ($main::verbose >= 4);
