#!/usr/bin/perl

############################################################
#
# $Id: position-analysis,v 1.60 2011/02/17 05:07:46 rsat Exp $
#
# Time-stamp: <2003-10-21 01:16:40 jvanheld>
#
############################################################
if ($0 =~ /([^(\/)]+)$/) {
    push (@INC, "$`lib/");
}
require "RSA.lib";
require RSAT::util;
require RSAT::stats;
use POSIX qw(sysconf _PC_CHOWN_RESTRICTED);
#use Math::CDF;
use Statistics::Distributions;

################################################################
###### default values for parameters
$class_interval = 20;
$in_format = "fasta";
$strands = "1str";
$group_rc = 0;
$img_format = $ENV{rsat_img_format} || "png";
$XYgraph_command = "$SCRIPTS/XYgraph";
$log_base = log(10);
$MAX_SIG = 75; ## Value to be displayed for the significance when the P-value is lower than precision limit (thus, P-val = 0)

## Origin and offset
$origin = "start";
@supported_origins = qw(start center end);
foreach my $ori (@supported_origins) {
  $supported_origin{$ori} = 1;
}
$supported_origins = join ",", @supported_origins;
$offset = 0;

#### initialize other variables
$start_time = &RSAT::util::StartScript();
$sequence_number = 0;

$max_seq_length = 0;
$last = 0; ## max number of sequences to treat (for quick testing)
$max_graphs = -1; #  maximal number of graphs to export

$nb_possible_pos = 0;
$sum_seq_length = 0;
$no_check = 0;
$no_filter = 0;
$no_filter_graphs = 0;

%supported_seq_type = ("dna"=>1,
		       "any"=>1);
$supported_seq_types = sort keys %supported_seq_type;
$seq_type = "any";

&ReadArguments();

##############################
### check parameter values ###
##############################


### check oligonucleotide length
unless ($oligo_length > 0) {
  print "\tYou should specify an oligonucleotide length > 0.\n";
  print "\tType position-analysis -h for more info..\n";
  exit;
}


#### min and max classes to take into account for chi-square calculation
if ((&IsInteger($min_pos)) && (&IsInteger($max_pos))) {
    if ($max_pos < $min_pos) {
	&RSAT::error::FatalError( "min position should be smaller than max position");
    }
}

## Title
unless ($title) {
    $title = "position-analysis";
}

################################################################
#### specific treatment for output file, because if graphs are
#### requested, they must be saved in the same directory as the output
#### file

if ($outputfile) {
    $dir{output} =`dirname $outputfile`; 
    chomp $dir{output};
}
&RSAT::util::CheckOutDir($dir{output});
$out = &OpenOutputFile($outputfile);

### open sequence file ###
&CheckInputSeqFormat($in_format);
($in, $input_dir) = &OpenInputFile($inputfile);

&LocalReadPatterns() if ($pattern_file);
&ReadSequence();
&CheckDNA() if ($seq_type == "dna");
&CalcClasses();

### statistics on oligo occurrences
&RSAT::message::TimeWarn("Calculating sums of occurrences") if ($main::verbose >= 2);
foreach my $oligo_seq (sort keys %pattern) {
  $sum_occurrences += $pattern{$oligo_seq}->{occ};
  $sum_overlaps += $pattern{$oligo_seq}->{overlaps};
}

&SumStrands() if ($strands eq "2str");

&RSAT::message::Info("Number of patterns before filtering", scalar(keys(%pattern))) if ($main::verbose >= 2);

#### if a pattern file has been specified, forget info about other patterns
if ($pattern_file) {
    foreach my $oligo_seq (sort keys %pattern) {
	unless ($selected_pattern{$oligo_seq}) {
	    delete $pattern{$oligo_seq};
#	    delete $pattern{$oligo_seq}->{occ};
	}
    }
    &RSAT::message::Info("Pattern file filtering", scalar(keys(%pattern)), "remaining patterns") if ($main::verbose >= 2);
}


#### check threshold on occurrences
if (defined($lth{'occ'})) {
  foreach my $oligo_seq (sort keys %pattern) {
    if ($pattern{$oligo_seq}->{occ} < $lth{'occ'}) {
      delete $pattern{$oligo_seq};
      #	    delete $pattern{$oligo_seq}->{occ};
    }
  }
  &RSAT::message::Info("Filtering by occurrences", scalar(keys(%pattern)), "remaining patterns") if ($main::verbose >= 2);
}


&CalcExpected();

&CalcChi() if ($return{chi});

&RSAT::message::Info("Chi and sig computed", scalar(keys(%pattern)), "remaining patterns") if ($main::verbose >= 2);

&Verbose() if ($main::verbose >= 1);


&PrintResult();


################################################################
### generate XYgraphs
if ($return{graphs}) {

    #### directory for storing the graphs
    $rel_dir{graphs}="graphs";
    if ($outputfile) {
	$basename = `basename $outputfile`;
	$basename =~ s/\.tab$//;
	chomp $basename;
	$rel_dir{graphs} = $basename."_".$rel_dir{graphs};
    }

    $dir{graphs} = $dir{output}."/".$rel_dir{graphs};
    mkdir $dir{graphs}, 0755 
	|| &RSAT::error::FatalError("Could not create directory $dir{graphs}");
    unless (-d $dir{graphs}) {
	warn "Creating directory $dir{graphs}\n" if ($main::verbose >= 1);
	mkdir $dir{graphs}, 0755;
	unless (-d $dir{graphs}) {
	    &RSAT::error::FatalError("Cannot create graph directory $dir{gtaph}");
	}
    }


    $date = &AlphaDate();
    chomp $date;
    print ";$date\tgenerating the graphs ... \n" if ($main::verbose >= 2);

    #### index for the graphs
    if ($outputfile) {
	$index_file = $outputfile;
	$index_file =~ s/\.tab$//; ## Suppress tab extension
	$index_file .= "_graph_index.html";
    } else {
	$index_file = "$dir{output}/graph_index_${oligo_length}nt_ci${class_interval}_$strands.html";
    }

    &RSAT::message::Info("Index file", $index_file) if ($main::verbose >= 2);
    open INDEX, ">$index_file";
    print INDEX "<html>";
    print INDEX "<title>$title ; ${oligo_len}nt $strands $noov</title>";
    print INDEX "<body>\n";
    print INDEX "<h1>$title</h1>";
    print INDEX "<table>\n";
    print INDEX "<tr>\n";
    print INDEX "<th>Sequence</th>\n";
    print INDEX "<th>Occ</th>\n";
    print INDEX "<th>Chi2</th>\n";
    print INDEX "<th>Rank</th>\n";
    print INDEX "<th>Eval</th>\n";
    print INDEX "<th>Sig</th>\n";
    print INDEX "</tr>\n";
    close INDEX;

    #### generate one graph for each oligo
    my $graphs_done = 0;
    foreach my $oligo_seq (@sorted_keys) {
      if ($return{chi}) {
	next unless (&IsReal($pattern{$oligo_seq}->{chi_square}) || ($no_filter_graphs));
      }
      $graphs_done++;

	if (($max_graphs > 0) && ($graphs_done > $max_graphs)) {
	    &RSAT::message::Warning("Exported $graphs_done graphs");
	    last;
	}

	my $score = "NA";
	my $Eval = sprintf "%.1g", $pattern{$oligo_seq}->{Eval};
	my $sig = $pattern{$oligo_seq}->{sig};
	my $graph_file_name = join("", $oligo_seq, "_ci", $class_interval, "_", $strands, "_pos_distrib.",$img_format);
	my $xmax = ($max_class+1) * $class_interval;
	my $title2 = $pattern{$oligo_seq}->{in_bound_occ}." occurrences";
	$title2 .= ", class interval=$class_interval";
	if ($score_column > 0) {
	  $score = sprintf "score = %.2f", $pattern{$oligo_seq}->{score};
	} elsif ($return{chi}) {
	  if (&IsReal($pattern{$oligo_seq}->{chi_square})) {
	    $score = sprintf "%.2f", $pattern{$oligo_seq}->{chi_square};
	  } else {
	    $score = $pattern{$oligo_seq}->{chi_square};
	  }
	  $title2 .= "; chi2 = $score";
	  $title2 .= "; Eval=".$Eval;
	  $title2 .= "; sig=".$sig;
	}
	&RSAT::message::Debug("Exporting graph for oligo", $oligo_seq, $dir{graphs}."/".$graph_file_name) if ($main::verbose >= 5);

	my $command = "$XYgraph_command -o $dir{graphs}/$graph_file_name";
	$command .= " -lines -xcol 1 -ycol 2,3 -legend ";
	$command .= " -title1 '$title ; $oligo_seq distribution profile' ";
	$command .= " -title2 '$title2' ";
	$command .= " -xleg1 'position' -yleg1 'class frequency' ";
	$command .= " -ymin 0 -xmin $class_min[0] -xmax $class_max[$max_class] ";
	$command .= " -xgstep2 $class_interval -xsize 600 -ysize 200";
	$command .= " -format ".$img_format;
	print "; $command\n" if ($main::verbose >= 3);
 	open XY, "| $command";
 	print XY ";class\tocc_$oligo_seq\texp_$oligo_seq\n"; ### header line
 	foreach my $class ($min_class..$max_class) {
 	    print XY "$class_center{$class}\t";
 	    print XY "$class_freq{$oligo_seq}{$class}\t";
 	    print XY "$exp_occ{$oligo_seq}{$class}\n";
 	}
	close XYgraph;
	open INDEX, ">>$index_file";
	print INDEX "<tr>\n";
	print INDEX "<td><A HREF=\"$rel_dir{graphs}/$graph_file_name\">$oligo_seq</A></td>\n";
	print INDEX "<td>", $pattern{$oligo_seq}->{in_bound_occ}, "</td>\n";
	print INDEX "<td>", $score, "</td>\n";
	print INDEX "<td>", $pattern{$oligo_seq}->{rank}, "</td>\n";
	print INDEX "<td>", $Eval, "</td>\n";
	print INDEX "<td>", $sig, "</td>\n";
	print INDEX "</tr>\n";
	close INDEX;
      }
    open INDEX, ">>$index_file";
    print INDEX "<TABLE>\n";
    print INDEX "</BODY></HTML>\n";
    close INDEX;
}



################################################################
## Close output stream
my $exec_time = &RSAT::util::ReportExecutionTime($start_time); ## This has to be exectuted by all scripts
print $main::out $exec_time if ($main::verbose >= 1); ## only report exec time if verbosity is specified
close $out;

exit(0);


################################################################
## Sum occurrences and profiles of reverse complement patterns for strand insensitive analysis ###
sub SumStrands {
  &RSAT::message::TimeWarn("Summing occurrences of reverse complementary patterns") if ($main::verbose >= 2);

  ################################################################
  ## Sum occurrences
  foreach my $oligo_seq (keys %pattern) {
    $rc_oligo_seq = lc(&ReverseComplement($oligo_seq));
    $occurrences_2strands{$oligo_seq} = $pattern{$oligo_seq}->{occ} + $occurrences{$rc_oligo_seq};
    $occurrences_2strands{$rc_oligo_seq} = $occurrences_2strands{$oligo_seq};
  }
  foreach my $oligo_seq (keys %pattern_2strands) {
    $pattern{$oligo_seq}->{occ} = $occurrences_2strands{$oligo_seq};
  }
  undef %pattern_2strands;

  ################################################################
  ## Sum position profiles
  for my $class ($min_class..$max_class) {
    foreach my $oligo_seq (sort keys %pattern) {
      $rc_oligo_seq = lc(&ReverseComplement($oligo_seq));
      if ($rc_oligo_seq eq $oligo_seq) {
	$m_pos_bothstrands{$oligo_seq}{$class} = $class_freq{$oligo_seq}{$class};
      } else {
	$m_pos_bothstrands{$oligo_seq}{$class} = $class_freq{$oligo_seq}{$class} + $class_freq{$rc_oligo_seq}{$class};
      }
    }
    foreach my $oligo_seq (sort keys %pattern) {
      $class_freq{$oligo_seq}{$class} = $m_pos_bothstrands{$oligo_seq}{$class};
      undef $m_pos_bothstrands{$oligo_seq}{$class};
    }
  }

  #### if requested, group results by pairs of reverse complements ####
  if ($group_rc) {
    &RSAT::message::TimeWarn("Grouping patterns by pairs of reverse complements") if ($main::verbose >= 2);
    foreach my $oligo_seq (keys %pattern) {
      my $rc_oligo_seq = lc(&ReverseComplement($oligo_seq));
      if ($rc_oligo_seq gt $oligo_seq) { ### only suppress one oligo from the dyad 
	delete $pattern{$rc_oligo_seq};
	delete $class_freq{$rc_oligo_seq};
      }
    }
  }
}


################################################################
#### read patterns from a file
sub LocalReadPatterns {
  $date = &AlphaDate();
  chomp $date;
  warn ";$date\treading pattern file ...\n" if ($main::verbose >= 1);
  open PATTERNS, "$pattern_file" || die "Error: cannot open pattern file $pattern_file\n";
  while (<PATTERNS>) {
    next if (/^;/);
    next unless (\S);
    chomp;
    @fields = split, "\t";
    $pattern = lc($fields[0]);
    warn ";\t$pattern\n" if ($main::verbose >= 2);
    $selected_pattern{$pattern} = 1;
    $score{$pattern} = $fields[$score_column-1] if ($score_column > 0);
  }
  close PATTERNS;
  @selected_patterns = keys %selected_pattern;
}


################################################################
#### Read input sequence and calculate oligo distributions 
sub ReadSequence {
  &RSAT::message::TimeWarn(join("\t", "Reading sequences"))
    if ($main::verbose >= 2);

  ## Read all sequences and count oligo occurrences per class interval
  $sequence_number = 0;
  while ((($current_seq, $current_id, @comments) = &ReadNextSequence($in, $in_format, $input_dir, "", $mask)) &&
	 (($current_seq ne "") || ($current_id ne ""))) {

    $sequence_number++;

    if (($last > 0) && ($sequence_number > $last)) {
      &RSAT::message::Warning("Stopped after $last sequences (option -last was used)");
      last;
    }

    ### remove tabs and blank spaces ###
    $current_seq = &FoldSequence($current_seq,0);

    ### statistics about sequences ###
    $seq_length[$sequence_number] = length($current_seq);
    $sum_seq_length += $seq_length[$sequence_number];
    $id_list[$sequence_number] = $current_id;
    my $last_pos = $seq_length[$sequence_number] - $oligo_length + 1;
    $max_last_pos = &max($max_last_pos, $last_pos);

    my $ref_pos = 0;

    ## Compute the origin
    if ($origin eq "center") {
      $ref_pos = &round(($seq_length[$sequence_number]+1)/2);
    } elsif ($origin eq "end") {
      $ref_pos = $seq_length[$sequence_number] + 1;
    } else {
      $ref_pos = 0;
    }
    $ref_pos += $offset;



    #	if (($origin eq "-0") || ($origin < 0)) {
    #	    $ref_pos = $seq_length[$sequence_number] + $origin + 1;
    #	} else {
    #	    $ref_pos = $origin;
    #	}

    &RSAT::message::TimeWarn (join("\t",
				   "",
				   "Reading sequence",
				   $sequence_number,
				   "id=".$current_id,
				   "length_sum=".$sum_seq_length,
				   "last_pos=".$last_pos,
				   "ref_pos=".$ref_pos,
				  )) if (($main::verbose >= 3) || (($main::verbose >= 2) && ($sequence_number%500==0)));



    #### count oligonucleotides ####
    my $current_pos = 1;
    my %classes_in_this_seq = ();
    while ($current_pos <= $last_pos) {
      $relative_pos = $current_pos - $ref_pos;
      if (($origin eq "-0") || ($origin < 0)) {
	$class = POSIX::floor($relative_pos/$class_interval);
      } else {
	$class = POSIX::floor(($relative_pos - 1)/$class_interval);
      }
      $pos_per_class{$class}++;
      $classes_in_this_seq{$class}++;
      $oligo_seq = lc(substr($current_seq,$current_pos-1,$oligo_length));
      $rc = lc(&ReverseComplement($oligo_seq));
      #	    warn join ("\t",
      #		       ";",
      #		       $sequence_number,
      #		       "len = ".$seq_length[$sequence_number],
      #		       "pos = ".$current_pos,
      #		       "last_pos = ".$last_pos,
      #		       "ref_pos = ".$ref_pos,
      #		       "rel_pos = ".$relative_pos,
      #		       "class = ".$class,
      #		       "oligo_seq = ".$oligo_seq
      #		       ), "\n"
      #			   if ($main::verbose >= 4); 
      if ((%selected_pattern) && 
	  !($selected_pattern{$oligo_seq}) && 
	  !($selected_pattern{$rc})) {
	next;
      }
      if (($noov) 
	  && ($last_pos{$oligo_seq} > 0) 
	  && ($current_pos < ($last_pos{$oligo_seq} + $oligo_length))) {
	$pattern{$oligo_seq}->{overlaps}++;
      } else {
	$pattern{$oligo_seq}->{occ}++;
	$class_freq{$oligo_seq}{$class}++;
	$last_pos{$oligo_seq} = $current_pos;
      }
      $current_pos++;
    }


    #### max and min classes
    my $current_min_class = &min (keys %classes_in_this_seq);
    my $current_max_class = &max (keys %classes_in_this_seq);
    if (defined($min_class)) {
      $min_class = &min ($current_min_class, $min_class);
    } else {
      $min_class = $current_min_class;
    }
    if (defined($max_class)) {
      $max_class = &max ($current_max_class, $max_class);
    } else {
      $max_class = $current_max_class;
    }

    &RSAT::message::Debug (
			   "current_min_class = ".$current_min_class,
			   "min_class = ".$min_class,
			   "current_max_class = ".$current_max_class,
			   "max_class = ".$max_class,
			  ) if ($main::verbose >= 4);

    for my $class ($current_min_class..$current_max_class) {
      $seq_per_class{$class}++;
      $sum_seq_per_class++;
    }

    undef %last_pos;
  }
  undef $current_seq;		### release the memory occupied
  close $in;


  ### statistics on sequence lengths
  &RSAT::message::TimeWarn ("Calculating stats on sequence lengths")
    if ($main::verbose >= 2);

  for my $s (1..$sequence_number) {
    if ($seq_length[$s] >= $oligo_length) {
      if ($strands eq "2str") {
	$nb_possible_pos += 2*($seq_length[$s] + 1 - $oligo_length);
      } else {
	$nb_possible_pos += $seq_length[$s] + 1 - $oligo_length;
      }
    }
    $max_seq_length = &max($max_seq_length, $seq_length[$s]);

    &RSAT::message::Debug($s, $seq_length[$s], $sum_seq_length, $nb_possible_pos) if ($main::verbose >= 4);


  }
  &RSAT::message::TimeWarn(join("\t", "Finished reading sequences. Number of patterns", scalar(keys %pattern))) 
    if ($main::verbose >= 2);

}


################################################################
#### calculate class intervals
sub CalcClasses {
    &RSAT::message::TimeWarn("Calculating classes") if ($main::verbose >= 2);

    ### class definition
    $class_nb = $max_class + 1;
    for my $class ($min_class..$max_class) {

	if (($origin eq  '-0') || ($origin < 0)) {
	    $class_min{$class} = $class *$class_interval;
	    $class_max{$class} = ($class+1)*$class_interval -1;
	} else {
	    $class_min{$class} = $class *$class_interval + 1;
	    $class_max{$class} = ($class+1)*$class_interval;
	}

	$class_center{$class} = ($class_min{$class} + $class_max{$class})/2;
	warn join ("\t", 
		   $class,
		   "class_min=".$class_min{$class},
		   "class_max=".$class_max{$class},
		   "class_center=".$class_center{$class},
		   ), "\n" if ($main::verbose >= 4);
    }

    #### min and max classes for calculating the chi2
    if (&IsInteger($min_pos)) {
	$min_calc_class = POSIX::floor($min_pos/$class_interval);
    } else {
	$min_calc_class = $min_class;
    }
    if (&IsInteger($max_pos)) {
	$max_calc_class = POSIX::floor($max_pos/$class_interval);
    } else {
#	die $max_class , "\n";
	$max_calc_class = $max_class;
    }
    $calc_class_nb = $max_calc_class - $min_calc_class + 1;

    #### positions per class
    $sum_pos_per_class  = 0;
    for my $class ($min_calc_class..$max_calc_class) {
	$sum_pos_per_class += $pos_per_class{$class};
    }
}



################################################################
#### calcualte expected frequencies per class interval
sub CalcExpected {
    ### calculate expected occurrences for each class
    &RSAT::message::TimeWarn("Calculating expected occurrences") if ($main::verbose >= 2);
    foreach my $oligo_seq (sort keys %pattern) {
	foreach my $class ($min_calc_class..$max_calc_class) {
	    $pattern{$oligo_seq}->{in_bound_occ} += $class_freq{$oligo_seq}{$class};
	}
	foreach my $class ($min_calc_class..$max_calc_class) {
	    ### check that the class_freq has been defined
	    unless (&IsNatural($class_freq{$oligo_seq}{$class})) {
		$class_freq{$oligo_seq}{$class} = 0;
	    }

	    #		if ($seq_per_class{$class} > 0) {
	    #			$rel_freq{$oligo_seq}{$class} =  $class_freq{$oligo_seq}{$class}/$seq_per_class{$class};
	    #		} else {
	    #			$rel_freq{$oligo_seq}{$class} = 0;
	    #		}
	    if ($sum_pos_per_class > 0) {
		$exp_occ{$oligo_seq}{$class} =  $pattern{$oligo_seq}->{in_bound_occ} * $pos_per_class{$class}/$sum_pos_per_class;
	    } else {
		$exp_occ{$oligo_seq}{$class} =  "NA";
	    }
	    $max_freq = &max($max_freq,$class_freq{$oligo_seq}{$class},$exp_occ{$oligo_seq}{$class});
	    $min_freq = &min($min_freq,$class_freq{$oligo_seq}{$class},$exp_occ{$oligo_seq}{$class});
	}
    }
}

################################################################
#### calculate chi square statistics to compare expected and observed
#### frequencies
sub CalcChi {

  ### calculate chi2 to compare the position distribution with a flat line
  &RSAT::message::TimeWarn("Calculating chi values") if ($main::verbose >= 2);
  my $nb_tests = scalar(keys(%pattern));
  foreach my $oligo_seq (sort keys %pattern) {

    @chi_values = ();
    foreach my $class ($min_calc_class..$max_calc_class) {
      push @chi_values, $class_freq{$oligo_seq}{$class};
    }
    foreach my $class ($min_calc_class..$max_calc_class) {
      push @chi_values, $exp_occ{$oligo_seq}{$class};
    }
    ($pattern{$oligo_seq}->{chi_square}, $pattern{$oligo_seq}->{df}) = &ChiSquare("goodfit", 2, $calc_class_nb, @chi_values);
    #	&RSAT::message::Debug($oligo_seq, "chi2=".$pattern{$oligo_seq}->{chi_square}, "df=".$pattern{$oligo_seq}->{df}, @chi_values) if ($main::verbose >= 5);

    ## Compute P-value
    #	$pattern{$oligo_seq}->{Pval} = sprintf("%.2e", 1 - &Math::CDF::pchisq($pattern{$oligo_seq}->{chi_square},
    #									      $pattern{$oligo_seq}->{df}));
    if ($pattern{$oligo_seq}->{chi_square} > 0) {
      $pattern{$oligo_seq}->{Pval} = &Statistics::Distributions::chisqrprob($pattern{$oligo_seq}->{df},
									    $pattern{$oligo_seq}->{chi_square});
    } else {
      $pattern{$oligo_seq}->{Pval} = 1;
    }

    if ($pattern{$oligo_seq}->{Pval} <= 0) {
      $pattern{$oligo_seq}->{Pval} = 0;	## Lower boundary for the computation of P-values with the CDF library
      $pattern{$oligo_seq}->{Eval} = 0;
      $pattern{$oligo_seq}->{sig} = $MAX_SIG;
    } else {
      $pattern{$oligo_seq}->{Eval} = $pattern{$oligo_seq}->{Pval}*$nb_tests;
      $pattern{$oligo_seq}->{sig} = sprintf ("%.2f", -log($pattern{$oligo_seq}->{Eval})/$log_base);
    }

    # 	&RSAT::message::Debug("chisquare P-value",
    # 			      "df=".$pattern{$oligo_seq}->{df},
    # 			      "chi2=".$pattern{$oligo_seq}->{chi_square},
    # 			      "Pval=".$pattern{$oligo_seq}->{Pval},
    # 			      "Eval=".$pattern{$oligo_seq}->{Eval},
    # 			      "sig=".$pattern{$oligo_seq}->{sig},
    # 	    ) if ($main::verbose >= 5);

    #### check threshold on occurrences
    if (defined($lth{'sig'})) {
      if ($pattern{$oligo_seq}->{sig} < $lth{'sig'}) {
	&RSAT::message::Debug( "Deleting pattern", $oligo_seq, "significance", $pattern{$oligo_seq}->{sig}, "sig threshold", $lth{sig}) if ($main::verbose >= 5);
	delete $pattern{$oligo_seq};
	#	    delete $pattern{$oligo_seq}->{occ};
	next;
      }
    }

    #### chek or not the applicability condition for the chi2
    if ($no_check) {
      #### suppress parentheses in the output
      $pattern{$oligo_seq}->{chi_square} =~ s/\(//;
      $pattern{$oligo_seq}->{chi_square} =~ s/\)//;
    } else {
      unless (&IsReal($pattern{$oligo_seq}->{chi_square})) {
	&RSAT::message::Info($oligo_seq, $pattern{$oligo_seq}->{chi_square}, "does not fit conditions for the chi-square") 
	  if ($main::verbose >= 2);
	unless ($no_filter) {
	  print $out "; WARNING: $oligo_seq discarded\n" 
	    if ($main::verbose >= 3);
	  delete $pattern{$oligo_seq};
	  #		    delete $pattern{$oligo_seq}->{occ};
	  #		    delete $pattern{$oligo_seq}->{chi_square};
	}
      }
    }


    ##	warn join ("\t", $oligo_seq, $pattern{$oligo_seq}->{occ}, ${chi_square{$oligo_seq}}, $low_th{chi}), "\n";


    #### check the threshold on chi-square
    if ((&IsReal($lth{chi})) && 
	($pattern{$oligo_seq}->{chi_square} < $lth{chi})) {
      &RSAT::message::Debug( "Deleted pattern", $oligo_seq, "chi2", $pattern{$oligo_seq}->{chi_square}, "below threshold", $lth{chi})
	if ($main::verbose >= 5);

      delete $pattern{$oligo_seq};
      #	    delete $pattern{$oligo_seq}->{occ};
      #	    delete $pattern{$oligo_seq}->{chi_square};
    }
  }
}


################################################################
## Compute the P-value of over-representation for word in each window.
sub CalcWindowPvalues {

  ### BEGINNING OF DEVELOPMENT, NOT AT ALL FUNCTIONAL

  ### calculate chi2 to compare the position distribution with a flat line
  &RSAT::message::TimeWarn("Calculating chi values") if ($main::verbose >= 2);
  my $nb_tests = scalar(keys(%pattern));
  foreach my $oligo_seq (sort keys %pattern) {

    @chi_values = ();
    foreach my $class ($min_calc_class..$max_calc_class) {
      $pval{$oligo_seq}{$class} = &sum_of_poisson($m,$s,$r);;
      push @chi_values, $exp_occ{$oligo_seq}{$class};
    }
  }
}

################################################################
#### help message
sub PrintHelp {
  open HELP, "| more";
  print HELP <<End_of_help; 
NAME
	position-analysis

        1998 by Jacques van Helden (jvanheld\@bigre.ulb.ac.be)

DESCRIPTION
	Calculates the positional distribution of oligonucleotides in
	a set of sequences, and detects those which significantly
	discard from a homogeneous distribution.

CATEGORY
	sequences
	pattern-discovery

DETAILED DESCRIPTION

	This programs takes a sequence set as input, and calculates
	the number of occurrences of each word in a set of
	non-overlapping positional windows. The window width (in
	number of residues) is specified with the option -ci (class
	interval).

	The expected number of occurrences per window is then computed
	on the basis of a model of homogeneous repartitionof the
	occurrences. Beware : homogeneous does not necessarily means
	"flat". Indeed, if the sequence set contains sequences of
	unequal lengths, the number of sequence fragments varies from
	window to window.

	Observed and expected occurrences are compared using the
	chi-squared formula:

		chisq = SUM_i ( (obs-exp)^2 / exp )

	where i is the window number. 

	A P-value is calculated for each word.

	  Pval = P(chisq >= x)

USAGE
	position-analysis [-i inputfile]  [-format input_format]
                [-o outputfile] -l length -ci class_interval
                [-1str | -2str] [-grouprc | -nogrouprc]

	position-analysis [-h | -help]
		provides a detailed or synthetic documentation


OPTIONS
	-i inputfile
		if not specified, the standard input is used.
		This allows to place the command within a pipe.

	-seqtype dna|any
		  Sequence type
	-last #
	      Stop after # sequences (for quick testing)

	      The possibility to limit the analysis to a few sequences
	      (e.g.50 of the input file can be useful for fine-tuing
	      the options and ensuring that the result will appear as
	      requested. Note that this option should not be used for
	      the final analysis, since the significance drastically
	      depends on the number of input sequences.

	-mask upper|lower
		Mask lower or uppercases, respecively, i.e. replace
		selected case by N characters.

	-format	input file format. Must be followed by one of the 
		following options:
		   fasta (default)
		   wconsensus
		   IG
		   filelist
		   raw
		See below for the description of these formats.

	-o file	outputfile. Returns a list of the oligonucleotides 
		encountered in the sequences, with their frequencies.

	-v \#	verbose level. 

	-l	oligonucleotide length.

	-ci	class interval (default 20 bases). 
		The width of the position classes (in number of bases)

	-origin start | center | end
		Reference for calculating positions.


		The value should be chosen according to the sequence
		type. For instance:

		-origin start for downstream sequences

		-origin end for promoter sequences

		-origin center can be useful for ChIP-seq peaks, which
			can have variable lengths, but are supposed to
			be more or less centred on the TF binding
			qsites.

	-offset
		Add an offset to site positions. The offset value must
		be an Integer number (positive, null or
		negative). This option allows to select an arbitrary
		position as origin.

		Example: the option I<-offset -100> can be used to
		specify the transcription start site (TSS) as origin,
		in a collection of promoter sequences including 100
		residues downstream of the TSS.

		Note: in previous versions, -origin was used to
		specify both the reference point and the offset. Since
		March 2010, the offset is specified with the option
		-offset.

	-grouprc        group reverse complement pairs

	-nogrouprc      do not group reverse complement pairs

	-sort
		sort oligonucleotides according to the bias in
		distribution profile

	-1str
		inactivates the summation of occurrences on both
		strands.

	-2str
		oligonucleotide occurrences found on both stands are
		summed.

	-noov	no overlap
			overlapping occurrences of the same pattern are 
			only taken into account once

	-return	fields_to_return
		supported fields:
			distrib	occurrences found in each position class
			exp	expected occurrences for each class
			graphs	one graph file per oligont profile
			chi	chi-square value
			rank	rank of the pattern according to the
				sorting criterion
		several fields can be entered, separated by commas

	-lth_chi #	lower threshold on chi2
		return only words with a chi2 value > #

	-lth_sig #	lower threshold on significance
		return only words with a sig value > #

	-lth_occ #	lower threshold on occurrences
		return only words with a number of occurrences > #

	-uth_rank #	upper threshold on rank
		return maximum # words

	-max_graphs #	maximal number of graphs to export

	-pl pattern_file
		A file containing a selection of patterns.  The
		analysis is then restricted to these patterns.  The
		first word of each new line is considered as a new
		pattern.  A score can be associated to each pattern
		with the option -sc.

	-sc #	score column
		(only valid whith the option -pl)
		The column containing a score value for each pattern
		of the pattern file

	-minpos #
		minimal position to take into account for the
		chi-square calculation This value must be a multiple
		of the class interval.

	-maxpos #
		maximal position to take into account for the
		chi-square calculation This value must be a multiple
		of the class interval.

	-nocheck
		do not check the applicability condition on the
		chi-square. By default, the program checks that each
		class has at least 5 observations. The chi-square is
		bracketed for words which do not fill this
		conditions. It is now recognized that this condition
		is too restrictive, and that the chi2 is still valid
		with smaller clas effective. We allow to suppress the
		checking, but the responsibility is left to the user,
		to decide whether the chi2 is or not significant.

	-nofilter
		Do not discard oligos which do not fit the condition
		of applicability. Instead, mark them by including the
		chi2 value in curly brackets.

	-img_format
		Image format (this parameter is passed to XYgraph).

	-title
		Title for the index table and position profile plots.

End_of_help
  close HELP;
  exit(0);
}

################################################################
#### short description of the options
sub PrintOptions {
  open HELP, "| more";
  print HELP <<End_of_help; 
position-analysis options
----------------------
-h		display complete help message
-help		display this list of options
-i		input file
-seqtype       sequence type (dna|any)
-last #	      Stop after # sequences (for quick testing)
-mask upper|lower	mask upper- or lowercases, respectively
-format		input sequence format
-o 		output file
-v \#		verbose level
-l		oligonucleotide length
-ci		class interval (default 20 bases). 
-origin [start|center|end]	Define pos as the origin for calculating positions
-offset #	add a given number to site positions (change the reference point).
-1str		inactivate summation of occ on both strands
-2str		sum occurences on both strands (default)
-grouprc	group reverse complement pairs (default)
-nogrouprc	do not group reverse complement pairs
-noov		no overlap
-sort		sort oligonucleotides according to the score
-return		chi,distrib,exp,graph,rank
-lth_chi	lower threshold on chi2
-lth_sig	lower threshold on significance of the chi2 statistics
-lth_occ       	lower threshold on occurrences
-uth_rank      	upper threshold on rank
-max_graphs #	maximal number of graphs to export
-pl		pattern list
-img_format	image format (this parameter is passed to XYgraph)
-sc		score column
-minpos		minimal position for chi-square calculation
-maxpos		maximal position for chi-square calculation
-nocheck	do not check applicability condition for the chi2
-nofilter	don\'t discard oligos which do not fit applicability condition
-title		title for the index table and position profile plots.

End_of_help
    close HELP;
    exit(0);
}

################################################################
#### read arguments ####
sub ReadArguments {
    foreach my $a (0..$#ARGV) {
	### verbose ###
	if ($ARGV[$a] eq "-v") {
	    if (&IsNatural($ARGV[$a+1])) {
		$main::verbose = $ARGV[$a+1];
	    } else {
		$main::verbose = 1;
	    }

	    ### detailed help
	} elsif ($ARGV[$a] eq "-h") {
	    &PrintHelp();

	    ### list of options
	} elsif ($ARGV[$a] eq "-help") {
	    &PrintOptions();

	    ### input file
	} elsif ($ARGV[$a] eq "-i") {
	    $inputfile = $ARGV[$a+1];

	    ## mask
	} elsif ($ARGV[$a] eq "-mask") {
	    $mask = $ARGV[$a+1];
	    &CheckMask($mask);

	    ### output file
	} elsif ($ARGV[$a] eq "-o") {
	    $outputfile = $ARGV[$a+1];

	    ### oligomer length
	} elsif (($ARGV[$a] eq "-l") && (&IsNatural($ARGV[$a+1]))) {
	    $oligo_length = $ARGV[$a+1];

	    ### class interval
	} elsif (($ARGV[$a] eq "-ci") && (&IsNatural($ARGV[$a+1]))) {
	    $class_interval = $ARGV[$a+1];

	    ### sequence format
	} elsif ($ARGV[$a] eq "-format") {
	    $in_format = lc($ARGV[$a+1]);

	    ### strands
	} elsif ($ARGV[$a] eq "-1str") {
	    $strands = "1str";
	    $sum_rc = 0;
	    $group_rc = 0;

	} elsif ($ARGV[$a] eq "-2str") {
	    $strands = "2str";
	    $sum_rc = 1;
	    $group_rc = 1;

	    ### grouping of reverse complements
	} elsif ($ARGV[$a] eq "-grouprc") {
	    $strands = "2str";
	    $group_rc = 1;
	} elsif ($ARGV[$a] eq "-nogrouprc") {
	    $group_rc = 0;

	    ### sort the result according to significance
	} elsif ($ARGV[$a] eq "-sort") {
	    $sort_result = 1;

	    ### no overlap between successive matches
	} elsif ($ARGV[$a] eq "-noov") {
	    $noov = 1;

	    ### lower threshold on chi square
	} elsif ($ARGV[$a] =~ /^-lth_chi/) { 
	    $lth{chi} = $ARGV[$a+1];
	    $return{chi} = 1;
	    unless (($lth{chi} >= 0) && (&IsReal($lth{chi}))) {
		&RSAT::error::FatalError("Threshold on chi2 must be a positive number") ;
	    }

	    ### lower threshold on significance
	  } elsif ($ARGV[$a] =~ /^-lth_sig/) {
	    $lth{'sig'} = $ARGV[$a+1];
	    $return{'sig'} = 1;
	    unless (&IsReal($lth{'sig'})) {
	      &RSAT::error::FatalError("Threshold on sig must be a Real number") ;
	    }

	    ### lower threshold on occurrences
	} elsif ($ARGV[$a] eq "-lth_occ") { 
	    $lth{'occ'} = $ARGV[$a+1];
	    unless (&IsNatural($lth{'occ'})) {
		&RSAT::error::FatalError("Threshold on occurrences must be a natural number") ;
	    }
	} elsif ($ARGV[$a] eq "-oth") { 
	    $lth{'occ'} = $ARGV[$a+1];
	    unless (&IsNatural($lth{'occ'})) {
		&RSAT::error::FatalError("Threshold on occurrences must be a natural number") ;
	    }
	    &RSAT::message::Warning("option -oth is obsolete, please use -lth_occ instead");

	    ### Upper threshold on rank
	} elsif ($ARGV[$a] eq "-uth_rank") { 
	    $uth{rank} = $ARGV[$a+1];
	    unless (&IsNatural($uth{rank})) {
		&RSAT::error::FatalError("Threshold on rank must be a natural number") ;
	    }
	} elsif ($ARGV[$a] eq "-rth") { 
	    $uth{rank} = $ARGV[$a+1];
	    unless (&IsNatural($uth{rank})) {
		&RSAT::error::FatalError("Threshold on rank must be a natural number") ;
	    }
	    &RSAT::message::Warning("option -rth is obsolete, please use -uth_rank instead");

	    ### return values
	} elsif ($ARGV[$a] eq "-return") {
	    @fields_to_return = split ",", $ARGV[$a+1];
	    foreach my $field (@fields_to_return) {
		if ($field =~ /dist/) {
		    $return{'distrib'} = 1;

		} elsif ($field =~ /chi/) {
		    $return{chi} = 1;

		} elsif ($field =~ /exp/) {
		    $return{'exp'} = 1;

		} elsif ($field =~ /graph/) {
		    $return{graphs} = 1;

		} elsif ($field =~ /rank/) {
		    $return{'rank'} = 1;
		}
	    }



	    #### sequence type
	} elsif ($ARGV[$a] =~ /^-seqtype/i) {
	    $seq_type = lc($ARGV[$a+1]);
	    unless ($supported_seq_type{$seq_type}) {
		&RSAT::error::FatalError("$seq_type is not a supported sequence type. Supported: $supported_seq_types");
	    }

	    #### last
	} elsif ($ARGV[$a] =~ /^-last/i) {
	    $last = lc($ARGV[$a+1]);
	    unless ((&IsNatural($last)) && ($last > 1)) {
		&RSAT::error::FatalError("$last is not a valid value for -last. Must be a strictly positive Natural number.");
	    }

	    #### max number of graphs
	} elsif ($ARGV[$a] =~ /^-max_graphs/i) {
	    $max_graphs = lc($ARGV[$a+1]);
	    unless ((&IsNatural($max_graphs)) && ($max_graphs > 1)) {
		&RSAT::error::FatalError("$max_graphs is not a valid value for the option -max_graphs. Must be a strictly positive Natural number.");
	    }

	    ### do not check applicability condition for the chi2
	} elsif ($ARGV[$a] eq "-nocheck") {
	    $no_check = 1;

	    ### do not discard oligos which do not fit the applicability condition
	} elsif ($ARGV[$a] eq "-nofilter") {
	    $no_filter = 1;

	    ### predefined pattern list
	} elsif ($ARGV[$a] =~ /-pl/) {
#	    die "BOUM\t$pattern_file\n";
	    $pattern_file = $ARGV[$a+1];

	    ### score column
	} elsif ($ARGV[$a] eq "-sc") {
	    $score_column = $ARGV[$a+1];
	    unless (&IsNatural($score_column)) {
		&RSAT::error::FatalError( "Score column must be a natural number\n");
	    }

	    ### origin for positions
	} elsif ($ARGV[$a] eq "-origin") {
	    $origin = $ARGV[$a+1];

	    ## For backwards compatibility
	    if (&IsInteger($origin)) {
		if ($origin eq "-0") {
		    $offset = 0;
		    $origin = "end";
		} elsif ($origin < 0) {
		    $offset = $origin;
		    $origin = "end";
		} else {
		    $offset = $origin;
		    $origin = "start";
		}
	    } elsif (!$supported_origin{$origin}) {
		&RSAT::error::FatalError($origin, "Invalid value for origin. Supported: $supported_origins.");
	    }

	    ### offset
	} elsif ($ARGV[$a] eq "-offset") {
	    $offset = $ARGV[$a+1];
	    &RSAT::message::Warning("Offset", $offset);
	    &RSAT::error::FatalError($offset, "is not a valid value for offset. Should be an Integer number.")
		unless (&IsInteger($offset));

	} elsif ($ARGV[$a] eq "-img_format") {
	    $img_format = $ARGV[$a+1];

	} elsif ($ARGV[$a] eq "-title") {
	    $title = $ARGV[$a+1];

	    ### min and max positions
	} elsif (($ARGV[$a] eq "-minpos") && (&IsInteger($ARGV[$a+1]))) {
	    $min_pos = $ARGV[$a+1];
	} elsif (($ARGV[$a] eq "-maxpos") && (&IsInteger($ARGV[$a+1]))) {
	    $max_pos = $ARGV[$a+1];

	}
    }
}

################################################################
### verbose
sub Verbose {
    print $out "; position-analysis";
    &PrintArguments($out);
#    print $out "\n";

    printf $out "; %s\n", "Citation: van Helden, et al. (2000).  Nucleic Acids Res 28, 1000-1010.";

    printf $out "; %-29s\t%s\n", "Sequence file", $inputfile if ($inputfile);
    printf $out "; %-29s\t%s\n", "Sequence format", $in_format;
    printf $out "; %-29s\t%s\n", "Sequence type", $seq_type;
    printf $out "; %-29s\t%s\n", "Output file", $outputfile if ($outputfile);
    printf $out "; %-29s\t%d\n", "Oligo length", $oligo_length;
    printf $out "; %-29s\t%f\n", "Lower threshold on chi", $lth{chi} if (&IsReal($lth{chi}));
    printf $out "; %-29s\t%d\n", "Lower threshold on occurrences", $lth{'occ'} if (&IsNatural($lth{'occ'}));
    if ($strands eq "2str") {
	printf $out "; %-29s\n", "Occurrences counted  on both strands";
	if ($group_rc) {
	    printf $out "; %-29s\n", "grouped by pairs of reverse complements";
	}
    } else {
	printf $out "; %-29s\n", "Occurrences counted  on a single  strands";
    }
    print $out "; Conditions of applicability not checked !\n" if $no_check;
    print $out "; WARNING ! chi2 is shown between curly braces when the applicability conditions are not satisfied.\n" if $no_filter;

    print $out "; Sequence statistics:\n";
    printf $out ";\t%-29s\t%d\n", "Nb of sequences", $sequence_number;
    printf $out ";\t%-29s\t%d\n", "Sum of sequence lengths", $sum_seq_length;
    printf $out ";\t%-29s\t%d\n", "Min sequence length", $min_seq_length;
    printf $out ";\t%-29s\t%d\n", "Max sequence length", $max_seq_length;
    printf $out ";\t%-29s\t%d\n", "Average sequence length", $sum_seq_length/$sequence_number ,"\n" if ($sequence_number > 0);
    printf $out ";\t%-29s\t%d\n", "Possible positions", $nb_possible_pos;
    unless ($sequence_number > 100) {
	print $out "; Sequences:\n";
	print $out ";\t#\tlength\tID\n";
	foreach my $s (1..$sequence_number) {
	    print $out ";\t$s\t$seq_length[$s]\t$id_list[$s]\n";
	}
    }

    if ($#selected_patterns >=0) {
	print $out join ("\n;\t", "; Selected patterns", @selected_patterns), "\n";
    }


    print $out "; Oligonucleotide statistics:\n";
    printf $out ";\t%-21s\t%d\n", "Total occurrences", $sum_occurrences;
    if ($noov) {
	printf $out ";\t%-21s\t%d\n", "Total overlaps", $sum_overlaps;
    }

    print $out "; Class parameters:\n";

    printf $out ";\t%-21s\t%d\n", "Class interval", $class_interval;
    printf $out ";\t%-21s\t%d\n", "Min position", $min_pos if (&IsInteger($min_pos));
    printf $out ";\t%-21s\t%d\n", "Max position", $max_pos if (&IsInteger($max_pos));
#    printf $out ";\t%-21s\t%d\n", "Min class", $min_calc_class + 1 if (&IsInteger($min_calc_class));
#    printf $out ";\t%-21s\t%d\n", "Max class", $max_calc_class + 1 if (&IsInteger($max_calc_class));
    printf $out ";\t%-21s\t%d\n", "Number of classes", $calc_class_nb;
    printf $out ";\t%-21s\t%d\n", "Total positions", $sum_pos_per_class;
    printf $out ";\t%-21s\t%d\n", "Degrees of freedom", $calc_class_nb - 1;

    print $out "; Sequences per class:\n";
    print $out join ("\t", ";", "class", "\[min", "max\]", "mid", "seq", "occ"), "\n";
    for my $class ($min_calc_class..$max_calc_class) {
	print $out join ("\t",  ";", 
			 $class - $min_calc_class + 1, 
			 "\[".$class_min{$class}, 
			 $class_max{$class}."\]", 
			 $class_center{$class}, 
			 $seq_per_class{$class}, 
			 $pos_per_class{$class}), "\n";
    }

    print $out ";\n";
}


################################################################
## Print the result file
sub PrintResult {

    &RSAT::message::TimeWarn("Printing results", scalar(keys(%pattern)), "patterns") if ($main::verbose >= 2);

    #### sort oligonucleotides, either according to their significance
    #### or alphabetically
    if ($sort_result) {
	&RSAT::message::TimeWarn("Sorting results") if ($main::verbose >= 2);
	if ($score_column > 0) {
	    @sorted_keys = sort { 
		$pattern{$b}->{score} <=>  $pattern{$a}->{score}
	    } keys %pattern;
	} elsif ($return{chi}) {
	    @sorted_keys = sort { 
		$pattern{$b}->{chi_square} <=>  $pattern{$a}->{chi_square}
	    } keys %pattern;
	} else {
	    @sorted_keys = sort { 
		$pattern{$b}->{in_bound_occ} <=>  $pattern{$a}->{in_bound_occ}
	    } keys %pattern;
	}
    } else {
	@sorted_keys = sort keys %pattern;
    }
    &RSAT::message::TimeWarn("Sorted", scalar(@sorted_keys), "patterns") if ($main::verbose >= 2);

    ## Select top ranking patterns if required
    if (defined($uth{rank})) {
	my $max_rank = &min(scalar(@sorted_keys), $uth{rank});
	@sorted_keys = @sorted_keys[0..($max_rank -1)];
	&RSAT::message::TimeWarn("Retained", scalar(@sorted_keys), "top-raking patterns") if ($main::verbose >= 2);
    }


    #### Output columns
    @out_col = (); @col_descriptions = ();
    push @out_col, 'seq'; push @col_descriptions, 'pattern sequence';
    push @out_col, 'id'; push @col_descriptions, 'pattern identifier';
    push @out_col, 'occ'; push @col_descriptions, 'pattern occurrences';
    if ($noov) {push @out_col, "over"; push @col_descriptions, 'overlapping occurrences (discarded)';}
    if ($return{chi}) {
	push @out_col, "chi2" ; push @col_descriptions, 'observed chi-square';
	push @out_col, "df" ; push @col_descriptions, 'degrees of freedom';
	push @out_col, "Pval" ; push @col_descriptions, 'P-value (probability for one word to be a false positive)';
	push @out_col, "Eval" ; push @col_descriptions, 'E-value; expected number of false positives (Eval = Pval * nb_tests)';
	push @out_col, "sig" ; push @col_descriptions, 'Significance (sig = -log10(Eval))';
    }
    if ($return{'rank'}) {
	push @out_col, "rank"; push @col_descriptions, 'rank of the pattern according to sorting criterion';
    }
    if ($return{'distrib'}) {
	for my $class ($min_calc_class..$max_calc_class) {
	    push @out_col, $class_center{$class}; push @col_descriptions, 'observed class occurrences';
	}
	if ($return{'exp'}) {
	    for my $class ($min_calc_class..$max_calc_class) {
		push @out_col, $class_center{$class}; push @col_descriptions ,'expected class occurrences';
	    }
	}
    }

    ## Column content description 
    if ($main::verbose >= 1) {
	print $out "; column headers\n";
	foreach my $c (0..$#out_col) {
	    printf $out ";\t%d\t%-15s\t%s\n", $c+1, $out_col[$c], $col_descriptions[$c];
	}
    }

    &RSAT::message::TimeWarn("Printing results") if ($main::verbose >= 2);
    ### Print header 
    print $out "#", join("\t", @out_col), "\n";


    ### Pattern distributions
    my $rank = 0;
    foreach my $oligo_seq (@sorted_keys) {
      $rank++;
      $pattern{$oligo_seq}->{rank} = $rank;
      print $out "$oligo_seq"; ### Pattern sequence
      print $out "\t", &PatternID($oligo_seq, $sum_rc); ### Pattern ID
      #	print $out "\t$oligo_seq"; ### Pattern ID
      #	if ($group_rc) {
      #	    print $out "|", lc(ReverseComplement($oligo_seq));
      #	}

      ### Occurrences
      print $out "\t", $pattern{$oligo_seq}->{in_bound_occ};
      if ($noov) {
	  unless (defined($pattern{$oligo_seq}->{overlaps})) {
	      $pattern{$oligo_seq}->{overlaps} = 0;
	  }
	  print $out "\t", $pattern{$oligo_seq}->{overlaps};
      }

      ### chi-square value
      if ($return{chi}) {
	printf $out "\t%.1f", $pattern{$oligo_seq}->{chi_square};
	printf $out "\t%d", $pattern{$oligo_seq}->{df};
	printf $out "\t%.1e", $pattern{$oligo_seq}->{Pval};
	printf $out "\t%.2g", $pattern{$oligo_seq}->{Eval};
	print $out "\t", $pattern{$oligo_seq}->{sig};
      }

      #### Rank
      if ($return{'rank'}) {
	print $out "\t$rank";
      }

      ### Position distribution
      if ($return{'distrib'}) {
	for my $class ($min_calc_class..$max_calc_class) {
	  print $out "\t$class_freq{$oligo_seq}{$class}";
	  #			printf $out " (%.1f)", $exp_occ{$oligo_seq}{$class} if ($return{'exp'});
	}
      }

      #	print $out "\n";

      if (($return{'exp'}) && ($return{'distrib'})) {
	#	    print $out "$oligo_seq";
	#	    print $out "\t$oligo_seq";
	#	    if ($group_rc) {
	#		print $out "|", lc(ReverseComplement($oligo_seq));
	#	    }
	#	    print $out "\t$pattern{$oligo_seq}->{in_bound_occ}";
	#	    print $out "\t$pattern{$oligo_seq}->{overlaps}" if ($noov);
	#	    print $out "\texpected";

	for my $class ($min_calc_class..$max_calc_class) {
	  printf $out "\t%.1f", $exp_occ{$oligo_seq}{$class}; 
	}
      }
      print $out "\n";
    }
}


################################################################
#
# Filter out all oligos with non-canonical alphabet

sub CheckDNA {
    &RSAT::message::TimeWarn("Checking DNA") if ($main::verbose >= 2);

    foreach my $oligo_seq (sort keys %pattern) {
	if ($oligo_seq =~ /[^atcg]/i) {
	    delete $pattern{$oligo_seq};
#	    delete $pattern{$oligo_seq}->{occ};
	}
    }
}

