#!/usr/bin/perl

############################################################
#
# $Id: position-analysis,v 1.36 2009/11/05 00:32:07 jvanheld Exp $
#
# Time-stamp: <2003-10-21 01:16:40 jvanheld>
#
############################################################
if ($0 =~ /([^(\/)]+)$/) {
    push (@INC, "$`lib/");
}
require "RSA.lib";
require RSAT::util;
use POSIX qw(sysconf _PC_CHOWN_RESTRICTED);
use Math::CDF;


################################################################
###### default values for parameters
$class_interval = 20;
$in_format = "fasta";
$strands = "1str";
$group_rc = 0;
$img_format = $ENV{rsat_img_format} || "png";
$XYgraph_command = "$SCRIPTS/XYgraph";
$log_base = log(10);

#### initialize other variables
$start_time = &RSAT::util::AlphaDate();
$sequence_number = 0;

$max_seq_length = 0;
$last = 0; ## max number of sequences to treat (for quick testing)
$max_graphs = 0; #  maximal number of graphs to export

$nb_possible_pos = 0;
$sum_seq_length = 0;
$no_check = 0;
$no_filter = 0;
$no_filter_graphs = 0;

%supported_seq_type = ("dna"=>1,
		       "any"=>1);
$supported_seq_types = sort keys %supported_seq_type;
$seq_type = "any";

&ReadArguments();

##############################
### check parameter values ###
##############################


### check oligonucleotide length
unless ($oligo_length > 0) {
  print "\tYou should specify an oligonucleotide length > 0.\n";
  print "\tType position-analysis -h for more info..\n";
  exit;
}


#### min and max classes to take into account for chi-square calculation
if ((&IsInteger($min_pos)) && (&IsInteger($max_pos))) {
    if ($max_pos < $min_pos) {
	&RSAT::error::FatalError( "min position should be smaller than max position");
    }
}


################################################################
#### specific treatment for output file, because if graphs are
#### requested, they must be saved in the same directory as the output
#### file

if ($outputfile) {
    $dir{output} =`dirname $outputfile`; 
    chomp $dir{output};
}
&RSAT::util::CheckOutDir($dir{output});
$out = &OpenOutputFile($outputfile);

### open sequence file ###
&CheckInputSeqFormat($in_format);
($in, $input_dir) = &OpenInputFile($inputfile);

&LocalReadPatterns() if ($pattern_file);
&ReadSequence();
&CheckDNA() if ($seq_type == "dna");
&CalcClasses();

### statistics on oligo occurrences
&RSAT::message::TimeWarn("Calculating sums of occurrences") if ($main::verbose >= 2);
foreach my $oligo_seq (sort keys %pattern) {
  $sum_occurrences += $pattern{$oligo_seq}->{occ};
  $sum_overlaps += $pattern{$oligo_seq}->{overlaps};
}

&SumStrands() if ($strands eq "2str");

#### if a pattern file has been specified, forget info about other patterns
if ($pattern_file) {
    foreach my $oligo_seq (sort keys %pattern) {
	unless ($selected_pattern{$oligo_seq}) {
	    delete $pattern{$oligo_seq}->{occ};
	}
    }
}


#### check threshold on occurrences
if (&IsNatural($low_thr{'occ'})) {
    foreach my $oligo_seq (sort keys %pattern) {
	if ($pattern{$oligo_seq}->{occ} < $low_thr{'occ'}) {
	    delete $pattern{$oligo_seq}->{occ};
	}
    }
}


&CalcExpected();

&CalcChi() if ($return{'chi'});

&Verbose() if ($main::verbose >= 1);


&PrintResult();


################################################################
### generate XYgraphs
if ($return{'graph'}) {

    #### directory for storing the graphs
    $rel_dir{graphs}="graphs";
    if ($outputfile) {
	$basename = `basename $outputfile`;
	chomp $basename;
	$rel_dir{graphs} = $basename."_".$rel_dir{graphs};
    }

    $dir{graphs} = $dir{output}."/".$rel_dir{graphs};
    mkdir $dir{graphs}, 0755 
	|| &RSAT::error::FatalError("Could not create directory $dir{graphs}");
    unless (-d $dir{graphs}) {
	warn "Creating directory $dir{graphs}\n" if ($main::verbose >= 1);
	mkdir $dir{graphs}, 0755;
	unless (-d $dir{graphs}) {
	    &RSAT::error::FatalError("Cannot create graph directory $dir{gtaph}");
	}
    }


    $date = `date +%H:%M:%S`;
    chomp $date;
    print ";$date\tgenerating the graphs ... \n" if ($main::verbose >= 2);

    #### index for the graphs
    if ($outputfile) {
	$index_file = "${outputfile}_graph_index.html";
    } else {
	$index_file = "$dir{output}/graph_index_${oligo_length}nt_ci${class_interval}_$strands.html";
    }

    &RSAT::message::Info(join('\t', $index_file)) if ($main::verbose >= 0);
    open INDEX, ">$index_file";
    print INDEX "<HTML><BODY>\n";
    print INDEX "<TABLE>\n";
    print INDEX "<TR>\n";
    print INDEX "<TH>Sequence</TH>\n";
    print INDEX "<TH>Occ</TH>\n";
    print INDEX "<TH>Chi2</TH>\n";
    print INDEX "<TH>Rank</TH>\n";
    print INDEX "<TH>Eval</TH>\n";
    print INDEX "<TH>Sig</TH>\n";
    print INDEX "</TR>\n";
    close INDEX;

    #### generate one graph for each oligo
    my $graphs_done = 0;
    foreach my $oligo_seq (@sorted_keys) {
	next unless (&IsReal($pattern{$oligo_seq}->{chi_square}) || ($no_filter_graphs));

	$graphs_done++;
	if (($max_graphs > 0) && ($graphs_done > $max_graphs)) {
	    &RSAT::message::Warning("Exported $graphs_done graphs");
	    last;
	}

	my $score = "NA";
	my $Eval = sprintf "%.1g", $pattern{$oligo_seq}->{Eval};
	my $sig = sprintf "%.2f", $pattern{$oligo_seq}->{sig};
	my $graph_file_name = join("", $oligo_seq, "_ci", $class_interval, "_", $strands, "_pos_distrib.",$img_format);
	my $xmax = ($max_class+1) * $class_interval;
	my $title2 = $pattern{$oligo_seq}->{in_bound_occ}." occurrences";
	$title2 .= ", class interval=$class_interval";
	if ($score_column > 0) {
	  $score = sprintf "score = %.2f", $pattern{$oligo_seq}->{score};
	} elsif ($return{'chi'}) {
	  if (&IsReal($pattern{$oligo_seq}->{chi_square})) {
	    $score = sprintf "%.2f", $pattern{$oligo_seq}->{chi_square};
	  } else {
	    $score = $pattern{$oligo_seq}->{chi_square};
	  }
	  $title2 .= "; chi2 = $score";
	  $title2 .= "; Eval=".$Eval;
	  $title2 .= "; sig=".$sig;
	}

	my $command = "$XYgraph_command -o $dir{graphs}/$graph_file_name";
	$command .= " -lines -xcol 1 -ycol 2,3 -legend ";
	$command .= " -title1 '$oligo_seq distribution profile' ";
	$command .= " -title2 '$title2' ";
	$command .= " -xleg1 'position' -yleg1 'class frequency' ";
	$command .= " -ymin 0 -xmin $class_min[0] -xmax $class_max[$max_class] ";
	$command .= " -xgstep2 $class_interval -xsize 600";
	$command .= " -format ".$img_format;
	print "; $command\n" if ($main::verbose >= 2);
 	open XY, "| $command";
 	print XY ";class\tocc_$oligo_seq\texp_$oligo_seq\n"; ### header line
 	foreach my $class ($min_class..$max_class) {
 	    print XY "$class_center{$class}\t";
 	    print XY "$class_freq{$oligo_seq}{$class}\t";
 	    print XY "$exp_occ{$oligo_seq}{$class}\n";
 	}
	close XYgraph;
	open INDEX, ">>$index_file";
	print INDEX "<tr>\n";
	print INDEX "<td><A HREF=\"$rel_dir{graphs}/$graph_file_name\">$oligo_seq</A></td>\n";
	print INDEX "<td>", $pattern{$oligo_seq}->{in_bound_occ}, "</td>\n";
	print INDEX "<td>", $score, "</td>\n";
	print INDEX "<td>", $pattern{$oligo_seq}->{rank}, "</td>\n";
	print INDEX "<td>", $Eval, "</td>\n";
	print INDEX "<td>", $sig, "</td>\n";
	print INDEX "</tr>\n";
	close INDEX;
      }
    open INDEX, ">>$index_file";
    print INDEX "<TABLE>\n";
    print INDEX "</BODY></HTML>\n";
    close INDEX;
}


## end of job message
if ($main::verbose >= 1) {
  $done_time  = `date '+%d/%m/%y %H:%M:%S %Z'`;
  print $out ";Job started ".$start_time."\n";
  print $out ";Job done    ".$done_time;
  
}

close $out;

exit(0);


################################################################
## Sum occurrences and profiles of reverse complement patterns for strand insensitive analysis ###
sub SumStrands {
  &RSAT::message::TimeWarn("Summing occurrences of reverse complementary patterns") if ($main::verbose >= 2);

  ################################################################
  ## Sum occurrences
  foreach my $oligo_seq (keys %pattern) {
    $rc_oligo_seq = lc(&ReverseComplement($oligo_seq));
    $occurrences_2strands{$oligo_seq} = $pattern{$oligo_seq}->{occ} + $occurrences{$rc_oligo_seq};
    $occurrences_2strands{$rc_oligo_seq} = $occurrences_2strands{$oligo_seq};
  }
  foreach my $oligo_seq (keys %pattern_2strands) {
    $pattern{$oligo_seq}->{occ} = $occurrences_2strands{$oligo_seq};
  }
  undef %pattern_2strands;
  
  ################################################################
  ## Sum position profiles
  for my $class ($min_class..$max_class) {
    foreach my $oligo_seq (sort keys %pattern) {
      $rc_oligo_seq = lc(&ReverseComplement($oligo_seq));
      if ($rc_oligo_seq eq $oligo_seq) {
	$m_pos_bothstrands{$oligo_seq}{$class} = $class_freq{$oligo_seq}{$class};
      } else {
	$m_pos_bothstrands{$oligo_seq}{$class} = $class_freq{$oligo_seq}{$class} + $class_freq{$rc_oligo_seq}{$class};
      }	  
    }
    foreach my $oligo_seq (sort keys %pattern) {
      $class_freq{$oligo_seq}{$class} = $m_pos_bothstrands{$oligo_seq}{$class};
      undef $m_pos_bothstrands{$oligo_seq}{$class};
    }   
  }
  
  #### if requested, group results by pairs of reverse complements ####
  if ($group_rc) {
    &RSAT::message::TimeWarn("Grouping patterns by pairs of reverse complements") if ($main::verbose >= 2);
    foreach my $oligo_seq (keys %pattern) {
      my $rc_oligo_seq = lc(&ReverseComplement($oligo_seq));
      if ($rc_oligo_seq gt $oligo_seq) { ### only suppress one oligo from the dyad 
	delete $pattern{$rc_oligo_seq};
	delete $class_freq{$rc_oligo_seq};
      }
    }
  }
}


################################################################
#### read patterns from a file
sub LocalReadPatterns {
    $date = `date '+%H:%M:%S'`;
    chomp $date;
    warn ";$date\treading pattern file ...\n" if ($main::verbose >= 1);
    open PATTERNS, "$pattern_file" || die "Error: cannot open pattern file $pattern_file\n";
    while (<PATTERNS>) {
	next if (/^;/);
	next unless (\S);
	chomp;
	@fields = split, "\t";
	$pattern = lc($fields[0]);
	warn ";\t$pattern\n" if ($main::verbose >= 2);
	$selected_pattern{$pattern} = 1;
	$score{$pattern} = $fields[$score_column-1] if ($score_column > 0);
    }
    close PATTERNS;
    @selected_patterns = keys %selected_pattern;
}


################################################################
#### Read input sequence and calculate oligo distributions 
sub ReadSequence {
  &RSAT::message::TimeWarn(join("\t", "Reading sequences"))
			   if ($main::verbose >= 2);

    #### read all sequences and count oligo occurrences per class interval
    $sequence_number = 0;
    while ((($current_seq, $current_id, @comments) = &ReadNextSequence($in, $in_format, $input_dir, "", $mask)) &&
	   (($current_seq ne "") || ($current_id ne ""))) {

	$sequence_number++;

	if (($last > 0) && ($sequence_number > $last)) {
	    &RSAT::message::Warning("Stopeed after $last sequences (option -last was used)");
	    last;
	}

	### remove tabs and blank spaces ###
	$current_seq = &FoldSequence($current_seq,0);

	### statistics about sequences ###
	$seq_length[$sequence_number] = length($current_seq);
	$id_list[$sequence_number] = $current_id;
	my $last_pos = $seq_length[$sequence_number] - $oligo_length + 1;
	$max_last_pos = &max($max_last_pos, $last_pos);

	my $ref_pos = 0;
	if (($origin eq "-0") || ($origin < 0)) {
	    $ref_pos = $seq_length[$sequence_number] + $origin + 1;
	} else {
	    $ref_pos = $origin;
	}

	&RSAT::message::TimeWarn (join("\t",
				       "",
				       "Reading sequence",
				       $sequence_number,
				       $current_id,
				       "len=".$seq_length[$sequence_number],
				       "last_pos=".$last_pos,
				       "ref_pos=".$ref_pos,
				      )) if (($main::verbose >= 3) || (($main::verbose >= 2) && ($sequence_number%50==0)));

	#### count oligonucleotides ####
	my $offset = 0;
	my $current_pos = 1;
	my %classes_in_this_seq = ();
	while ($current_pos <= $last_pos) {
	    $relative_pos = $current_pos - $ref_pos;
	    if (($origin eq "-0") || ($origin < 0)) {
		$class = POSIX::floor($relative_pos/$class_interval);
	    } else {
		$class = POSIX::floor(($relative_pos - 1)/$class_interval);
	    }
	    $pos_per_class{$class}++;
	    $classes_in_this_seq{$class}++;
	    $oligo_seq = lc(substr($current_seq,$current_pos-1,$oligo_length));
	    $rc = lc(&ReverseComplement($oligo_seq));
#	    warn join ("\t",
#		       ";",
#		       $sequence_number,
#		       "len = ".$seq_length[$sequence_number],
#		       "pos = ".$current_pos,
#		       "last_pos = ".$last_pos,
#		       "ref_pos = ".$ref_pos,
#		       "rel_pos = ".$relative_pos,
#		       "class = ".$class,
#		       "oligo_seq = ".$oligo_seq
#		       ), "\n"
#			   if ($main::verbose >= 4); 
	    if ((defined(%selected_pattern)) && 
		!($selected_pattern{$oligo_seq}) && 
		!($selected_pattern{$rc})) {
		next;
	    }
	    if (($no_overlap) 
		&& ($last_pos{$oligo_seq} > 0) 
		&& ($current_pos < ($last_pos{$oligo_seq} + $oligo_length))) {
		$pattern{$oligo_seq}->{overlaps}++;
	    } else {
		$pattern{$oligo_seq}->{occ}++;
		$class_freq{$oligo_seq}{$class}++;
		$last_pos{$oligo_seq} = $current_pos;
	    }
	    $current_pos++;
	}


	#### max and min classes
	my $current_min_class = &min (keys %classes_in_this_seq);
	my $current_max_class = &max (keys %classes_in_this_seq);
	if (defined($min_class)) {
	    $min_class = &min ($current_min_class, $min_class);
	} else {
	    $min_class = $current_min_class;
	}
	if (defined($max_class)) {
	    $max_class = &max ($current_max_class, $max_class);
	} else {
	    $max_class = $current_max_class;
	}

	&RSAT::message::Debug (
			       "current_min_class = ".$current_min_class,
			       "min_class = ".$min_class,
			       "current_max_class = ".$current_max_class,
			       "max_class = ".$max_class,
			      ) if ($main::verbose >= 4);

	for my $class ($current_min_class..$current_max_class) {
	    $seq_per_class{$class}++;
	    $sum_seq_per_class++;
	}

	undef %last_pos;
    }
    undef $current_seq; ### release the memory occupied
    close $in;


    ### statistics on sequence lengths
    &RSAT::message::TimeWarn ("Calculating stats on sequence lengths")
      if ($main::verbose >= 2);

    for my $s (1..$sequence_number) {
	$sum_seq_length += $seq_length[$s];
	if ($seq_length[$s] >= $oligo_length) {
	    if ($strands eq "2str") {
		$nb_possible_pos += 2*($seq_length[$s] + 1 - $oligo_length);
	    } else {
		$nb_possible_pos += $seq_length[$s] + 1 - $oligo_length;
	    }
	}
	$max_seq_length = &max($max_seq_length, $seq_length[$s]);

	&RSAT::message::Debug($s, $seq_length[$s], $sum_seq_length, $nb_possible_pos) if ($main::verbose >= 4);


    }
    &RSAT::message::TimeWarn(join("\t", "Finished reading sequences. Number of patterns", scalar(keys %pattern))) 
      if ($main::verbose >= 2);

}


################################################################
#### calculate class intervals
sub CalcClasses {
    &RSAT::message::TimeWarn("Calculating classes") if ($main::verbose >= 2);
    
    ### class definition
    $class_nb = $max_class + 1;
    for my $class ($min_class..$max_class) {

	if (($origin eq  '-0') || ($origin < 0)) {
	    $class_min{$class} = $class *$class_interval;
	    $class_max{$class} = ($class+1)*$class_interval -1;
	} else {
	    $class_min{$class} = $class *$class_interval + 1;
	    $class_max{$class} = ($class+1)*$class_interval;
	}

	$class_center{$class} = ($class_min{$class} + $class_max{$class})/2;
	warn join ("\t", 
		   $class,
		   "class_min=".$class_min{$class},
		   "class_max=".$class_max{$class},
		   "class_center=".$class_center{$class},
		   ), "\n" if ($main::verbose >= 4);
    }

    #### min and max classes for calculating the chi2
    if (&IsInteger($min_pos)) {
	$min_calc_class = POSIX::floor($min_pos/$class_interval);
    } else {
	$min_calc_class = $min_class;
    }
    if (&IsInteger($max_pos)) {
	$max_calc_class = POSIX::floor($max_pos/$class_interval);
    } else {
#	die $max_class , "\n";
	$max_calc_class = $max_class;
    }    
    $calc_class_nb = $max_calc_class - $min_calc_class + 1;

    #### positions per class
    $sum_pos_per_class  = 0;
    for my $class ($min_calc_class..$max_calc_class) {
	$sum_pos_per_class += $pos_per_class{$class};
    }
}



################################################################
#### calcualte expected frequencies per class interval
sub CalcExpected {
    ### calculate expected occurrences for each class
    &RSAT::message::TimeWarn("Calculating expected occurrences") if ($main::verbose >= 2);
    foreach my $oligo_seq (sort keys %pattern) {
	foreach my $class ($min_calc_class..$max_calc_class) {
	    $pattern{$oligo_seq}->{in_bound_occ} += $class_freq{$oligo_seq}{$class};
	}
	foreach my $class ($min_calc_class..$max_calc_class) {
	    ### check that the class_freq has been defined
	    unless (&IsNatural($class_freq{$oligo_seq}{$class})) {
		$class_freq{$oligo_seq}{$class} = 0;
	    }

	    #		if ($seq_per_class{$class} > 0) {
	    #			$rel_freq{$oligo_seq}{$class} =  $class_freq{$oligo_seq}{$class}/$seq_per_class{$class};
	    #		} else {
	    #			$rel_freq{$oligo_seq}{$class} = 0;
	    #		}
	    if ($sum_pos_per_class > 0) {
		$exp_occ{$oligo_seq}{$class} =  $pattern{$oligo_seq}->{in_bound_occ} * $pos_per_class{$class}/$sum_pos_per_class;
	    } else {
		$exp_occ{$oligo_seq}{$class} =  "NA";
	    }
	    $max_freq = &max($max_freq,$class_freq{$oligo_seq}{$class},$exp_occ{$oligo_seq}{$class});
	    $min_freq = &min($min_freq,$class_freq{$oligo_seq}{$class},$exp_occ{$oligo_seq}{$class});
	}
    }
}

################################################################
#### calculate chi square statistics to compare expected and observed
#### frequencies
sub CalcChi {

    ### calculate chi2 to compare the position distribution with a flat line
    &RSAT::message::TimeWarn("Calculating chi values") if ($main::verbose >= 2);
    my $nb_tests = scalar(keys(%pattern));
    foreach my $oligo_seq (sort keys %pattern) {

	@chi_values = ();
	foreach my $class ($min_calc_class..$max_calc_class) {
	    push @chi_values, $class_freq{$oligo_seq}{$class};
	}
	foreach my $class ($min_calc_class..$max_calc_class) {
	    push @chi_values, $exp_occ{$oligo_seq}{$class};
	}
	($pattern{$oligo_seq}->{chi_square}, $pattern{$oligo_seq}->{df}) = &ChiSquare("goodfit", 2, $calc_class_nb, @chi_values);

	## Calculate P-value
	$pattern{$oligo_seq}->{Pval} = 1 - &Math::CDF::pchisq($pattern{$oligo_seq}->{chi_square},
							      $pattern{$oligo_seq}->{df});
	if ($pattern{$oligo_seq}->{Pval} <= 0) {
	  $pattern{$oligo_seq}->{Pval} = 0;
	  $pattern{$oligo_seq}->{Eval} = 0;
	  $pattern{$oligo_seq}->{sig} = 20;
	} else {
	  $pattern{$oligo_seq}->{Eval} = $pattern{$oligo_seq}->{Pval}*$nb_tests;
	  $pattern{$oligo_seq}->{sig} = -log($pattern{$oligo_seq}->{Eval})/$log_base;
	}

	#### check threshold on occurrences
	if (defined($low_thr{'sig'})) {
	  if ($pattern{$oligo_seq}->{sig} < $low_thr{'sig'}) {
	    delete $pattern{$oligo_seq}->{occ};
	    next;
	  }
	}

	#### chek or not the applicability condition for the chi2
	if ($no_check) {
	    #### suppress parentheses in the output
	    $pattern{$oligo_seq}->{chi_square} =~ s/\(//;
	    $pattern{$oligo_seq}->{chi_square} =~ s/\)//;
	} else {
	    unless (&IsReal($pattern{$oligo_seq}->{chi_square})) {
		print $out "; WARNING: $oligo_seq $pattern{$oligo_seq}->{chi_square}  does not fit conditions for the chi-square\n" 
		    if ($main::verbose >= 2);
		unless ($no_filter) {
		    print $out "; WARNING: $oligo_seq discarded\n" 
			if ($main::verbose >= 3);
		    delete $pattern{$oligo_seq}->{occ};
		    delete $pattern{$oligo_seq}->{chi_square};
		}
	    }
	}


##	warn join ("\t", $oligo_seq, $pattern{$oligo_seq}->{occ}, ${chi_square{$oligo_seq}}, $low_th{'chi'}), "\n";


	#### check the threshold on chi-square
	if ((&IsReal($low_thr{'chi'})) && 
	    ($pattern{$oligo_seq}->{chi_square} < $low_thr{'chi'})) {
	    print $out "; WARNING: $oligo_seq below the chi threshold $pattern{$oligo_seq}->{chi_square} < $low_thr{'chi'}\n" 
		if ($main::verbose >= 3);

	    delete $pattern{$oligo_seq}->{occ};
	    delete $pattern{$oligo_seq}->{chi_square};
	}
    }

}

################################################################
#### help message
sub PrintHelp {
  open HELP, "| more";
  print HELP <<End_of_help; 
NAME
	position-analysis

        1998 by Jacques van Helden (jvanheld\@bigre.ulb.ac.be)

DESCRIPTION
	Calculates the positional distribution of oligonucleotides in
	a set of sequences, and detects those which significantly
	discard from a homogeneous distribution.

CATEGORY
	sequences
	pattern-discovery

DETAILED DESCRIPTION

	This programs takes a sequence set as input, and calculates
	the number of occurrences of each word in a set of
	non-overlapping positional windows. The window width (in
	number of residues) is specified with the option -ci (class
	interval).

	The expected number of occurrences per window is then computed
	on the basis of a model of homogeneous repartitionof the
	occurrences. Beware : homogeneous does not necessarily means
	"flat". Indeed, if the sequence set contains sequences of
	unequal lengths, the number of sequence fragments varies from
	window to window.

	Observed and expected occurrences are compared using the
	chi-squared formula:
	
		chisq = SUM_i ( (obs-exp)^2 / exp )

	where i is the window number. 

	A P-value is calculated for each word.

	  Pval = P(chisq >= x)

USAGE
	position-analysis [-i inputfile]  [-format input_format]
                [-o outputfile] -l length -ci class_interval
                [-1str | -2str] [-grouprc | -nogrouprc]

	position-analysis [-h | -help]
		provides a detailed or synthetic documentation


OPTIONS
	-i inputfile
		if not specified, the standard input is used.
		This allows to place the command within a pipe.

	-seqtype dna|any
		  Sequence type
	-last #
	      Stop after # sequences (for quick testing)

	      The possibility to limit the analysis to a few sequences
	      (e.g.50 of the input file can be useful for fine-tuing
	      the options and ensuring that the result will appear as
	      requested. Note that this option should not be used for
	      the final analysis, since the significance drastically
	      depends on the number of input sequences.

	-mask upper|lower
		Mask lower or uppercases, respecively, i.e. replace
		selected case by N characters.

	-format	input file format. Must be followed by one of the 
		following options:
		   fasta (default)
		   wconsensus
		   IG
		   filelist
		   raw
		See below for the description of these formats.

	-o file	outputfile. Returns a list of the oligonucleotides 
		encountered in the sequences, with their frequencies.

	-v \#	verbose level. 

	-l	oligonucleotide length.

	-ci	class interval (default 20 bases). 
		The width of the position classes (in number of bases)

	-grouprc        group reverse complement pairs

	-nogrouprc      do not group reverse complement pairs

	-sort	
		sort oligonucleotides according to the bias in
		distribution profile

	-1str
		inactivates the summation of occurrences on both
		strands.

	-2str
		oligonucleotide occurrences found on both stands are
		summed.

	-noov	no overlap
			overlapping occurrences of the same pattern are 
			only taken into account once

	-return	fields_to_return
		supported fields:
			distrib	occurrences found in each position class
			exp	expected occurrences for each class
			graph	a rgaph file per oligont profile
			chi	chi-square value
			rank	rank of the pattern according to the
				sorting criterion
		several fields can be entered, separated by commas

	-lth_chi #	lower threshold on chi2
		return only words with a chi2 value > #

	-lth_sig #	lower threshold on significance
		return only words with a sig value > #

	-oth #	lower threshold on occurrences
		return only words with a number of occurrences > #

	-rth #	upper threshold on rank
		return maximum # words

	-max_graphs #	maximal number of graphs to export

	-pl pattern_file
		A file containing a selection of patterns.  The
		analysis is then restricted to these patterns.  The
		first word of each new line is considered as a new
		pattern.  A score can be associated to each pattern
		with the option -sc.

	-sc #	score column
		(only valid whith the option -pl)
		The column containing a score value for each pattern
		of the pattern file

	-minpos #
		minimal position to take into account for the
		chi-square calculation This value must be a multiple
		of the class interval.

	-maxpos #
		maximal position to take into account for the
		chi-square calculation This value must be a multiple
		of the class interval.

	-nocheck
		do not check the applicability condition on the
		chi-square. By default, the program checks that each
		class has at least 5 observations. The chi-square is
		bracketed for words which do not fill this
		conditions. It is now recognized that this condition
		is too restrictive, and that the chi2 is still valid
		with smaller clas effective. We allow to suppress the
		checking, but the responsibility is left to the user,
		to decide whether the chi2 is or not significant.

	-nofilter
		Do not discard oligos which do not fit the condition
		of applicability. Instead, mark them by including the
		chi2 value in curly brackets.

	-origin
		reference for calculating positions. 

		If a negative value is specified, coordinates are
		calculated with respect to the specified position
		relative to the end rather than the start of the
		sequences.

		The default value is 0, meaning that positions are
		calculated from sequence start (5\' side).

		To calculate the positions relative to sequence end,
		use the option:
			     -origin -0

	-img_format
		Image format (this parameter is passed to XYgraph)


End_of_help
  close HELP;
  exit(0);
}

################################################################
#### short description of the options
sub PrintOptions {
  open HELP, "| more";
  print HELP <<End_of_help; 
position-analysis options
----------------------
-h		display complete help message
-help		display this list of options
-i		input file
-seqtype       sequence type (dna|any)
-last #	      Stop after # sequences (for quick testing)
-mask upper|lower	mask upper- or lowercases, respectively
-format		input sequence format
-o 		output file
-v \#		verbose level
-l		oligonucleotide length
-ci		class interval (default 20 bases). 
-1str		inactivate summation of occ on both strands
-2str		sum occurences on both strands (default)
-grouprc	group reverse complement pairs (default)
-nogrouprc	do not group reverse complement pairs
-noov		no overlap
-sort		sort oligonucleotides according to the score
-return		chi,distrib,exp,graph,rank
-lth_chi	lower threshold on chi2
-lth_sig	lower threshold on significance
-oth		lower threshold on occurrences
-rth		lower threshold on rank
-max_graphs #	maximal number of graphs to export
-pl		pattern list
-origin		origin (-0 to calculate positions relative to sequence end)
-img_format	image format (this parameter is passed to XYgraph)
-sc		score column
-minpos		minimal position for chi-square calculation
-maxpos		maximal position for chi-square calculation
-nocheck	do not check applicability condition for the chi2
-nofilter	don\'t discard oligos which do not fit applicability condition
End_of_help
    close HELP;
    exit(0);
}

################################################################
#### read arguments ####
sub ReadArguments {
    foreach my $a (0..$#ARGV) {
	### verbose ###
	if ($ARGV[$a] eq "-v") {
	    if (&IsNatural($ARGV[$a+1])) {
		$main::verbose = $ARGV[$a+1];
	    } else {
		$main::verbose = 1;
	    }
	    
	    ### detailed help
	} elsif ($ARGV[$a] eq "-h") {
	    &PrintHelp();
	    
	    ### list of options
	} elsif ($ARGV[$a] eq "-help") {
	    &PrintOptions();
	    
	    ### input file
	} elsif ($ARGV[$a] eq "-i") {
	    $inputfile = $ARGV[$a+1];
	    
	    ## mask
	} elsif ($ARGV[$a] eq "-mask") {
	    $mask = $ARGV[$a+1];
	    &CheckMask($mask);	    

	    ### output file
	} elsif ($ARGV[$a] eq "-o") {
	    $outputfile = $ARGV[$a+1];
	    
	    ### oligomer length
	} elsif (($ARGV[$a] eq "-l") && (&IsNatural($ARGV[$a+1]))) {
	    $oligo_length = $ARGV[$a+1];
	    
	    ### class interval
	} elsif (($ARGV[$a] eq "-ci") && (&IsNatural($ARGV[$a+1]))) {
	    $class_interval = $ARGV[$a+1];
	    
	    ### sequence format
	} elsif ($ARGV[$a] eq "-format") {
	    $in_format = lc($ARGV[$a+1]);
	    
	    ### strands
	} elsif ($ARGV[$a] eq "-1str") {
	    $strands = "1str";
	    $sum_rc = 0;
	    $group_rc = 0;

	} elsif ($ARGV[$a] eq "-2str") {
	    $strands = "2str";
	    $sum_rc = 1;
	    $group_rc = 1;
	    
	    ### grouping of reverse complements
	} elsif ($ARGV[$a] eq "-grouprc") {
	    $strands = "2str";
	    $group_rc = 1;
	} elsif ($ARGV[$a] eq "-nogrouprc") {
	    $group_rc = 0;
	    
	    ### sort the result according to significance
	} elsif ($ARGV[$a] eq "-sort") {
	    $sort_result = 1;
	    
	    ### no overlap between successive matches
	} elsif ($ARGV[$a] eq "-noov") {
	    $no_overlap = 1;
	    
	    ### lower threshold on chi square
	} elsif ($ARGV[$a] =~ /^-lth_chi/) { 
	    $low_thr{'chi'} = $ARGV[$a+1];
	    $return{'chi'} = 1;
	    unless (($low_thr{'chi'} >= 0) && (&IsReal($low_thr{'chi'}))) {
		&RSAT::error::FatalError("Threshold on chi2 must be a positive number") ;
	    }

	    ### lower threshold on significance
	  } elsif ($ARGV[$a] =~ /^-lth_sig/) {
	    $low_thr{'sig'} = $ARGV[$a+1];
	    $return{'sig'} = 1;
	    unless (&IsReal($low_thr{'sig'})) {
	      &RSAT::error::FatalError("Threshold on sig must be a Real number") ;
	    }

	    ### lower threshold on occurrences
	} elsif ($ARGV[$a] =~ /^-oth/) { 
	    $low_thr{'occ'} = $ARGV[$a+1];
	    unless (&IsNatural($low_thr{'occ'})) {
		&RSAT::error::FatalError("Threshold on occurrences must be a natural number") ;
	    }
	    
	    ### Upper threshold on rank
	} elsif ($ARGV[$a] =~ /^-rth/) { 
	    $upper_thr{rank} = $ARGV[$a+1];
	    unless (&IsNatural($upper_thr{rank})) {
		&RSAT::error::FatalError("Threshold on rank must be a natural number") ;
	    }
	    
	    ### return values
	} elsif ($ARGV[$a] eq "-return") {
	    @fields_to_return = split ",", $ARGV[$a+1];
	    foreach my $field (@fields_to_return) {
		if ($field =~ /dist/) {
		    $return{'distrib'} = 1;
		    
		} elsif ($field =~ /chi/) {
		    $return{'chi'} = 1;
		    
		} elsif ($field =~ /exp/) {
		    $return{'exp'} = 1;
		    
		} elsif ($field =~ /graph/) {
		    $return{'graph'} = 1;

		} elsif ($field =~ /rank/) {
		    $return{'rank'} = 1;
		}
	    }

	    
	    
	    #### sequence type
	} elsif ($ARGV[$a] =~ /^-seqtype/i) {
	    $seq_type = lc($ARGV[$a+1]);
	    unless ($supported_seq_type{$seq_type}) {
		&RSAT::error::FatalError("$seq_type is not a supported sequence type. Supported: $supported_seq_types");
	    }

	    #### last
	} elsif ($ARGV[$a] =~ /^-last/i) {
	    $last = lc($ARGV[$a+1]);
	    unless ((&IsNatural($last)) && ($last > 1)) {
		&RSAT::error::FatalError("$last is not a valid value for -last. Must be a strictly positive Natural number.");
	    }

	    #### max number of graphs
	} elsif ($ARGV[$a] =~ /^-max_graphs/i) {
	    $max_graphs = lc($ARGV[$a+1]);
	    unless ((&IsNatural($max_graphs)) && ($max_graphs > 1)) {
		&RSAT::error::FatalError("$max_graphs is not a valid value for the option -max_graphs. Must be a strictly positive Natural number.");
	    }

	    ### do not check applicability condition for the chi2
	} elsif ($ARGV[$a] eq "-nocheck") {
	    $no_check = 1;

	    ### do not discard oligos which do not fit the applicability condition
	} elsif ($ARGV[$a] eq "-nofilter") {
	    $no_filter = 1;
	    
	    ### predefined pattern list
	} elsif ($ARGV[$a] =~ /-pl/) {
#	    die "BOUM\t$pattern_file\n";
	    $pattern_file = $ARGV[$a+1];
	    
	    ### score column
	} elsif ($ARGV[$a] eq "-sc") {
	    $score_column = $ARGV[$a+1];
	    unless (&IsNatural($score_column)) {
		&RSAT::error::FatalError( "Score column must be a natural number\n");
	    }
	    
	    ### score column
	} elsif ($ARGV[$a] =~ /^-orig/) {
	    $origin = $ARGV[$a+1];
	    unless (&IsInteger($origin)) {
		&RSAT::error::FatalError( "Invalid value $origin\tOrigin must be an integer number\n");
	    }
	    
	} elsif ($ARGV[$a] eq "-img_format") {
	    $img_format = $ARGV[$a+1];

	    ### min and max positions
	} elsif (($ARGV[$a] eq "-minpos") && (&IsInteger($ARGV[$a+1]))) {
	    $min_pos = $ARGV[$a+1];
	} elsif (($ARGV[$a] eq "-maxpos") && (&IsInteger($ARGV[$a+1]))) {
	    $max_pos = $ARGV[$a+1];
	    
	}
    }
}

################################################################
### verbose
sub Verbose {
    print $out "; position-analysis";
    &PrintArguments($out);
#    print $out "\n";

    printf $out "; %s\n", "Citation: van Helden, et al. (2000).  Nucleic Acids Res 28, 1000-1010.";

    printf $out "; %-29s\t%s\n", "Sequence file", $inputfile if ($inputfile);
    printf $out "; %-29s\t%s\n", "Sequence format", $in_format;
    printf $out "; %-29s\t%s\n", "Sequence type", $seq_type;
    printf $out "; %-29s\t%s\n", "Output file", $outputfile if ($outputfile);
    printf $out "; %-29s\t%d\n", "Oligo length", $oligo_length;
    printf $out "; %-29s\t%f\n", "Lower threshold on chi", $low_thr{'chi'} if (&IsReal($low_thr{'chi'}));
    printf $out "; %-29s\t%d\n", "Lower threshold on occurrences", $low_thr{'occ'} if (&IsNatural($low_thr{'occ'}));
    if ($strands eq "2str") {
	printf $out "; %-29s\n", "Occurrences counted  on both strands";
	if ($group_rc) {
	    printf $out "; %-29s\n", "grouped by pairs of reverse complements";
	}
    } else {
	printf $out "; %-29s\n", "Occurrences counted  on a single  strands";
    }
    print $out "; Conditions of applicability not checked !\n" if $no_check;
    print $out "; WARNING ! chi2 is shown between curly braces when the applicability conditions are not satisfied.\n" if $no_filter;

    print $out "; Sequence statistics:\n";
    printf $out ";\t%-29s\t%d\n", "Nb of sequences", $sequence_number;
    printf $out ";\t%-29s\t%d\n", "Sum of sequence lengths", $sum_seq_length;
    printf $out ";\t%-29s\t%d\n", "Min sequence length", $min_seq_length;
    printf $out ";\t%-29s\t%d\n", "Max sequence length", $max_seq_length;
    printf $out ";\t%-29s\t%d\n", "Average sequence length", $sum_seq_length/$sequence_number ,"\n" if ($sequence_number > 0);
    printf $out ";\t%-29s\t%d\n", "Possible positions", $nb_possible_pos;
    unless ($sequence_number > 100) {
	print $out "; Sequences:\n";
	print $out ";\t#\tlength\tID\n";
	foreach my $s (1..$sequence_number) {
	    print $out ";\t$s\t$seq_length[$s]\t$id_list[$s]\n";
	}
    }

    if ($#selected_patterns >=0) {
	print $out join ("\n;\t", "; Selected patterns", @selected_patterns), "\n";
    }
    

    print $out "; Oligonucleotide statistics:\n";
    printf $out ";\t%-21s\t%d\n", "Total occurrences", $sum_occurrences;
    if ($no_overlap) {
	printf $out ";\t%-21s\t%d\n", "Total overlaps", $sum_overlaps;
    }

    print $out "; Class parameters:\n";

    printf $out ";\t%-21s\t%d\n", "Class interval", $class_interval;
    printf $out ";\t%-21s\t%d\n", "Min position", $min_pos if (&IsInteger($min_pos));
    printf $out ";\t%-21s\t%d\n", "Max position", $max_pos if (&IsInteger($max_pos));
#    printf $out ";\t%-21s\t%d\n", "Min class", $min_calc_class + 1 if (&IsInteger($min_calc_class));
#    printf $out ";\t%-21s\t%d\n", "Max class", $max_calc_class + 1 if (&IsInteger($max_calc_class));
    printf $out ";\t%-21s\t%d\n", "Number of classes", $calc_class_nb;
    printf $out ";\t%-21s\t%d\n", "Total positions", $sum_pos_per_class;
    printf $out ";\t%-21s\t%d\n", "Degrees of freedom", $calc_class_nb - 1;

    print $out "; Sequences per class:\n";
    print $out join ("\t", ";", "class", "\[min", "max\]", "mid", "seq", "occ"), "\n";
    for my $class ($min_calc_class..$max_calc_class) {
	print $out join ("\t",  ";", 
			 $class - $min_calc_class + 1, 
			 "\[".$class_min{$class}, 
			 $class_max{$class}."\]", 
			 $class_center{$class}, 
			 $seq_per_class{$class}, 
			 $pos_per_class{$class}), "\n";
    }
    
    print $out ";\n";
}


################################################################
## Print the result file
sub PrintResult {


    #### sort oligonucleotides, either according to their significance
    #### or alphabetically
    if ($sort_result) {
	&RSAT::message::TimeWarn("Sorting results") if ($main::verbose >= 2);
	if ($score_column > 0) {
	    @sorted_keys = sort { 
		$pattern{$b}->{score} <=>  $pattern{$a}->{score}
	    } keys %pattern;
	} elsif ($return{'chi'}) {
	    @sorted_keys = sort { 
		$pattern{$b}->{chi_square} <=>  $pattern{$a}->{chi_square}
	    } keys %pattern;
	} else {
	    @sorted_keys = sort { 
		$pattern{$b}->{in_bound_occ} <=>  $pattern{$a}->{in_bound_occ}
	    } keys %pattern;
	}
    } else {
	@sorted_keys = sort keys %pattern;
    }

    if (defined($upper_thr{rank})) {
      @sorted_keys = @sorted_keys[0..($upper_thr{rank}-1)];
    } 


    #### output columns
    @out_col = (); @col_descriptions = ();
    push @out_col, 'seq'; push @col_descriptions, 'pattern sequence';
    push @out_col, 'id'; push @col_descriptions, 'pattern identifier';
    push @out_col, 'occ'; push @col_descriptions, 'pattern occurrences';
    if ($no_overlaps) {push @out_col, "over"; push @col_descriptions, 'overlapping occurrences (discarded)';}
    if ($return{'chi'}) {
	push @out_col, "chi2" ; push @col_descriptions, 'observed chi-square';
	push @out_col, "df" ; push @col_descriptions, 'degrees of freedom';
	push @out_col, "Pval" ; push @col_descriptions, 'P-value (probability for one word to be a false positive)';
	push @out_col, "Eval" ; push @col_descriptions, 'E-value; expected number of false positives (Eval = Pval * nb_tests)';
	push @out_col, "sig" ; push @col_descriptions, 'Significance (sig = -log10(Eval))';
    }
    if ($return{'rank'}) {
	push @out_col, "rank"; push @col_descriptions, 'rank of the pattern according to sorting criterion';
    }
    if ($return{'distrib'}) {
	for my $class ($min_calc_class..$max_calc_class) {
	    push @out_col, $class_center{$class}; push @col_descriptions, 'observed class occurrences';
	}
	if ($return{'exp'}) {
	    for my $class ($min_calc_class..$max_calc_class) {
		push @out_col, $class_center{$class}; push @col_descriptions ,'expected class occurrences';
	    }
	}
    }
    
    if ($main::verbose >= 1) {
	print $out "; column headers\n";
	foreach my $c (0..$#out_col) {
	    printf $out ";\t%d\t%-15s\t%s\n", $c+1, $out_col[$c], $col_descriptions[$c];
	}
    }
    
    &RSAT::message::TimeWarn("Printing results") if ($main::verbose >= 2);
    ### Print header 
    print $out ";", join("\t", @out_col), "\n";


    ### pattern distributions
    my $rank = 0;
    foreach my $oligo_seq (@sorted_keys) {
      $rank++;
      $pattern{$oligo_seq}->{rank} = $rank;
      print $out "$oligo_seq"; ### Pattern sequence
      print $out "\t", &PatternID($oligo_seq, $sum_rc); ### Pattern ID
      #	print $out "\t$oligo_seq"; ### Pattern ID
      #	if ($group_rc) {
      #	    print $out "|", lc(ReverseComplement($oligo_seq));
      #	}

      ### Occurrences
      print $out "\t", $pattern{$oligo_seq}->{in_bound_occ};
      print $out "\t", $pattern{$oligo_seq}->{overlaps} if ($no_overlaps);

      ### chi-square value
      if ($return{'chi'}) {
	printf $out "\t%.1f", $pattern{$oligo_seq}->{chi_square};
	printf $out "\t%d", $pattern{$oligo_seq}->{df};
	printf $out "\t%.1g", $pattern{$oligo_seq}->{Pval};
	printf $out "\t%.1g", $pattern{$oligo_seq}->{Eval};
	printf $out "\t%.2f", $pattern{$oligo_seq}->{sig};
      }

      #### Rank
      if ($return{'rank'}) {
	print $out "\t$rank";
      }

      ### Position distribution
      if ($return{'distrib'}) {
	for my $class ($min_calc_class..$max_calc_class) {
	  print $out "\t$class_freq{$oligo_seq}{$class}";
	  #			printf $out " (%.1f)", $exp_occ{$oligo_seq}{$class} if ($return{'exp'});
	}
      }

      #	print $out "\n";

      if (($return{'exp'}) && ($return{'distrib'})) {
	#	    print $out "$oligo_seq";
	#	    print $out "\t$oligo_seq";
	#	    if ($group_rc) {
	#		print $out "|", lc(ReverseComplement($oligo_seq));
	#	    }
	#	    print $out "\t$pattern{$oligo_seq}->{in_bound_occ}";
	#	    print $out "\t$pattern{$oligo_seq}->{overlaps}" if ($no_overlaps);
	#	    print $out "\texpected";

	for my $class ($min_calc_class..$max_calc_class) {
	  printf $out "\t%.1f", $exp_occ{$oligo_seq}{$class}; 
	}
      }
      print $out "\n";
    }
}


################################################################
#
# Filter out all oligos with non-canonical alphabet

sub CheckDNA {
    &RSAT::message::TimeWarn("Checking DNA") if ($main::verbose >= 2);

    foreach my $oligo_seq (sort keys %pattern) {
	if ($oligo_seq =~ /[^atcg]/i) {
	    delete $pattern{$oligo_seq}->{occ};
	}
    }
}

