#!/usr/bin/perl

############################################################
#
# $Id: dyad-analysis,v 1.63 2009/11/05 00:32:07 jvanheld Exp $
#
# Time-stamp: <2003-10-22 23:46:08 jvanheld>
#
############################################################
if ($0 =~ /([^(\/)]+)$/) {
    push (@INC, "$`lib/");
}
require "RSA.lib";
require "RSA.disco.lib";

################################################################
#### Initialise parameters
local $start_time = `date '+%d/%m/%y %H:%M:%S %Z'`;
local %default_return_fields = (occ=>1,proba=>1);
local $timeout = 0; ## This is principally for the Web server
local @alphabet = ("a","c","g","t"); ## For &all_oligos()

#### background models
local %supported_bg = ('upstream'=>1,
		 'upstream-noorf'=>1,
		 'intergenic'=>1,
		 'monads'=>1,
		 'input'=>1,
		 'upstream-rm'=>1,
		 'upstream-noorf-rm'=>1
		);
$supported_bg = join ",", sort keys %supported_bg;
$background_model="monads"; ## Default background model : estimate expected dyad freq from observed monad freq


local $org_or_taxon; ## Name of the organism or taxon for the background model
local $taxon = 0; ## Specify it the background model is organism-specific or taxon-specific

## Dyad parameters
$oligo_length = 3;
%dyad_occ_sum = ();
%dyad_ovl_sum = ();
$min_spacing = $max_spacing = 0;
$accepted_dyad_type{rep} = 1;  #direct or inverted repeats
$accepted_dyad_type{dr} = 1;  #direct repeats
$accepted_dyad_type{ir} = 1;  #inverted repeats
$accepted_dyad_type{any} = 1; #any dyad, repeat or not
$dyad_type = "any";

## File containing a restricted list of accepted patterns
$infile{accepted_patterns} = ""; 
%accepted_patterns = ();

## Count dyads on single or boths strand
$strands = "-2str";
$sum_rc = 1;
$group_rc = 1;

## Input sequences
$seq_type = "DNA";
$input_format = "fasta";

### Maximum number of sequences to report in the verbose message
$max_seq_verbose = 50; 

## Expected frequency file
$infile{exp_freq} = "";

## Counters
$nb_possible_dyads = 0;
$nb_tested_patterns = 0;
$nb_possible_pos = 0;
$sum_seq_length = 0;

## The number of possible positions depends on the dyad spacing
%nb_possible_pos = (); 

## Discard dyad that contains non-ACGT characters, if the sequence type is DNA.
%discarded_dyad_positions = (); 
%valid_dyad_positions = ();

$discarded_monad_positions = 0;

################################################################
## Read command line arguments
&ReadArguments();



################################################################
## Define the list of allowed monads (oligos)
if ($seq_type eq "DNA") {
  foreach my $oligo (&all_oligos($oligo_length, @alphabet)) {
    $allowed_monads{$oligo} = 1; 
  }
}


################################################################
## Timeout control: the whole analysis is performed inside an eval
## block, in order to stop it if it lasts more than 10'. This eval was
## essentially conceived for the web server. On the command-line, the
## timeout can be extended with the option -timeout.
$SIG{ALRM} = sub {die "timeout" };
eval {
    alarm($timeout);

    ################################################################
    ## Check argument values 

    ## Special cases where we need to ccount patterns with zero occurrences
    if (($tail eq "left") ||
	($tail eq "two")) {
	$zeroocc = 1;
    }

    ################################################################
    ## Read a file containing a selection of accepted patterns
    if ($infile{accepted_patterns}) {
      &ReadAcceptedPatterns($infile{accepted_patterns}, "dyads");
    }

    ################################################################
    ## Check pre-defined frequency table for intergenic frequency
    ## calibration

      ## Suffix for the -noov/-ovlp option
      if ($noov) {
	$noov_suffix = "-noov";
      } else { 
	$noov_suffix = "-ovlp";
      }

    if ($mncf) {
	## localize monad intergenic frequency file
#	$infile{monad_exp_freq} .= "$data_dir/oligo-frequencies/${oligo_length}nt_intergenic_${organism_name}.freq";
	$infile{monad_exp_freq} = &ExpectedFreqFile($org_or_taxon, $oligo_length, $background_model,
					      type=>"oligo",
					      noov=>$noov_suffix, str=>$str, taxon=>$taxon);
    } elsif ($background_model eq "monads") {
      &RSAT::message::Info("Background model", "monads") if ($main::verbose >= 3);
    } elsif ($background_model eq "exp_freq_file") {
      &RSAT::message::Info("User-specified expected frequency file", $infile{exp_freq}) if ($main::verbose >= 3);
    } else {
      ## Check organism

      $infile{exp_freq} = &ExpectedFreqFile($org_or_taxon, $oligo_length, $background_model,
					    type=>"dyad",
					    noov=>$noov_suffix, str=>$str, taxon=>$taxon);
#      $infile{exp_freq} = &ExpectedFreqFile($org_or_taxon, $oligo_length, $background_model,
#					    type=>"dyad",
#					    noov=>$noov_suffix, str=>$str);
    }

    ## Check fields to calculate
    &CheckCalcFields();

    ## Check spacing values and order
    if ($group_sp) {
	@spacing_list = ("$min_spacing,$max_spacing");
    } else {
	@spacing_list = ($min_spacing..$max_spacing);
    }
    unless ($oligo_length > 0) {
	&RSAT::error::FatalError("Invalid oligonucleotide length specification.");
    }

    ## check sequence format
    if ($input_format eq "") {
	&RSAT::error::FatalError ("You did not specify the input sequence format.",
				  "Type dyad-analysis -help for info.");
    } elsif (not $accepted_input_seq{$input_format}) {
      &RSAT::error::FatalError ($input_format,"is not a valid input sequence format",
				"Type dyad-analysis -help for info.");
    }

    ################################################################
    ## Start the analysis
    &CalcPossibleDyads(); ### Calculate all possible dyads
    &CountDyads(); ### Read sequence and count dyads 
    &CountZeroOcc() if ($zeroocc); ## Count patterns with zero occurrences
    &CalcOccSum(); ## Calculate the sum of occurrences
    if ($sum_rc) {
	&SumReverseComplements() ; ## Group dyads by pairs of reverse complements
	&CountZeroOcc2str(); ## Count patterns with zero occurrences on both strands
    }
    &CalcMonadFrequencies() if ($calc_fields{freq}); ### Calculate observed monad frequencies
    &CalcDyadFrequencies() if ($calc_fields{freq}); ### Calculate observed dyad frequencies

    ## Open output stream
    $out = &OpenOutputFile($outputfile); 

    &CalcExpectedMonadFreq(); ## Calculate expected monad frequencies
    &CalcExpectedDyadFreq() if ($calc_fields{exp_freq}); ### Calculate expected dyad frequencies
    &CalcExpectedOcc() if ($calc_fields{exp_occ}); ### Calculate expected dyad occurrences
    
    ################################################################
    ## Check some thresholds to filter out undue dyads before starting
    ## the time-consuming computation of probabilities
    &CheckThresholds("occ");
    &CheckThresholds("obs_freq");
    &CheckThresholds("exp_freq");
    &CheckThresholds("exp_occ");
    &CalcRatio() if ($calc_fields{ratio}); ### Calculate obs/exp ratio
    &CalcProba() if ($calc_fields{proba}); ### Calculate probabilities

    ################################################################
    ## Choose between a single row per pair of reverse complements, or
    ## two separate rows (with identical statistics)
    if ($sum_rc) {
	if ($group_rc) {
	    &GroupRC();
	} else {
	    &UngroupRC();
	}
    }

    ## Print the parameters of the analysis
    &PrintVerbose() if ($verbose); 

    ## Print the result
    &PrintResult();

    ## Report execution time
    if ($verbose) {
	$done_time = `date '+%d/%m/%y %H:%M:%S %Z'`;
	print $out ";Job started $start_time";
	print $out ";Job done    $done_time";
    }


    ###### close input stream
    close $in if ($infile{input});
    
    ###### close output stream
    close $out if ($outputfile);

    alarm(0);
}; ###eval 



################################################################
#### report out of time errors to the rsa-tools administrator
if ($@) {
  if ($@ =~ /timeout/) {
    open ERROR_REPORT, "| $mail_command $ENV{SERVER_ADMIN}";
    $out = ERROR_REPORT;
    print $out "!!!!!!!!!!!!!!!! Time out ($timeout seconds) during execution of dyad-analysis !!!!!!!!!!!!!!!!\n";
    &PrintArguments($out);
    &PrintVerbose();
    close ERROR_REPORT;
    &RSAT::error::FatalError( "Timeout (after $timeout seconds of processing)",
			      "The error has automatically been reported to $ENV{SERVER_ADMIN}",
			      "Please contact this person for more details");
  } else {
    &RSAT::error::FatalError($@);
    #      alarm(0);
    #	&RSAT::error::FatalError( "dyad-analysis : unknown error during the eval.");
  }
    
  die "HERE\n";
}

exit(0);

################################################################
##################### SUBROUTINE DEFINITION ####################
################################################################

################################################################
## Check if thresholds have been defined on a given field, and
## activate the calculation of a dependent field
sub CheckThresholdDependencies {
    my ($field, $dependency) = @_;
    if ((defined($lth{$field})) || (defined($uth{$field}))) {
	$calc_fields{$dependency} = 1;
    }
}

################################################################
## Check fields to calculate, as a function of those to return
## There are dependencies betwen these fields. 
sub CheckCalcFields {
    %return_fields = %default_return_fields unless (defined(%return_fields));
    %calc_fields = %return_fields;

    ## Check depencencies between fields and thresholds
    &CheckThresholdDependencies("occ", "occ");
    &CheckThresholdDependencies("occ_P", "occ");
    &CheckThresholdDependencies("occ_E", "occ");
    &CheckThresholdDependencies("occ_sig", "occ");
    &CheckThresholdDependencies("occ_P", "proba");
    &CheckThresholdDependencies("occ_E", "proba");
    &CheckThresholdDependencies("occ_sig", "proba");
    &CheckThresholdDependencies("observed_freq", "freq");
    &CheckThresholdDependencies("exp_freq", "exp_freq");
    &CheckThresholdDependencies("zscore", "zscore");
    &CheckThresholdDependencies("mseq", "mseq");
    &CheckThresholdDependencies("ms_P", "proba");
    &CheckThresholdDependencies("ms_E", "proba");
    &CheckThresholdDependencies("ms_sig", "proba");
    &CheckThresholdDependencies("ms_P", "mseq");
    &CheckThresholdDependencies("ms_E", "mseq");
    &CheckThresholdDependencies("ms_sig", "mseq");
    &CheckThresholdDependencies("ratio", "mseq");

    ## Check dependencies between fields
    if (($calc_fields{proba}) || 
	($calc_fields{ratio}) || 
	($calc_fields{zscore})) {
	$calc_fields{exp_occ} = 1;
	$calc_fields{exp_freq} = 1;

	## Automatically return exp freq and exp occ in the output
	$return_fields{exp_occ} = 1;
	$return_fields{exp_freq} = 1;
    }

    if ($calc_fields{exp_freq}) {
	$calc_fields{freq} = 1;	
    }

    if ($calc_fields{exp_occ}) {
	$calc_fields{occ} = 1;	
    }
    if ($calc_fields{freq}) {
	$calc_fields{occ} = 1;	
    }
}

################################################################
## Calculate monad frequencies
sub CalcMonadFrequencies {
    ## Single-strand frequencies
    foreach $oligo (keys %oligo) {
	$oligo{$oligo}->{freq} = $oligo{$oligo}->{occ}/$sum_oligo_count;
    }

    ## Two-strands frequencies
    if ($sum_rc) {
	foreach $oligo (keys %oligo) {
	    $oligo{$oligo}->{freq_2str} = $oligo{$oligo}->{occ_2str}/$sum_oligo_count;
	}
    }
}

################################################################
## Calculate expected monad frequencies
sub CalcExpectedMonadFreq {
    &RSAT::message::TimeWarn("Calculating expected monad frequencies") if ($main::verbose >= 2);

    if ($mncf) {
	################################################################
	## Read expected monad frequencies from a file
	($exp) = &OpenInputFile($infile{exp_freq});
	while (<$exp>) {
	    next if (/^;/);
	    chomp;
	    my ($oligo_seq, $freq) = split;
	    $oligo_seq = lc($oligo_seq);
#	    @fields = split;
#	    $oligo_seq = lc($fields[0]);
#	    $freq = $fields[1];
	    if ((&IsReal($freq)) && (defined($oligo{$oligo_seq}))) {
		$oligo{$oligo_seq}->{exp_freq} = $freq;
	    }
	}
	close $exp;
	
	################################################################
	## Use monad frequencies observed in the input file to
	## estimated expected monad frequencies
    } else {
	foreach $oligo (keys %oligo) {
	    $oligo{$oligo}->{exp_freq} = $oligo{$oligo}->{freq};
	}
    }
}

################################################################
## Calculate expected frequencies
## Different models can be used
sub CalcExpectedDyadFreq {
    &RSAT::message::TimeWarn("Calculating expected dyad frequencies") if ($main::verbose >= 2);


    if ($infile{exp_freq}) {
	################################################################
	# Read expected dyad frequencies from a user-specified file
	&ReadExpectedFrequencies($infile{exp_freq}, $sum_rc);

	################################################################
	## Calculate expected dyad freq on basis of monad frequencies 
	## observed in the input sequence.
    } else {

	foreach my $oligo1 (keys %oligo) {
	    foreach my $spacing (@spacing_list) {
		foreach my $oligo2 (&SecondElement($oligo1)) {
		    ### build the dyad
		    my $pattern_seq= "${oligo1}n\{$spacing\}${oligo2}";
		    next unless (defined($patterns{$pattern_seq}->{occ}));
		    $patterns{$pattern_seq}->{exp_freq} = &CalcExpFreqFromMonads($oligo1, $oligo2);
#		    &RSAT::message::Debug("Expected dyad frequency", $pattern_seq, $patterns{$pattern_seq}->{exp_freq}) if ($main::verbose >= 0);
		}
	    }	
	}

	
# 	################################################################
# 	## Normalize expected frequencies to make sure that the sum is
# 	## 1.  In some cases, the sum is not 1, for example with very
# 	## short sequences where only a few monads are observed.
# 	&RSAT::message::TimeWarn("Normalizing expected frequencies") if ($main::verbose >= 2);
# 	if ($sum_exp_freq <= 0) {
# 	    &RSAT::error::FatalError("The sum of expected frequencies must be strictly positive");
# 	} else {
# 	    foreach my $pattern_seq (keys %patterns) {
# 		$patterns{$pattern_seq}->{exp_freq} /= $sum_exp_freq;
# 	    }
# 	}
	
# 	################################################################
# 	## Treat the reverse complement pairs if the analysis is performed on both strands
# 	if ($sum_rc) {
# 	    &RSAT::message::TimeWarn("Summing expected frequencies per reverse complement pairs") if ($main::verbose >= 2);
# 	    foreach my $pattern_seq (keys %patterns) {
# 		my $rc_pattern = &SmartRC($pattern_seq);
# 		if ($rc_pattern ge $pattern_seq) {
# 		    $patterns_2str{$pattern_seq}->{exp_freq}
# 		    = $patterns_2str{$rc_pattern}->{exp_freq}
# 		    = $patterns{$pattern_seq}->{exp_freq} + $patterns{$rc_pattern}->{exp_freq};
# 		} elsif ($rc_pattern eq $pattern_seq) {
# 		    $patterns_2str{$pattern_seq}->{exp_freq}
# 		    = $patterns{$pattern_seq}->{exp_freq};
# 		}
# 	    }
# 	    foreach my $pattern_seq (keys %patterns_2str) {
# 		$patterns{$pattern_seq}->{exp_freq} = $patterns_2str{$pattern_seq}->{exp_freq};
# 	    }
# 	    undef(%patterns_2str);
# 	    &GroupRC();
# 	}
    
    }


    ################################################################
    ## Delete the entry for dyads having an expected frequency but no
    ## occurrence (this happens if the expected frequency file
    ## contains spacings that were not used in the current analysis)
    foreach my $pattern_seq (keys %patterns) {
	unless (defined($patterns{$pattern_seq}->{occ})) {
	    delete $patterns{$pattern_seq};
	}
    }

}


################################################################
## Calculate expected frequency from monad frequencies
sub CalcExpFreqFromMonads {
#    &RSAT::message::Debug("Calculating expected frequency from monads", $oligo1, $oligo2) if ($main::verbose >= 5);
    my ($oligo1, $oligo2) = @_;
    my $oligo1_rc, $oligo2_rc;
    my $freq1_rc, $freq2_rc;

    my $freq1 = $oligo{$oligo1}->{exp_freq};
    my $freq2 = $oligo{$oligo2}->{exp_freq};
    my $exp_freq =  $freq1*$freq2;

    if ($sum_rc) {
 	$oligo1_rc = &SmartRC($oligo1);
 	$oligo2_rc = &SmartRC($oligo2);
	if ($oligo1_rc ne $oligo2) {
	    $freq1_rc = $oligo{$oligo1_rc}->{exp_freq};
	    $freq2_rc = $oligo{$oligo2_rc}->{exp_freq};
	}
	$exp_freq += $freq1_rc*$freq2_rc;
    }
    
    if ($group_sp) {
	$exp_freq *= ($max_spacing-$min_spacing + 1);
    }

#     &RSAT::message::Debug (
# 	"Expected dyad frequency calculated from monads", 
# 	$oligo1.":".$freq1, 
# 	$oligo2.":".$freq2, 
# 	"rc", 
# 	$oligo1_rc.":".$freq1_rc, 
# 	$oligo2_rc.":".$freq2_rc, 	
# 	$exp_freq,
#     ) if ($main::verbose >= 10);

    return ($exp_freq);
}


################################################################
## Sum occurrences of reverse complementary dyads
sub SumReverseComplements {
    &RSAT::message::TimeWarn("Summing occurrences of reverse complement pairs") if ($main::verbose >= 2);

    ################################################################
    ## Sum dyad occurrences
    foreach my $pattern_seq (keys %patterns) {
	my $rc_pattern = &SmartRC($pattern_seq);
	if ($rc_pattern eq $pattern_seq) {
	    #### don't count twice the reverse palindroms !!!!!!!
	    $patterns{$pattern_seq}->{'remark'} = "inv_rep";
	    $patterns{$pattern_seq}->{occ_2str} = $patterns{$pattern_seq}->{occ};;
	    $patterns{$pattern_seq}->{overlaps_2str} = $patterns{$pattern_seq}->{overlaps};;
#	    $dyad_occ_2str{$rc_pattern} = $patterns{$pattern_seq}->{occ};
#	    $overlaps_2str{$rc_pattern} = $patterns{$pattern_seq}->{overlaps} ;
#	} else {
	} elsif ($rc_pattern gt $pattern_seq) {
	    $patterns{$rc_pattern}->{occ_2str} = $patterns{$pattern_seq}->{occ_2str} =  
		$patterns{$pattern_seq}->{occ} + $patterns{$rc_pattern}->{occ};
	    $patterns{$rc_pattern}->{overlaps_2str} = $patterns{$pattern_seq}->{overlaps_2str} =  
		$patterns{$pattern_seq}->{overlaps} + $patterns{$rc_pattern}->{overlaps};
#	    $dyad_occ_2str{$rc_pattern} = $dyad_occ_2str{$pattern_seq} =  $patterns{$pattern_seq}->{occ} + $patterns{$rc_pattern}->{occ};
#	    $overlaps_2str{$rc_pattern} = $overlaps_2str{$pattern_seq} =  $patterns{$pattern_seq}->{overlaps} + $patterns{$rc_pattern}->{overlaps};
	}
    }
    foreach my $pattern_seq (keys %patterns) {
	$patterns{$pattern_seq}->{occ} = $patterns{$pattern_seq}->{occ_2str};
	$patterns{$pattern_seq}->{overlaps} = $patterns{$pattern_seq}->{overlaps_2str};
#	$patterns{$pattern_seq}->{occ} = $dyad_occ_2str{$pattern_seq};
#	$patterns{$pattern_seq}->{overlaps} = $overlaps_2str{$pattern_seq};
    }
#    undef %patterns_occ_2str;
#    undef %overlaps_2str;
    
    
    ################################################################
    ## Sum monad (oligo) occurrences
    foreach my $oligo_seq (sort keys %oligo) {
	my $rc_oligo = &SmartRC($oligo_seq);
	if ($rc_oligo eq $pattern_seq) {
	    $patterns{$pattern_seq}->{'remark'} = "palindrom";
	    $oligo{$oligo_seq}->{occ_2str} = $oligo{$oligo_seq}->{occ};
#	    $oligo_occ_2str{$rc_oligo} = $oligo{$oligo_seq}->{occ};
	} else {
	    $oligo{$oligo_seq}->{occ_2str} = $oligo{$oligo_seq}->{occ} + $oligo{$rc_oligo}->{occ};
#	    $oligo_occ_2str{$rc_oligo} = $oligo_occ_2str{$oligo_seq} =  $oligo{$oligo_seq}->{occ} + $oligo{$rc_oligo}->{occ};
	}
    }
#     foreach $oligo_seq (sort keys %oligo) {
# 	$oligo{$oligo_seq}->{occ} = $oligo_occ_2str{$oligo_seq};
#     }
#     undef %oligo_occ_2str;

#     ################################################################
#     ## I don't understand why I have to multiply the sum of monad
#     ## occurrences by 2, but if I don't do it, all the expected
#     ## frequencies are 4 times too high ! I guess I knew the reason
#     ## the first time I implemented it.
#     $sum_oligo_count *= 2; 


    &GroupRC();
}


################################################################
### Sum occurrences for each spacing value
sub CalcOccSum {
  &RSAT::message::TimeWarn("Calculating sums of occurrences per spacing") if ($main::verbose >= 2);
  foreach $spacing (@spacing_list) {
    foreach $oligo1 (keys %oligo) {
      foreach $oligo2 (&SecondElement($oligo1)) {
	my $pattern_seq;
	if ($group_sp) {
	  $pattern_seq = "${oligo1}n\{$min_spacing,$max_spacing\}${oligo2}";
	} else {
	  $pattern_seq = "${oligo1}n\{$spacing\}${oligo2}";
	}

	next if (($infile{accepted_patterns}) && (!($accepted_patterns{$pattern_seq})));
	next unless ((defined($patterns{$pattern_seq}->{occ})) ||
		     (defined($patterns{$pattern_seq}->{overlaps}))
		    );
	$dyad_occ_sum{$spacing} += $patterns{$pattern_seq}->{occ};
	$dyad_ovl_sum{$spacing} += $patterns{$pattern_seq}->{overlaps};
      }
    }
  }
  &RSAT::message::TimeWarn(scalar(keys(%patterns)), "distinct dyads") if ($main::verbose >= 2);
}


################################################################
## Calculate observed dyad frequencies
sub CalcDyadFrequencies {
    &RSAT::message::TimeWarn("Calculating dyad frequencies") if ($main::verbose >= 2);

    foreach $spacing (@spacing_list) {
	### frequencies are calculated for each spacing value independently
	foreach $oligo1 (keys %oligo) {
	    foreach $oligo2 (&SecondElement($oligo1)) {
		if ($group_sp) {
		    $pattern_seq = "${oligo1}n\{$min_spacing,$max_spacing\}${oligo2}";
		} else {
		    $pattern_seq = "${oligo1}n\{$spacing\}${oligo2}";
		}

		next unless (defined($patterns{$pattern_seq}->{occ}));

		if ($dyad_occ_sum{$spacing} == 0) {
		    $patterns{$pattern_seq}->{obs_freq} = 0;
		} else {
		    $patterns{$pattern_seq}->{obs_freq} = $patterns{$pattern_seq}->{occ}/$dyad_occ_sum{$spacing};
		}
	    }
	}
    }

}  ### /CalcDyadFrequencies


################################################################
## Calculate expected occurrences
sub CalcExpectedOcc {
    &RSAT::message::TimeWarn("Calculating expected occurrences") if ($main::verbose >= 2);
#    foreach my $pattern_seq (keys %patterns) {
#	$patterns{$pattern_seq}->{exp_occ} =  $patterns{$pattern_seq}->{exp_freq}*$dyad_occ_sum{$spacing};
#    }

    foreach my $spacing (@spacing_list) {
	foreach my $oligo1 (sort keys %oligo) {
	    foreach my $oligo2 (&SecondElement($oligo1)) {
		### build the dyad
		my $pattern_seq= "${oligo1}n\{$spacing\}${oligo2}";
		next unless (defined($patterns{$pattern_seq}->{occ})); 
  		### calculate expected occurrences
#  		if ($noov) {
#  		    $patterns{$pattern_seq}->{exp_occ} =  $patterns{$pattern_seq}->{exp_freq} * 
#  			($dyad_occ_sum{$spacing} - ($min_overlap_dist{$spacing} - 1) * $patterns{$pattern_seq}->{occ});
#  		} else {
  		    $patterns{$pattern_seq}->{exp_occ} =  $patterns{$pattern_seq}->{exp_freq}*$dyad_occ_sum{$spacing};
#  		}

# 		&RSAT::message::Debug("Expected occ", 
# 				      $pattern_seq, 
# 				      "exp_freq:". $patterns{$pattern_seq}->{exp_freq},
# 				      "occ sum for spacing:", $dyad_occ_sum{$spacing},
# 				      "exp_occ:". $patterns{$pattern_seq}->{exp_occ},
# 				     ) if ($main::verbose >= 5);
	    }
	}
    }
}

## ##############################################################
## Calculate dyad probabilities
sub CalcProba {
    &RSAT::message::TimeWarn("Calculating probabilities") if ($main::verbose >= 2);

#    foreach my $pattern_seq (keys %patterns) {

    
    ## Note: we need to build the pattern for each spacing separately,
    ## rather than taking the keys of %patterns because the number of
    ## valid dyads depends on the spacing.
    foreach my $spacing (@spacing_list) {
	foreach my $oligo1 (sort keys %oligo) {
	    foreach my $oligo2 (&SecondElement($oligo1)) {
		### build the dyad
		my $pattern_seq= "${oligo1}n\{$spacing\}${oligo2}";
		next unless (defined($patterns{$pattern_seq}->{occ})); 
		
		### Detect repeats
		if (($sum_rc) && (&Palindromic($pattern_seq))) {
		    $patterns{$pattern_seq}->{remark} = "inv_rep";
		}
		if ($oligo1 eq $oligo2) {
		    $patterns{$pattern_seq}->{remark} .= "dir_rep";
		}
		
		## Check expected frequency
		if (!defined($patterns{$pattern_seq}->{exp_freq})) {
		    
		    &RSAT::message::Warning (join("\t", "Undefined expected frequency for dyad", 
						  $pattern_seq));
		} if ($patterns{$pattern_seq}->{exp_freq} <= 0) {
		    
		    #### Expected frequency might not be defined if
		    #### max_spacing is > than the max spacing of the
		    #### expected frequency file
		    if ($max_spacing > 20) {
			#### calculate expected frequency oin the basis of monad frequencies
			$patterns{$pattern_seq}->{exp_freq} = &CalcExpFreqFromMonads($oligo1, $oligo2);
			$patterns{$pattern_seq}->{remark} = "monad exp freq ".$patterns{$pattern_seq}->{remark};
			
		    } else {
			&RSAT::message::Warning (join("\t", "Expected frequency must be > 0 for dyad", 
						      $pattern_seq,
						      $patterns{$pattern_seq}->{exp_freq}));
			next;
#			delete $patterns{$pattern_seq};
		    }
		}

		### self-overlap coefficient
		if (($calc_fields{proba}) || ($calc_fields{zscore})) {
		    $oligo_pair = $oligo1.$oligo2;
		    $overlap_coeff{$pattern_seq} = &OverlapCoeff($oligo_pair);
		}
		
		### calculate Z-scores ###
		if ($calc_fields{zscore}) {
		    $patterns{$pattern_seq}->{var_est} = $valid_dyad_positions{$spacing}*$patterns{$pattern_seq}->{exp_freq}*(2*$overlap_coeff{$pattern_seq} - 1 - (4*$oligo_length+1)*$patterns{$pattern_seq}->{exp_freq});
		    if ($patterns{$pattern_seq}->{var_est} <= 0) {
			$patterns{$pattern_seq}->{zscore} = "NA";
		    } else {
			$patterns{$pattern_seq}->{zscore} = sprintf "%7.2f", ($patterns{$pattern_seq}->{occ} - $patterns{$pattern_seq}->{exp_occ})/sqrt($patterns{$pattern_seq}->{var_est});
		    }
		}
		
		
		### calculate occurrence probability ####
		if ($calc_fields{proba}) {
		    $patterns{$pattern_seq}->{occ_P} =  &binomial_boe($patterns{$pattern_seq}->{exp_freq},$valid_dyad_positions{$spacing},$patterns{$pattern_seq}->{occ});
		    $main::nb_tested_patterns++;
#		    &RSAT::message::Debug("binomial test",
#					  "p= ".$patterns{$pattern_seq}->{exp_freq},
#					  "T= ".$valid_dyad_positions{$spacing},
#					  "x= ".$patterns{$pattern_seq}->{occ},
#					 ) if ($main::verbose >= 10);
		}
	    }
	}
    }  

    ## Corrections or multi-testing
    &MultiTestCorrections($nb_tested_patterns, %patterns);
#     ### Calculate occurrence significance
#     &RSAT::message::Info(join ("\t", "Correcting for multiple testing", $main::nb_tested_patterns)) if ($main::verbose >= 2);
#     foreach my $pattern_seq (keys %patterns) {
# 	if (&IsReal ($patterns{$pattern_seq}->{occ_P})) {
# 	    $patterns{$pattern_seq}->{occ_E} =  $patterns{$pattern_seq}->{occ_P}*$main::nb_tested_patterns;
# 	    if ($patterns{$pattern_seq}->{occ_E} > 0) {
# 		$patterns{$pattern_seq}->{occ_sig} =  -log($patterns{$pattern_seq}->{occ_E})/log(10);
# 	    } else {
# 		$patterns{$pattern_seq}->{occ_sig} =  999;
# 	    }
# 	} else {
# 	    $patterns{$pattern_seq}->{occ_E} = "NA";
# 	    $patterns{$pattern_seq}->{occ_sig} = "NA";
# 	}
# 	&RSAT::message::Debug("Calculating E-value",  
# 			      $pattern_seq,
# 			      $patterns{$pattern_seq}->{occ},
# 			      $patterns{$pattern_seq}->{occ_P},
# 			      $main::nb_tested_patterns,
# 			      $patterns{$pattern_seq}->{occ_E},
# 			      $patterns{$pattern_seq}->{occ_sig},
# 			     ) if ($main::verbose >= 0);
#     }

    #### threshold filtering ####
    &CheckThresholds("zscore");
    &CheckThresholds("occ_P");		      
    &CheckThresholds("occ_E");
    &CheckThresholds("occ_sig");
		      

} ### CalcProba

################################################################
## Read arguments 
sub ReadArguments {
    foreach $a (0..$#ARGV) {
	### verbose ###
	if ($ARGV[$a] eq "-v") {
	    if (&IsNatural($ARGV[$a+1])) {
		$verbose = $ARGV[$a+1];
	    } else {
		$verbose = 1;
	    }

	    ### detailed help
	} elsif ($ARGV[$a] eq "-h") {
	    &PrintHelp();

	    ### list of options
	} elsif ($ARGV[$a] eq "-help") {
	    &PrintOptions;

	    ### input file ###
	} elsif ($ARGV[$a] eq "-i") {
	    $infile{input} = $ARGV[$a+1];

	    ### input format ###
	} elsif ($ARGV[$a] eq "-format") {
	    $input_format = lc($ARGV[$a+1]);

	    ## mask
	} elsif ($ARGV[$a] eq "-mask") {
	    $mask = $ARGV[$a+1];
	    &CheckMask($mask);

	    ### output file ###
	} elsif ($ARGV[$a] eq "-o") {
	    $outputfile = $ARGV[$a+1];

	    ### oligo length ###
	} elsif (($ARGV[$a] eq "-l") && (&IsNatural($ARGV[$a+1]))) {
	    $oligo_length = $ARGV[$a+1];

	    ### spacing ###
	} elsif ($ARGV[$a] =~ /^-sp/i) {
	    if (&IsNatural($ARGV[$a+1])) {
		$min_spacing = $max_spacing = $ARGV[$a+1];
	    } elsif ($ARGV[$a+1] =~ /^(\d+)\-(\d+)$/) {
		$min_spacing = &min($1,$2);
		$max_spacing = &max($1,$2);
	    } else {
		print "Error: invalid spacing specification\n";
		print "Type dyad-analysis -help for more info\n";
		exit;
	    }

	    ### dyad type ###
	} elsif ($ARGV[$a] =~ /^-type/i) {
	    $dyad_type = lc($ARGV[$a+1]);
	    die "Error : '$dyad_type' is not a valid dyad type" unless $accepted_dyad_type{$dyad_type};

	    ### analyze only selected dyads
	} elsif ($ARGV[$a] =~ /^-accept/i) {
	    $infile{accepted_patterns} = $ARGV[$a+1];

	    ### sort results ###
	} elsif ($ARGV[$a] =~ /^-sort/i) {
	    $sort_results = 1;

	    ################################################################
	    #### Left-tail or two-tail significance test  
	} elsif ($ARGV[$a] eq "-under") {
	    $tail = "left";

	} elsif ($ARGV[$a] =~ /^\-two_tail/) {
	    $tail = "two";

	    ################################################################
	    ## Also count patterns not observed
	} elsif ($ARGV[$a] eq "-zeroocc") {
	    $zeroocc = 1;

	    ### expected frequency table ###
	} elsif ($ARGV[$a] =~ /^-exp/i) {
	    $infile{exp_freq} = $ARGV[$a+1];
	    $background_model = "exp_freq_file";

	    ### timeout setting
	} elsif ($ARGV[$a] =~ /^-timeout/i) {
	    $timeout = $ARGV[$a+1];
	    die "Error: timeout value should be integer\n"
		unless &IsNatural($timeout);

	    ### use oligo non-coding frequencies as expected frequencies
	} elsif ($ARGV[$a] =~ /^-ncf/i) {
	    &RSAT::message::Warning ("option -ncf is deprecated, use '-bg intergenic' instead");
	    $background_model = "intergenic";
#	    $rescale_freq = 1;

	    ### specify a background model for estimating expected frequencies
	} elsif ($ARGV[$a] =~ /^-bg/i) {
	    $background_model = $ARGV[$a+1];
	    $background_model =~ s/ncf/intergenic/;
	    $background_model =~ s/input/monads/;
	    unless ($supported_bg{$background_model}) {
		&RSAT::error::FatalError("Invalid background model\t$background_model\tsupported: $supported_bg");
	    }

	    ### organism (for selecting an organism-specific background model)
	} elsif ($ARGV[$a] =~ /^-org/i) {
	    if ($taxon) {
		&RSAT::message::FatalError("Options -org and -taxon are mutually exclusive");
	    }
	    $org_or_taxon = $ARGV[$a+1];
	    &CheckOrganismName($org_or_taxon);

	    ### taxon (for selecting a taxon-specific background model)
	  } elsif ($ARGV[$a] =~ /^-taxon/i) {
	    $taxon = 1;
	    $org_or_taxon = $ARGV[$a+1];
	    &CheckTaxon($org_or_taxon);

	    ### use monad intergenic frequencies as expected frequencies
	} elsif ($ARGV[$a] =~ /^-mncf/i) {
	    $mncf = 1;

	    ### single strand count ###
	} elsif ($ARGV[$a] =~ /^-1str/i) {
	    $strands = "-1str";
	    $sum_rc = 0;
	    $group_rc = 0;

	    ### sum count on both strands ###
	} elsif ($ARGV[$a] =~ /^-2str/i) {
	    $strands = "-2str";
	    $sum_rc = 1;

	    ### group pairs of reverse complements
	} elsif ($ARGV[$a] =~ /^-grouprc/i) {
	    $group_rc = 1;

	    ### group pairs of reverse complements
	} elsif ($ARGV[$a] =~ /^-nogrouprc/i) {
	    $group_rc = 0;

	    ### proteic sequence ###
	} elsif ($ARGV[$a] =~ /^-prot/i) {
	    $sum_rc = 0;
	    $proteic = 1;

	    ### group spacings ###
	} elsif ($ARGV[$a] =~ /^-groupsp/i) {
	    $group_sp = 1;

	    ################################################################
	    ### Thresholds

	    ### Lower threshold
	} elsif ($ARGV[$a] eq "-lth") {
	    my $thr_field = $ARGV[$a+1];
	    my $thr_value =  $ARGV[$a+2];
	    unless ($supported_threshold{$thr_field}) {
		&RSAT::error::FatalError("Invalid threshold field $thr_field. Supported: $supported_thresholds");
	    }
	    $lth{$thr_field} = $thr_value;

	    ### Upper threshold
	} elsif ($ARGV[$a] eq "-uth") {
	    my $thr_field = $ARGV[$a+1];
	    my $thr_value =  $ARGV[$a+2];
	    unless ($supported_threshold{$thr_field}) {
		&RSAT::error::FatalError("Invalid threshold field $thr_field. Supported: $supported_thresholds");
	    }
	    $uth{$thr_field} = $thr_value;


	    ### threshold on significance ###
	} elsif ($ARGV[$a] =~ /^-thosig/i) {
	    &RSAT::message::Warning("Option -thosig is obsolete. Use option -lth occ_sig instead");
	    $lth{occ_sig} = $ARGV[$a+1];
	    &FatalError ("Threshold on occurrence significance must be a real number") unless (&IsReal($lth{occ_sig}));

	    ### output fields ###
	} elsif ($ARGV[$a] =~ /^-return/i) {
	    @return_fields = split ",", $ARGV[$a+1];
	    foreach $field (@return_fields) {
		if ($field eq "proba") {
		    $return_fields{proba} = 1;
		} elsif  ($field =~ /^zsc/i) {
		    $return_fields{zscore} = 1;
		} elsif  ($field eq "freq") {
		    $return_fields{freq} = 1;
		} elsif  ($field eq "exp_freq") {
		    $return_fields{exp_freq} = 1;
		} elsif  ($field =~ /^observed_freq/i) {
		    $return_fields{freq} = 1;
		} elsif  ($field eq "occ") {
		    $return_fields{occ} = 1;
		} elsif  ($field eq "exp_occ") {
		    $return_fields{exp_occ} = 1;
		} elsif  ($field eq "ratio") {
		    $return_fields{ratio} = 1;
		} elsif  ($field eq "rank") {
		    $return_fields{rank} = 1;
		} elsif  ($field eq "monad_freq") {
		    $return_fields{monad_freq} = 1;
		} else {
		    &RSAT::error::FatalError("Invalid return field $field.");
		}
	    }

	    ### prevent overlapping matches
	} elsif ($ARGV[$a] =~ /^-noov/) {
	    $noov = 1;

	    #### sequence type
	} elsif ($ARGV[$a] =~ /^-seqtype/i) {
	    $a++;
	    if ($ARGV[$a] =~ /^prot/i) {
		$sum_rc = 0;
		$group_rc = 0;
		$seq_type = "protein";
		
		#### DNA sequences
	    } elsif ($ARGV[$a] =~ /^dna/i) {
		$seq_type = "DNA";
		
		#### any other sequence type
	    } elsif ($ARGV[$a] =~ /^other/i) {
		$sum_rc = 0;
		$group_rc = 0;
		$seq_type = "other";


	    } else {
		die "\tError: sequence type '$ARGV[$a]' is not supported\n";
	    }
	}
    }
}



################################################################
### calculate number of possible dyads ###
sub CalcPossibleDyads {
    &RSAT::message::TimeWarn("Calculating all possible dyads") if ($main::verbose >= 2);

    if ($proteic) {
	$nb_possible_oligo = 20**$oligo_length;
    } else {
	$nb_possible_oligo = 4**$oligo_length;
    }

    $nb_possible_dr = $nb_possible_oligo;
    $nb_possible_ir = $nb_possible_oligo;
    $nb_possible_dr /= 2 if ($sum_rc); 
    ### beware : grouping with the reverse complement reduces the number of distintc patterns
    ### but this is only tru for direct repeats, 
    ### since inverted repeats are by definition identical to their reverse complement
    
    if ($dyad_type eq "dr") {
	$main::nb_possible_dyads = $nb_possible_dr;
    } elsif ($dyad_type eq "ir") {
	$main::nb_possible_dyads = $nb_possible_ir;
    } elsif ($dyad_type eq "rep") {
	$main::nb_possible_dyads = $nb_possible_dr + $nb_possible_ir;
    } else {
	$main::nb_possible_dyads = $nb_possible_oligo**2;
	### reduce according to reverse-complement grouping
	if ($sum_rc) {
	    $main::nb_possible_dyads -= ($nb_possible_oligo**2 - $nb_possible_oligo)/2;
	}
    }
    ### multiply by spacing range
    if ($max_spacing > $min_spacing) {
	$main::nb_possible_dyads *= $max_spacing - $min_spacing + 1;
    }
}


################################################################
## Given an oligonucleotide, return the list of oligos that can be
## combined with it to form a dyad this list depends on the variable
## $dyad_type (ir, dr or any)
sub SecondElement {
    my ($oligo1) = @_;
    my @oligo2 = ();
    if ($dyad_type eq "dr") {
	return $oligo1;
    } elsif ($dyad_type eq "ir") {
	return lc(&ReverseComplement($oligo1));
    } elsif ($dyad_type eq "rep") {
	return $oligo1, lc(&ReverseComplement($oligo1));
    } else {
	return keys %oligo; 
    }
}

################################################################
#### Read sequences and count dyads
sub CountDyads {
  &RSAT::message::TimeWarn("Counting dyad occurrences") if ($main::verbose >= 2);
  ($in, $input_dir) = &OpenInputFile($infile{input});
  $sequence_number = 0;
  while ((($current_seq, $current_id, @comments) = &ReadNextSequence($in, $input_format, $input_dir, $seq_type, $mask)) &&
	 (($current_seq ne "") || ($current_id ne ""))) {

    ### Remove tabs and blank spaces
    $current_seq = &FoldSequence($current_seq,0);
    $current_seq = &CleanDNA($current_seq) if ($seq_type eq "DNA");

    ### Index sequence lengths
    $sequence_number++;  
    $id_list[$sequence_number] = $current_id;
    $seq_length[$sequence_number] = length($current_seq);
    my $prev_sum_seq_len = $sum_seq_length;
    $sum_seq_length += $seq_length[$sequence_number];


    ################################################################
    ## Calculate the number of possible positions, as a function of
    ## dyad spacing (the number of forbidden positions at the end
    ## of the sequecne depends on the spacing)
    foreach my $spacing (@spacing_list) {
      my $new_pos_nb =  max(0,$seq_length[$sequence_number] - 2*$oligo_length - $spacing + 1); 
      $nb_possible_pos{$spacing} += $new_pos_nb if ($new_pos_nb > 0); 
    }

    ## Calculate las position
    my $last_pos = $seq_length[$sequence_number] - $oligo_length;

    ## chunks for high verbosity
    my $chunk = 10000;

    ## Verbosity
    &RSAT::message::TimeWarn(join ("\t", "\tsequence",
				   $sequence_number,
				   "seq_id=".$current_id,
				   "len=".$seq_length[$sequence_number],
				   "last_pos=".$last_pos,
				   "sum_len=".$sum_seq_length,
				  )) if (($main::verbose >= 3) || ($main::verbose >= 2 && $sequence_number%100==0));

    ################################################################
    ## Iteration to get teh first monad of the dyad
    my $pos1 = 0;
    while ($pos1 <= $last_pos) {
      #	    &RSAT::message::Debug("p1= ".$pos1, "last= ".$last_pos) if ($main::verbose >= 10);

      ## High verbostity
      if ($main::verbose >= 3) {
	if ((($prev_sum_seq_len+$pos1)%$chunk ==0 ) && ($pos1 > 0)) {
	  $chunk_nb++;
	  $ps = `ps v -c -p $$ | grep -v TIME`;
	  chomp $ps;
	  &RSAT::message::TimeWarn (
				    "seq=".$sequence_number,
				    "pos1=".$pos1,
				    "last_pos=".$last_pos,
				    $ps), "\n";
	}
      }

      ################################################################
      ## Get the first monad
      my $oligo1 = lc(substr($current_seq,$pos1,$oligo_length));

      ################################################################
      ## For DNA, filter out dyads containing unspecified residues
      if ($seq_type eq "DNA") {
	unless ($allowed_monads{$oligo1}) {
	  $discarded_monad_positions++;
		
	  ## Count dyad positions which are discarded
	  ## because the first monad contains unspecified
	  ## residues
	  foreach my $spacing ($min_spacing..$max_spacing) {
	    if ($pos1 + $oligo_length + $spacing <= $last_pos) {
	      $discarded_dyad_positions{$spacing}++; 
	      #			    &RSAT::message::Debug("Discarded because of monad 1", ++$discarded_monad1, $pos1, $oligo1, $spacing, $last_pos, $current_id, $seq_length[$sequence_number]) if ($main::verbose >= 10);
	    } else {
	      #			    &RSAT::message::Debug("Not discarded because end of sequence reached", ++$not_discarded_too_late, $pos1, $oligo1, $spacing, $last_pos, , $current_id, $seq_length[$sequence_number]) if ($main::verbose >= 10);
	    }
	  }
		
	  # 		    &RSAT::message::Warning(
	  # 			"Invalid first monad",
	  # 			$pos1,
	  # 			$oligo1,
	  # 		    ) if ($main::verbose >= 5);
	  $pos1++;
	  next;
	}
      }

      ################################################################
      ## Count monad occurrences. 
      ## These will be used to estimate expected dyad frequencies. 
      $oligo{$oligo1}->{occ}++;
      $sum_oligo_count++;


      # 	    &RSAT::message::Debug(
      # 		"First monad",
      # 		"p1= ".$pos1,
      # 		"oligo1= ".$oligo1,
      # 		"occ= ".$oligo{oligo1}->{occ},
      # 		"sum_oligos= ".$sum_oligo_counts,
      # 	    ) if ($main::verbose >= 10);

      ################################################################
      ## Iterate spacings to obtain the second monad and the dyad
      foreach my $spacing ($min_spacing..$max_spacing) {
	my $pos2 = $pos1 + $oligo_length + $spacing;
	#      $min_overlap_dist = $oligo_length;
	$min_overlap_dist{$spacing} = 2*$oligo_length + $spacing;
		
	if ($pos2 <= $last_pos) {
	  my $oligo2 = lc(substr($current_seq,$pos2,$oligo_length));
		    	
	  if ($seq_type eq "DNA") {
	    unless ($allowed_monads{$oligo2}) {
		
	      ## Dyads which are discarded because the second monad contains unspecified residues
	      $discarded_dyad_positions{$spacing}++;
	      #			    &RSAT::message::Debug("Discarded because of monad 2", ++$discarded_monad2, $pos1, $oligo1, $spacing, $pos2, $oligo2, $last_pos, $quick_test, $current_id, $seq_length[$sequence_number]) if ($main::verbose >= 10);

	      ## Discard this monad only if it the same
	      ## position cannot be discarded as first
	      ## monad (avoid discarding twice the same
	      ## position)
	      $discarded_monad_positions++ 
		if ($last_pos - $pos2 < $spacing); 
	      # 			    &RSAT::message::Debug(
	      # 				    "Invalid second monad",
	      # 				    "p1=".$pos1,
	      # 				    "sp=".$spacing,
	      # 				    "p2=".$pos2,
	      # 				    $oligo2,
	      # 				) if ($main::verbose >= 10);
	      #			    $spacing++;
	      next;
	    }
	  }

	  ################################################################
	  ## Check for specific dyad types

	  ## count only direct repeats
	  next if (($dyad_type eq "dr") && ($oligo1 ne $oligo2)); 

	  ## count only inverted repeats
	  next if (($dyad_type eq "ir") && ($oligo2 ne &SmartRC($oligo1))); 

	  ## count only repeats
	  next if (($dyad_type eq "rep") && ($oligo1 ne $oligo2) &&($oligo2 ne &SmartRC($oligo1))); 

	  ################################################################
	  ## Generate the dyad name (grouping all spacings or not)
	  if ($group_sp) {
	    $pattern_seq = "${oligo1}n\{$min_spacing,$max_spacing\}${oligo2}";
	  } else {
	    $pattern_seq = "${oligo1}n\{$spacing\}${oligo2}";
	  }

	  ################################################################
	  ## Check accepted patterns
	  next if (($infile{accepted_patterns}) && (!($accepted_patterns{$pattern_seq})));

	  ################################################################
	  ## Treatment of self-overlapping occurrences
	  if ($noov) {

	    ################################################################
	    ## Prevent self-overlap
	    my $dist = $pos1 - $last_pos{$pattern_seq};

	    if ((&IsNatural($last_pos{$pattern_seq})) &&
		($dist < $min_overlap_dist{$spacing})) {
	      ### new position overlaps with the previous occurrence of the same dyad
	      $patterns{$pattern_seq}->{overlaps}++;
	      #			    $spacing++;
	      next;
	    }

	    ################################################################
	    ## Index the last position where the current
	    ## dyad has been found (i.e. the current
	    ## position)
	    $last_pos{$pattern_seq} = $pos1;
	    if ($sum_rc) {
	      $rc_pattern_seq = &SmartRC($pattern_seq);
	      $last_pos{$rc_pattern_seq} = $pos1;
	    }
	  }
	  $patterns{$pattern_seq}->{occ}++;

	}
      }
      $pos1++;
    }
    undef %last_pos;
  }


  foreach my $spacing (@spacing_list) {
    ## Make sure the nb of possible positions per spacing is
    ## defined (it can be false if all sequences are smaller than
    ## the max dyad length)
    $nb_possible_pos{$spacing} = 0 
      unless (defined($nb_possible_pos{$spacing})) ;

    ## Calculate the number of valid positions per spacing
    #	$discarded_dyad_positions{$spacing} += $discarded_dyad_positions;
    $valid_dyad_positions{$spacing} = $nb_possible_pos{$spacing} - $discarded_dyad_positions{$spacing};
  }

  &RSAT::message::TimeWarn("Counted occurrences", scalar(keys(%patterns)), "distinct dyads") if ($main::verbose >= 2);
}


################################################################
## Detect patterns with 0 occurrences (single strand counts)
sub CountZeroOcc {
    &RSAT::message::TimeWarn("Counting patterns with 0 occurrences on single strand") if ($main::verbose >= 2);

    foreach my $oligo1 (&all_oligos($oligo_length, @alphabet)) {
	foreach my $spacing (@spacing_list) {
	    foreach my $oligo2 (&all_oligos($oligo_length, @alphabet)) {
		### build the dyad
		my $pattern_seq= "${oligo1}n\{$spacing\}${oligo2}";

		## Check if no occurrences were found
		unless ($patterns{$pattern_seq}->{occ}) {
#		&RSAT::message::Debug("\tpattern with 0 occurrences\t$pattern_seq") if ($main::verbose >= 0);
		    $patterns{$pattern_seq}->{occ} = 0;
		    $zero_occ{$pattern_seq}++;
		}
	    }
	}
    }

    &RSAT::message::TimeWarn(join ("\t", scalar(keys(%zero_occ)), "patterns with 0 occurrences (before grouping RC)")) if ($main::verbose >= 2);
}

################################################################
## Detect patterns with 0 occurrences on both strands
sub CountZeroOcc2str {
    &RSAT::message::TimeWarn("Counting patterns with 0 occurrences on both strands") if ($main::verbose >= 2);

    ## Check if no occurrences were found
    foreach my $pattern_seq (keys %patterns) {
	unless ($patterns{$pattern_seq}->{occ}) {
	    $patterns{$pattern_seq}->{occ} = 0;
	    $zero_occ_2str{$pattern_seq}++;
	}
    }

    &RSAT::message::TimeWarn(join ("\t", scalar(keys(%zero_occ_2str)), "patterns with 0 occurrences (after grouping RC)")) if ($main::verbose >= 2);
}


################################################################
#### Verbose
sub PrintVerbose {
    print $out "; dyad-analysis ";
    &PrintArguments($out);
    print $out "; Citation: van Helden et al. (2000). Nucleic Acids Res. 28(8):1808-18.\n";
    printf $out "; %-29s\t%s\n", "Input file", $infile{input} if ($infile{input});
    if ($infile{accepted_patterns}) {
	printf $out "; %-29s\t%s\n", "Accepted dyad file", $infile{accepted_patterns}; 
	printf $out "; %-29s\t%s\n", "Accepted dyads", scalar(keys(%accepted_patterns)); 
    }
    printf $out "; %-29s\t%s\n", "Sequence type", $seq_type;
    printf $out "; %-29s\t%s\n", "Nb of sequences", $sequence_number;
    printf $out "; %-29s\t%s\n", "Masked characters", $mask if ($mask);
    printf $out "; %-29s\t%s\n", "Sum of sequence lengths", $sum_seq_length;
    printf $out "; %-29s\t%s\n", "Output file", $outputfile if ($outputfile);
    printf $out "; %-29s\t%s\n", "default return values", join(",", keys(%default_return_fields));
    printf $out "; %-29s\t%s\n", "return values", join(",", keys(%return_fields));

    ################################################################
    ## Counting mode

    ## overlaps
    if ($noov eq "-noov") {
	print $out "; Discard overlapping matches\n";
    } else {
	print $out "; Count overlapping matches\n";
    }

    ## Strands
    if ($sum_rc) {
	print $out "; Counted on both strands\n";
	if ($group_rc) {
	    print $out "; \tgrouped by pairs of reverse complements\n";
	}
    } else {
	print $out "; Counted on a single strand\n";
    }

    ################################################################
    ## Monads
    printf $out "; %-29s\n", "Monad parameters";
    printf $out "; %-29s\t%d\n", "\tmonad size", $oligo_length;
    if ($verbose >= 3) {
	print $out ("; allowed monads\n;\t". join("\n;\t", sort keys %allowed_monads), "\n") ;
    }
    printf $out "; \t%-29s%d\n", "monad positions", $sum_oligo_count + $discarded_monad_positions;
    if ($seq_type eq "DNA") {
	printf $out "; \t%-29s\t%d\n", "    valid", $sum_oligo_count;
	printf $out "; \t%-29s\t%d%s\n", "    discarded", $discarded_monad_positions, " (contain other letters than ACGT)";
    }
    printf $out "; \t%-29s%d\n", "distinct monads", $nb_possible_oligo;

    if ($return_fields{monad_freq}) {
	print $out ";\tMonad frequencies\n";
	print $out join("\t", ";\t\t", "monad", "occ_1s", "occ_sum", "freq", "frq_sum");
	print $out "\t", join("\t", "rc_pair", "occ_2s", "freq_2s") if ($sum_rc);
	print $out "\n";
	my $monad_occ_sum = 0;
	my $monad_freq_sum = 0;
	foreach my $oligo (sort keys %oligo) {
	    $monad_occ_sum += $oligo{$oligo}->{occ};
	    $monad_freq_sum += $oligo{$oligo}->{freq};
	    print $out join("\t", ";\t", "mon_f", 
			    $oligo, 
			    $oligo{$oligo}->{occ},
			    $monad_occ_sum,
			    sprintf("%7.3f", $oligo{$oligo}->{freq}),
			    sprintf("%7.3f", $monad_freq_sum),
			   );

	    if ($sum_rc) {
		print $out ("\t", 
			    join ("\t", 
				  $oligo."|".&SmartRC($oligo), 
				  $oligo{$oligo}->{occ_2str},
				  sprintf "%7.3f", $oligo{$oligo}->{freq_2str}));
	    }
	    print $out "\n";
	}
    }


    ################################################################
    #### dyads
    printf $out "; %-29s\n", "Dyad parameters";
    my %extended_type = (dr=>"direct repeats", 
			 ir=>"inverted repeats", 
			 rep=>"direct or inverted repeats", 
			 any=>"any dyad");
    printf $out "; \t%-29s%s\n", "dyad type", $extended_type{$dyad_type};
    printf $out "; \t%-29s%d\n", "minimal spacing", $min_spacing;
    printf $out "; \t%-29s%d\n", "maximal spacing", $max_spacing;
    printf $out "; \t%-29s\t%d\n", "distinct dyads", $main::nb_possible_dyads;
    printf $out "; \t%-29s\t%d\n", "dyads tested for significance", $main::nb_tested_patterns;

    ## Count of dyads with zero occurrences
    if ($zeroocc) {
	if ($sum_rc) {
	    printf $out "; \t%-29s\t%d\n", "dyads with zero occurrences on both strands", scalar(keys(%zero_occ_2str));
	} else {
	    printf $out "; \t%-29s\t%d\n", "dyads with zero occurrences on direct strands", scalar(keys(%zero_occ));
	}
    }

    ## Dyad positions
    printf $out ";\tDyad counts per spacing\n";

    print $out join("\t", ";\t", "spacing", "max_pos", "occ");
    if ($noov) {
	print $out "\t", "ovlps";
	print $out "\t", "ovl+occ";
    }
    if ($seq_type eq "DNA") {
	print $out "\t", join("\t", 
			      "ACGT", 
			      "nonACGT",
			     );
    }
    print $out "\n";
    foreach my $spacing (@spacing_list) {
	print $out join("\t", ";\t", "sp ".$spacing, 
			$nb_possible_pos{$spacing},
			$dyad_occ_sum{$spacing});
	if ($noov) {
	    print $out "\t", $dyad_ovl_sum{$spacing};
	    print $out "\t", $dyad_ovl_sum{$spacing} + $dyad_occ_sum{$spacing};
	}
    	if ($seq_type eq "DNA") {
	    print $out "\t", join ("\t", 
				   $valid_dyad_positions{$spacing}, 
				   $discarded_dyad_positions{$spacing},
				  );
	}
	print $out "\n";
    }


    print $out &PrintThresholdValues();

    #### expected frequencies
    printf $out "; %s\n", "Estimation of expected dyad frequencies";
    print $out ";\tBackground model\t", $background_model, "\n";
    unless (($background_model eq "monads") ||
	    ($background_model eq "exp_freq_file")) {
	printf $out ";\t%-29s\t%s\n", "    organism", ($supported_organism{$org_or_taxon}->{name} || $org_or_taxon);
    }
    if ($mncf) {
	print $out ";\tMonad frequencies in intergenic region\n";
	printf $out ";\t%-29s\t%s\n", "organism", $org_or_taxon;
	printf $out ";\t%-29s\t%s\n", "exp. monad freq. file", $infile{monad_exp_freq};
    } elsif ($infile{exp_freq}) {
	printf $out ";\t%-22s\t%s\n", "exp. freq. file", $infile{exp_freq};
    } else {
	print $out ";\tMonad calibration from input sequences\n";
    }
    
    unless ($#id_list > $max_seq_verbose) {
	print $out "; Sequences:\n";
	foreach $s (1..$#id_list) {
	    print $out join("\t", ";", "seq ".$s, $id_list[$s], $seq_length[$s]), "\n";
	}
    }
}


################################################################
## Print header line for the output file
sub PrintHeaderLine {
    $col_description{"seq"} = "dyad sequence";
    $col_description{"sequence"} = "dyad sequence";
    $col_description{"dyad_seque"} = "dyad sequence";
    $col_description{"dyad_ident"} = "dyad identifier";
    $col_description{"observed_freq"} = "observed frequency"; 
    $col_description{"expected_frq"} = "expected frequency"; 
    $col_description{"occ"} = "observed occurrences";
    $col_description{"exp_occ"} = "expected occurrences"; 
    $col_description{"occ_P"} = "occurrence probability (binomial)";
    $col_description{"occ_E"} = "E-value for occurrences (binomial)";
    $col_description{"occ_sig"} = "occurrence significance (binomial)";
    $col_description{"zscore"} = "z-score (normal)"; 
#    $col_description{"occ_lkh"} = "occurrence likelihood"; 
    $col_description{"all_occ"} = "number of non-overlapping + overlapping occurrences"; 
    $col_description{"ovl_occ"} = "number of overlapping occurrences"; 
    $col_description{"ratio"} = "observed/expected ratio"; 
    $col_description{"rank"} = "rank"; 
    $col_description{"ov_coef"} = "overlap coefficient"; 
    $col_description{"remark"} = "remark"; 

    @out_col = ();

    ################################################################
    ## Print header line
    my $dyad_len = 2*$oligo_length+5;
    my $dyad_id_len = $dyad_len;
    if ($sum_rc) {
	$dyad_id_len = 2*$dyad_len + 1;
    } 
    if ($sum_rc) {
	if ($dyad_len < 8) {
	    push @out_col, sprintf("%-".(${dyad_len}-1)."s", "seq");
	} else {
	    push @out_col, sprintf("%-".(${dyad_len}-1)."s", "sequence");
	}
	push @out_col, sprintf("%-${dyad_id_len}s", "identifier");
    } else {
	push @out_col, "dyad_seque";
	push @out_col, "dyad_ident";    
    }

    ## Frequencies
    push @out_col, "observed_freq" if ($return_fields{freq});
    push @out_col, "expected_freq" if ($return_fields{exp_freq});

    ## Occurrences
    push @out_col, "occ" if ($return_fields{occ});
    push @out_col, "exp_occ"  if ($return_fields{exp_occ});

    ## Binomial probability
    if ($return_fields{proba}) {
	push @out_col, "occ_P";
	push @out_col, "occ_E";
	push @out_col, "occ_sig";
    }

    ## Rank
    push @out_col, "rank"  if ($return_fields{rank});
    
    ## Overlapping occurrences
    if ($noov) {
	push @out_col, "ovl_occ";
	push @out_col, "all_occ";
    }


    ## z-score
    if ($return_fields{zscore}) {
	push @out_col, "zscore";
	push @out_col, "occ_var";
    }

    ## Ratio
    push @out_col, "ratio"  if ($return_fields{ratio});


    if (($return_fields{proba})|| ($return_fields{zscore})) {
	push @out_col, "ov_coef" ;
	push @out_col, "remark";
    }
    
    if ($verbose) {
	print $out "; column headers\n";
	foreach $c (0..$#out_col) {
	    printf $out ";\t%d\t%-15s\t%s\n", $c+1, $out_col[$c], $col_description{$out_col[$c]};
	}
    }

    print $out "#", join("\t",@out_col), "\n";

}



################################################################
## Print the result
sub PrintResult {
    &RSAT::message::TimeWarn("Printing the result") if ($main::verbose >= 2);
  
    #### print header line
    &PrintHeaderLine();
    
    #### sort dyads according to significance ####
    if ($sort_results) {
	if ($return_fields{proba}) { ### sort by decreasing order of significance
	    @sorted_patterns = sort { $patterns{$b}->{occ_sig} <=> $patterns{$a}->{occ_sig}} keys %patterns;
	} elsif ($return_fields{zscore}) { ### sort by decreasing order of zscore
	    @sorted_patterns = sort {$patterns{$b}->{zscore} <=> $patterns{$a}->{zscore}} keys %patterns;
	} else { ### sort by decreasing order of occurrences
	    @sorted_patterns = sort {$patterns{$b}->{occ} <=> $patterns{$a}->{occ}} keys %patterns;
	}
    }
    unless (defined(@sorted_patterns)) {
	@sorted_patterns = sort keys %patterns;
    }
    

    ################################################################
    ## Calculate rank and check thresholds
    &RSAT::message::TimeWarn("Calculating ranks") if ($main::verbose >= 2);
    my $rank = 0;
    foreach my $pattern_seq (@sorted_patterns) {
	$rank = $rank +1;
	$patterns{$pattern_seq}->{rank} = $rank;
#	&RSAT::message::Debug($pattern_seq, "rank", $rank) if ($main::verbose >= 10);
    }
    &CheckThresholds('rank');

    ################################################################
    ## Print the result line for each dyad
    foreach my $p (0..$#sorted_patterns) {
	my $pattern_seq = $sorted_patterns[$p];
	next unless (defined($patterns{$pattern_seq}->{occ}));
	print $out "$pattern_seq";
	print $out "\t$pattern_seq";
	if ($sum_rc){
	    $rc_pattern = &SmartRC($pattern_seq);
	    print $out "|$rc_pattern";
	}

	## Frequencies
	printf $out "\t%15.13f", $patterns{$pattern_seq}->{obs_freq} if ($return_fields{freq});
	printf $out "\t%15.13f", $patterns{$pattern_seq}->{exp_freq} if ($return_fields{exp_freq});

	## Occurrences
	printf $out "\t%7d", $patterns{$pattern_seq}->{occ} if ($return_fields{occ});
	printf $out "\t%7.2f", $patterns{$pattern_seq}->{exp_occ} if ($return_fields{exp_occ});

	## Binomial probability
	if ($return_fields{proba}) {
	    if ($patterns{$pattern_seq}->{occ_P} >= 0.0001) {
		printf $out "\t%7.5f", $patterns{$pattern_seq}->{occ_P};
	    } else {
		printf $out "\t%7.2g", $patterns{$pattern_seq}->{occ_P};
	    }
	    printf $out "\t%7.1e", $patterns{$pattern_seq}->{occ_E};
	    printf $out "\t%7.2f", $patterns{$pattern_seq}->{occ_sig};
	}

	## Rank
	printf $out "\t%7d", $patterns{$pattern_seq}->{rank} if ($return_fields{rank});

	## Overlapping occurrences
	if ($noov) {
	    printf $out "\t%d", $patterns{$pattern_seq}->{overlaps};
	    printf $out "\t%d", $patterns{$pattern_seq}->{overlaps} + $patterns{$pattern_seq}->{occ};
	}

	## z-score
	if ($return_fields{zscore}) {
	    print $out "\t", $patterns{$pattern_seq}->{zscore};
	    printf $out "\t%7.2f", $patterns{$pattern_seq}->{var_est};
	}

	## Ratio
	if ($return_fields{ratio}) {
	    if ($patterns{$pattern_seq}->{exp_occ} > 0) {
		printf $out "\t%7.2f", $patterns{$pattern_seq}->{occ}/$patterns{$pattern_seq}->{exp_occ};
	    } else {
		print $out "\tNA";
	    }
	}

	## Miscelaneous
	if (($return_fields{proba}) || ($return_fields{zscore})) {
	    printf $out "\t%7.4f", $overlap_coeff{$pattern_seq};
	    print $out "\t", $patterns{$pattern_seq}->{remark};
	}
	print $out "\n";
    }
}

################################################################
## Print detailed help message
sub PrintHelp {
  open HELP, "| more"; 
  print HELP <<End_of_help; 

NAME dyad-analysis

	1998 by Jacques van Helden (jvanheld\@bigre.ulb.ac.be) 

DESCRIPTION
	Detects overrepresented dyads (spaced pairs) in a set of DNA
	sequences. A dyad is defined here as a pair of
	oligonucleotides of the same size separated by a fixed number
	of bases.

	This algorithm is able to detect binding sites that escape detection by
	oligo-analysis, because of the sequence degeneracy within the spacer 
	region. A typical example of patterns that are efficiently detected by 
	the dyad analysis is the binding site for the yeast Gal4p 
	transcription factor, which has the consensus CGGNNNNNWNNNNNCCG.

	The dyad-analysis is generally efficient to detect binding sites 
	for HTH factors from E.coli, and for C6 Zinc cluster proteins in yeast.

CATEGORY
	statistics
	sequences
	pattern-discovery

OPTIONS
	-h      (must be first argument) display full help message

	-help   (must be first argument) display options

	-o output_file

	-i input_file
		The sequence to be analyzed. Multiple sequences can 
		be entered at once with most sequence formats (see below).

	-mask upper|lower
		Mask lower or uppercases, respecively, i.e. replace
		selected case by N characters.

	-format format
		Input sequence format. Various standards are 
		supported.
		   raw: the raw sequence without any identifier or comment.
		   multi: several raw sequences concatenated. 
		   IG: IntelliGenetics format. 
		   FastA: the sequence format used by FastA, BLAST, Gibbs
			  sampler and a lot of other bioinformatic programs. 
		   Wconsensus: the format defined by Jerry Hertz for 
			       his programs (patser, consensus, wconsensus). 
	-l #	oligo_length
		Oligonucleotide size (default 3)
		This is the size of a single element (a half dyad). 
	-sp #-#	spacing (default 0)
		Spacing between the elements of the dyad. 
		The spacing is the number of bases between the end of 
		the first element and the start of the second one. 
		Spacing formats
		---------------
		A single integer value means that the spacing is fixed. 
		Variable spacing can be introdued by entering the min and 
		max values separated by a hyphen. 
		For example 8-12 means that all occurrences of the dyad 
		with a spacing between 8 and 12 qill be counted together 
		and their significance estimated globally. 
		Warning, this is different from scanning one by one the
		 spacing values 8 to 12. 
	-type dyad_type (dr|ir|any)
		In order to fasten execution, the program can be asked 
		to restrict its analysis to symmetric dyads. 
		Three types are accepted
		   dr	direct repeats: the second element is the same as the 
			first one 
		   ir	inverted repeats: the second element is the reverse
			complement of the first one. 
		   rep  repeats: direct and inverted repeats are evaluated
		   any	(default)
			When selecting the option any, the analysis is 
			performed on all non-symmetric dyads as well. 

	-accept accepted_dyad_file
		Specify a file containing a list of accepted
		dyads. The analysis is then restricted to these
		dyads. Since the number of tested dyads is reduced by
		this selection, the multitesting correction is
		generally lower, which increases the significance of
		the accepted dyads, compared to the default situation
		where all dyads are analyzed.

		File format: the first word of each row specifies a
		dyad. Subsequent words are ignored.

	-groupsp
		Group dyads made of the same words (monads) but with
		different spacings.

	-2str	count on oth strands
		The occurrences of each oligonucleotide are summed on both 
		strands. This allows to detect elements which act in an
		orientation-insensitive way (as is generally the case for 
		yeast upstream elements). 

	-1str	single strand count
		only the direct strand is considered for oligonucleotide and 
		dyad occurrence counting.

	-prot	input sequence is proteic. In this case, the analysis
		concerns pairs of oligopeptides instead of oligonucleotides

	-expfreq	file with an expected frequency table
		By default, the frequency expected for each dyad is the 
		product of the frequency expected for each element 
		(oligonucleotide): 

		  exp(dyad) = exp(oligo1)*exp(oligo2)

		By default, the oligonucleotide frequencies observed in the 
		input sequences are used to estimate the expected oligo 
		frequencies.
		Alternatively, predefined frequency tables can be used. 
		These tables can for instance be calculated on basis of
		- the whole yeast genome 
		- all yeast intergenic regions 
		- all yeast gene regions 
		This allows to correct the bias due to the highly variable
		distribution of oligonucleotides observed in the yeast genome. 

	-ncf	(deprecated, use "-bg intergenic" instead)
		use intergenic frequencies as background frequencies

	-bg	background model
		Type of sequences used as background model for
		estimating expected dyad frequencies.

		Either -org or -taxon is required with the option -bg.

		Supported models: 
		-bg upstream 
			 all upstream sequences, allowing overlap with
		         upstream ORFs

		-bg upstream-noorf
			  all upstream sequences, preventing overlap
			  with upstream ORFs

		-bg intergenic
		         intergenic frequencies 
			 Whole set of intergenic regions, including
			 upstream and downstream sequences

		-bg monads (default)
		    Calcualte expected dyad frequency from the monad
		    frequencies observed in the input sequences.

		-bg input
		    Same as -bg monads, allowed for consistency with
		    oligo-analysis.


	-org	organism
	-taxon	taxon
		Organism or taxon that used as reference for the
		estimation of a background model based on a genome
		subset (option -bg).  Either -org or -taxon is
		required with the option -bg.

   		Options -org and -taxon are mutually exclusive.

	-thosig #
		Threshold on occurrence significance.
		(obsolete: use -lth occ_sig instead)

	-lth param value
		Lower threshold on some parameter. All patterns with a
		parameter value smaller than the threshold are
		discarded.
		Supported parameters: $supported_thresholds
	      	Example: select patterns with a positive value for the
	      	occurrence significance.

			 -lth occ_sig 0

	-uth param value
		Upper threshold on some parameter. All patterns with a
		parameter value higher than the threshold are
		discarded.
		Supported parameters: $supported_thresholds
		Example: to select no more than 50 patterns
		        -uth rank 50

	-sort	sort results by decreasing order of significance

	-return	output_fields
		output fields may contain one or several of the following 
		words:
			freq
			occ
			proba (binomial probability)
			zscore
			ratio
			rank
		the fields have to be separated by commas
		By default, only occurrences are returned.


	-under  detect under-represented instead of over-represented
         	dyads (left tail significance test).


	-two_tails
		detect under-represented and over-represented dyads
		(two-fail significance test).

	-zeroocc
		Report also dyads with zero occurrences (provided they
		fit the other thresholds).  By default, the program
		reports only patterns present in the sequence.  If the
		left tail or two-tail test is applied, patterns with
		zero occurrences are automatically taken into
		account.

		In some other cases, one would also like to detect
		patterns absent from the sequence. This is the
		function of the option -zeroocc.

	-noov	do not allow overlapping matches of the same word

	-timeout #
		timeout (in seconds). Default = 3600.
		dyad-analysis can be time consuming. In order to
		protect the server from endless queries, the program
		will automatically stop after 1 hour (default) of
		calculation. The time out value can be changed for
		heavy tasks.

	-seqtype  dna|prot|other
		Input sequence type
		. DNA (default)
		    Only A, C, G, and T residues are
		    accepted. oligomers that contain partly defined
		    (IUPAC code) or undefined nucleotides (N) are
		    discarded from the countings.
		. protein
		    Oligopeptide analysis instead of oligonucleotide.
		    This inactivates the grouping of oligomers with
		    their reverse complements, and modifies the
		    alphabet size.
		. other
		    Any type of letters found in the input sequence is
		    considered valid. This allows to analyze texts in
		    human language.

OUTPUT COLUMNS
	Dyad pattern
	Dyad identifier. Same as pattern, with th reverse complement added
		when the counting was performed on both strands.

	Expected frequency (exp_frq): the probability to observe the dyad at
		each position. This value is calculated on basis of the
		expected frequency table (see above), or of the 
		oligonucleotide frequencies observed in the input sequences. 

	Observed occurrences (occ): the number of ocurrences
		observed for each dyad. 
		Overlapping matches are detected and summed in the counting. 

	Expected number of occurrences (exp_occ): the number of	ocurrences 
		expected for each dyad. This value is calculated on basis of
		the oligonucleotide frequency table selected. 

	Occurrence P-value (occ_P): the probability to have N or more
		occurrences, given the expected number of occurrences 
		(where N is the observed number of occurrences). 

	Occurrence E-value (occ_E): the expected number of false
		positives, given the number of false 
			E-value = P-value * nb_tested_patterns
		This is a correction for multi-testing, taking into
		account the number of patterns for which a test of
		significance has been performed (which varies with the
		size of the monad, and with the number of spacings
		sampled)/

	Occurrence Significance (occ_sig): 
		A logarithmic transformation of the E-value. 
		  	      occ_sig = -log10(occ_E)
		The highest sig correspond to the most overrepresented
		oligonucleotide.  Sig value higher than 0 
		indicate a significant overrepresentation (E-value < 1).

PROBABILITIES
	Various calibration models can be used to estimate the probability of
	each oligonucleotide (see above). From there, and expected number of
	occurrences is calculated and compared to the observed number of
	occurrences. The significance of the observed number of occurrences
	is calculated with the binomial formulae. 

              
    EXPECTED DYAD FREQUENCY
        If exp(oligo1) is the expected frequency for the first element, and
           exp(oligo1) is the expected frequency for the second element
        
        Then
           exp(dyad) = exp(oligo1)*exp(oligo2)

    NUMBER OF POSSIBLE DYADS
        This number depends on the dyad type selected by the user. 
        When the analysis is restricted to inverted repeats, or to direct 
        repeats, the first element univocally determines the second one, 
        thus:
                nb_poss_dyads = nb_poss_oligo
                              = 4^w
                where w is the oligonucleotide length.
         When any dyad is allowed, each oligonucleotide can combine with any 
        other or itself, thus:
                nb_poss_dyads = nb_poss_oligo * nb_poss_oligo 
                              = 4^2w
    EXPECTED OCCURRENCES
                              r
           Exp_occ = p * 2 * SUM (Lj + 1 - d) = p * T
                             j=1
        
        where   p  = expected dyad frequency
                n  = number of input sequences
                Lj = length of the jth input sequence
                d  = length of the dyad, calculated as follows:
                        d = 2w + s
                        where w is the oligonucleotide length
                              s is the spacer length
                T  = the number of possible matching positions in the 
                     whole set of input sequences.
                The factor 2 stands for the fact that occurrences are summed
                on both strands (it is omitted when the option -1str 
                is active).

    PROBABILITY OF THE OBSERVED NUMBER OF OCCURRENCES
        The probability to observe exactly obs occurrences in the whole set
        of sequences is calculated by the binomial
        
                                                      obs      T-obs
            P(obs) = bin(p,T,obs) =       T!         p    (1-p)
                                     ---------------
                                     obs! * (T-obs)! 
        
        where   obs is the observed number of dyad occurrences,
                p   is the expected dyad frequency,
                T   is the number of possible matching positions,
                    as defined above. 
        
        The probability to observe obs or more occurrences in the whole set of
        of sequences is calculated by the sum of binomials:
             
                            obs-1
             P(>=obs) =  1 - SUM P(j)
                             j=0
                                      
    SIGNIFICANCE INDEX
        The significance index is a conversion of the occurrence probability, 
        calculated as follows:.
              
              Sig_occ = -log10(NPD * P(>=obs));
        where   NPD     is the number of possible dyads, calculated as above.

AVAILABILITY
      The program can be used through its web interface at:
      http://rsat.bigre.ulb.ac.be/rsat/

      dyad-analysis is a perl script running on unix machines (SUN,
      SGI, DEC Alpha, Max OSX have been tested). The web interface is
      a perl-cgi script.

End_of_help
  close HELP;
  exit;
}


################################################################
#### display short help message #####
sub PrintOptions {
  open HELP, "| more";
  print HELP <<End_short_help;
dyad-analysis options
----------------
-h		(must be first argument) display full help message
-help		(must be first argument) display options
-o		output file
-i		input file
-mask upper|lower	mask upper- or lowercases, respectively
-format		input sequence format
-seqtype       	sequence type (dna|prot|other)
-v		verbose
-l		oligonucleotide length
-sp		spacing
-expfreq	expected frequency file
-ncf		(deprecated, use '-bg intergenic' instead) 
-bg		background model (supported: $supported_bg)
-org		organism (-org and -taxon are mutually exclusive)
-taxon		taxon (-org and -taxon are mutually exclusive)
-type		ir|dr|rep|any
-accept 	accepted_dyad_file
-groupsp	Group dyads made of the same words (monads) but with different spacings.
-sort		sort results by decreasing significance
-under		detect under-represented instead of over-represented words 
-two_tails      detect under-represented and over-represented words 
-zeroocc	return also patterns with zero occurrences
-thosig		threshold on occurrence significance (obsolete)
-lth param \#	lower threshold on parameter. Supported: $supported_thresholds
-uth param \#	upwer threshold on parameter. Supported: $supported_thresholds
-1str		count occurrences on a direct strand only
-2str		count occurrences on both strands
-prot		input sequence is proteic
-return		occ,freq,proba,zscore,ratio,rank
-noov		do not allow overlapping matches of the same word
-timeout #	timeout (seconds). Default = 3600.
End_short_help
  close HELP;
  exit(0);
}
