#!/usr/bin/perl -w
############################################################
#
# $Id: calibrate-oligos,v 1.15 2011/02/17 04:54:48 rsat Exp $
#
# Time-stamp: <2003-07-04 12:48:55 jvanheld>
#
############################################################
#use strict;;
if ($0 =~ /([^(\/)]+)$/) {
    push (@INC, "$`lib/");
}
require "RSA.lib";
require RSAT::util;

################################################################
#### initialise parameters
local $start_time = &RSAT::util::StartScript();
my $oligo_length = 6;
$repet = 1000;
my $oligo_len = 6;
my $str = "-2str";
my $seq_len=800;
my $seq_nb=10;
my $ov="-noov";
my $organism_name = "Saccharomyces_cerevisiae";
my %dir = ();
my $start = 1;
my $end = undef;

$seq_file = "";
$family_file = "";

@supported_tasks = qw (all upstream random oligos distrib fit clean_seq clean_oligos);
foreach my $task (@supported_tasks) {
    $supported_task{$task} = 1;
}
$supported_tasks = join ",", @supported_tasks;

local %infile = ();
local %outfile = ();

$verbose = 0;
$dry_run = 0;
$die_on_error = 1;

&ReadArguments();

################################################################
#### check argument values


#### check selected tasks
unless (%task) {
    &RSAT::error::FatalError("You should select at least one task.");
}
if ($task{all}) {
    foreach my $t (@supported_tasks) {
	next if ($t eq "clean_oligos"); ## clean must be actively requested
	$task{$t} = 1;
    }
}

## organism name
&CheckOrganismName($organism_name);

## output directory
unless (defined($dir{output})) {
    $dir{output} = "results/".$organism_name."/rand_gene_selections/".$oligo_len."nt".$str.$ov."_N".$seq_nb."_L".$seq_len."_R".$repet;
}
&RSAT::util::CheckOutDir($dir{output});
$dir{oligos} = $dir{output}."/oligos";
&RSAT::util::CheckOutDir($dir{oligos});

chdir($dir{output});

################################################################
#### output files
$file_prefix =  $organism_name."_".$oligo_len."nt_".$str.$ov."_n".$seq_nb."_l".$seq_len."_r".$repet;
$outfile{distrib} =$file_prefix."_distrib.tab";
#$outfile{stats} =$file_prefix."_stats.tab";
$outfile{negbin} =$file_prefix."_negbin.tab";
$outfile{poisson} =$file_prefix."_poisson.tab";

## open output stream for the distribution
if ($task{distrib}) {
    $out = &OpenOutputFile($outfile{distrib});
    ################################################################
    #### print verbose
    &Verbose() if ($verbose);
}


################################################################
#### Retrieve all upstream sequences in wc format. This allows to
#### load them rapidly for further analysis
$seq_file = $organism_name."_allup".$seq_len.".wc";
if ($task{upstream}) {
    warn "; ", &AlphaDate(), "\tRetrieving upstream sequences\n" if ($verbose >= 1);
    my $command = "retrieve-seq -org $organism_name ";
    $command .= " -all -type upstream -nocomment -lw 0";
    $command .= " -from -".$seq_len;
    $command .= "  -to -1";
    $command .= "  -format wc";
    $command .= " -label id";
    $command .= " -o $seq_file";
    print $out "; $command\n";
    &doit($command, $dry_run, $die_on_error, $verbose);
}

################################################################
##### Select random gene families
$family_file = "random_genes.tab";
if ($task{random}) {
    warn "; ", &AlphaDate(), "\tSelecting random gene families\t".$family_file."\n" if ($verbose >= 1);
    $command = "random-genes -org ".$organism_name;
    $command .= " -g ".$repet;
    $command .= " -n ".$seq_nb;
    $command .= " -o ".$family_file;
    print $out "; $command\n";
    &doit($command, $dry_run, $die_on_error, $verbose);
}

################################################################
## Run oligo-analysis to count oligonucleotide occurrences in each
## random sequence set.

if ($task{oligos}) {
    unless (defined($end)) {
	$end = $repet;
    }
    for my $r ($start..$end) {
	warn "; ", &AlphaDate(), "\toligo-analysis\trepetition\t",$r,"\n" if ($verbose >= 1);
	
	#### select one gene family
	my $one_family_file = "oligos/RAND_n".$seq_nb."_r".$r.".fam";
	$command = "grep 'RAND".$r."\$' ".$family_file;
	$command .= " | cut -f 1 > ".$one_family_file;
	print $out "; $command\n" if ($r == 1);
	&doit($command, $dry_run, $die_on_error, $verbose);
	
	#### select the corresponding upstream sequences
	my $oligo_file = "oligos/oligos_".$oligo_len."nt".$str.$ov."_L".$seq_len."_n".$seq_nb."_r".$r;
	$command = "grep -f ".$one_family_file." ".$seq_file;
	$command .= " | oligo-analysis -format wc -return occ ".$str;
	$command .= " -v 1" if ($r == 1);
	$command .= " ".$ov;
	$command .= " -l ".$oligo_len;
	$command .= " -o ".$oligo_file;
	print $out "; $command\n" if ($r == 1);
	&doit($command, $dry_run, $die_on_error, $verbose);
    }
}

################################################################
#### Regroup oligo-counts in a single table
#  my $oligo_table = $organism_name."_oligo_counts_".$oligo_len."nt_r".$repet.".tab";
#  warn "; ", &AlphaDate(), "\toligo table\t",$oligo_table,"\n" if ($verbose >= 1);
#  $command = "compare-scores -o ".$oligo_table;
#  $command .= " -null 0 -sc 3 -files oligos/oligos_".$oligo_len."nt_r*";
# print $out "; $command\n";
#&doit($command, $dry_run, $die_on_error, $verbose);

################################################################
#### Calculate distribution of occurrences for each oligonucleotide
local $max_occ = 0;
local %counts = ();
local %count_sum = ();
local %overlaps = ();
if ($task{distrib}) {
    warn "; ", &AlphaDate(), "\toligo count distribution\t",$outfile{distrib},"\n" if ($verbose >= 1);
    for my $r (1..$repet) {
	my $oligo_file = "oligos/oligos_".$oligo_len."nt".$str.$ov."_L".$seq_len."_n".$seq_nb."_r".$r;
#	my $oligo_file = "oligos/oligos_".$oligo_len."nt".$str.$ov."_L".$seq_len."_n".$seq_nb."_r".$r;
	&RSAT::error::FatalError("File ".$oligo_file." does not exist in directory ".$dir{output}."\n") unless (-e $oligo_file);
	my ($oligos) = &OpenInputFile($oligo_file);
	while (<$oligos>) {
	    next if (/^;/);
	    chomp;
	    my ($pattern, $id, $occ, $ovl) = split "\t", $_;
#	warn $r, "\t", $pattern, "\t", $occ, "\n";
	    $counts{$pattern}{$occ}++;
	    $overlaps{$pattern}{$ovl}++;
	    $max_occ = $occ if ($max_occ < $occ);
	    $count_sum{$pattern}++;
	}
	close $oligos;
    }
    
    #### print the distribution in the output file
    print $out "; oligo count distribution\n";
    print $out "; ", join ("\t", "pattern", 0..$max_occ), "\n";
    foreach my $pattern (sort keys %count_sum) {
	## calculate number of families without any occurrence of this pattern
	$counts{$pattern}{0} = $repet - $count_sum{$pattern}; 
	print $out $pattern;
	for my $occ (0..$max_occ) {
	    unless ($counts{$pattern}{$occ}) {
		$counts{$pattern}{$occ} = 0;
	    }
	    printf $out "\t%d", $counts{$pattern}{$occ};
	}
	print $out "\n";
    }
    &CloseVerbose();
}

################################################################
#### Clean data files (just hold the distrib files).

## Delete sequence files. 
if ($task{clean_seq}) {
    warn "; ", &AlphaDate(), "\tDeletingsequence file\n" if ($verbose >= 1);
    my $command = "rm -f ".$seq_file;
    &doit($command, $dry_run, $die_on_error, $verbose);
}

## Delete files with oligonucleotide counts. 
if ($task{clean_oligos}) {
    warn "; ", &AlphaDate(), "\tDeleting oligos from directory\t", $dir{output},"\n" if ($verbose >= 1);
    my $command = "\\rm -rf oligos";
#    $command .= " oligos/oligos_".$oligo_len."nt".$str.$ov."_L".$seq_len."_n".$seq_nb."_r*";
#    $command .= " oligos/RAND*.fam";
    &doit($command, $dry_run, $die_on_error, $verbose);
}

################################################################
#### Calculate statistics on each pattern count distribution
#&CalculateStatistics() if ($task{stats});
&FitDistribution() if ($task{fit});

warn "; Result stored in directory\t", $dir{output}, "\n";

exit(0);


################################################################
################### subroutine definition ######################
################################################################


################################################################
#### display full help message 
sub PrintHelp {
  open HELP, "| more";
  print HELP <<End_of_help;
NAME
	calibrate-oligos

        2002 by Jacques van Helden (jvanheld\@bigre.ulb.ac.be)
	
DESCRIPTION

	Calibrate oligonucleotide frequencies in upstream sequences of
	a selected organism. The calibration can be gene-wise (count
	oligo frequencies in the upstream sequence of each gene) or
	cluster-wise (count oligo frequencies in upstream sequences of
	random gene selections).

CATEGORY
	util

USAGE
        calibrate-oligos -org organism [-r #] [-start #]
	    [-sl #] [-sn #] [-ol #]

OPTIONS
	-h	(must be first argument) display full help message
	-help	(must be first argument) display options
	-v	verbose
	-outdir outputdir
	-task selected_task
		Select the tasks to be performed.
		Supported tasks: $supported_tasks

		Can be used iteratively on the same command line to 
		select multiple tasks.  

		Example:
		    -task upstream -task oligos -task distrib
		For a full analysis, simply type 
		    -task all
    Repetitions
	-r #	repetitions
	-start #	
	        starting iteration (to pursue an interrupted test)
	-end #	ending iteration (to pursue an interrupted test)

    Upstream sequences
	-org organism
	-sl	sequence length
	-sn	sequence number

    oligo-analysis
	-ol	oligo length
	-1str   strand-sensitive analysis
	-2str   strand-insensitive analysis
	-noov	prevent overlapping matches for self-overlapping patterms
		(default)
	-ovlp	allow overlapping matches for self-overlapping patterms

TASKS
    all
        Perform all the supported tasks

    upstream
	Retrieve all upstream sequences in wc format. This allows to
	load them rapidly for further analysis.

    random
	Select random gene families.

    oligos
	Count oligonucleotide occurrences in each random family. 

    distrib
	Calculate distribution of occurrences for each
	oligonucleotide.

    fit
	Fit theoretical distributions (Poisson and negbin) on the
        observed distribution.

   clean_seq 
	Delete sequence files.

   clean_oligos
	Delete oligonucleotide count files.

End_of_help
  close HELP;
  exit;
}

################################################################
#### display short help message
sub PrintOptions {
  open HELP, "| more";
  print HELP <<End_short_help;
calibrate-oligos options
----------------
-h		(must be first argument) display full help message
-help		(must be first argument) display options
-outdir		output dir
-v		verbose
-task		selected task (supported: $supported_tasks)
-r #		repetitions
-start #	starting iteration (to pursue an interrupted test)E
-end #  	ending iteration (to pursue an interrupted test)E
-org		organism
-sl		sequence length
-sn		sequence number
-ol		oligo length
-1str   	strand-sensitive analysis
-2str   	strand-insensitive analysis
-noov		prevent overlapping matches for self-overlapping patterms
-ovlp		allow overlapping matches for self-overlapping patterms
End_short_help
  close HELP;
  exit;
}


################################################################
#### read arguments 
sub ReadArguments {
    foreach my $a (0..$#ARGV) {
	### verbose  
	if ($ARGV[$a] eq "-v") {
	    if (&IsNatural($ARGV[$a+1])) {
		$verbose = $ARGV[$a+1];
	    } else {
		$verbose = 1;
	    }
	    
	    ### detailed help
	} elsif ($ARGV[$a] eq "-h") {
	    &PrintHelp();
	    
	    ### list of options
	} elsif ($ARGV[$a] eq "-help") {
	    &PrintOptions();
	    
	    #### organism
	} elsif ($ARGV[$a] eq "-org") {
	    $organism_name =$ARGV[$a+1];

	    ### repetitions
	} elsif ($ARGV[$a] eq "-r") {
	    $repet = $ARGV[$a+1];
	    
	    ### sequence length
	} elsif ($ARGV[$a] eq "-sl") {
	    $seq_len = $ARGV[$a+1];
	    
	    ### sequence number
	} elsif ($ARGV[$a] eq "-sn") {
	    $seq_nb = $ARGV[$a+1];
	    
	    ### oligo length
	} elsif ($ARGV[$a] eq "-ol") {
	    $oligo_len = $ARGV[$a+1];
	    
	    ### strands 
	} elsif ($ARGV[$a] eq "-1str") {
	    $force{strands} = "1str";
	} elsif ($ARGV[$a] eq "-2str") {
	    $force{strands} = "2str";

	    #### prevent self-overlap
	} elsif ($ARGV[$a] eq "-noov") {
	    $noov = 1;

	    #### allow self-overlap
	} elsif ($ARGV[$a] eq "-ovlp") {
	    $noov = 0;

	    ### output directory  
	} elsif ($ARGV[$a] eq "-outdir") {
	    $dir{output} = $ARGV[$a+1];
	    	
	    ### starting iteration
	} elsif ($ARGV[$a] eq "-start") {
	    $start = $ARGV[$a+1];
	    
	    ### ending iteration
	} elsif ($ARGV[$a] eq "-end") {
	    $end = $ARGV[$a+1];
	    
	    #### task selection
	} elsif ($ARGV[$a] eq "-task") {
	    my @requested_tasks = split ",", $ARGV[$a+1];
	    foreach my $task (@requested_tasks) {
		next unless $task;
		if ($supported_task{$task}) {
		    $task{$task} = 1;
		} else {
		    &RSAT::error::FatalError("Unsupported task '$task'. \n\tSupported: $supported_tasks");
		}
	    }

	}
    }
}

################################################################
#### verbose message
sub Verbose {
    print $out "; calibrate-oligos ";
    &PrintArguments($out);
    printf $out "; %-29s\t%s\n", "Output directory", $dir{output};
    printf $out "; %-29s\t%s\n", "Distribution", $outfile{distrib} if ($task{distrib});
#    printf $out "; %-29s\t%s\n", "Stats", $outfile{stats} if ($task{stats});
    printf $out "; %-29s\t%s\n", "Poisson fitting", $outfile{poisson} if ($task{fit});
    printf $out "; %-29s\t%s\n", "negbin fitting", $outfile{negbin} if ($task{fit});
    printf $out "; %-29s\t%s\n", "Organism", $organism_name;
    printf $out "; %-29s\t%s\n", "Sequence length", $seq_len;
    printf $out "; %-29s\t%s\n", "Sequence number", $seq_nb;
    printf $out "; %-29s\t%s\n", "Repetitions", $repet;
    printf $out "; %-29s\t%s\n", "Sequence file", $seq_file;
    printf $out "; %-29s\t%s\n", "Random gene selections", $family_file;
    printf $out "; %-29s\t%s\n", "Oligonucleotide length", $oligo_len;
    printf $out "; %-29s\t%s\n", "Strands", $str;
    printf $out "; %-29s\t%s\n", "Overlap mode", $ov;
    if (%main::infile) {
	print $out "; Input files\n";
	while (($key,$value) = each %infile) {
	    print $out ";\t$key\t$value\n";
	}
    }
    if (%main::outfile) {
	print $out "; Output files\n";
	while (($key,$value) = each %outfile) {
	    print $out ";\t$key\t$value\n";
	}
    }
}

################################################################
###### finish verbose
sub CloseVerbose {
  my $exec_time = &RSAT::util::ReportExecutionTime($start_time);
  print $main::out $exec_time if ($main::verbose >= 1);
  close $out;
}


################################################################
## Fit theoretical distributions (Poisson and negbin) on the observed
## distribution.
sub FitDistribution {
    for my $theor ("negbin", "poisson") {
	my $command = "fit-distribution -v 1";
	$command .= " -distrib ". $theor;
	$command .= " -i ".$outfile{distrib};
	$command .= " -o ".$outfile{$theor};
	&doit($command, $dry_run, $die_on_error, $verbose);
    }
}

################################################################
### OBSOLETE
### Previous method for calculating statistics. This is now done with
### the script fit-distribution
sub CalculateStatistics {
    warn "; ", &AlphaDate(), "\toligo count statistics\t",$outfile{stats},"\n" if ($verbose >= 1);

    $out = &OpenOutputFile($outfile{stats});
    &Verbose();

    #### header
    print $out join ("\t", 
		     "; pattern",
		     "sum",
#		     "ssq",
		     "avg",
		     "var",
		     "std",
		     "chi2",
		     "df",
		     "Lgroup",
		     "Rgroup",
		     ), "\n";


    ## Open the file with count distributions
    ($distrib) = &OpenInputFile($outfile{distrib});

    my $pattern_count = 0;
    while (<$distrib>) {
	next unless (/\S/);
	next if (/^;/);
	chomp();
	$pattern_count++;
	my @counts = split "\t";
	my $pattern = shift @counts;
	my $max_occ = $#counts;
	my $sum = 0;
	my $ssq = 0;
	for my $occ (0..$max_occ) {
	    $sum += $occ*$counts[$occ];
	    $ssq += $occ*$occ*$counts[$occ];
	}
	my $avg = $sum/$repet;
	my $var = $ssq/$repet - $avg*$avg;
	my $std = sqrt($var);

	## Fit a poisson distribution and calculate the goodness of fit
	my @expected = poisson($max_occ, $avg, 1); 
	my $exp_sum = 0;
	foreach my $i (0..$#expected) {
	    $expected[$i] *= $repet;
	    $exp_sum += $expected[$i];
	}

	## Perform a chi-square test
	my ($chi2, $df, $left_group, $right_group) = &ChiSquare("goodness", 2, $max_occ+1, @counts, @expected);
	
	## Discard cases where applicability conditions are not met
	unless (&IsReal($chi2)) {
	    $chi2 = "NA";
	}

	warn join ("\t", $pattern, $sum, $ssq, $avg, $var, $std, $exp_sum, $chi2, $df, $left_group, $right_group), "\n" if ($verbose >= 3);

	print $out join ("\t", 
			 $pattern,
			 $sum,
#			 $ssq,
			 $avg,
			 $var,
			 $std,
			 $chi2,
			 $df,
			 $left_group, 
			 $right_group,
#			 @counts, 
#			 @expected,
			 ), "\n";
    }

    &CloseVerbose();
}
