#!/usr/bin/perl -w
############################################################
#
# $Id: footprint-discovery-quality,v 1.2 2011/03/04 16:10:53 rsat Exp $
#
############################################################

## use strict;

=pod

=head1 NAME

footprint-discovery-quality

=head1 VERSION

$program_version

=head1 DESCRIPTION

This program computes the distribution of significance score values returned by dyad-analysis in promoters of orthologous clusters or in random selections of genes. 

=head1 AUTHORS

sylvain@bigre.ulb.ac.be

=head1 CATEGORY

UTIL

=over

=item util

=back

=head1 USAGE

footprint-discovery-quality -n nb_of_genegroups -o output [-v #] [...]

=head1 INPUT FORMAT

=head1 OUTPUT FORMAT

=head1 SEE ALSO
footprint-discovery
get-orthologs
dyad-analysis

=head1 WISH LIST

=cut


BEGIN {
    if ($0 =~ /([^(\/)]+)$/) {
	push (@INC, "$`lib/");
    }
}
require "RSA.lib";
require "RSA.disco.lib";
use List::Util 'shuffle';


                    


################################################################
## Main package
package main;
{

    ################################################################
    ## Initialise parameters
    local $start_time = &RSAT::util::StartScript();
    $program_version = do { my @r = (q$Revision: 1.2 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
#    $program_version = "0.00";

    %main::infile = ();
    %main::outfile = ();
    %main::outputdir = ();

    $main::verbose = 0;

    $main::taxon_list = "";
    @taxa = ();
    $main::organism = "";
    $main::orthogroups_nb = 100;
    $main::scripts = $ENV{RSAT}."/perl-scripts/";
    %main::bgmodel = ();
    
    ## Options for the &doit() command;
    local $dry = 0;	  ## Do not run the command, just echo them as warning
    local $batch = 0;		## Run the processes on a PC cluster
    local $die_on_error = 1;
    local $job_prefix = "footpring_qual";

    ################################################################
    ## Read argument values
    &ReadArguments();

    ################################################################
    ## Check argument values
    if ($taxon_list eq "") {
      &RSAT::error::FatalError("Please specify at least one taxon with the -taxon option");
    } else {
      @taxa = split /,/, $taxon_list;
      foreach my $taxon (@taxa) {
        &CheckTaxon($taxon);
      }
    }
    
    if ($organism eq "") {
      &RSAT::error::FatalError("Please specify an organism with the -org option");
    } else {
      &CheckOrganism($organism);
    }
    
    my $batch_arg = "";  
    if ($batch) {
      $batch_arg = " -batch ";
    }


    ################################################################
    ## Create the output directories, files and prefixes
    $outputdir{root} = 'footprint_quality/';
    $outputdir{random_genes_dir} = 'footprint_quality/random_gene_selections/';
    $outputdir{dyad_scores} = $outputdir{root}."/dyad_scores/";
    $outfile{random_genes_selection} = $outputdir{random_genes_dir}."random_gene_selection.tab";
    
    system("mkdir -p $outputdir{root}");
    system("mkdir -p $outputdir{random_genes_dir}");
    system("mkdir -p $outputdir{dyad_scores}");
    
    

    ################################################################
    ## Data treatment
    
    ################################################################
    ## REAL DATA 
    
    ## Get orthogroups_nb groups of orthologs
    ## Use the program 
    my $rdm_genes_cmd = $scripts."random-genes -feattype CDS -v $verbose -n $orthogroups_nb -org $organism -o $outfile{random_genes_selection}";
    &doit($rdm_genes_cmd);
    ## put gene names in a hash
    my %rdm_genes = ();
    ($main::genelist) = &OpenInputFile($outfile{random_genes_selection});
    while (<$main::genelist>) {
      chomp();
      s/\r/\n/g;		  ## Suppress Windows-specific carriage return
      next if /^;/;		## Comment line
      next if /^\#/;		## Header line
      next if /^\--/;		## SQL comment line
      next unless /\S/;		## Empty line
      my ($gene) = split /\s/;
      $gene = &trim($gene); ## Remove leading and trailing spaces
      $rdm_genes{$gene}++;
    }
    my @rdm_genes_array = keys %rdm_genes;
    
    close $main::genelist;
    ## Run footprint-discovery on each requested taxon and discover the motifs
    foreach my $taxon (@taxa) {
      my $footprint_discovery_cmd = $scripts."footprint-discovery $batch_arg -task query_seq,orthologs,ortho_seq,purge,filter_dyads,dyads -taxon $taxon -genes $outfile{random_genes_selection} -v $verbose -org $organism -o $outputdir{root} -sep_genes";
      &doit($footprint_discovery_cmd);
      
    }
    
    ##################################################################
    ## RANDOM DATA
    ## Determine for each taxon the average number of gene per group of orthologs
    ## wait until the bbh search has been achieved
    &RSAT::message::TimeWarn ("Estimating the average number of gene by orthology group") if ($main::verbose >= 3);
    my $achieved = 0;
    my %ortho_nb = ();
    foreach my $taxon (@taxa) {
        while (!$achieved) {
        sleep(20);
        &RSAT::message::TimeWarn ("All data are not available yet ... wait 20 seconds") if ($main::verbose >= 2);
        $achieved = 1;
        my $ortho_nb_sum = 0;
        foreach my $gene (keys (%rdm_genes)) {
          my $taxon_gene_dir = $outputdir{root}."/".$taxon."/".$organism."/".$gene;
          my $ortho_seq_file = $taxon_gene_dir."/".$gene."_".$organism."_".$taxon."_ortho_seq.fasta";
          my $ortho_file = $taxon_gene_dir."/".$gene."_".$organism."_".$taxon."_ortho_bbh.tab";
          if (!-e $ortho_seq_file) {
            $achieved = 0;
            &RSAT::message::TimeWarn ("File $ortho_seq_file does not exist") if ($main::verbose >= 2);
          } 
          my $ortho_nb = 0;
          next if (!$achieved);
          $ortho_nb = `cat $ortho_file | grep -v '^;' | grep -v '^#' | wc -l`;
          chomp $ortho_nb;
          &RSAT::message::TimeWarn ("$ortho_nb orthologs for gene $gene of $organism in taxon $taxon") if ($main::verbose >= 3);
          $ortho_nb_sum += $ortho_nb;
        }
        $ortho_nb{$taxon} = $ortho_nb_sum;
      }
      $achieved = 0;
    }
    
    foreach my $taxon (@taxa) {
      my $dyad_file_result = "footprint_quality/".$taxon."/".$organism."/".$rdm_genes_array[0]."/".$rdm_genes_array[0]."_".$organism."_".$taxon."_ortho_dyads_3nt_sp0-20-2str-noov_taxfreq_sig0.tab";
      my $bgmodel_taxon = "";
      while ($bgmodel_taxon eq "") {
        sleep (30);
        $bgmodel_taxon = `cat $dyad_file_result | grep 'exp. freq.' | cut -f 3` if (-e $dyad_file_result);
        chomp $bgmodel_taxon;
      }

      $bgmodel{$taxon} = $bgmodel_taxon;
    }

    foreach my $taxon (@taxa) {
      my @taxon_organisms = &getAllTaxonOrganisms($taxon, \%supported_organism);
      my @taxon_genes = &getAllOrganismsGenes(\@taxon_organisms);
      my $cpt = 0;
      my @taxon_genes_size = 0 .. (scalar(@taxon_genes)-1);  
      my @shuffled_taxon_genes_size = shuffle(@taxon_genes_size);     
      
      my $nbgene_per_group = int ($ortho_nb{$taxon}  / $orthogroups_nb);
      &RSAT::message::TimeWarn ("$nbgene_per_group random genes per group of genes in taxon $taxon") if ($main::verbose >= 3);
      my $random_genes_dir_taxon =  $outputdir{random_genes_dir}."/".$taxon."/";
      system ("mkdir -p $random_genes_dir_taxon");
      my $absolute_path = `pwd`;
      chomp $absolute_path;

      for (my $i = 0; $i < $orthogroups_nb; $i ++) {
        # constitute $orthogroups_nb groups of random genes (input of retrieve-seq-multigenome)
        # the number of random genes per group is equal to the mean number of orthologs per taxon (determined in previous step).
        my $random_genes_taxon_dir_i = $absolute_path."/".$random_genes_dir_taxon."/"."random_genes_".$i;
        my $file_root = $random_genes_taxon_dir_i."/"."random_genes_".$taxon."_".$i;
        
        my $random_gene_taxon_file = $file_root."_ref_gene.tab";
        my $random_gene_seq_taxon_file = $file_root."_ref_gene.fasta";
        my $random_gene_dyad_taxon_file = $file_root."_ref_gene_dyads.tab";
        
        my $random_genes_taxon_file = $file_root."_random_ortho_genes.tab";
        my $random_genes_seq_taxon_file = $file_root."_random_ortho_genes.fasta";
        my $random_genes_seq_purgednc_file = $file_root."_random_ortho_genes_purged_notclean.fasta";
        my $random_genes_seq_purged_file = $file_root."_random_ortho_genes_purged.fasta";
        
        my $random_genes_dyad_taxon_file = $file_root."_random_ortho_genes_dyads.tab";
        my $random_genes_dyad_taxon_file_done = $file_root."_random_ortho_genes_dyads.done";

        
        
        system ("mkdir -p $random_genes_taxon_dir_i");
        
        
        my ($random_genes_taxon_file_hdl) = &OpenOutputFile($random_genes_taxon_file);
        for (my $i = 0; $i < $nbgene_per_group; $i++) {
          print $random_genes_taxon_file_hdl join "\t", @{$taxon_genes[$shuffled_taxon_genes_size[$cpt++]]};
          print $random_genes_taxon_file_hdl "\n";
        }
        close $random_genes_taxon_file_hdl;
        
        
        system ("cat $random_genes_taxon_file | head -n 1 > $random_gene_taxon_file");
        # write commands
        my $rdm_footprint_cmd = $scripts."retrieve-seq-multigenome -v $verbose -i $random_genes_taxon_file -o $random_genes_seq_taxon_file ";
        $rdm_footprint_cmd .= "; ".$scripts."retrieve-seq-multigenome -v $verbose -i $random_gene_taxon_file -o $random_gene_seq_taxon_file";
        $rdm_footprint_cmd .= "; ".$scripts."purge-sequence -nodie -i $random_genes_seq_taxon_file -ml 30 -mis 0 -mask_short 30 -2str -o $random_genes_seq_purgednc_file";
        $rdm_footprint_cmd .= "; ".$scripts."convert-seq -i $random_genes_seq_purgednc_file -mask non-dna -from fasta -to fasta -dna -o $random_genes_seq_purged_file";
        $rdm_footprint_cmd .= "; ".$scripts."dyad-analysis -v $verbose -2str -noov -l 3 -sp 0-20 -i $random_gene_seq_taxon_file -o $random_gene_dyad_taxon_file";
        $rdm_footprint_cmd .= "; ".$scripts."dyad-analysis -v $verbose -expfreq ".$bgmodel{$taxon}." -accept $random_gene_dyad_taxon_file -return occ,proba,rank -uth rank 50 -lth occ 1 -lth occ_sig 0 -l 3 -sp 0-20 -i $random_genes_seq_purged_file -o $random_genes_dyad_taxon_file -2str -noov";
        $rdm_footprint_cmd .= "; touch $random_genes_dyad_taxon_file_done";

        &doit($rdm_footprint_cmd, $dry, $die_on_error, $verbose, $job_prefix);
      }
    }
    
    ##########################################################################################################
    ## Extract all dyad-analysis result files  (random and real data) and create one score file for each taxon
    ## Check that all jobs are done (real cases) ... it is done when the asmb file is produced
    $achieved = 0;
    foreach my $taxon (@taxa) {
      my $dyads = "";
      while (!$achieved) {
        sleep(20);
        &RSAT::message::TimeWarn ("All data are not available yet ... wait 20 seconds") if ($main::verbose >= 2);
        $achieved = 1;
        foreach my $gene (keys (%rdm_genes)) {
          my $taxon_gene_dir = $outputdir{root}."/".$taxon."/".$organism."/".$gene;
          my $asmb_file = $taxon_gene_dir."/".$gene."_".$organism."_".$taxon."_ortho_dyads_3nt_sp0-20-2str-noov_taxfreq_sig0.asmb";
          my $dyad_file = $taxon_gene_dir."/".$gene."_".$organism."_".$taxon."_ortho_dyads_3nt_sp0-20-2str-noov_taxfreq_sig0.tab";

          if (!-e $asmb_file) {
            $achieved = 0;
            $dyads = "";
            &RSAT::message::TimeWarn ("File $asmb_file does not exist yet") if ($main::verbose >= 2);
          }
          next if (!$achieved);
          $dyads .= &readDyads($dyad_file, $gene);
        }
      }
      my $dyad_score_taxon_file = $outputdir{dyad_scores}."/".$taxon."_real_dyad_scores.tab";
      my ($dyad_score_taxon_file_hdl) = &OpenOutputFile($dyad_score_taxon_file);
      print $dyad_score_taxon_file_hdl $dyads;
      close $dyad_score_taxon_file_hdl;
      $achieved = 0;
    }
    
    ## Check that all jobs are done (random cases) ... it is done when the .done file is produced
    $achieved = 0;
    foreach my $taxon (@taxa) {
      my $dyads = "";
      my $random_genes_dir_taxon =  $outputdir{random_genes_dir}."/".$taxon."/";
      while (!$achieved) {
        sleep(20);
        &RSAT::message::TimeWarn ("All data are not available yet ... wait 20 seconds") if ($main::verbose >= 2);
        $achieved = 1;
        for (my $i = 0; $i < $orthogroups_nb; $i ++) {
          
          my $random_genes_taxon_dir_i = $random_genes_dir_taxon."/"."random_genes_".$i;
          my $file_root = $random_genes_taxon_dir_i."/"."random_genes_".$taxon."_".$i;
          my $random_genes_dyad_taxon_file_done = $file_root."_random_ortho_genes_dyads.done";
          my $random_genes_dyad_taxon_file = $file_root."_random_ortho_genes_dyads.tab";
          if (!-e $random_genes_dyad_taxon_file_done) {
            $achieved = 0;
            &RSAT::message::TimeWarn ("File $random_genes_dyad_taxon_file_done does not exist yet") if ($main::verbose >= 2);
          }
          next if (!$achieved);
          $dyads .= &readDyads($random_genes_dyad_taxon_file, "rand_".$i);
        }
      }
      my $dyad_score_taxon_file = $outputdir{dyad_scores}."/".$taxon."_rdm_dyad_scores.tab";
      my ($dyad_score_taxon_file_hdl) = &OpenOutputFile($dyad_score_taxon_file);
      print $dyad_score_taxon_file_hdl $dyads;
      close $dyad_score_taxon_file_hdl;
      $achieved = 0;
    }   

    ################################################################
    ## Print verbose
    &Verbose() if ($main::verbose);

    ################################################################
    ## Execute the command

    ################################################################
    ## Insert here output printing

    ################################################################
    ## Report execution time and close output stream
    my $exec_time = &RSAT::util::ReportExecutionTime($start_time); ## This has to be exectuted by all scripts
    print $main::out $exec_time if ($main::verbose >= 1); ## only report exec time if verbosity is specified
    close $main::out if ($main::outfile{output});

    exit(0);
}

################################################################
################### SUBROUTINE DEFINITION ######################
################################################################


################################################################
## Display full help message 
sub PrintHelp {
    system "pod2text -c $0";
    exit()
}

################################################################
## Display short help message
sub PrintOptions {
    &PrintHelp();
}

################################################################
## Returns an array containing all the organism for a spe
sub getAllTaxonOrganisms {
   my @ref_organisms = ();
   my $taxon = shift;
   my $supported_organism_ref = shift;
   my %supported_organism = %{$supported_organism_ref};
   foreach my $org (keys %supported_organism) {
     my $taxonomy = $supported_organism{$org}->{"taxonomy"};
     my @taxa = split( /;\s*/, $taxonomy);
     push @taxa, $org;     
     foreach my $taxon_i (@taxa) {
       if (lc($taxon) eq lc($taxon_i)) {
         push @ref_organisms, $org;
         
       }
     }
   }
   return @ref_organisms;
}

################################################################
## Returns an array containing all the genes ids of a given set of organisms
## $array[$i][0] = gene $i name
## $array[$i][1] = organism name
sub getAllOrganismsGenes {
  my $organisms_ref = shift;
  my @organisms = @{$organisms_ref};
#   print join "\n", @organisms;
  my @genes = ();
  my $i = 0;
  foreach my $org (@organisms) {
    my $org_dir = $supported_organism{$org}->{'data'};
    my $cds_name_file = $org_dir."/genome/cds.tab";
    &RSAT::message::TimeWarn ("Reading CDS from file $cds_name_file") if ($main::verbose >= 3);
    my ($genenames_file_handler) = &OpenInputFile($cds_name_file);
    while (my $ligne = <$genenames_file_handler>) {
      next if ($ligne =~ /^--/);
      chomp $ligne;
      my @lignecp = split /\t/, $ligne;
      $genes[$i][0] = $lignecp[0];
      $genes[$i][1] = $org;
      $i++;
    }
    close $genenames_file_handler;
  }
  return @genes;
}

################################################################
## Read a dyad file and returns a string containing of the form 
## dyad<tab>score<tab>genename
sub readDyads {
  my $dyadfile = shift;
  my $id = shift;
  my ($dyadfile_handler) = &OpenInputFile($dyadfile);
  my $result = "";
  while (my $ligne = <$dyadfile_handler>) {
    next if ($ligne =~ /^#/);
    next if ($ligne =~ /^;/);
    chomp $ligne;
    my @lignecp = split /\t/, $ligne;
    my $dyad = $lignecp[0];
    my $sig = $lignecp[7];
    $result .= join "\t", $dyad, $sig, $id;
    $result .= "\n";
  }
  close $dyadfile_handler;
  return $result;
}

################################################################
## Read arguments 
sub ReadArguments {
  my $arg;
  my @arguments = @ARGV; ## create a copy to shift, because we need ARGV to report command line in &Verbose()
  while (scalar(@arguments) >= 1) {
    $arg = shift (@arguments);
    ## Verbosity

=pod

=head1 OPTIONS

=over 4

=item B<-v #>

Level of verbosity (detail in the warning messages during execution)

=cut
    if ($arg eq "-v") {
      if (&IsNatural($arguments[0])) {
	$main::verbose = shift(@arguments);
      } else {
	$main::verbose = 1;
      }


=pod

=item B<-h>

Display full help message

=cut
    } elsif ($arg eq "-h") {
      &PrintHelp();


=pod

=item B<-help>

Same as -h

=cut
    } elsif ($arg eq "-help") {
      &PrintOptions();

    ## Organism
=pod

=item	B<-org query_organism>

Query organism, to which the query genes belong.

=cut
  } elsif ($arg eq "-org") {
    $main::organism = shift(@arguments);

    ## Taxon
=pod

=item	B<-taxon reference_taxon>

Reference taxon, in which orthologous genes have to be collected. In principle, more than one taxon is given (coma sepated)

=cut
  } elsif ($arg eq "-taxon") {
    $main::taxon_list = shift(@arguments);
   
    ## Taxon
=pod

=item	B<-n number_of_controls>

Numbers of groups of orthologs. Default = 100

=cut
  } elsif ($arg eq "-n") {
    $main::orthogroups_nb = shift(@arguments);

=pod

=item B<-batch>

Generate one command per query gene, and post it on the queue of a PC
cluster.

=cut
  } elsif ($arg eq "-batch") {
    $main::batch = 1;

=pod

=item B<-i inputfile>

If no input file is specified, the standard input is used.  This
allows to use the command within a pipe.

=cut
    } elsif ($arg eq "-i") {
      $main::infile{input} = shift(@arguments);


=pod

=item	B<-o outputfile>

If no output file is specified, the standard output is used.  This
allows to use the command within a pipe.

=cut
    } elsif ($arg eq "-o") {
      $main::outfile{output} = shift(@arguments);

    } else {
      &FatalError(join("\t", "Invalid option", $arg));

    }
  }

=pod

=back

=cut

}

################################################################
## Verbose message
sub Verbose {
    print $main::out "; footprint-discovery-quality";
    &PrintArguments($main::out);
    printf $main::out "; %-22s\t%s\n", "Program version", $program_version;
    if (%main::infile) {
	print $main::out "; Input files\n";
	while (my ($key,$value) = each %main::infile) {
	  printf $main::out ";\t%-13s\t%s\n", $key, $value;
	}
    }
    if (%main::outfile) {
	print $main::out "; Output files\n";
	while (my ($key,$value) = each %main::outfile) {
	  printf $main::out ";\t%-13s\t%s\n", $key, $value;
	}
    }
}


__END__
