#!/usr/bin/perl

if ($0 =~ /([^(\/)]+)$/) {
    push (@INC, "$`lib/");
}
require "RSA.lib";

#### initialise parameters ####
$out_format = "fasta";
$seq_nb = 1;
$line_width = 70;
$sequence_type = "dna";

################################################################
## Supported background models
%supported_bg = (
		 equi=>1,
		 intergenic=>1,
		 upstream=>1,
		 upstream-noorf=>1,
		 protein=>1,
		 );
$supported_bg = sort keys %supported_bg;

&ReadArguments();


################################################################
#
# Check alphabet
#
if ($sequence_type eq "dna") {
    @letters = qw ( A C G T );
} elsif ($sequence_type =~ /^prot/) {
    @letters = qw ( a c d e f g h i k l m n p q r s t v w y );
#    @letters = qw ( A C D E F G H I K L M N P Q R S T V W Y );
} elsif ($sequence_type =~ /^other/) {
    @letters = ("a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", 
		"n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", 
		" ", ".", ",", ":", ";");
} else {
    &RSAT::error::FatalError($sequence_type, "is not a valid sequence type. Supported: dna,protein,other");
}

if (defined(%alphabet)) {
    %freq = %alphabet
} else {
    foreach $letter (@letters) {
	$freq{$letter} = 1/($#letters + 1);
    }
}


################################################################
## Check argument values

## Output formaat
&CheckOutputSeqFormat($out_format);

################################################################
## File with the specification of sequence lengths
if ($length_file) {
    ($len_handle, $input_dir) = &OpenInputFile($length_file);
#    open LEN, $length_file;
    while (<$len_handle>) {
	chomp;
	next if (/^;/);
	next if (/^--/);
	next if (/^#/);
	next unless (/\S/);
	my @fields = split "\t";
	$length = $fields[1];
	if (&IsNatural($length)) {
	  push @lengths, $length;
	}
    }
    close $len_handle;
    $seq_nb = $#lengths +1;
} else {

    #### check sequence lengths and repetitions
    unless (&IsNatural($seq_length)) {
	&RSAT::error::FatalError ("You should specify the sequence length");
    } 
    
    unless ($seq_nb >=1) {
	&RSAT::error::FatalError( "Number of repetitions should be >=1");
    }
}
 
################################################################
## Background model
if ($background_model eq "equi") {
    $method = "equiprobable residues";

} elsif ($background_model) {
  
    ### Localize frequency file in the RSAT data directory for the specified organism
    $method = "Frequency file";
    $exp_freq_file = &ExpectedFreqFile($organism_name, $oligo_length, $background_model, 
				       $noov=>"-ovlp", str=>"-1str");
} 


################################################################
## Calculate residue probabilities (Markov or Bernoulli model)
if ($exp_freq_file) {

    ## Load Markov model from an oligonucleotide frequency file
    &CalcConditionalProbabilities($exp_freq_file);

} else {

    ## Calculate cumulated frequencies for individual residues
    $cum_letter_freq[0] = $freq{$letters[0]};
    for $l (1..$#letters) {
	$cum_letter_freq[$l] = $cum_letter_freq[$l-1]+$freq{$letters[$l]};
    }
    unless ($cum_letter_freq[$#letters] > 0) {
	&RSAT::error::FatalError( "Invalid alphabet specification.");
    }
    for $l (0..$#letters) {
	$freq{$letters[$l]} /= $cum_letter_freq[$#letters];
	$cum_letter_freq[$l] /= $cum_letter_freq[$#letters];
    }
}



################################################################
## open output stream
$out = &OpenOutputFile($outputfile);

################################################################
## Verbose
&Verbose() if ($verbose);

## Initialize the random seed
if (defined($seed)) {
    srand($seed);
} else {
    srand (time);
}


################################################################
## Generate the random sequence
for $r (1..$seq_nb) {
  ### initialization
  my $current_seq_length = $lengths[$r-1] || $seq_length;

  $sequence = "";
  $seq_id = "rand_$r";
  my @comments = ();
  my $comment = join ("; ", "random sequence ".$r."/".$seq_nb,
		      "type: ".$sequence_type,
		      "length: ".$current_seq_length,
		     );
  if ($background_model) {
      $comment .= "; bg model: ".$background_model;
      $comment .= "; organism: ".$organism_name if ($organism_name);
  }
  push @comments, $comment;

  if ($exp_freq_file) {
    $rand = int (rand $#subword_keys);


    ## Note: the selection of the first word of a sequence should
    ## be improved. Currently, the first prefix (corresponding to
    ## the markov order) is chosen in an equiprobable way. this
    ## does not really matter as far as the sequences are larger
    ## much larger than the markov order, which is usually the
    ## case
    $subword = $subword_keys[$rand]; ## Subword
    $sequence .= uc($subword); ## First subword is printed in uppercases

    for $i ((length($sequence)+1)..$current_seq_length) {
      #	    @cum_letter_freq = @{$cum_letter_freq{$subword}};
      $rand = rand;
      $l = 0;
      #      do {
      #	$l++;
      #      } until (($l > $#letters) || ($cum_letter_freq{$subword}[$l] >= $rand));

      while ($l <= $#letters) {
	if ($cum_letter_freq{$subword}[$l] >= $rand) {
	  last;
	} else {
	  $l++;
	}
      }
      $next_letter = $letters[$l];
      $sequence .= $next_letter;
      $subword = lc(substr($subword.$next_letter,1));
    }
    
  } else {
    for $i (1..$current_seq_length) {
      $rand = rand();
      $l = -1;
      do {
	$l++;
      } until (($l > $#letters) || ($cum_letter_freq[$l] >= $rand));
      $next_letter = $letters[$l];
      $sequence .= $next_letter;
    }
  }
  &PrintNextSequence($out, $out_format, $line_width, $sequence, $seq_id, @comments);
}

###### close output file ######
close $out if ($outputfile);


exit(0);


################################################################
## Read arguments
sub ReadArguments {
  foreach $a (0..$#ARGV) {
    if ($ARGV[$a] eq "-v") {
      $verbose = 1;

    } elsif ($ARGV[$a] eq "-l") {
      $seq_length = $ARGV[$a+1];

    } elsif ($ARGV[$a] eq "-h") {
      &PrintHelp();

    } elsif ($ARGV[$a] eq "-help") {
      &PrintOptions();

    } elsif ($ARGV[$a] eq "-o") {
      $outputfile = $ARGV[$a+1];

    } elsif ($ARGV[$a] eq "-format") {
      $out_format = lc($ARGV[$a+1]);

      ## Sequence type (supported=dna,protein)
    } elsif ($ARGV[$a] eq "-type") {
      $sequence_type = lc($ARGV[$a+1]);

      ## Number of sequences
    } elsif ($ARGV[$a] eq "-n") {
      $seq_nb = $ARGV[$a+1];

    } elsif ($ARGV[$a] eq "-r") {
      $seq_nb = $ARGV[$a+1];
      &RSAT::message::Warning ("option -r is deprecated, you should use -n instead.");

    } elsif ($ARGV[$a] eq "-expfreq") {
      $exp_freq_file = $ARGV[$a+1];

    } elsif ($ARGV[$a] eq "-lw") {
      $line_width = $ARGV[$a+1];

    } elsif ($ARGV[$a] eq "-seed") {
      $seed = $ARGV[$a+1];

    } elsif ($ARGV[$a] eq "-a") {
      if ($ARGV[$a+1] eq "yeast") {
	$alphabet{'A'} = $alphabet{'T'} = 0.308512197555054;
	$alphabet{'C'} = $alphabet{'G'} = 0.191487802444946;
      # if argument between quotes
      } elsif (($ARGV[$a+1] =~ /a:t\s(\S+)\sc:g\s(\S+)/i) && ($1 <= 1) && ($1 >=0) && ($2 <= 1) && ($2 >= 0)) {
        $alphabet{'A'} = $alphabet{'T'} = $1;
        $alphabet{'C'} = $alphabet{'G'} = $2;
      } elsif (("$ARGV[$a+1] $ARGV[$a+2]" =~ /a:t (\S+)/i) && ($1 <=1) && ($1 >=0)) {
	$alphabet{'A'} = $alphabet{'T'} = $1;
	if (("$ARGV[$a+3] $ARGV[$a+4]" =~ /c:g (\S+)/i) && ($1 <=1) && ($1 >=0)) {
	  $alphabet{'C'} = $alphabet{'G'} = $1;
	}
      } else {
	&RSAT::error::FatalError("Invalid alphabet specification");
    }

      ### use oligo intergenic frequencies as expected frequencies
    } elsif ($ARGV[$a] =~ /^-ncf/i) {
      &Warning ("option -ncf is deprecated, use '-bg intergenic' instead");
      $background_model = "intergenic";

      ### specify a background model for estimating expected frequencies
    } elsif ($ARGV[$a] =~ /^-bg/i) {
      $background_model = $ARGV[$a+1];
      $background_model =~ s/ncf/intergenic/;
      if ($background_model eq "protein") {
	  $sequence_type = "protein";
      } else {
	  $sequence_type = "dna";
      }

      ### organism (for selecting the appropriate expected frequencies)
    } elsif ($ARGV[$a] =~ /^-org/i) {
      $organism_name = $ARGV[$a+1];

      #### oligonucleotide length
    } elsif ($ARGV[$a] eq "-ol") {
      $oligo_length = $ARGV[$a+1];

      #### file containing a list of the desired lengths of random sequences 
    } elsif ($ARGV[$a] eq "-lf") {
      $length_file =$ARGV[$a+1];

    }
  }
}


################################################################
## Print detailed help message
sub PrintHelp {
    open HELP, "| more";
    print HELP <<End_of_help;
NAME
	random-seq

USAGE
	random-seq -l seqlength [-v] [-o outputfile]

DESCRIPTION
	Generates random sequences according to different
	probabilistic models (Bernoulli, Markov chains).

CATEGORY
	sequences

OPTIONS
    OUTPUT OPTIONS
	-l	sequence length
	-n	repetitions. Allows to generate a set r of sequences,
		each of length l.
	-v	verbose.
	-o outputfile
		if not specified, the standard output is used.
		This allows to place the comand within a pipe.
	-format	output format. 
		Two options are available:
		IG	IG suite from IntelliGenetics
		raw	(default) 
        -lw ##  Line width. A newline character will be inserted in the 
                sequence every ## bases. Default is 70. 
                -lw 0 will prevent newline insertion.

	-type	protein|DNA|other

	-seed #	seed for the random generator

    PROBABILITIES
	-a	alphabet
		must be followed by residue frequencies expressed precisely 
		this way:
		-a a:t # c:g #
		ex: 
		-a a:t 0.3 c:g 0.2

		The option
		-a yeast
		will automatically use the residue frequencies from yeast 
		genome.

	-expfreq
		indicate the file that contains expected oligomer 
		frequencies. When this option is used, the sequences are 
		generated according to a Markov chain. For instance if the 
		frequency file contains tetramers, each base will be 
		selected randomly with a probability depending on the 3 
		preceding bases:

		           p(Wj-3,j)
		p(Bj=B) = -----------
		          p(Wj-3,j-1)

		where	Bj	is the base at position j
			B	is a residue, ie A, C, G or T
			Wj-3,j-1
				is a word found in the sequence between 
				positions j-3 and j-1
			Wj-3,j	is the same word with the character B 
				concatenated at its end

	-bg	background model

		Automatically load a pre-calibrated exected frequency
		file from the RSAT genome distribution. When this
		option is used, the options -org and -ol are also
		required, to indicate the organism and the
		oligonucleotide length, respectively.

		This option is incompatible with the option -expfreq. 

		Type of sequences used as background model for
		estimating expected oligonucleotide frequencies.

		Supported models:
		-bg equi
		    equiprobable residue frequencies (default)

		-bg upstream-noorf
		    all upstream sequences, preventing overlap with
		    upstream open reading frames (ORFs). Requires to
		    speciy a model organism.

		-bg upstream
		    all upstream sequences, allowing overlap with
		    upstream ORFs. Requires to speciy a model
		    organism.

		-bg intergenic
		    intergenic frequencies Whole set of intergenic
		    regions, including upstream and downstream
		    sequences. Requires to speciy a model organism.

		-bg protein
		    generate random peptidic sequences using a Markov
		    chain calibrated on all peptidic sequences of the
		    selected organism.

	-org	organism
		This is necessary with the option -bg, to determine
		which frequency file has to be sued for expected
		frequency calibration.
	-ol	oligo-length
		This is necessary with the option -bg, to determine
		the oligonucleotide length for the Markov chain model.
	-lf	length file 
		Allows to generate random sequences with the same
		lengths as a set of reference sequences. The sequence
		length file can be obtained with the command
			sequence-lengths
		The length file contains two columns : 
		    - sequence ID (ignored)
		    - sequence length
WEB VERSION
	http://bigre.ulb.ac.be/bioinformatics/rsa-tools/

End_of_help
    close HELP;
    exit(0);
}

################################################################
## Print a summary of available options
sub PrintOptions {
    open HELP, "| more";
    print HELP <<End_short_help;
random-seq options
==================
-l		sequence length
-n		repetitions (number of sequences)
-v		verbose
-o		outputfile
-format		output format. 
-lw ##		Line width. 
-a		alphabet. ex: -a a:t 0.3 c:g 0.2
-expfreq	file containing expected oligomer frequencies
-type		protein|DNA|other
-seed #	seed for the random generator
-ncf		(deprecated, use '-bg intergenic' instead) 
-bg		background model (supported: $supported_bg)
-org		organism 
-ol		oligo-length
-lf		length file 
End_short_help
    close HELP;
    exit(0);
}

################################################################
## Verbosity
sub Verbose {
  print $out "; random-seq ";
  &PrintArguments($out);
  print $out "; sequence length	        $seq_length\n";
  print $out "; number of sequences     $seq_nb\n";
  print $out "; format			$out_format\n";
  if ($outputfile) {
    print $out "; Output file	$outputfile\n";
  }
  print $out "; residue frequencies: $method\n";
  if ($exp_freq_file) {
    print $out "; Calibrated oligomer frequencies\n";
    print $out "; 	oligomer length:         $oligo_length\n";
    print $out "; 	expected frequency file: $exp_freq_file\n";

    if ($main::verbose >= 3) {
      print $out "; oligomer frequencies\n";
      print $out "; subword frequencies\n";
      foreach $subword (sort keys %subword_freq) {
	printf $out ";\t%s\t%f\t", $subword, $subword_freq{$subword};
	for $l (0..$#letters) {
	  printf $out (" %s:%g",
				 $letters[$l],
				 $cum_letter_freq{$subword}[$l] - ($l>0)*$cum_letter_freq{$subword}[$l-1]
				);
		}
		print $out "\n";
	    }
	}
    } else {
	for $l (0..$#letters) {
	    print $out "; \t";
	    print $out "; $letters[$l]\t";
	    print $out "; $freq{$letters[$l]}\n";    
	}
    }
}
