#!/usr/bin/perl

if ($0 =~ /([^(\/)]+)$/) {
    push (@INC, "$`lib/");
}
require "RSA.lib";

################################################################
## Main package
package main;
{
    ## Start time
    $start_time = &RSAT::util::StartScript();

    ## Initialise parameters ####
    $out_format = "fasta";
    $seq_nb = 1;
    $repetitions = 1;
    $line_width = 70;
    $sequence_type = "dna";

    ################################################################
    ## Supported background models
    %supported_bg = (
	'equi'=>1,
	'intergenic'=>1,
	'upstream'=>1,
	'upstream-noorf'=>1,
	'protein'=>1,
	);
    $supported_bg = join(",", sort keys %supported_bg);

    &ReadArguments();

    ################################################################
    ## Check alphabet
    if ($sequence_type eq "dna") {
	@letters = qw ( A C G T );
    } elsif ($sequence_type =~ /^prot/) {
	@letters = qw ( a c d e f g h i k l m n p q r s t v w y );
#    @letters = qw ( A C D E F G H I K L M N P Q R S T V W Y );
    } elsif ($sequence_type =~ /^other/) {
	@letters = ("a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", 
		    "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", 
		    " ", ".", ",", ":", ";");
    } else {
	&RSAT::error::FatalError($sequence_type, "is not a valid sequence type. Supported: dna,protein,other");
    }

    if (%alphabet) {
	%freq = %alphabet;
    } else {
	foreach $letter (@letters) {
	    $freq{$letter} = 1/($#letters + 1);
	}
    }


    ################################################################
    ## Check argument values

    ## Output formaat
    &CheckOutputSeqFormat($out_format);

    ################################################################
    ## File with the specification of sequence lengths
    if ($length_file) {
	($len_handle, $input_dir) = &OpenInputFile($length_file);
	#    open LEN, $length_file;
	while (<$len_handle>) {
	    chomp;
	    next if (/^;/);
	    next if (/^--/);
	    next if (/^#/);
	    next unless (/\S/);
	    my @fields = split "\t";
	    $length = $fields[1];
	    if (&IsNatural($length)) {
		push @lengths, $length;
	    }
	}
	close $len_handle;

	## Repeat sequence length list if requested
	if ($repetitions > 1) {
	    my @rep_lengths = @lengths;
	    for my $r (1..$repetitions) {
		push @rep_lengths, @lengths;
	    }
	    @lengths = @rep_lengths;
	}
	

	$seq_nb = $#lengths +1;
    } else {

	#### check sequence lengths and number of sequences
	unless (&IsNatural($seq_length)) {
	    &RSAT::error::FatalError ("You should specify the sequence length");
	}
	unless ($seq_nb >=1) {
	    &RSAT::error::FatalError( "Number of sequences should be >=1 (option -n)");
	}
    }

    ################################################################
    ## Background model
    if ($background_model eq "equi") {
	$method = "equiprobable residues";

    } elsif ($background_model) {

	### Localize frequency file in the RSAT data directory for the specified organism
	$method = "Frequency file";
	$exp_freq_file = &ExpectedFreqFile($organism_name, $oligo_length, $background_model, 
					   $noov=>"-ovlp", str=>"-1str");
    }


    ################################################################
    ## Calculate residue probabilities (Markov or Bernoulli model)
    if ($exp_freq_file) {

	## Load Markov model from an oligonucleotide frequency file
	&CalcConditionalProbabilities($exp_freq_file);

    } else {

	## Calculate cumulated frequencies for individual residues
	$cum_letter_freq[0] = $freq{$letters[0]};
	for $l (1..$#letters) {
	    $cum_letter_freq[$l] = $cum_letter_freq[$l-1]+$freq{$letters[$l]};
	}
	unless ($cum_letter_freq[$#letters] > 0) {
	    &RSAT::error::FatalError( "Invalid alphabet specification.");
	}
	for $l (0..$#letters) {
	    $freq{$letters[$l]} /= $cum_letter_freq[$#letters];
	    $cum_letter_freq[$l] /= $cum_letter_freq[$#letters];
	}
    }



    ################################################################
    ## Open output stream
    $out = &OpenOutputFile($outputfile);

    ################################################################
    ## Verbose
    &Verbose() if ($verbose);

    ## Initialize the random seed
    if (defined($seed)) {
	srand($seed);
    } else {
	srand (time);
    }


    ################################################################
    ## Generate the random sequence
    for $r (1..$seq_nb) {
	### initialization
	my $current_seq_length = $lengths[$r-1] || $seq_length;
	
	$sequence = "";
	$seq_id = "rand_$r";
	my @comments = ();
	my $comment = join ("; ", "random sequence ".$r."/".$seq_nb,
			    "type: ".$sequence_type,
			    "length: ".$current_seq_length,
	    );
	if ($background_model) {
	    $comment .= "; bg model: ".$background_model;
	    $comment .= "; organism: ".$organism_name if ($organism_name);
	}
	push @comments, $comment;

	if ($exp_freq_file) {
	    $rand = int (rand $#subword_keys);


	    ## Note: the selection of the first word of a sequence should
	    ## be improved. Currently, the first prefix (corresponding to
	    ## the markov order) is chosen in an equiprobable way. this
	    ## does not really matter as far as the sequences are larger
	    ## much larger than the markov order, which is usually the
	    ## case
	    $subword = $subword_keys[$rand]; ## Subword
	    $sequence .= uc($subword); ## First subword is printed in uppercases

	    for $i ((length($sequence)+1)..$current_seq_length) {
		#	    @cum_letter_freq = @{$cum_letter_freq{$subword}};
		$rand = rand;
		$l = 0;
		#      do {
		#	$l++;
		#      } until (($l > $#letters) || ($cum_letter_freq{$subword}[$l] >= $rand));

		while ($l <= $#letters) {
		    if ($cum_letter_freq{$subword}[$l] >= $rand) {
			last;
		    } else {
			$l++;
		    }
		}
		$next_letter = $letters[$l];
		$sequence .= $next_letter;
		$subword = lc(substr($subword.$next_letter,1));
	    }
	    
	} else {
	    for $i (1..$current_seq_length) {
		$rand = rand();
		$l = -1;
		do {
		    $l++;
		} until (($l > $#letters) || ($cum_letter_freq[$l] >= $rand));
		$next_letter = $letters[$l];
		$sequence .= $next_letter;
	    }
	}
	&PrintNextSequence($out, $out_format, $line_width, $sequence, $seq_id, @comments);
    }

    ## Close output file
    my $exec_time = &RSAT::util::ReportExecutionTime($start_time);
    print $main::out $exec_time if ($main::verbose >= 1);
    close $out if ($outputfile);

    exit(0);
}

################################################################
## Read arguments
sub ReadArguments {
  foreach $a (0..$#ARGV) {
    if ($ARGV[$a] eq "-v") {
      $verbose = 1;

    } elsif ($ARGV[$a] eq "-l") {
      $seq_length = $ARGV[$a+1];

    } elsif ($ARGV[$a] eq "-h") {
      &PrintHelp();

    } elsif ($ARGV[$a] eq "-help") {
      &PrintOptions();

    } elsif ($ARGV[$a] eq "-o") {
      $outputfile = $ARGV[$a+1];

    } elsif ($ARGV[$a] eq "-format") {
      $out_format = lc($ARGV[$a+1]);

      ## Sequence type (supported=dna,protein)
    } elsif ($ARGV[$a] eq "-type") {
      $sequence_type = lc($ARGV[$a+1]);

      ## Number of sequences
    } elsif ($ARGV[$a] eq "-n") {
      $seq_nb = $ARGV[$a+1];

    } elsif ($ARGV[$a] eq "-expfreq") {
      $exp_freq_file = $ARGV[$a+1];

    } elsif ($ARGV[$a] eq "-lw") {
      $line_width = $ARGV[$a+1];

    } elsif ($ARGV[$a] eq "-seed") {
      $seed = $ARGV[$a+1];

    } elsif ($ARGV[$a] eq "-a") {
      if ($ARGV[$a+1] eq "yeast") {
	$alphabet{'A'} = $alphabet{'T'} = 0.308512197555054;
	$alphabet{'C'} = $alphabet{'G'} = 0.191487802444946;
      # if argument between quotes
      } elsif (($ARGV[$a+1] =~ /a:t\s(\S+)\sc:g\s(\S+)/i) && ($1 <= 1) && ($1 >=0) && ($2 <= 1) && ($2 >= 0)) {
        $alphabet{'A'} = $alphabet{'T'} = $1;
        $alphabet{'C'} = $alphabet{'G'} = $2;
      } elsif (("$ARGV[$a+1] $ARGV[$a+2]" =~ /a:t (\S+)/i) && ($1 <=1) && ($1 >=0)) {
	$alphabet{'A'} = $alphabet{'T'} = $1;
	if (("$ARGV[$a+3] $ARGV[$a+4]" =~ /c:g (\S+)/i) && ($1 <=1) && ($1 >=0)) {
	  $alphabet{'C'} = $alphabet{'G'} = $1;
	}
      } else {
	&RSAT::error::FatalError("Invalid alphabet specification");
    }

      ### use oligo intergenic frequencies as expected frequencies
    } elsif ($ARGV[$a] =~ /^-ncf/i) {
      &Warning ("option -ncf is deprecated, use '-bg intergenic' instead");
      $background_model = "intergenic";

      ### specify a background model for estimating expected frequencies
    } elsif ($ARGV[$a] =~ /^-bg/i) {
      $background_model = $ARGV[$a+1];
      $background_model =~ s/ncf/intergenic/;
      if ($background_model eq "protein") {
	  $sequence_type = "protein";
      } else {
	  $sequence_type = "dna";
      }

      ### organism (for selecting the appropriate expected frequencies)
    } elsif ($ARGV[$a] =~ /^-org/i) {
      $organism_name = $ARGV[$a+1];

      #### oligonucleotide length
    } elsif ($ARGV[$a] eq "-ol") {
      &RSAT::error::FatalError("Options -ol and -mkv are mutually exclusive") if ($markov_given);
      $oligo_length = $ARGV[$a+1];
      $length_given = 1;


      #### markov order
    } elsif ($ARGV[$a] eq "-markov") {
      &RSAT::error::FatalError("Options -ol and -mkv are mutually exclusive") if ($oligo_given);
      $oligo_length = $ARGV[$a+1] + 1;
      $markov_given = 1;

      #### file containing a list of the desired lengths of random sequences
    } elsif ($ARGV[$a] eq "-lf") {
      $length_file =$ARGV[$a+1];

      ## Number of repetition for length file
    } elsif ($ARGV[$a] eq "-rep") {
      $repetitions = $ARGV[$a+1];


    }
  }
}


################################################################
## Print detailed help message
sub PrintHelp {
    open HELP, "| more";
    print HELP <<End_of_help;
NAME
	random-seq

USAGE
	random-seq -l seqlength [-v] [-o outputfile]

DESCRIPTION
	Generates random sequences according to different
	probabilistic models (Bernoulli, Markov chains).

CATEGORY
	sequences

OPTIONS
    OUTPUT OPTIONS
	-l	sequence length

	-n	Number of sequences.
                Generate a set of n sequences, each of length l.

	-v	verbose.
	-o outputfile
		if not specified, the standard output is used.
		This allows to place the comand within a pipe.
	-format	output format. 
		Two options are available:
		IG	IG suite from IntelliGenetics
		raw	(default) 
        -lw ##  Line width. A newline character will be inserted in the 
                sequence every ## bases. Default is 70. 
                -lw 0 will prevent newline insertion.

	-type	protein|DNA|other

	-seed #	seed for the random generator

    PROBABILITIES
	-a	alphabet
		must be followed by residue frequencies expressed precisely 
		this way:
		-a a:t # c:g #
		ex: 
		-a a:t 0.3 c:g 0.2

		The option
		-a yeast
		will automatically use the residue frequencies from yeast 
		genome.

	-expfreq

		Specifyt a file describing the background model. This
		file must be in oligo format, i.e. a table with
		expected oligomer frequencies. See
		I<convert-background-model> for conversion of other
		background model formats into oligomer frequency
		tables.

                When this option is used, the sequences are generated
		according to a Markov chain. For instance if the
		frequency file contains tetramers, each base will be
		selected randomly with a probability depending on the
		3 preceding bases:

		           p(Wj-3,j)
		p(Bj=B) = -----------
		          p(Wj-3,j-1)

		where	Bj	is the base at position j
			B	is a residue, ie A, C, G or T
			Wj-3,j-1
				is a word found in the sequence between 
				positions j-3 and j-1
			Wj-3,j	is the same word with the character B 
				concatenated at its end

	-bg	background model

		Automatically load a pre-calibrated exected frequency
		file from the RSAT genome distribution. When this
		option is used, the options -org and -ol are also
		required, to indicate the organism and the
		oligonucleotide length, respectively.

		When the option -bg is used, it is necessary to
		define the order of the markov chain, either with the
		option -markov, or with the option -ol (oligo length).

		This option is incompatible with the option -expfreq. 

		Type of sequences used as background model for
		estimating expected oligonucleotide frequencies.

		Supported models:
		-bg equi
		    equiprobable residue frequencies (default)

		-bg upstream-noorf
		    all upstream sequences, preventing overlap with
		    upstream open reading frames (ORFs). Requires to
		    speciy a model organism.

		-bg upstream
		    all upstream sequences, allowing overlap with
		    upstream ORFs. Requires to speciy a model
		    organism.

		-bg intergenic
		    intergenic frequencies Whole set of intergenic
		    regions, including upstream and downstream
		    sequences. Requires to speciy a model organism.

		-bg protein
		    generate random peptidic sequences using a Markov
		    chain calibrated on all peptidic sequences of the
		    selected organism.

	-org	organism
		This is necessary with the option -bg, to determine
		which frequency file has to be sued for expected
		frequency calibration.

	-markov	markov order for the background model

		Note: options -markov and -ol are mutually exclusive. 

	-ol	oligo-length

		Oligonucleotide length used to train the Markov
		background model. If the oligo length is k, the markov
		order is m = k-1.

	-lf	length file 

		Generate random sequences with the same lengths as a
		set of reference sequences. 

		This option is incompatible with the options -l and -n.

		The sequence length file can be obtained with the
		command
			sequence-lengths

		The length file contains two columns : 
		    - sequence ID (ignored)
		    - sequence length

	-rep	Number of repetitions

		The list if sequence lengths is repeated rep times.

		This option only serves when a length file is used to
		specify sequence-sise lengths.

		This option is incompatible with the options -l and
		-n.

WEB VERSION
	http://bigre.ulb.ac.be/bioinformatics/rsa-tools/

End_of_help
    close HELP;
    exit(0);
}

################################################################
## Print a summary of available options
sub PrintOptions {
    open HELP, "| more";
    print HELP <<End_short_help;
random-seq options
==================
-l		sequence length
-n		number of sequences
-v		verbose
-o		outputfile
-format		output format
-lw ##		Line width
-a		alphabet. ex: -a a:t 0.3 c:g 0.2
-expfreq	file containing expected oligomer frequencies
-type		protein|DNA|other
-seed #	seed for the random generator
-ncf		(deprecated, use '-bg intergenic' instead)
-bg		background model (supported: $supported_bg)
-org		organism
-markov		markov order (incompatible with -ol)
-ol		Oligo length (incompatible with -markov)
-lf		Length file (incompatible with options -l and -n)
-rep		Number of repetitions (incompatible with options -l and -n)
End_short_help
    close HELP;
    exit(0);
}

################################################################
## Verbosity
sub Verbose {
  print $out "; random-seq ";
  &PrintArguments($out);
  if ($length_file) {
      print $out "; Sequence length file	        $length_file\n";
      print $out "; Repetitions          	        $repetitions\n";
  } else {
      print $out "; sequence length                     $seq_length\n";
  }
  print $out "; Number of sequences     $seq_nb\n";
  print $out "; Sequence format         $out_format\n";
  if ($outputfile) {
    print $out "; Output file	$outputfile\n";
  }
  print $out "; residue frequencies: $method\n";
  if ($exp_freq_file) {
    print $out "; Calibrated oligomer frequencies\n";
    print $out "; 	oligomer length:         $oligo_length\n";
    print $out "; 	expected frequency file: $exp_freq_file\n";

    if ($main::verbose >= 3) {
      print $out "; oligomer frequencies\n";
      print $out "; subword frequencies\n";
      foreach $subword (sort keys %subword_freq) {
	printf $out ";\t%s\t%f\t", $subword, $subword_freq{$subword};
	for $l (0..$#letters) {
	  printf $out (" %s:%g",
				 $letters[$l],
				 $cum_letter_freq{$subword}[$l] - ($l>0)*$cum_letter_freq{$subword}[$l-1]
				);
		}
		print $out "\n";
	    }
	}
    } else {
	for $l (0..$#letters) {
	    print $out "; \t";
	    print $out "; $letters[$l]\t";
	    print $out "; $freq{$letters[$l]}\n";    
	}
    }
}
