#!/usr/bin/env perl
if ($0 =~ /([^(\/)]+)$/) {
    push (@INC, "$`lib/");
}
require "RSA.lib";



if ($ARGV[0] eq "-h") {
  open HELP, "| more";
  print HELP <<End_of_help;
NAME
	substit

DESCRIPTION
	Returns a list of sequences matching a nucleotide pattern with
        a given number of substitutions (only ATGC are supported). 

CATEGORY
	sequences

USAGE
        substitutions [-i inputfile | -p pattern] [-o outputfile] [-v] -s #
	
OPTIONS
	-v	verbose.
	-p pattern
                the pattern must be a DNA sequence. 
                if the -p option is not used, the standard input is
                used
        -s #    number of allowed substitutions.
        -IUPAC  use IUPAC code in the output (this makes shorter the 
                lists of substituted patterns)
	-o outputfile
		if not specified, the standard output is used.
		This allows to place the comand within a pipe.
		
PATTERN FORMAT
        The standard degenerate nucleotide code of the IUPAC-IUB commission 
        is supported (see http://www.chem.qmw.ac.uk/iupac/misc/naseq.html for 
        complete information).
        The pattern sequence should thus only contain the following characters:
                A                       (Adenine)
                C                       (Cytosine)
                G                       (Guanine)
                T                       (Thymine)
                R       = A or G        (puRines)
                Y       = C or T        (pYrimidines)
                W       = A or T        (Weak hydrogen bonding)
                S       = G or C        (Strong hydrogen bonding)
                M       = A or C        (aMino group at common position)
                K       = G or T        (Keto group at common position)
                H       = A, C or T     (not G)
                B       = G, C or T     (not A)
                V       = G, A, C       (not T)
                D       = G, A or T     (not C)
                N       = G, A, C or T  (aNy)
        Upper and lower case are considered equivalent.

	
OUTPUT FORMAT
	
	
EXAMPLES
	
End_of_help
    close HELP;
  exit;
}

if ($ARGV[0] eq "-help") {
  open HELP, "| more";
  print HELP <<End_short_help;
substitutions options
---------------------
-v	verbose.
-p      pattern
-s #    number of allowed substitutions.
-IUPAC  use IUPAC code in the output
-o      outputfile
End_short_help
    close HELP;
  exit;
}

$start_time = &RSAT::util::StartScript();

#### initialise parameters ####

#### read arguments ####
foreach $a (0..$#ARGV) {

  if ($ARGV[$a] eq "-v") {
    $verbose = 1;
    
  } elsif ($ARGV[$a] eq "-p") {
    $pattern = $ARGV[$a+1];

  } elsif ($ARGV[$a] eq "-s") {
    $allowed_subst = $ARGV[$a+1];

  } elsif ($ARGV[$a] eq "-o") {
    $outputfile = $ARGV[$a+1];

  } elsif (lc($ARGV[$a]) eq "-iupac") {
    $IUPAC_output = 1;;

  }
}



#### check argument values ####
unless ($allowed_subst =~ /\d+/) {
      print "\tYou should enter a valid number of substitutions.\n";
      print "\ttype substitutions -h for help\n";
      exit;  
}

### get the pattern ###
if ($pattern eq "") {
  if (<STDIN> =~ /(\S+)/) {
    $pattern = $1;
  } else {
      print "\tYou should enter a valid pattern.\n";
      print "\ttype substitutions -h for help\n";
      exit;
  }
}
if ($allowed_subst > length($pattern)) {
      print "\tSubstitution number cannot be higher than pattern length.\n";
      print "\ttype substitutions -h for help\n";
      exit;
}



### open output file ###
if ($outputfile ne "") {
  unless (open(OUTPUT, ">$outputfile")) {
    print "\tcannot open output file\n";
    print "\ttype substitutions -h for help\n";
    exit;
  }
  $out = OUTPUT;
} else {
  $out = STDOUT;
}

###### execute the command #########

%subst = Substitute($pattern, $allowed_subst, $IUPAC_output);

###### print output ######

@keys = sort {($subst{$a} > $subst{$b}) or (($subst{$a} == $subst{$b}) and (lc($a) cmp lc($b)))} keys %subst;

foreach $p (@keys) {
    print $out "$p\t$subst{$p}\n";  
}


#### verbose ####
if ($verbose) {
  print ";substitutions result\n";
  if ($outputfile ne "") {
    print $out ";Output file	$outputfile\n";
  }
  if ($IUPAC_output) {
      print ";IUPAC output\n";
  }
  print ";pattern\t$pattern\n";
  print "; $allowed_subst\tallowed substitutions\n";
  print "; ", $#keys +1,"\tmatching patterns\n";
}


###### close input file ######
if ($inputfile ne "") {
  close $in;
}


################################################################
## Report execution time and close output stream
my $exec_time = &RSAT::util::ReportExecutionTime($start_time); ## This has to be exectuted by all scripts
print $main::out $exec_time if ($main::verbose >= 1); ## only report exec time if verbosity is specified
close $main::out if ($outputfile);



exit(0);


