#!/usr/bin/env perl

if (($ARGV[0] eq "-h") || ($ARGV[0] eq "-help")) {
  open HELP, "| more";
  print HELP <<End_of_help;
NAME
	mismatches

DESCRIPTION
	returns a list of sequences matching a DNA pattern 
        with a given number of mismatches
	
USAGE
        mismatches -p pattern [-o outputfile] [-v] -mis #
	
CATEGORY
	util
	sequences

OPTIONS
	-v	verbose.
	-p pattern
                the pattern must be a DNA sequence. 
                if the -p option is not used, the standard input is
                used
        -mis #    number of allowed mismatches.
        -IUPAC  use IUPAC code in the output (this makes shorter the 
                lists of result patterns)
	-o outputfile
		if not specified, the standard output is used.
		This allows to place the comand within a pipe.
        
		
PATTERN FORMAT
        The standard degenerate nucleotide code of the IUPAC-IUB commission 
        is supported (see http://www.chem.qmw.ac.uk/iupac/misc/naseq.html for 
        complete information).
        The pattern sequence should thus only contain the following characters:
                A                       (Adenine)
                C                       (Cytosine)
                G                       (Guanine)
                T                       (Thymine)
                R       = A or G        (puRines)
                Y       = C or T        (pYrimidines)
                W       = A or T        (Weak hydrogen bonding)
                S       = G or C        (Strong hydrogen bonding)
                M       = A or C        (aMino group at common position)
                K       = G or T        (Keto group at common position)
                H       = A, C or T     (not G)
                B       = G, C or T     (not A)
                V       = G, A, C       (not T)
                D       = G, A or T     (not C)
                N       = G, A, C or T  (aNy)
        Upper and lower case are considered equivalent.

	
OUTPUT FORMAT
	
	
EXAMPLES
	
End_of_help
    close HELP;
  exit;
}

$start_time = &RSAT::util::StartScript();

#### initialise parameters ####

#### read arguments ####
foreach $a (0..$#ARGV) {

  if ($ARGV[$a] eq "-v") {
    $verbose = 1;
    
  } elsif ($ARGV[$a] eq "-p") {
    $pattern = $ARGV[$a+1];

  } elsif ($ARGV[$a] eq "-mis") {
    $allowed_mis = $ARGV[$a+1];

  } elsif ($ARGV[$a] eq "-o") {
    $outputfile = $ARGV[$a+1];

  } elsif (lc($ARGV[$a]) eq "-iupac") {
    $IUPAC_output = 1;;

  }
}



#### check argument values ####
unless ($allowed_mis =~ /\d+/) {
      print "\tYou should enter a valid number of mismatches.\n";
      print "\ttype mismatches -h for help\n";
      exit;  
}

### get the pattern ###
if ($pattern eq "") {
  if (<STDIN> =~ /(\S+)/) {
    $pattern = $1;
  } else {
      print "\tYou should enter a valid pattern.\n";
      print "\ttype mismatches -h for help\n";
      exit;
  }
}



### open output file ###
if ($outputfile ne "") {
  unless (open(OUTPUT, ">$outputfile")) {
    print "\tcannot open output file\n";
    print "\ttype mismatches -h for help\n";
    exit;
  }
  $out = OUTPUT;
} else {
  $out = STDOUT;
}

#### verbose ####
if ($verbose) {
  print ";mismatches result\n";
  if ($outputfile ne "") {
    print $out ";Output file	$outputfile\n";
  }
  if ($IUPAC_output) {
      print ";IUPAC output\n";
  }
  print ";pattern\t$pattern\n";
  print "; $allowed_ins\tallowed mismatches\n";
  print "; ", $#keys +1,"\tmatching patterns\n";
}

###### execute the command #########
for $mis (0..$allowed_mis) {
    for $subst(0..$mis) {
	for $del(0..$mis-$subst) {
	    foreach $pat($to_delete) {
		for $ins(0..$mis-$subst-$del) {
		    foreach $pat (@to_insert) {
			%inserted = &InsertPattern($pat,$ins,$IUPAC_output);
			@inserted = keys %inserted;
		    }
		}
	    }
	}
    }
}

###### print output ######

@keys = sort {($mis{$a} > $mis{$b}) or (($mis{$a} == $mis{$b}) and (lc($
a) cmp lc($b)))} keys %mis;

foreach $p (@keys) {
    print $out "$p\t$mis{$p}\tsubst{$p}\t$ins{$p}\tdel{$p}\n";  
}


###### close input file ######
if ($inputfile ne "") {
  close $in;
}


################################################################
## Report execution time and close output stream
my $exec_time = &RSAT::util::ReportExecutionTime($start_time); ## This has to be exectuted by all scripts
print $main::out $exec_time if ($main::verbose >= 1); ## only report exec time if verbosity is specified
close $main::out if ($outputfile);


exit(0);


########################## subroutine definition ############################

sub InsertPattern {
### usage 
### %ins_hash = InsertPattern($pattern, $allowed_ins, $IUPAC_output)
### returns a hash of DNA sequences matching $pattern 
### with >= $allowed_ins insertions
### the keys of the hash are the matching patterns,
### the values give the number of insertions

    local($l_pattern) = uc($_[0]);
    local($l_allowed_ins) = $_[1];
    local($l_IUPAC_output) = $_[2];
    local($l_pattern_length) = length($l_pattern);
    local(@to_insert) = ();
    local(@inserted) = ($l_pattern);
    local(%inserted);
    $inserted{$l_pattern} = 0;

    if ($l_IUPAC_output) {
	@insertable_nucl = (N);
    } else {
	@insertable_nucl = (A,C,G,T);
    }


    for $ins (1..$allowed_ins) {
	@to_insert = @inserted;
	@inserted = ();
	foreach $pat (@to_insert) {
	    for $pos (0..$l_pattern_length) {
		foreach $nucl (@insertable_nucl) {
		    if ($pos == 0) {
			$inserted_pattern = "";
		    } else {
			$inserted_pattern = substr($pat,0,$pos);
		    }
		    $inserted_pattern .= $nucl;
		    $inserted_pattern .= substr($pat,$pos);
		    $inserted{$inserted_pattern} = $ins;
		    @inserted = (@inserted, $inserted_pattern);
		}
	    }
	}
    }

    return(%inserted);
}

