#!/usr/bin/perl
############################################################
#
# $Id: purge-sequence,v 1.33 2009/11/05 00:32:07 jvanheld Exp $
#
# Time-stamp: <2003-10-05 23:01:33 jvanheld>
#
############################################################
#use strict;;
if ($0 =~ /([^(\/)]+)$/) {
    push (@INC, "$`lib/");
}
require "RSA.lib";


my $skdir = "$ENV{RSAT}/bin";
my $delete_repeats=0;

#### initialise parameters ####
my $start_time = &AlphaDate();
my $convert_seq_command = "$SCRIPTS/convert-seq";
$both_strands = 1;
$inputfile = "";
$outputfile = "";
$match_len = 40;
$mismatches = 3;
$in_format = "fasta";
$verbose = 0;
$dry_run = 0;
$die_on_error = 1;

&ReadArguments();


#### check argument values ####
### check input format 
&CheckInputSeqFormat($in_format);

$dir{execution} = `pwd`;
chomp $dir{execution};

if ($outputfile) {
    $basename{out} = `basename ${outputfile}`;
    $dirname{out} = `dirname ${outputfile}`;
    chomp $dirname{out};
    chdir $dirname{out};
    $dir{output} = `pwd`;
    chomp($dir{output});
}

chdir($dir{execution});

if ($inputfile) {
    $basename = `basename ${inputfile}`;
    chomp $basename;
    $dirname = `dirname ${inputfile}`;
    chomp $dirname;
    chdir $dirname;
    $dir{input} = `pwd`;
    chomp($dir{input});
#    warn join "\t", $dirname, $dir{input}, $basename, "\n"
} else {
    $dirname = ".";
    $basename = $start_time;
}

#### create a temp file to filter out non-conform lines 
$tmp_file = "tmp_".$basename.".fasta";

#$command = "cd $TMP ; ";
$command = " $convert_seq_command -from $in_format -to fasta -o ${TMP}/$tmp_file -noempty ";
$command .= " -mask_short ".$mask_short if ($mask_short);
$command .= " -mask non-dna " if ($dna);
$command .= " -i $dir{input}/$basename" if ($inputfile);
&doit($command, $dry_run, $die_on_error, $verbose);


###### execute the command #########
@extensions = qw(al1 ssp bck bwt des lcp ois llv prj suf tis sds sti1);

#### build the tree
my $mkvtree_options = "";
$mkvtree_options .= " -bck"; # bucket boundaries ???
$mkvtree_options .= " -dna"; # input is DNA sequence
$mkvtree_options .= " -tis"; # output parsed and transformed input string to file
$mkvtree_options .= " -bwt"; # output Burrows & Wheeler Transform to file
$mkvtree_options .= " -suf"; # output suffix array to file
$mkvtree_options .= " -sti1"; # output reduced inverse suffix array (sti1tab) to file
$mkvtree_options .= " -lcp"; # output longest common prefix lengths to file
$mkvtree_options .= " -ois"; # output the parsed input string to a file
$mkvtree_options .= " -pl"; # length of prefix for bucket sort (default 0)
$mkvtree_options .= " -db $tmp_file"; # sequence file
$command = "cd $TMP; $skdir/mkvtree $mkvtree_options";
&doit($command, $dry_run, $die_on_error, $verbose);

#### locate the duplications
my $vmatch_options = "-s"; # show the sequence content of a match
if ($delete_repeats) {
    ## delete repeats
    $vmatch_options .= " -dbnomatch 1 "; # show all database sequence not containing a match
} else {
    ## mask repeats
    $vmatch_options .= " -dbmaskmatch n "; # show all database sequence not containing a match
}
$vmatch_options .= " keepleft"; # keep the left occurrence of each repeat
$vmatch_options .= " -l $match_len"; # specify that match must have the given length
$vmatch_options .= " -e $mismatches" if ($mismatches > 0); # specify the allowed edit distance > 0
$vmatch_options .= " -d"; # compute direct matches 
$vmatch_options .= " -showdesc 0"; # use the original sequence identifier 
if ($both_strands) {
    $vmatch_options .= " -p"; # compute reverse complemented (palindromic) matches
} 
$vmatch_options .= " $tmp_file ";
$vmatch_options .= " | grep -v '^#' ";
#$vmatch_options .= " | awk '\$1 ~ \"\>\" {print \$1 \"_\" \$2 \"_\" (\$2+\$3-1)}; \$1 !~ \"\>\" {print}'";
$vmatch_options .= " | perl -pe 's| (\\d+) |_\$1_|'";

$vmatch_options .= "> $dir{output}/$basename{out}" if ($outputfile);
$command = "cd $TMP ; $skdir/vmatch $vmatch_options";
&doit($command, $dry_run, $die_on_error, $verbose);
foreach $ext  (@extensions) {
    my $command = "rm -f ${TMP}/${tmp_file}.${ext}";
    &doit($command, $dry_run, $die_on_error, $verbose);
}

#### remove the temp file
$command = "rm -f ${TMP}/$tmp_file";
&doit($command, $dry_run, $die_on_error, $verbose);


###### verbose ######
if ($verbose) {
  my $done_time = &AlphaDate;
  warn "; Job started $start_time\n";
  warn "; Job done    $done_time\n";
}


exit(0);


########################## subroutine definition ############################

sub PrintHelp {
#### display full help message #####
  open HELP, "| more";
  print HELP <<End_of_help;
NAME
	purge-sequence

        1999 by Jacques van Helden (jvanheld\@bigre.ulb.ac.be)

USAGE
        purge-sequence [-i inputfile] [-o outputfile] [-v] [-2str | -1str]

DESCRIPTION

	Mask redundant fragments in a sequence. This perl script is no
	more than a wrapper to facilitate this specific usage for two
	programs developed by Stefan Kurtz : mkvtree and vmatch.

	When a fragment is duplicated, the first ocurrence is
	conserved and the second masked (or optionally deleted). 

	By default, the program also searches similarities between the
	direct and reverse complement strand.

CATEGORY
	sequences

OPTIONS
	-h	(must be first argument) display full help message
	-help	(must be first argument) display options
	-v	verbose
	-i inputfile
	        The file containing the sequence to purge.
		If inputfile is not specified, the standard input is
		used.  This allows to place the command within a pipe.
	-format sequence format
	-o outputfile
		if not specified, the standard output is used.
		This allows to place the command within a pipe.
	-n	dry run
		print commands without executing them
	-ml #	match length (default: $match_len)
	-mis #	mismatches (default: $mismatches)
	-1str	discard duplications on the direct strand only
	-2str	discard duplications on the reverse complement as well
	-del	delete repeats instead of masking them
		By default, repeats are masked, i.e. each nucleotide
		within a repeat is replaced by the letter n.  When the
		option -del is selected, repeats are deleted. This
		means that one sequence of input can be converted to
		several fragments in the output.

	-mask_short min_seq_len
		Mask (replace by N characters) sequences shorter than
		the specified length. This can be useful to discard
		short intergenic segments from the pattern discovery
		step, especially when working with bacterial genomes,
		where short intergenic sequences generally correspond
		to intra-operon segments.

	-nodie
		The perl script purge-sequence does not die in case
         	the encapsulated programs (mkvtree, vmatch) return an
         	error message.

End_of_help
  close HELP;
  exit;
}

sub PrintOptions {
#### display short help message #####
  open HELP, "| more";
  print HELP <<End_short_help;
purge-sequence options
----------------
-h			(must be first argument) display full help message
-help			(must be first argument) display options
-i			input file
-o			output file
-v			verbose
-n			dry run
-format 		sequence format
-ml #			match length (default: $match_len)
-mis #			mismatches (default: $mismatches)
-1str			discard duplications on the direct strand only
-2str			discard duplications on the reverse complement as well
-del			delete repeats instead of masking them
-mask_short min_seq_len	Mask sequences shorter than the specified length
End_short_help
  close HELP;
  exit;
}


################################################################
## Read arguments
sub ReadArguments {
    foreach my $a (0..$#ARGV) {
	### verbose ###
	if ($ARGV[$a] eq "-v") {
	    if (&IsNatural($ARGV[$a+1])) {
		$verbose = $ARGV[$a+1];
	    } else {
		$verbose = 1;
	    }
	    
	    ### dry run
	} elsif ($ARGV[$a] eq "-n") {
	    $dry_run  = 1;
	    
	    ### detailed help
	} elsif ($ARGV[$a] eq "-h") {
	    &PrintHelp;
	    
	    ### list of options
	} elsif ($ARGV[$a] eq "-help") {
	    &PrintOptions;
	    
	    ### input file ###
	} elsif ($ARGV[$a] eq "-i") {
	    $inputfile = $ARGV[$a+1];
	    
	    ### sequence format
	} elsif ($ARGV[$a] eq "-format") {
	    $in_format = lc($ARGV[$a+1]);
	    
	} elsif ($ARGV[$a] eq "-i") {
	    $inputfile = $ARGV[$a+1];
	    
	    ### output file ###
	} elsif ($ARGV[$a] eq "-o") {
	    $outputfile = $ARGV[$a+1];
	    
	    ### strands
	} elsif ($ARGV[$a] eq "-1str") {
	    $both_strands = 0;
	} elsif ($ARGV[$a] eq "-2str") {
	    $both_strands = 1;
	    
	    ### delete repeats instead of masking them
	} elsif ($ARGV[$a] eq "-del") {
	    $delete_repeats = 1;
	    
	    ### matching length
	} elsif (($ARGV[$a] eq "-ml") && (&IsInteger($ARGV[$a+1]))) {
	    $match_len = $ARGV[$a+1];
	    
	    ### mismatches
	} elsif (($ARGV[$a] eq "-mis") && (&IsInteger($ARGV[$a+1]))) {
	    $mismatches = $ARGV[$a+1];

	    ## mask non-DNA code
	} elsif ($ARGV[$a] eq "-dna") {
	    $dna = 1;

	    ## mask non-DNA code
	} elsif ($ARGV[$a] eq "-nodie") {
	    $die_on_error = 0;

	    ## mask short sequences
	} elsif ($ARGV[$a] eq "-mask_short") {
	    $mask_short = $ARGV[$a+1];
	    &RSAT::error::FatalError(join("\t", 
					  $mask_short,
					  "Invalid value for the minimal masking length. ". 
					  "Must be a strictly positive natural number"))
		unless &IsNatural($mask_short) && ($mask_short > 0);
	    
	    
	}
    }
}

