#!/usr/bin/perl

### CVS: added the option -mask

if ($0 =~ /([^(\/)]+)$/) {
    push (@INC, "$`lib/");
}
require "RSA.lib";


#### initialise parameters ####
$start_time = `date '+%d/%m/%y %H:%M:%S %Z'`;
$line_width = 0;
$in_format = "fasta";
$out_format = "fasta";
$null = "NA";
$from = $null;
$to =$null;
$strand = "D";

&ReadArguments();

#### check argument values ####

&CheckInputSeqFormat($in_format);
&CheckInputSeqFormat($out_format);
my %fragments = ();
my %fragments_per_seq = ();
if ($fragment_file) {
    ## Fragment file
    if (($from ne $null) || ($to ne $null)) {
	&RSAT::message::FatalError("The option -frag is incompatible with the options -from and -to");
    }
    my ($in, $input_dir) = &OpenInputFile($fragment_file);
    my $l = 0;
    while (<$in>) {
	$l++;
	chomp();
	s/\r//;
	next if (/^;/);
	next if (/^#/);
	next unless (/\S/);
	my ($frag_id, $seq_id, $frag_from, $frag_to, $frag_strand) = split "\t";
	push @{$fragments_per_seq{$seq_id}}, $frag_id;
	$fragment{$frag_id}->{seq_id} = $seq_id;
	$fragment{$frag_id}->{from} = $frag_from;
	$fragment{$frag_id}->{to} = $frag_to;
	$fragment{$frag_id}->{strand} = $frag_strand;
    }
    close $in;

} else {
    ## Same from and to for all the sequences
    &RSAT::error::FatalError("You must specify the starting position (option -from)") if ($from eq $null);
    &RSAT::error::FatalError("You must specify the ending position (option -to)") if ($to eq $null);
}

### open input file ###
($in, $input_dir) = &OpenInputFile($inputfile);

### open output file ###
$out = &OpenOutputFile($outputfile);

#### verbose ####
if ($main::verbose >= 1) {
    print $out "; sub-sequence result\n";
    if ($inputfile ne "") {
	print $out "; Input file	$inputfile\n";
    }
    if ($outputfile ne "") {
	print $out "; Output file	$outputfile\n";
    }
    print $out "; Input format\t$in_format\n";
    print $out "; Output format\t$out_format\n";
    print $out "; From\t$from\n";
    print $out "; To\t$to\n";
    if ($strand eq "R") {
	print $out "; Output is the reverse complement of input sequence\n";
    }
}


################################################################
###### execute the command #########
while ((($current_seq, $seq_id) = &ReadNextSequence($in, $in_format, $input_dir, "", $mask)) &&
       (($current_seq ne "") || ($seq_id ne ""))) {
    $current_seq =~ s/\s//g;
    $seq_length = length($current_seq);

    &RSAT::message::TimeWarn(join("\t", "Read sequence", $seq_id, $seq_length)) if ($main::verbose >= 2);

    if ($fragment_file) {
	my $frag_list = $fragments_per_seq{$seq_id};
	my @fragments = @$frag_list;
	foreach my $frag_id (@fragments) {
	    my $fragment = $fragment{$frag_id};
	    my $frag_from = $fragment->{from};
	    my $frag_to = $fragment->{to};
	    my $frag_strand = $fragment->{strand};
#	    &RSAT::message::Debug($seq_id, $frag_id, $frag_from, $frag_to, $frag_strand) if ($main::verbose >= 3);
	    &OneFragment($frag_from, $frag_to, $frag_strand, $frag_id);
	}
    } else {
	&OneFragment($from, $to, $strand, $seq_id);
    }

}

###### verbose ######
if ($main::verbose >= 1) {
    $done_time = `date '+%d/%m/%y %H:%M:%S %Z'`;
    print $out ";Job started $start_time";
    print $out ";Job done    $done_time";
}


###### close input file ######
close $in unless ($inputfile eq "");

###### close output file ######
close $out unless ($outputfile eq "");


exit(0);

################################################################
##################### SUBROUTINE DEFINITION ####################
################################################################

sub OneFragment {
    my ($current_from, $current_to, $current_strand, $current_id) = @_;
    if ($current_from > $current_to) {
	my $tmp = $current_from;
	$current_from = $current_to;
	$current_to = $tmp;
    }
    if ($current_from < 0) {
	$sequence_from = length($current_seq) + $current_from + 1;
    } else {
	$sequence_from = $current_from;
    }
    if ($current_to < 0) {
	$sequence_to = length($current_seq) + $current_to + 1;
    } else {
	$sequence_to = $current_to;
    }
    $sequence_from = &max($sequence_from, 1);
    $sequence_from = &min($sequence_from, $seq_length);
    $sequence_to = &max($sequence_to, 1);
    $sequence_to = &min($sequence_to, $seq_length);
    $current_length = $sequence_to - $sequence_from + 1;

    push @comments, join(" ", "sub-sequence from", $sequence_from, "to", $sequence_to);
    if ($limits) {
	$current_id .= "_".$current_from."_".$current_to;
    }
    $sub_seq = substr($current_seq, $sequence_from -1, $current_length);
    if ($current_strand eq "R") {
	$sub_seq = &ReverseComplement($sub_seq);
    }
    &RSAT::message::Debug("Fragment", $current_id, $seq_id, $current_from, $current_to, $sequence_from, $sequence_to, $current_strand, $out_format) if ($main::verbose >= 3);
    &PrintNextSequence($out, $out_format, $line_width, $sub_seq, $current_id, @comments);
    @comments = ();
}

################################################################
#### display full help message 
sub PrintHelp {
  open HELP, "| more";
  print HELP <<End_of_help;
NAME
	sub-sequence

        1998 by Jacques van Helden (jvanheld\@bigre.ulb.ac.be)
	
USAGE
        sub-sequence [-i inputfile] -from # -to # [-o outputfile] [-v]

DESCRIPTION
	Returns a segment from the input sequence, specified by its position. 
	
CATEGORY
	sequences

OPTIONS
        -h      (must be first argument) display full help message

        -help   (must be first argument) display options

	-v	verbose

	-i inputfile
		if not specified, the standard input is used.
		This allows to place the command within a pipe.

	-mask upper|lower
		Mask lower or uppercases, respecively, i.e. replace
		selected case by N characters.

	-o outputfile
		if not specified, the standard output is used.
		This allows to place the command within a pipe.

	-frag	fragment_file

		This option allows to specify a list of fragments t be
		retrieved from each sequence of the input file.

		Each row contains the coordinates of a fragment in 4
		or 5 columns:

		     1) fragment ID
		     2) sequence ID (must be the same as in the sequence file)
		     3) fragment start
		     4) fragment end
		     5) strand (optional). If not specified, all
                        fragments are taken on the direct strand.

    		example: 
		frag1	chr2L	344641	348496	D
		frag2	chr2L	346419	350309	R
		frag3	chr2R	350781	354418	D

	-from #	starting position
		if not specified, the subsequence starts at 1st position.

	-to #	end position
		if not specified, the end of the sequence is used.

	-iformat	
		input format. Default is fasta

	-oformat	
		output format. Default is fasta

	-format	input-output format. Default is fasta

	-rc	return the reverse complement of the sub-sequences

	-limits	
		add a suffix to sequence IDs to indicate the limits of
		the sub-sequence.

POSITION SPECIFICATION
	Positive position are used to refer to the sequence start. +1 is 
	the first residue from the sequence.
	Negative positions refer to the sequence end (-1 is the last residue 
	from the sequence).
	
INPUT-OUTPUT FORMAT
	Various sequence formats are supported:
	- IG
	- Fasta
	- Wconsensus
	- raw
	- multi	
	When the input contains several sequences in the same file, the 
	programs extracts the fragment at the specified positions from 
	each of them.

EXAMPLES
	sub-sequence -v -i mydata -o myresult -from -353 -to -397
	
End_of_help
  close HELP;
  exit(0);
}


################################################################
#### Display short help message 
sub PrintOptions {
  open HELP, "| more";
  print HELP <<End_short_help;
sub-sequence options
----------------
-h		(must be first argument) display full help message
-help		(must be first argument) display options
-i		input file
-mask upper|lower	mask upper- or lowercases, respectively
-o		output file
-v		verbose
-frag		fragment_file
-from #		start position
-to #		end position
-iformat	input format. Default is fasta
-oformat	output format. Default is fasta
-format		input/output format (fasta|wc|ig|raw|multi)
-rc		return the reverse complement of the sub-sequences
-limits		add a suffix to sequence IDs to indicate the limits of the sub-sequence
End_short_help
  close HELP;
  exit;
}

################################################################
#### Read arguments 
sub ReadArguments {
    foreach $a (0..$#ARGV) {
	### verbose ###
	if ($ARGV[$a] eq "-v") {
	    $main::verbose = 1;
	    
	    ### detailed help
	} elsif ($ARGV[$a] eq "-h") {
	    &PrintHelp();
	    
	    ### list of options
	} elsif ($ARGV[$a] eq "-help") {
	    &PrintOptions();
	    
	    ### Input file
	} elsif ($ARGV[$a] eq "-i") {
	    $inputfile = $ARGV[$a+1];

	    ## Mask
	} elsif ($ARGV[$a] eq "-mask") {
	    $mask = $ARGV[$a+1];
	    &CheckMask($mask);	    

	    ### Output file
	} elsif ($ARGV[$a] eq "-o") {
	    $outputfile = $ARGV[$a+1];


	    ## Fragment file
	} elsif ($ARGV[$a] eq "-frag") {
	    $fragment_file = $ARGV[$a+1];

	    ## starting position
	} elsif ($ARGV[$a] eq "-from") {
	    $from = $ARGV[$a+1];

	    ## ending position
	} elsif ($ARGV[$a] eq "-to") {
	    $to = $ARGV[$a+1];

	} elsif ($ARGV[$a] eq "-format") {
	    $in_format = $ARGV[$a+1];
	    $out_format = $ARGV[$a+1];

	} elsif ($ARGV[$a] eq "-iformat") {
	    $in_format = $ARGV[$a+1];

	} elsif ($ARGV[$a] eq "-oformat") {
	    $out_format = $ARGV[$a+1];

	} elsif ($ARGV[$a] eq "-rc") {
	    $strand = "R";

	} elsif ($ARGV[$a] eq "-limits") {
	    $limits = 1;

	}

    }
}
