#!/usr/bin/perl
if ($0 =~ /([^(\/)]+)$/) {
    push (@INC, "$`lib/");
}
require "RSA.lib";

$start_time = &RSAT::util::StartScript(); 
$date = $start_time;

#### initialise parameters ####
$type="site";
$id = "patser";
$matrix_length = 1;
$seq_format = "wconsensus";  ### because this is the usual format for patser
$limits = 0; ## Report sequence limits for the feature map (requires the option -seq)
$flank_len = 0;
$null = "NA";

@supported_table_fields = qw (score start end strand position);
foreach my $field  (@supported_table_fields) {
    $supported_table_fields{$field} = 1;
}

$supported_table_fields = join ",", @supported_table_fields;

#### read arguments ####
&ReadArguments();

################################################################
## Check parameter values
if (($fixed_origin) && !($sequence_file)) {
  &RSAT::error::FatalError("The option -origin requires to specify a sequence file (option -seq).");
}

#### default return format
unless (%return) {
    $return{matches} = 1;
}
if (($return{table}) && !(%table_fields)){
    $table_fields{score} = 1;
}


#### origin file
if ($origin_file) {
    unless (open ORIG, $origin_file) {
	print "Error: could not open the oirigin file\n";
	exit;
    }
    while (<ORIG>) {
	if (/^\s*(\S+)\s*(\S+)/) {
	    $origin{$1} = $2;
	}
    }
    close ORIG;
}



#### read the sequences
if ($sequence_file) {
    ($seq, $seq_dir) = &OpenInputFile($sequence_file);
    while ((($current_seq, $current_id) = &ReadNextSequence($seq, $seq_format, $seq_dir)) &&
	   (($current_seq) || ($current_id))) {
	$current_seq =~s/\s//g;
	$sequence{$current_id} = $current_seq;
    }
    close $seq;
}


### open input file ###
($in, $input_dir) = &OpenInputFile($inputfile);

### open output file ###
$out = &OpenOutputFile($outputfile);

#### verbose ####
if ($verbose) {
    print $out ";features-from-patser ";
    &PrintArguments();
    print $out "\n";
    print $out ";Input file	$inputfile\n" if ($inputfile);
    print $out ";Output file	$outputfile\n" if ($outputfile);
    if ($sequence_file) {
	print $out ";Sequence file\t$sequence_file\n";
	print $out ";\tsequence id\tlength\n";
	my $i = 0;
	foreach $id (sort keys %sequence) {
	    $i++;
	    if ($i >= 50) {
		print $out join ("\t", ";", "...", scalar(keys %sequence)- $i +1, "additional sequences skipped ..."), "\n";
		last;
	    }
	    $length = length($sequence{$id});
	    print $out join ("\t", ";", $id, $length), "\n";
	}
    }
}


################################################################
## Print sequence limits to draw the backbone
if ($sequence_file) {
    print $out ";Sequence file\t$sequence_file\n";
    print $out ";\tsequence id\tlength\n";
    foreach my $id (sort keys %sequence) {
	my $length = length($sequence{$id});
	my $end;
	my $start;
	if ($origin eq "-0") {
	    $end = -1;
	    $start = -$length;
	} else {
	    $end = $length - $origin;
	    $start = 1 - $origin;
	}
	if ($limits) {
	    print $out join("\t", $id, "LIMIT", "SEQ_START", "DR", $start,$start, "-", 0), "\n";
	    print $out join("\t", $id, "LIMIT", "SEQ_END", "DR", $end,$end, "-", 0), "\n";
	}
    }
}

#### read and parse patser file
$header_printed=0;
my $l = 0; ## Line count
while (<$in>) {
  $l++;
    if (($matrix_length == 1) && (/width of the alignment matrix\:\s+(\d+)/)) {
	$matrix_length = $1;
	print $out "; $_" if ($verbose >= 1);

	#### the row contains a match description
    } elsif (/^\s*(\S+)\s+position=\s+(\d+)([c\s])\s+score=\s+(\S+)(.*)/i) {
	
	#### parse the match
	$map = $1;
	$position = $2;
	$strand = $3;
	$score = $4;
	$the_rest= $5; ## These fields are ocasionally present, depending on th options used with patser
	$descr = "";

	#### P-value
	if ($the_rest =~ /ln\(p-value\)=\s+(\S+)/) {
	    $lnP = $1;
	} else {
	  $lnP = $null;
	}

	#### sequence (only available with the option patser -s, for version >=3e)
	if ($the_rest =~ /sequence=\s+(\S+)/) {
	    $descr = $1;
	}

	#### adapt coordinates to the selected origin
	unless ($fixed_origin) {
	    $origin = $origin{$1};
	}
	if ($origin eq "-0") {
	    $start = $position - length($sequence{$map}) - 1;
	} else {
	    $start = $position - $origin;
	}
	$end = $start + $matrix_length -1;

	#### strand
	unless ($force_strand) {
	    if  ($strand =~ /^c$/i) {
		$strand = "R";
	    } else {
		$strand = "D";
	    }
	}


	#### parse information about the match
	if ($return{matches}) {

	  #### get the sub sequence
	  if ($sequence_file) {
	    my $offset = $position - 1;
	    my $sub_sequence = uc(sub_sequence($sequence{$map}, $offset, $matrix_length));
	    
	    #		&RSAT::message::Debug($sub_sequence, $offset, $matrix_length, length($sequence{$map}), $sequence{$map}) if ($main::verbose >= 0);
	    
	    my $left_flank =  lc(sub_sequence($sequence{$map}, $offset - $flank_len, $flank_len));
	    my $right_flank =  lc(sub_sequence($sequence{$map}, $offset + $matrix_length, $flank_len));
	    if ($strand eq "R") {
	      $sub_sequence = &ReverseComplement($sub_sequence);
	      $left_flank = lc(&ReverseComplement($left_flank));
	      $right_flank = lc(&ReverseComplement($right_flank));
	      $sub_sequence = $right_flank.$sub_sequence.$left_flank;
	    } else {
	      $sub_sequence = $left_flank.$sub_sequence.$right_flank;
	    }
	    
	    
	    #### description
	    $descr = "${sub_sequence}";
	  }

	    ################################################################
	    ### print the header line if it has not yet been done
	    unless ($header_printed) {
		print $out join ("\t", 
				 "; map", 
				 "type", 
				 "id", 
				 "strand", 
				 "start", 
				 "end", 
				 "sequence", 
				 "score",
				 "ln(P)",
				 ), "\n";
		$header_printed=1;
	    }

#	&RSAT::message::Debug("patser line parsed", $l,  $map, $position, $strand, $score, $descr, $the_rest, $lnP) if ($main::verbose >= 5);
	    #### print the match
	    &PrintFeature();
	}

	#### store score for the score table
	if ($return{table}) {
	    push @{$score{$map}}, $score;
	    push @{$start{$map}},  $start if ($table_fields{start});
	    push @{$end{$map}},  $end if ($table_fields{end});
	    push @{$strand{$map}},  $strand if ($table_fields{strand});
	    push @{$position{$map}},  join ("", $start,"..",$end,$strand) if ($table_fields{position});
	    $max_scores_per_map = &max($max_scores_per_map, $#{$score{$map}}+1) unless ($max_col_per_seq);
#	    warn join ("\t", $map, $score, $start, $end, $strand, $#{$score{$map}}), "\n" if ($verbose >= 3);
	}
	

    } else {
	#### Print the line as a comment
	print $out "; $_" if ($verbose >= 1);
    }
}

################################################################
#### print score table
if ($return{table}) {
    print $out "; Score table\n" if ($verbose >= 1);
    
    #### header
#    print $out join ("\t", "; seq_id", 1..$max_scores_per_map), "\n";
    print $out "; seq_id";
    if ($table_fields{score}) {
	for my $i (1..$max_scores_per_map) {
	    print $out "\tscore.${i}";
	} 
    }
    if ($table_fields{start}) {
	for my $i (1..$max_scores_per_map) {
	    print $out "\tstart.${i}";
	} 
    }
    if ($table_fields{end}) {
	for my $i (1..$max_scores_per_map) {
	    print $out "\tend.${i}";
	} 
    }
    if ($table_fields{strand}) {
	for my $i (1..$max_scores_per_map) {
	    print $out "\tstrand.${i}";
	} 
    }
    if ($table_fields{position}) {
	for my $i (1..$max_scores_per_map) {
	    print $out "\tposition.${i}";
	} 
    }
    print $out "\n";

    #### print matche and position
    foreach my $map (sort keys %score) {
#	warn join ("\t", $map, $#{$score{$map}}), "\n" if ($verbose >= 3);

	#### make sure all vectors contains the same number of entries
	if  ($#{$score{$map}} + 1 > $max_scores_per_map) {
		@{$score{$map}} = @{$score{$map}}[0..($max_scores_per_map-1)];
		@{$start{$map}} = @{$start{$map}}[0..($max_scores_per_map-1)] if ($table_fields{start});;
		@{$end{$map}} = @{$end{$map}}[0..($max_scores_per_map-1)] if ($table_fields{end});;
		@{$strand{$map}} = @{$strand{$map}}[0..($max_scores_per_map-1)] if ($table_fields{strand});;
		@{$position{$map}} = @{$position{$map}}[0..($max_scores_per_map-1)] if ($table_fields{position});;
	} else {
	    for my $i ($#{$score{$map}}..$max_scores_per_map-2) {
		push @{$score{$map}}, "";
		push @{$start{$map}}, "" if ($table_fields{start});
		push @{$end{$map}}, "" if ($table_fields{end});
		push @{$strand{$map}}, "" if ($table_fields{strand});
		push @{$position{$map}}, "" if ($table_fields{position});
	    }
	}

	print $out $map;
	
	my @sorted_index = sort {$score{$map}->[$b] <=> $score{$map}->[$a] } 0..$#{$score{$map}};
	print $out "\t", join ("\t", 
			       (sort {$b <=> $a} @{$score{$map}}), 
			       ) if ($table_fields{score});
	print $out "\t", join ("\t", 
			       @{$start{$map}}[@sorted_index],
			       ) if ($table_fields{start});
	print $out "\t", join ("\t", 
			       @{$end{$map}}[@sorted_index],
			       ) if ($table_fields{end});
	print $out "\t", join ("\t", 
			       @{$strand{$map}}[@sorted_index],
			       ) if ($table_fields{strand});
	print $out "\t", join ("\t", 
			       @{$position{$map}}[@sorted_index],
			       ) if ($table_fields{position});
	print $out "\n";
    }
}

###### close input file ######
close $in unless ($inputfile eq "");

###### close output file ######
my $exec_time = &RSAT::util::ReportExecutionTime($start_time);
print $main::out $exec_time if ($main::verbose >= 1);
close $out unless ($outputfile eq "");


exit(0);


########################## subroutine definition ############################

sub PrintFeature {
    print $out join ("\t",  
		     $map, 
		     $type, 
		     $id, 
		     $strand, 
		     $start, 
		     $end, 
		     $descr, 
		     $score,
		     $lnP), "\n";
}

sub PrintHelp {
#### display full help message #####
  open HELP, "| more";
  print HELP <<End_of_help;
#### display full help message #####
  open HELP, "| more";
  print HELP <<End_of_help;
NAME
	features-from-patser

        1998 by Jacques van Helden (jvanheld\@bigre.ulb.ac.be)
	
USAGE
        features-from-patser [-i inputfile] [-o outputfile] [-v]

DESCRIPTION
	converts the output from the program patser into a file
	that can be used as input for the program feature-map.

CATEGORY
	util
	conversion
	drawing

OPTIONS
        -h      (must be first argument) display full help message
        -help   (must be first argument) display options

	-v	verbose

	-i inputfile
		if not specified, the standard input is used.
		This allows to place the command within a pipe.

	-o outputfile
		if not specified, the standard output is used.
		This allows to place the command within a pipe.

	-id	feature identifier
		the same identifier is used for all features

	-mlen #	
		matrix length

	-strand	D, R or DR. Strand on which to draw the feature.

	-origin	#
		each position is recalculated using this value as reference.

	-origfile	origin file	
		each line indicates the specific origin for one of the 
		sequences. It must contain 2 words separate by a tab.
		The first word is the sequence identifier (= the map name),
		the second word is the origin for that sequence.


	-seq seq_file
		indicates the sequence file which had been used as 
		input for patser. This file must be in wconsensus format.

	-limit
		return start and end positions for each input sequence.
		Requires to specify the sequence file with the option -seq. 

	-format
		sequence file format 
		(default is wcconsensus, since this is the usual format for patser)

	-N #	return matching sequences with # flanking nucleotides
		(only valid with the -seq option)

	-table return_fields
		Return a table with one row per sequence and one column per match. 

		The next argument indicates the information to
		return. Supported: $supported_table_fields

		Beware: this format is not supported by feature-map,
		but it is very convenient for post-processing.

	-maxcol	# 
		max number of columns per sequence for the table
		This prevents from obtaining more columns than the
		number of top scores. Indeed, when there are several
		matches with identical scores, patser return ll of
		them as a single top score.

	-null 	null string (default $default_null) displayed for
         	undefined values in the column ln(P).

INPUT FORMAT
	The input file should be the result of patser.
	
	
OUTPUT FORMAT
	By default, the program returns a table with one row per
	match, in feature-map format.

	The option -table motifies the behaviour, and a table is
	returned with one row per sequence and one column per match. 
	
EXAMPLES

       features-from-patser -v -i mydata -o myresult \
	    -table matches,table

End_of_help
  close HELP;
  exit;
}

sub PrintOptions {
#### display short help message #####
  open HELP, "| more";
  print HELP <<End_short_help;
#### display short help message #####
  open HELP, "| more";
  print HELP <<End_short_help;
features-from-patser options
----------------
-h      	(must be first argument) display full help message
-help   	(must be first argument) display options
-i      	input file
-o      	output file
-v      	verbose
-id		feature identifier
-mlen #		matrix length
-strand		D, R or DR. Strand on which to draw the feature
-origin #	origin
-N #		return matching sequences with # flanking nucleotides
-origfile	origin file
-seq		sequence file
-limit		return start and end positions for each input sequence.
-format		sequence file format (default wc)
-table		return a table with specified fields (Supported: $supported_table_fields).
-maxcol	# 	max number of columns per sequence for the table
-null 		null string (default $default_null) displayed for undefined values in the column ln(P).
End_short_help
  close HELP;
  exit;
}



################################################################
## Read arguments 
sub ReadArguments {
    foreach $a (0..$#ARGV) {
	### verbose ###
	if ($ARGV[$a] eq "-v") {
	    if (&IsNatural($ARGV[$a+1])) {
		$verbose = $ARGV[$a+1];
	    } else {
		$verbose = 1;
	    }
	    
	    ### detailed help
	} elsif ($ARGV[$a] eq "-h") {
	    &PrintHelp();
	    
	    ### list of options
	} elsif ($ARGV[0] eq "-help") {
	    &PrintOptions();
	    
	    ### input file ###
	} elsif ($ARGV[$a] eq "-i") {
	    $inputfile = $ARGV[$a+1];
	    ### output file ###
	} elsif ($ARGV[$a] eq "-o") {
	    $outputfile = $ARGV[$a+1];

	} elsif ($ARGV[$a] eq "-strand") {
	    $strand = $ARGV[$a+1];
	    $force_strand = 1;

	} elsif ($ARGV[$a] eq "-id") {
	    $id = $ARGV[$a+1];

	    #### neighborhood (return flanking residues)
	} elsif (($ARGV[$a] eq "-N") && (&IsNatural($ARGV[$a+1]))) {
	    $flank_len = $ARGV[$a+1];

	} elsif ($ARGV[$a] =~ /^-orig/i) {
	    $fixed_origin = 1;
	    $origin = $ARGV[$a+1];
	    &RSAT::error::FatalError(join("\t", $origin, "Invalid value for the origin: should be an integer number or -0"))
	      unless ((&RSAT::util::IsInteger($origin)) || ($origin eq "-0"));

	} elsif ($ARGV[$a] =~ /^-origfile/i) {
	    $origin_file = $ARGV[$a+1];

	} elsif (($ARGV[$a] =~ /^-mlen/i) && (&IsNatural($ARGV[$a+1]))) {
	    $matrix_length = $ARGV[$a+1];

	    ### sequence file 
	} elsif ($ARGV[$a] =~ /^-seq/) {
	    $sequence_file = $ARGV[$a+1];

	    ## Report sequence limits
	} elsif ($ARGV[$a] =~ /^-limit/) {
	    $limits = 1;

	    ### sequence file format 
	} elsif ($ARGV[$a] eq "-format") {
	    $seq_format = $ARGV[$a+1];

	    ### max # of colukns for a table
	} elsif ($ARGV[$a] eq "-maxcol") {
	    $max_col_per_seq = $ARGV[$a+1];
	    $max_scores_per_map  = $max_col_per_seq;
	    unless (&IsNatural($max_col_per_seq)) {
		&RSAT::error::FatalError("$max_col_per_seq invalid number of columns");
	    }

	    ### null string ###
	} elsif ($ARGV[$a] eq "-null") {
	    $null = $ARGV[$a+1];

	    ### return a table
	} elsif ($ARGV[$a] eq "-table") {
	    $return{table} = 1;
	    $return{matches} = 0;
	    @to_return = split ",", $ARGV[$a+1];
	    foreach my $field (@to_return) {
		unless ($supported_table_fields{$field}) {
		    &RSAT::error::FatalError("$field is not a supported return field");
		}
		$table_fields{$field} = 1;
	    }

	}
    }
}
