#!/usr/bin/perl -w
############################################################
#
# $Id: parse-RNAfold,v 1.6 2009/11/05 00:32:07 jvanheld Exp $
#
# Time-stamp: <2002-06-06 13:23:08 jvanheld>
#
############################################################
#use strict;;
if ($0 =~ /([^(\/)]+)$/) {
    push (@INC, "$`lib/");
}
require "RSA.lib";


#### initialise parameters ####
my $start_time = &AlphaDate;

local %infile = ();
local %outfile = ();

local $verbose = 0;
local $in = STDIN;
local $out = STDOUT;
$out_format = "fasta";
$stem = 0;

&ReadArguments;


#### check argument values ####
&CheckOutputSeqFormat($out_format);


#### verbose ####
&Verbose if ($verbose);


### open output file ###
$out = &OpenOutputFile($outfile{output});

##### read input #####
($in) = &OpenInputFile($infile{input});
my $l = 0;
my $seq_id;
my $sequence;
my $prev_item;
my $fold;
my $score;
while (my $line = <$in>) {
    $l++;
    chomp($line);
    if ($line =~ />(\S+)/) {
	$seq_id = $1;
	$prev_item = "id";
	warn "; treating sequence\t$seq_id\n" if ($verbose >= 1);
    } elsif (($prev_item eq "id") && ($line =~ /^[augc]+$/i)) {
	$sequence = $line;
	$sequence =~ s/u/t/g;
	$sequence =~ s/U/T/g;
	$prev_item = "seq";
    } elsif (($prev_item eq "seq") && ($line =~ /^([\.\(\)]*)\s+\(\s*(\S+)\)$/))  {
	$fold = $1;
	$score = 42;
	$prev_item = "fold";
	if (length($fold) != length($sequence)) {
	    warn "; ERROR: fold has not the same lemgth as sequence\n";
	} else {
	    &extract_loops($seq_id, $sequence, $fold);
	}
    } else {
	warn "; line $l\tCannot interpret\t$line\n"; 
    }
}

close $in if ($infile{input});

###### execute the command #########


###### print output ######


###### verbose ######
if ($verbose) {
    my $done_time = &AlphaDate;
    print $out "; Job started $start_time\n";
    print $out "; Job done    $done_time\n";
}


###### close output file ######
close $out if ($outfile{output});


exit(0);


########################## subroutine definition ############################

sub PrintHelp {
#### display full help message #####
  open HELP, "| more";
  print HELP <<End_of_help;
NAME
	parse-RNAfold

        2001 by Jacques van Helden (jvanheld\@bigre.ulb.ac.be)
	
DESCRIPTION
	Parses the result of RNAfold, and extracts the sequences of
	the loops (in the future, it will also allow to extract other
	sub-structures, like stem-loops.

CATEGORY
	parser
	sequences

USAGE
        parse-RNAfold [-i inputfile] [-o outputfile] [-v]

OPTIONS
	-h	(must be first argument) display full help message
	-help	(must be first argument) display options
	-v	verbose
	-i inputfile
		if not specified, the standard input is used.
		This allows to place the command within a pipe.
	-o outputfile
		if not specified, the standard output is used.
		This allows to place the command within a pipe.

End_of_help
  close HELP;
  exit;
}

sub PrintOptions {
#### display short help message #####
  open HELP, "| more";
  print HELP <<End_short_help;
parse-RNAfold options
----------------
-h	(must be first argument) display full help message
-help	(must be first argument) display options
-i	input file
-o	output file
-v	verbose
End_short_help
  close HELP;
  exit;
}


sub ReadArguments {
#### read arguments ####
    foreach my $a (0..$#ARGV) {
	### verbose ###
	if ($ARGV[$a] eq "-v") {
	    if (&IsNatural($ARGV[$a+1])) {
		$verbose = $ARGV[$a+1];
	    } else {
		$verbose = 1;
	    }
	    
	    ### detailed help
	} elsif ($ARGV[$a] eq "-h") {
	    &PrintHelp;
	    
	    ### list of options
	} elsif ($ARGV[$a] eq "-help") {
	    &PrintOptions;
	    
	    ### input file ###
	} elsif ($ARGV[$a] eq "-i") {
	    $infile{input} = $ARGV[$a+1];
	    
	    ### output file ###
	} elsif ($ARGV[$a] eq "-o") {
	    $outfile{output} = $ARGV[$a+1];
	    
	    #### output sequence format
	} elsif ($ARGV[$a] eq "-format") {
	    $out_format = lc($ARGV[$a+1]);

	}
    }
}

sub Verbose {
    print $out "; parse-RNAfold ";
    &PrintArguments($out);
    if (defined(%infile)) {
	print $out "; Input files\n";
	while (($key,$value) = each %infile) {
	    print $out ";\t$key\t$value\n";
	}
    }
    if (defined(%outfile)) {
	print $out "; Output files\n";
	while (($key,$value) = each %outfile) {
	    print $out ";\t$key\t$value\n";
	}
    }
    print ";Output format\t$out_format\n";
}

sub extract_loops {
    my ($seq_id, $sequence, $fold) = @_;
    my @loop_sequences = ();
    my $offset = 1;
    my $pattern ="";

    if ($stem) {
	$pattern = '\(+\.+\)+'; #### extract loop + stem 
    } else {
	$pattern = '\(\.+\)'; #### extract loop only
    }
    while ($fold =~ /$pattern/) { 
	my $loop_start = length($`)+1;
	my $loop_len = length($&)-2;
	my $loop_seq = substr($sequence, $loop_start, $loop_len);
	my $loop_id = join ("_", 
			    $seq_id,  
			    $loop_start + $offset, 
			    $loop_start + $offset + $loop_len -1, 
			    $loop_len);

	if ($verbose >= 3) {
	    warn ("$sequence\n",
		  "$fold\n",
		  &print_meter(length($sequence),1), "\n"
		  );
	}
	
	warn "; Loop\t$seq_id\t$loop_start\t$loop_len\t\t$loop_id\t$loop_seq\n" if ($verbose >= 2);
	push @loop_sequences, $loop_seq;
	$fold = substr($fold, $loop_start+$loop_len);
	$sequence = substr($sequence, $loop_start+$loop_len);
	$offset += $loop_start+$loop_len;
	&PrintNextSequence($out, $out_format, 0, $loop_seq, $loop_id);
    }
    return @loop_sequences;
}

sub print_meter {
    my ($len, $levels) = @_;
    my $meter;
    for my $p (1..$len) {
	$meter .= $p%10;
    }

    for my $l (1..$levels) {
	my $chunk = 10**$l;
	$meter .= "\n";
	for my $p (1..$len) {
	    if ($p%${chunk} == 0) {
		$meter .= sprintf int($p/$chunk)%10;
	    } else {
		$meter .= ".";
	    }
	}
    }
    return $meter;
}
