#!/usr/bin/perl
#use strict;;
if ($0 =~ /([^(\/)]+)$/) {
  push (@INC, "$`lib/");
}
require "RSA.lib";


#### initialise parameters ####
$start_time = &RSAT::util::StartScript();
$in_format = "fasta";
$verbose = 0;
local $units = "bp";

&ReadArguments();

################################################################
## Check argument values

&CheckInputSeqFormat($in_format) unless ($in_format eq "bed");

## Open output stream
if (defined($outfile{output})) {
  $out = &OpenOutputFile($outfile{output});
} else {
  $out = STDOUT;
}

## Open input stream
if (defined($infile{input})) {
  ($in, $input_dir) = &OpenInputFile($infile{input});
} else {
  $in = STDIN;
}

## Verbose
&Verbose if ($verbose);

print $out "#seq", "\t", "length", "\n" unless ($sum_only);
$sum = 0;

## Compute sequence lengths from coordinates in bed format
if ($in_format eq "bed") {
  while (<$in>) {
    next unless (/\S/);		## Skip empty rows
    next if (/^#/);		## Skip comment lines
    chomp();
    my ($chrom, $start, $end) = split('\t');
#    &RSAT::message::Debug("chrom=".$chrom, "start=".$start, "end=".$end) if ($main::verbose >= 10);
    unless ((&RSAT::util::IsNatural($start)) &&
	    (&RSAT::util::IsNatural($end))) {
      &RSAT::message::Warning("Skipping line", $l, "invalid start and end positions.") if ($main::verbose >= 2);
      next;
    }
    if ($start > $end) {
      &RSAT::message::Warning("Skipping line", $l, "start ($start) larger than end ($end).") if ($main::verbose >= 2);
      next;
    }

    my ($length, $rounded_length) = &apply_units($end - $start + 1);
    my $current_id = $chrom.":".$start."..".$end.":1";
    $sum+= $length;
    print $out $current_id, "\t", $rounded_length , "\n"
      unless ($sum_only);
  }

## Compute sequence lengths by reading the sequences
} else {
  while ((($current_seq, $current_id) = &ReadNextSequence($in, $in_format, $input_dir)) &&
	 (($current_seq ne "") || ($current_id ne ""))) {
    my ($length, $rounded_length) = &apply_units(length(&FoldSequence($current_seq,0)));
    $sum+= $length;
    print $out $current_id, "\t", $rounded_length , "\n"
      unless ($sum_only);
  }
}


################################################################
## Print sum of sequence lengths if requested
if ($sum_only) {
  my $rounded_length = $sum;
  if ($units ne "bp") {
    $rounded_length = sprintf ("%.2f", $sum);
  }
  print $out $rounded_length, "\t", $units, "\n";
}
close $in if ($infile{input});


################################################################
## Report execution time and close output stream
my $exec_time = &RSAT::util::ReportExecutionTime($start_time); ## This has to be exectuted by all scripts
print $main::out $exec_time if ($main::verbose >= 1); ## only report exec time if verbosity is specified
close $main::out if ($main::outfile{output});


exit(0);

sub apply_units {
  my ($length) = @_;
  if ($units eq "kb") {
    $length /= 1000;
  } elsif ($units eq "mb") {
    $length /= 1000000;
  }
  my $rounded_length = $length;
  unless ($units eq "bp") {
    $rounded_length = sprintf("%.2f", $length);
  }
  return ($length, $rounded_length);
}

########################## subroutine definition ############################

sub PrintHelp {
#### display full help message #####
  open HELP, "| more";
  print HELP <<End_of_help;
NAME
	sequence-lengths

        1999 by Jacques van Helden (jvanheld\@bigre.ulb.ac.be)

DESCRIPTION
	Return the lengths of each sequence from an input
	file. Otionally, return the sum of lengths.

CATEGORY
	sequences

USAGE
        sequence-lengths [-i inputfile] [-o outputfile] [-v]

OPTIONS
	-h	(must be first argument) display full help message

	-help	(must be first argument) display options

	-v	verbose

	-i inputfile
		if not specified, the standard input is used.
		This allows to place the command within a pipe.

	-units bp|kb|mb
	      Units for sequence lengths.
	      Supported values:
	      bp	base pairs
	      kb	kilobases
	      mb	megabases

	-in_format   input format

		The input file can contain either sequences or genomic
		coordinates (-in_format bed).

		For a list of supported input sequences, type
		    convert-seq -help

	-o outputfile
		If not specified, the standard output is used.
	        This allows to place the command within a pipe.

	-sum	only return sum of sequene lengths
End_of_help
  close HELP;
  exit;
}

sub PrintOptions {
#### display short help message #####
  open HELP, "| more";
  print HELP <<End_short_help;
sequence-lengths options
----------------
-h		display full help message
-help		display options
-i		input file
-in_format    	input format (sequence or bed)
-o		output file
-v		verbose
-sum		only return sum of sequene lengths
-units		bp|kb|mb
End_short_help
  close HELP;
  exit;
}


sub ReadArguments {
  #### read arguments ####
  foreach my $a (0..$#ARGV) {
    ### verbose ###
    if ($ARGV[$a] eq "-v") {
      if (&IsNatural($ARGV[$a+1])) {
	$verbose = $ARGV[$a+1];
      } else {
	$verbose = 1;
      }

      ### detailed help
    } elsif ($ARGV[$a] eq "-h") {
      &PrintHelp();

      ### list of options
    } elsif ($ARGV[$a] eq "-help") {
      &PrintOptions();

      ### input file ###
    } elsif ($ARGV[$a] eq "-i") {
      $infile{input} = $ARGV[$a+1];

      ### output file ###
    } elsif ($ARGV[$a] eq "-o") {
      $outfile{output} = $ARGV[$a+1];

      ### sequence format ###
    } elsif ($ARGV[$a] eq "-in_format") {
      $in_format = $ARGV[$a+1];

    } elsif ($ARGV[$a] eq "-format") {
      $in_format = $ARGV[$a+1];
      &RSAT::message::Warning("sequence-lengths: option -format has been replaced by -in_format");

      ### only return sum of lengths
    } elsif ($ARGV[$a] eq "-sum") {
      $sum_only = 1;

      ### Units
    } elsif ($ARGV[$a] eq "-units") {
      $units = lc($ARGV[$a+1]);
    }
  }
}

sub Verbose {
  print $out "; sequence-lengths ";
  &PrintArguments($out);
  if (%main::infile) {
    print $out "; Input files\n";
    while (($key,$value) = each %infile) {
      print $out ";\t$key\t$value\n";
    }
  }
  if (%main::outfile) {
    print $out "; Output files\n";
    while (($key,$value) = each %outfile) {
      print $out ";\t$key\t$value\n";
    }
  }
  print $out "; seq format\t$in_format\n";
}
