#!/usr/bin/perl -w
############################################################
#
# $Id: fetch-sequences,v 1.19 2012/08/06 11:45:24 jvanheld Exp $
#
############################################################

## use strict;

=pod

=head1 NAME

fetch-sequences

=head1 VERSION

$program_version

=head1 DESCRIPTION

Retrieve genome sequences for a set of coordinates specified in a bed
file.

=head1 AUTHORS

Jacques.van-Helden@univ-amu.fr

Adapted from a script developed by Carl Herrmann.

=head1 CATEGORY

=over

=item genomics

=item sequences

=back

=head1 USAGE

fetch-sequences [-i bedfile] [-o seqfile] [-v #] [...]


Examples

Retrieve peaks identified by the peak-calling program MACS.

 fetch-sequences -i MACS_output_peaks.bed -genome mm8


To retrieve regions of fixed width (200bp) centred on the peak
summitsreturned by MACS.

 fetch-sequences -i MACS_output_summits.bed -genome mm8 -extend 100


=head1 INPUT FORMAT

The definition of the BED format is provided on the UCSC Genome
Browser web site (http://genome.ucsc.edu/FAQ/FAQformat#format1).

The first three required BED fields are:

=over

=item 1. chrom

The name of the chromosome (e.g. chr3, chrY, chr2_random) or scaffold
(e.g. scaffold10671).

=item 2. chromStart

The starting position of the feature in the chromosome or
scaffold. The first base in a chromosome is numbered 0.

=item 3. chromEnd

The ending position of the feature in the chromosome or scaffold. The
chromEnd base is not included in the display of the feature. For
example, the first 100 bases of a chromosome are defined as
chromStart=0, chromEnd=100, and span the bases numbered 0-99.

=back

The 9 additional optional BED fields are:

=over

=item 4. name

Defines the name of the BED line. This label is displayed to the left
of the BED line in the Genome Browser window when the track is open to
full display mode or directly to the left of the item in pack mode.

=item 5. score

A score between 0 and 1000. If the track line useScore attribute is
set to 1 for this annotation data set, the score value will determine
the level of gray in which this feature is displayed (higher numbers =
darker gray).

=item 6. strand

Defines the strand - either '+' or '-'.

=item 7. thickStart

The starting position at which the feature is drawn thickly (for
example, the start codon in gene displays).

=item 8. thickEnd

The ending position at which the feature is drawn thickly (for
example, the stop codon in gene displays).

=item 9. itemRgb

An RGB value of the form R,G,B (e.g. 255,0,0). If the track line
itemRgb attribute is set to "On", this RBG value will determine the
display color of the data contained in this BED line. NOTE: It is
recommended that a simple color scheme (eight colors or less) be used
with this attribute to avoid overwhelming the color resources of the
Genome Browser and your Internet browser.

=item 10. blockCount

The number of blocks (exons) in the BED line.

=item 11. blockSizes

A comma-separated list of the block sizes. The number of items in this
list should correspond to blockCount.

=item 12. blockStarts

A comma-separated list of block starts. All of the blockStart
positions should be calculated relative to chromStart. The number of
items in this list should correspond to blockCount.

=back

=head1 OUTPUT FORMAT

Sequences are exported in fasta format.

=head1 SEE ALSO

=over

=item I<peak-motifs>

A common utilization of fetch-sequences is to retrieve UCSC sequences
for the peak coordinates produced by a peak callinf program
(e.g. MACS, SICER, SWEMBL, ...).

=back

=head1 WISH LIST

=over


=item B<-maf>

Get multi-genome alignment files instead of single-genome sequence.
The maf output requires to specify either a taxon (option I<-taxon>)
or a list of organisms (option I<-org_list organism_file.txt>).

=item B<-taxon>

Taxonomic level for multi-genome alignment files.

=item B<-mask>

Add support for UCSC masking options (coding, repetitive, ...).

For the time being, all sequences are transformed to uppercases.

=item B<check chromosome size>

When the coordinates of one feature exceed chromosome size, the whole
UCSC query fails. In some cases, the UCSC Web server does not even
issue an error, but all the sequences following the erroneous feature
are empty.

This poses problems with the option -extend, since the extended
feature may reach chromosome ends. I should check the chromosome
lengths, and restrict the extended end to these values.

=back

=cut


BEGIN {
  if ($0 =~ /([^(\/)]+)$/) {
    push (@INC, "$`lib/");
  }
}
require "RSA.lib";

use Bio::Das; ## Required to access UCSC Genome Browser


################################################################
## Main package
package main;
{

  ################################################################
  ## Initialise parameters
  our $start_time = &RSAT::util::StartScript();
  our $program_version = do { my @r = (q$Revision: 1.19 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
  #    $program_version = "0.00";

  our %infile = ();
  our %outfile = ();

  our $verbose = 0;
  our $in = STDIN;
  our $out = STDOUT;

  our @supported_headers = qw(ucsc galaxy);
  our $supported_headers = join ",", @supported_headers;
  our %supported_header  =();
  foreach my $header (@supported_headers) {
    $supported_header{$header} = 1;
  }
  our $header_format = "ucsc";


  our $genome = "";

  ## Extend the coordinates on the left and right sides
  our $upstr_extension = 0;
  our $downstr_extension = 0;
  our $reference = "segment";

  ## Max number of sequences to retrieve
  our $top = 0;
  our $chunk = 10000;

  our @queries = (); ## List of queries for the DAS server
  our %header = (); ## Header for each sequence
  our @skipped_rows = (); ## Invalid rows skipped during parsing

  ## Parameters for connecting the DAS server
  our $das_server="http://genome.cse.ucsc.edu/cgi-bin/das";
  our $timeout = 20; ## seconds for the timeout at UCSC
  our $max_trials = 20; ## Number of trials in case of timeout

  ################################################################
  ## Read argument values
  &ReadArguments();

  ################################################################
  ## Check argument values

  unless ($genome) {
    &RSAT::error::FatalError("Genome version must be specified with option -genome (e.g. mm9, hg19)");
  }

  ## Define the URL fo the DAS server
  our $das_server_url = $das_server.'/'.$genome;
  &RSAT::message::Info("DAS server", $das_server_url) if ($main::verbose >= 2);

  ## Open DAS client
  my $das = Bio::Das->new($timeout);


  ################################################################
  ## Open output stream
  $out = &OpenOutputFile($outfile{output});

  ## Define log file
  if ($outfile{output}) {
    $outfile{log} = $outfile{output};
    $outfile{log} =~ s/\.fasta$//;
    $outfile{log} =~ s/\.fa$//;
    $outfile{log} .= "_log.txt";
  } else {
    $outfile{log} = "";
  }
  $log = &OpenOutputFile($outfile{log});

  ################################################################
  ##Initialising parameter for length of each chr
  our %chr_end  = ();

  ## Send request to DAS server
  my @response = $das->entry_points(-dsn=>[$das_server_url]);

  ## Treat request results
  for my $id (@response) {
     if ($id->is_success) {
      my @dsns = $id->results;
      foreach (@dsns) {
        $chr_end{"chr".$_->ref} = $_->end; #put result in hash table with key "chr" and value "chr_end"
      }
    }
  }
  if (scalar(keys(%chr_end)) == 0) {
    &RSAT::error::FatalError("Could not obtain entry points from UCSC DAS server", $das_server_url);
  }

  ################################################################
  ## Download input from remote URL
  if ($main::infile{input_url}) {
    &RSAT::message::TimeWarn("Transferring input file from URL", $main::infile{input_url}) if ($main::verbose >= 2);
    use LWP::Simple;
#    my $content = get($main::infile{input_url});
#    my @content_lines = split (/[\n\r]/, $content);
#    die "Could not get URL ".$infile{input_url} unless ($content);
    if (defined($outfile{output})) {
      $main::outfile{input} = $main::outfile{output};
      $main::outfile{input} =~ s/\.fasta$//;
      $main::outfile{input} =~ s/\.fa$//;
      $main::outfile{input} .= ".bed";
    } else {
      ## Define temporary directory
      if ($main::TMP) {
	$tmp_dir = $main::TMP;
      } else {
	$tmp_dir = ".";
      }
      $main::outfile{input} = &RSAT::util::make_temp_file($tmp_dir, "fetch-sequences");
    }
    getstore($main::infile{input_url}, $main::outfile{input});
    &RSAT::message::TimeWarn("Genomic coordinates transferred to local file", $main::outfile{input}) if ($main::verbose >= 2);
    ($main::in) = &OpenInputFile($main::outfile{input});
  } else {
    ($main::in) = &OpenInputFile($main::infile{input});
  }

  ################################################################
  ## Read input
  &RSAT::message::TimeWarn("Reading genomic coordinates") if ($main::verbose >= 2);
  my $i = 0;
  my $l = 0;
  while (<$main::in>) {
    $l++;
    next if (/^#/); ## Skip comment lines
    next unless (/\S/); ## Skip empty lines
    next unless (/\t/); ## Skip starting comment lines
    chomp();

    if (($top > 0) && ($i >= $top)) {
      &RSAT::message::Info("Stopped after $top top queries. Further lines are ignored.");
      last;
    }
    my ($chrom, $left, $right, $name, $score, $strand) = split(/\t/);

    &RSAT::message::Debug($l, $chrom, $left, $right, $strand) if ($main::verbose >= 5);

    ## Check validity of the chromosome
    unless (defined($chr_end{$chrom})) {
      if (defined($chr_end{"chr".$chrom})) {
	$chrom = "chr".$chrom;
	&RSAT::message::Warning($chrom, "chromosome name: missing 'chr' prefix, added.", $l) if ($main::verbose >= 2);
      } else {
	&RSAT::message::Warning($chrom, "Invalid chromosome name", "skipping line", $l) if ($main::verbose >= 2);
	push @skipped_rows, $_;
	next;
      }
    }

    ## Check validity of left and right positions
    unless (&RSAT::util::IsNatural($left)) {
      &RSAT::message::Warning($left, "Invalid left position", "skipping line", $l) if ($main::verbose >= 2);
      push @skipped_rows, $_;
      next;
    }
    unless (&RSAT::util::IsNatural($right)) {
      &RSAT::message::Warning($right, "Invalid right position", "skipping line", $l) if ($main::verbose >= 2);
      push @skipped_rows, $_;
      next;
    }

    ## Check that the left is smaller than the right
    if ($left > $right) {
      my $tmp = $left;
      $left = $right;
      $right = $tmp;
    }

    ## Make sure that strand is defined
    unless ((defined($strand)) && ($strand =~ /[+-]/)) {
      $strand = "+";
    }

    ## Treat reference
    if ($reference eq "start") {
      if ($strand eq "-") {
	$left = $right;
      } else {
	$right = $left;
      }
    } elsif ($reference eq "end") {
      if ($strand eq "-") {
	$right = $left;
      } else {
	$left = $right;
      }
    }

    ## Check if the right position is not outside the genome ################
    if ($right > $chr_end{$chrom}) {
      &RSAT::message::Warning("$chrom:$left-$right", "invalid position. Skipping line", $l) if ($main::verbose >= 2);
      push @skipped_rows, $_;
      next;
    }


    ## Treat left and right extensions
    if ($strand eq "-") {
      $left = &RSAT::stats::max(0, $left - $downstr_extension);
      $right = &RSAT::stats::min($chr_end{$chrom}, $right + $upstr_extension);
      #$right += $upstr_extension;
    } else {
      $left = &RSAT::stats::max(0, $left - $upstr_extension);
      $right = &RSAT::stats::min($chr_end{$chrom}, $right + $downstr_extension);
      #$right += $downstr_extension;
    }

#    &RSAT::message::Debug("chrom=".$chrom, "left=".$left, "right=".$right) aif ($main::verbose >= 5);


    ## Formulate the query
    my $query = $chrom.":".$left.",".$right;
    if ($header_format eq 'ucsc') {
      $header{$query} = ">".$genome."_".$chrom."_".$left."_".$right.$strand."\trange=".$chrom.":".$left."-".$right." 5'pad=0 3'pad=0 strand=".$strand." repeatMasking=none";
    } elsif ($header_format eq "galaxy") {
      $header{$query} = ">".$genome."_".$chrom."_".$left."_".$right."_".$strand;
    } else {
      $header{$query} = $name || $query;
    }
    push(@queries, $query);
    &RSAT::message::Debug("Query", $query, $header{$query}) if ($main::verbose >= 5);

    $i++;
  }
  close $main::in if ($main::infile{input});


  ################################################################
  ## Print verbose
  &Verbose() if ($main::verbose >= 1);

  ################################################################
  ## Report skipped lines in the log file
  if (scalar(@skipped_rows) > 0) {
    &RSAT::message::Warning("Skipped ".scalar(@skipped_rows)." invalid rows in query file") if ($main::verbose >= 2);
    printf $log "; %-22s\t%s\n", "Skipped rows in query file", scalar(@skipped_rows);
    print $log (join "\n", @skipped_rows), "\n";
  }

  ## Split queries in chunks to avoid timeout
  my @query_chunk = ();
  my $error = 0;
  my $next_chunk_size;
  my $trial_nb = 1;
  while ($remaining = scalar(@queries)) {
    if ($error) {
      &RSAT::message::TimeWarn("Retrying, trial number", $trial_nb);
    } else {
      ## Splice a chunk of queries from the list
      $next_chunk_size = &RSAT::stats::min($chunk, $remaining);
      @query_chunk = splice (@queries, 0, $next_chunk_size);
      &RSAT::message::Debug("Query chunk", join "; ", @query_chunk) if ($main::verbose >= 10);
      &RSAT::message::TimeWarn( "Remaining queries", $remaining, ,"Treating next chunk",$next_chunk_size, scalar(@query_chunk)) 	if ($main::verbose >= 2);
    }

    ## Send request to DAS server
    &RSAT::message::TimeWarn("Sending request to DAS server", $das_server_url) if ($main::verbose >= 3);
    my @request = $das->dna(-dsn=>[$das_server_url],-segment=> \@query_chunk);

    ################################################################
    ## Treat request results
    for my $request (@request) {
      &RSAT::message::TimeWarn("Treating result") if ($main::verbose >= 3);
      if ($request->is_success) {
	my %results = %{$request->results};
	foreach my $query (@query_chunk) {
	  my $sequence = $results{$query};
#	while (($query,$sequence) = each %$results) {
	  print $out $header{$query}, "\n";
	  print $out uc($sequence),"\n"; ## TEMPORARY: convert all sequences to uppercases
#	  print $out $sequence,"\n";
	}

	## Reset error and trial counter
	$error = 0;
	$trial_nb = 1;
      } else {
	## Report error
	$error = 1;
	&RSAT::message::Warning("DAS request returned error", $request->dsn,": ",$request->error);
	if ($trial_nb >= $max_trials) {
	  &RSAT::error::FatalError("Giving up after ".$trial_nb." unsuccessful requests to DAS server", $das_server_url);
	} else {
	  $trial_nb++;
	}
      }
    }
  }

  ################################################################
  ## Report execution time and close output stream
  my $exec_time = &RSAT::util::ReportExecutionTime($start_time); ## This has to be exectuted by all scripts
  print $log $exec_time if ($main::verbose >= 1); ## only report exec time if verbosity is specified
  close $out if ($outfile{output});
  close $log if ($outfile{log});

  exit(0);
}

################################################################
################### SUBROUTINE DEFINITION ######################
################################################################


################################################################
## Display full help message 
sub PrintHelp {
  system "pod2text -c $0";
  exit()
}

################################################################
## Display short help message
sub PrintOptions {
  &PrintHelp();
}

################################################################
## Read arguments 
sub ReadArguments {
  my $arg;
  my @arguments = @ARGV; ## create a copy to shift, because we need ARGV to report command line in &Verbose()
  while (scalar(@arguments) >= 1) {
    $arg = shift (@arguments);
    ## Verbosity

=pod

=head1 OPTIONS

=over 4

=item B<-v #>

Level of verbosity (detail in the warning messages during execution)

=cut
    if ($arg eq "-v") {
      if (&IsNatural($arguments[0])) {
	$main::verbose = shift(@arguments);
      } else {
	$main::verbose = 1;
      }


=pod

=item B<-h>

Display full help message

=cut
    } elsif ($arg eq "-h") {
      &PrintHelp();


=pod

=item B<-help>

Same as -h

=cut
    } elsif ($arg eq "-help") {
      &PrintOptions();


=pod

=item B<-i inputfile>

The input file should be in bed format (see section INPUT FORMATS
above).

If no input file is specified, the standard input is used.  This
allows to use the command within a pipe.

This option is mutually exclusive with option I<-u>.

=cut
    } elsif ($arg eq "-i") {
      &RSAT::error::FatalError("Options -i and -u are mutually exclusive") 
	if ($main::infile{input_url});
      $main::infile{input} = shift(@arguments);

=pod

=item B<-u input_file_URL>

Use as input a file available on a remote Web server (e.g. a bed file
on your Galaxy account).

This option is mutually exclusive with option I<-i>.

=cut
    } elsif ($arg eq "-u") {
      &RSAT::error::FatalError("Options -i and -u are mutually exclusive") 
	if ($main::infile{input});
      $main::infile{input_url} = shift(@arguments);

=pod

=item B<-genome genome_version>

Genome version (e.g. mm9, hg19).

This option is mandatory, since the bed files generally does not
contain information about the genome.


=cut
    } elsif ($arg eq "-genome") {
      $main::genome = shift(@arguments);

=pod

=item B<-header_format header_format>

Format for sequence headers.

Supported header formats.

=over

=item I<UCSC (default)>

=item I<galaxy>

=back

=cut
    } elsif ($arg eq "-header_format") {
      $main::header_format = lc(shift(@arguments));
      &RSAT::error::FatalError($main::header_format, "Invalid header format. Supported: ".$supported_headers) unless ($supported_header{$main::header_format});

=pod

=item B<-upstr_ext #>

Extend each region by # base pairs on the upstream side (i.e. left
side for + strands, right side for - strand).

Under others, this option is convenient to retrieve regions of fixed
width around the summits of peak calling results (e.g. summit file
produced by MACS).

=item B<-downstr_ext #>

Extend each region by # base pairs on the downstream side (i.e. right
for + strand, left for - strand).

=item B<-extend #>

Extend each region by # base pairs on both upstream and downstream
sides.

=cut

    } elsif ($arg eq "-upstr_ext") {
      $main::upstr_extension = shift(@arguments);
      &RSAT::error::FatalError($main::upstr_extension, "Invalid value for upstream extension; should be an Integer") 
	unless (&RSAT::util::IsInteger($upstr_extension));

    } elsif ($arg eq "-downstr_ext") {
      $main::downstr_extension = shift(@arguments);
      &RSAT::error::FatalError($main::downstr_extension, "Invalid value for downstream extension; should be an Integer") 
	unless (&RSAT::util::IsInteger($upstr_extension));

    } elsif ($arg eq "-extend") {
      $main::upstr_extension = $main::downstr_extension = shift(@arguments);
      &RSAT::error::FatalError($main::downstr_extension, "Invalid value for extension; should be an Integer") 
	unless (&RSAT::util::IsInteger($upstr_extension));

=pod

=item B<-reference segment|end|start>

Reference from which the sequences should be fetched.

=over

=item segment (default)

Retrieve sequences from the start to the end positions of each feature
(possibly extended with the options I<-upstr_ext>, I<-downstr_ext> or
I<-extend>).

=item start | end

Retrieve sequences relative to repsectively the start or the end
position of each feature. 

This option is generally combined with the options I<-upstr_ext>,
I<-downstr_ext> or I<-extend>, in order to retrieve sequences of a fixed
width around the end coordinate.

=back

=cut

    } elsif ($arg eq "-reference") {
      $main::reference = shift(@arguments);

=pod

=item B<-top #>

Only consider the # top features of the bed file as queries.

This option is convenient for restricting the number of peak
sequences, and for testing.

=cut

    } elsif ($arg eq "-top") {
      $main::top = shift(@arguments);
      &RSAT::error::FatalError($main::top, "Invalid value for top; should be a Natural number") unless (&RSAT::util::IsNatural($top));

=pod

=item B<-chunk #>

Send queries to UCSC by chunk of # features (default: chunk=10000).

This can be useful to fix problems of timeout, which can occur if too
many queries are sent to UCSC in one shot.

=cut

    } elsif ($arg eq "-chunk") {
      $main::chunk = shift(@arguments);
      &RSAT::error::FatalError($main::chunk, "Invalid value for chunk; should be a Natural number") unless (&RSAT::util::IsNatural($top));


=pod

=item	B<-o outputfile>

The output file is in fasta format.

If no output file is specified, the standard output is used.  This
allows to use the command within a pipe.

=cut
    } elsif ($arg eq "-o") {
      $outfile{output} = shift(@arguments);

    } else {
      &FatalError(join("\t", "Invalid option", $arg));

    }
  }

=pod

=back

=cut

}

################################################################
## Verbose message is sent to the log file, because fasta files cannot
## contain comments
sub Verbose {
  print $log "; fetch-sequences ";
  &PrintArguments($log, 1);
  printf $log "; %-22s\t%s\n", "Program version", $program_version;
  printf $log "; %-22s\t%s\n", "Genome", $genome;
  printf $log "; %-22s\t%s\n", "DAS server URL", $das_server_url;
  printf $log "; %-22s\t%s\n", "Reference", $reference;
  printf $log "; %-22s\t%d\n", "Upstream extension", $upstr_extension;
  printf $log "; %-22s\t%d\n", "Downstream extension", $downstr_extension;
  printf $log "; %-22s\t%d\n", "Invalid rows in query file", scalar(@main::skipped_rows);
  if (%main::infile) {
    print $log "; Input files\n";
    while (my ($key,$value) = each %main::infile) {
      printf $log ";\t%-13s\t%s\n", $key, $value;
    }
  }
  if (%main::outfile) {
    print $log "; Output files\n";
    while (my ($key,$value) = each %main::outfile) {
      printf $log ";\t%-13s\t%s\n", $key, $value;
    }
  }
}


__END__
