#!/usr/bin/env perl
############################################################
#
# $Id: retrieve-seq-bed,v 1.48 2013/10/03 17:24:24 jvanheld Exp $
#
############################################################

## use strict;

=pod

=head1 NAME

retrieve-seq-bed

=head1 VERSION

$program_version

=head1 DESCRIPTION

Retrieve sequences for a set of genomic coordinates provided in bed,
gff or vcf format. Note that gff coordinates are zero-based.

This script is a wrapper around I<bedtools getfasta>, an efficient
tool to retrieve sequences from a fasta-formatted sequence file
(e.g. all genome sequences) and a file of coordinates defined on the
sequences of the fasta file.

The wrapper generates the I<bedtools getfasta> command in order to
retrieve genomic coordinates from one of the locally supported
genomes.

=head1 AUTHORS

Bruno Contreras Moreira <bcontreras\@eead.csic.es>
Jacques.van-Helden\@univ-amu.fr

=head1 CATEGORY

=over

=item genome

=back

=head1 USAGE

retrieve-seq-bed -org organism_name -i inputfile [-o outputfile] [-v #] [...]

=head1 INPUT FORMAT

The genomic coordinate file will be used as input by I<bedtools
getfasta>, and must be compliant with the supported formats:
BED/GFF/VCF.

=head1 OUTPUT FORMAT

A sequence file in fasta format (produced by I<bedtools getfasta>. 

=head1 SEE ALSO

=head1 WISH LIST

=over

=item B<-server http://some.rsat.server/rsat/>

Send the request to a remote RSAT server via the Web services. This
option enables to get fasta sequences from any RSAT server without
having to install them locally.

=item B<-extend length>
=item B<-extend_up up_length>
=item B<-extend_down down_length>

Extend the peaks by a given length on the upstream (I<-extend_up>),
downstream (I<-extend_down>) or both sides (I<-extend>). The side is
adapted according to the strand.

Flank extension is done via I<bedtools flank>. 

The extended coordinates are exported with the same name as the output
file, supplemented with the suffix _flanks.bed.

=back

=cut


BEGIN {
  if ($0 =~ /([^(\/)]+)$/) {
    push (@INC, "$`lib/");
  }
}
require "RSA.lib";



################################################################
## Main package
package main;
{

  ################################################################
  ## Initialise parameters
  our $start_time = &RSAT::util::StartScript();
  our $program_version = do { my @r = (q$Revision: 1.48 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
  #    $program_version = "0.00";

  our %infile = ();
  our %outfile = ();

  our $verbose = 0;
  our $in = STDIN;
  our $out = STDOUT;

  our $organism;
  our $organism_name = "";
  our $repeat_masked = 0; ## If set ot 1, use repeat-masked version of the genome
  $infile{coordinates} = ""; ## Mandatory argument

  our $common = 0;
  our $add_chr = 0;      ## Prepend chr prefix to chromosome column in bed file
  our $remove_chr = 0;   ## Remove chr prefix from chromosome column in bed file
  our $check_chr = 0;    ## Check consistency between chromosome names in bed and genome fasta
  our %fasta_chrom = (); ## Index of chromosome names found in the fasta genome file
  our %fasta_chr_prefix = (); ## Index of chromosome names starting with the chr prefix in the fasta genome file
  our %bed_chr_prefix = (); ## Index of chromosome names starting with the chr prefix in the bed file
  our %common_chr = ();  ## hash linking common chr names to real names in FASTA headers 

  ## Arguments for &RSAT::util::doit();
  our $dry = "";
  our $die_on_error = "";
  our $batch = "";
  our $job_prefix = "";


  ################################################################
  ## Read argument values
  &ReadArguments();

  ################################################################
  ## Check argument values

  ## Treatment of the "chr" prefix
  &RSAT::error::FatalError("The options -common_chr, -add_chr, -remove_chr and -check_chr are mutually exclusive") 
      if ($remove_chr + $add_chr + $check_chr > 1);

  ## Check that input coordinates file has been specified
  unless ($infile{coordinates}) {
      &RSAT::error::FatalError("You must specify the input file containing genomic coordinates (option -i).");
  }
  my $coordinate_file =  $infile{coordinates};

  ## Check that output sequence file has been specified
  unless ($outfile{sequences}) {
      &RSAT::error::FatalError("You must specify the output sequence file (option -o).");
  }

  ## Check that organism_name has been defined
  unless ($organism_name) {
      &RSAT::error::FatalError("You must define an organism name (option -org).");
  }

  ## Check that the organism is supported locally
  $organism = new RSAT::organism();
  $organism->check_name($organism_name);
  
  ################################################################
  ## Identify the genome sequence file (must be in fasta format)
  my $genome_dir = $ENV{RSAT}."/data/genomes/".$organism_name."/genome/";
  $infile{fasta_genome} = $genome_dir."/".$organism_name;
  $infile{fasta_genome} .= ".dna";
  $infile{fasta_genome} .= "_rm" if ($repeat_masked);
  $infile{fasta_genome} .= ".genome.fa";
  &RSAT::error::FatalError("Missing fasta file with genome sequence", $infile{fasta_genome}) 
      unless (-e $infile{fasta_genome}) ;
  &RSAT::message::Info("Genome sequence", $infile{fasta_genome}) if ($main::verbose >= 2);


  ################################################################
  ## Build a prefix based on output sequence file
  $prefix{outfile} =  $outfile{sequences};
  $prefix{outfile} =~ s/.fasta$//;
  $prefix{outfile} =~ s/.fna$//;
  $prefix{outfile} =~ s/.fa$//;


  ################################################################
  ## Open log stream
  if ($main::verbose >= 1) {
      $outfile{log} = $prefix{outfile}."_log.txt";
      $out = &OpenOutputFile($outfile{log});
  }


  ################################################################
  ## Check chr names
  if($infile{commonchrfile} || $check_chr) {
   
    &RSAT::message::TimeWarn("Checking chromosome names") if ($main::verbose >= 2);

    # Get the list of chromosome names from the genome sequence (fasta file)
    &RSAT::message::TimeWarn("Getting chromosome names from fasta genome", $infile{fasta_genome}) if ($main::verbose >= 2);
    $cmd =  'grep \'^>\' '.$infile{fasta_genome}.' |';
    open(CHR, $cmd);
    while (<CHR>) {
        chomp();
        &RSAT::message::Debug("\tSequence header", $_) if ($main::verbose >= 10);
        if (/^\>(\S+)/) {
          my $chrom = $1;
          if ($chrom =~ /^chr/i) {
            $fasta_chr_prefix{$chrom} = 1;
          }
          $fasta_chrom{$chrom} = 1;
          &RSAT::message::Debug("\tChromosome name", $chrom) if ($main::verbose >= 5);
        }
    }
    close (CHR);
  }

  ## check whether file with common chr names is available
  if($infile{commonchrfile}) {
   
    # Get the list of common chromosome names from optional infile
    &RSAT::message::TimeWarn("Reading common chr names from ", $infile{commonchrfile}) if ($main::verbose >= 2);

    if(open(COMMON,"<",$infile{commonchrfile})) { # no exception, this is optional
        while(<COMMON>) {
            if(/^(\S+)\s+(\S+)/) {
                $common_chr{$1} = $2;          
                &RSAT::message::Info("\tAccepted common chr name: ".$1." => ". $2);
            }
        }
        close(COMMON);
    }

    # Replace common names for fullnames so that bedtools works 
    my ($chrom,$rest);
    &RSAT::message::TimeWarn("Replacing common chr names in query bed file", $infile{coordinates}) if ($main::verbose >= 2);

    open(FULLBED,">",$prefix{outfile}."_full.bed") ||
        &RSAT::error::FatalError("Cannot create full bed file", $prefix{outfile}."_full.bed");

    open(BED,"<",$infile{coordinates}) || 
        &RSAT::error::FatalError("Cannot read query bed file", $infile{coordinates});
    while (<BED>) {
        chomp();
        if (/^(\S+)\s+(.*)/) {
          ($chrom,$rest) = ($1,$2);
          if($common_chr{$chrom}){ $chrom = $common_chr{$chrom} }
          print FULLBED "$chrom\t$rest\n";
        }
    }
    close(BED);

    close(FULLBED);

    $outfile{coordinates} = $prefix{outfile}."_full.bed";
    $coordinate_file = $outfile{coordinates};
  }
  elsif ($check_chr) {

    # Check chromosome names in the query file
    my $chrom;
    &RSAT::message::TimeWarn("Getting chromosome names from query bed file", $infile{coordinates}) if ($main::verbose >= 2);
    $cmd =  'cut -f 1 '.$infile{coordinates};
    $cmd .= ' | grep -v \'^#\' | sort | uniq |';
    open(CHR, $cmd);
    while (<CHR>) {
	    chomp();
	    if (/(\S+)/) {
	      $chrom = $1;
	      if ($chrom =~ /^chr/i) {
		    $bed_chr_prefix{$chrom} = 1;
	      }
	      $bed_chrom{$chrom} = 1;
	      &RSAT::message::Debug("\tBed chromosome name", $chrom) if ($main::verbose >= 5);
	    }
    }
    close (CHR);

    # Decide whether to add or remove chr prefixes
    # Not sure this logic is correct but I leave it as it was Bruno Mar2018
    my $bed_chr_prefixes = scalar(keys(%bed_chr_prefix));
    my $fasta_chr_prefixes = scalar(keys(%fasta_chr_prefix));
    if ($main::verbose >= 2) {
	    &RSAT::message::Info("Bed chromosome names with chr prefix", $bed_chr_prefixes, join(",", sort(keys(%bed_chrom))));
	    &RSAT::message::Info("Fasta chromosome names with chr prefix", $fasta_chr_prefixes, join(",", sort(keys(%fasta_chrom))));
    }
    if (($bed_chr_prefixes == 0) && ($fasta_chr_prefixes > 0)) {
	    $add_chr = 1;
	    &RSAT::message::Info("Adding chr prefixes to ensure compatibility between bed and fasta file") if ($main::verbose >= 1);
    } elsif (($bed_chr_prefixes > 0) && ($fasta_chr_prefixes == 0)) {
	    $remove_chr = 1;
	    &RSAT::message::Info("Removing chr prefixes to ensure compatiblity between bed and fasta file") if ($main::verbose >= 1);
    }

    # Define coordinate file name, depending on whether the bed file
    # must be converted or not.
    if (($add_chr + $remove_chr) >= 1) {
      $out = &OpenOutputFile($outfile{log});
      $outfile{coordinates} = $prefix{outfile}."_converted.bed";
      $coordinate_file = $outfile{coordinates};
    }

    # Suppress chr prefix from chromosome names if requested
    if ($add_chr) {
      &RSAT::message::TimeWarn("Adding chr prefix to chromosome column of bed file") if ($main::verbose >= 2);
      $cmd =  &RSAT::server::GetProgramPath("convert-features");
      $cmd .= " -add_chr -from bed -to bed";
      $cmd .= " -i ".$coordinate_file;
      $cmd .= " -o ".$outfile{coordinates};
      &RSAT::util::doit($cmd, $dry, $die_on_error, $verbose);
    }

    # Suppress chr prefix from chromosome names if requested
    if ($remove_chr) {
      &RSAT::message::TimeWarn("Removing chr prefix from chromosome column of bed file") if ($main::verbose >= 2);
      $cmd =  &RSAT::server::GetProgramPath("convert-features");
      $cmd .= " -remove_chr -from bed -to bed";
      $cmd .= " -i ".$infile{coordinates};
      $cmd .= " -o ".$outfile{coordinates};
      &RSAT::util::doit($cmd, $dry, $die_on_error, $verbose);
    }
  }


  ################################################################
  ## Generate the bedtools command
  my $bedtools = &RSAT::server::GetProgramPath("bedtools", $die_on_error);
  my $cmd = $bedtools." getfasta -fi ".$infile{fasta_genome};
  $cmd .= " -bed ".$coordinate_file;
  $cmd .= " -s"; ## Take strand in consideration (this is inactive by default in bedtools getfasta)
  $cmd .= " -fo ".$outfile{sequences} if ($outfile{sequences});
  $outfile{err_log} = $outfile{sequences}."_err.txt";
  $cmd .= " 2> ".$outfile{err_log};
  &RSAT::util::doit($cmd, $dry, $die_on_error, $verbose);
#  &RSAT::util::doit($cmd, $dry, $die_on_error, $verbose, $batch, $job_prefix, $out);


  ################################################################
  ## Print verbose
  if ($main::verbose >= 1){ 
    Verbose() 
  }
  
  if(-s $outfile{err_log}) {
    my $bad_chr_names = 0;
    open(ERR,'<',$outfile{err_log});
    while(my $err_line = <ERR>) {
        &RSAT::message::Warning($err_line);
        if($err_line =~ m/was not found in the FASTA file/) { $bad_chr_names++ }
    }
    close(ERR);  
    if($bad_chr_names > 0 && keys(%fasta_chrom)) {
            &RSAT::message::Warning("Valid chromosome names are: ". join(",", sort(keys(%fasta_chrom))));
        }

  }

  ################################################################
  ## Execute the command

  ################################################################
  ## Insert here output printing

  ################################################################
  ## Report execution time and close output stream
  &close_and_quit();
}

################################################################
################### SUBROUTINE DEFINITION ######################
################################################################


################################################################
## Close output file and quit
sub close_and_quit {

  ## Report execution time
  my $exec_time = &RSAT::util::ReportExecutionTime($start_time); ## This has to be exectuted by all scripts
  print $main::out $exec_time if ($main::verbose >= 1); ## only report exec time if verbosity is specified

  ## Close output file
  if ($outfile{sequences}) {
    close $main::out;
    &RSAT::message::TimeWarn("Output file", $outfile{sequences}) if ($main::verbose >= 2);
  }

  ## CLOSE OTHER FILES HERE IF REQUIRED

  exit(0);
}


################################################################
## Display full help message 
sub PrintHelp {
  system "pod2text -c $0";
  exit()
}

################################################################
## Display short help message
sub PrintOptions {
  &PrintHelp();
}

################################################################
## Read arguments 
sub ReadArguments {
  my $arg;
  my @arguments = @ARGV; ## create a copy to shift, because we need ARGV to report command line in &Verbose()
  while (scalar(@arguments) >= 1) {
    $arg = shift (@arguments);


=pod

=head1 OPTIONS

=over 4

=item B<-v #>

Level of verbosity (detail in the warning messages during execution)

=cut
    if ($arg eq "-v") {
      if (&IsNatural($arguments[0])) {
	$main::verbose = shift(@arguments);
      } else {
	$main::verbose = 1;
      }


=pod

=item B<-h>

Display full help message

=cut
    } elsif ($arg eq "-h") {
      &PrintHelp();


=pod

=item B<-help>

Same as -h

=cut
    } elsif ($arg eq "-help") {
      &PrintOptions();


=pod

=item B<-i coordinate_file>

Genomic coordinates, in one of the formats supported by I<bedtools
getfasta>: BED, GFF, VCF.

=cut
    } elsif ($arg eq "-i") {
      $main::infile{coordinates} = shift(@arguments);

# =pod

# =item B<-in_format format>

# Format of the genomic coordinate file.  Supported (same as I<bedtools
# getfasta>): BED, GFF, VCF.

# =cut
#     } elsif ($arg eq "-in_format") {
#       $main::in_format= lc(shift(@arguments));


=pod

=item	B<-o outputfile>

Output file (in fasta format), where the sequences will be saved.
This argument is mandatory, since it is required by I<bedtools
getfasta>.

=cut
    } elsif ($arg eq "-o") {
      $outfile{sequences} = shift(@arguments);

=pod

=item	B<-org organism_name>

Organism name, which must correspond to one organism supported on the
local RSAT instance.

=cut
    } elsif ($arg eq "-org") {
      $main::organism_name = shift(@arguments);

=pod

=item B<-rm> 

Use repeat-masked version of the genome.

=cut

    } elsif ($arg eq "-rm") {
      $main::repeat_masked = 1;

=pod

=item B<-common_chr file>, B<-add_chr>, B<-remove_chr>, B<-check_chr>

These four options allow to ensure consistency between chromosome
naming in the query bed file and in the genome annotations. Indeed,
the chromosome naming varies depending on the genome reference center,
so that a bed file obtained with one convention may have chromosome
names incompatible with the convention used in the database that
served as source to install the genome in RSAT.

I<-common_chr file> takes a tab-separated file matching common, short 
chr names to full names actually used in underlying FASTA file.

I<-add_chr> adds a prefix "chr" to chromosome names of the query bed
file before running the analysis.

I<-remove_chr> removes the prefix "chr" from chromosome names.

The option I<-check_chr> analyses the input bed file and the RSAT
annotation table, and checks the consistency between naming
conventions.

=cut
    } elsif ($arg eq "-common_chr") {
      $main::infile{commonchrfile} = shift(@arguments);
      $common = 1;

    } elsif ($arg eq "-add_chr") {
      $add_chr = 1;

    } elsif ($arg eq "-remove_chr") {
      $remove_chr = 1;

    } elsif ($arg eq "-check_chr") {
      $check_chr = 1;


    } else {
      &FatalError(join("\t", "Invalid option", $arg));

    }
  }

=pod

=back

=cut

}


################################################################
## Verbose message
sub Verbose {
  print $out "; retrieve-seq-bed ";
  &PrintArguments($out);
  printf $out "; %-22s\t%s\n", "Program version", $program_version;
  if (%main::infile) {
    print $out "; Input files\n";
    while (my ($key,$value) = each %main::infile) {
      printf $out ";\t%-13s\t%s\n", $key, $value;
    }
  }
  if (%main::outfile) {
    print $out "; Output files\n";
    while (my ($key,$value) = each %main::outfile) {
      printf $out ";\t%-13s\t%s\n", $key, $value;
    }
  }
}


__END__
