#!/usr/bin/perl -w

=pod

=head1 NAME

check-retrieve-seq-rest

=head1 DESCRIPTION

Compares upstream sequences obtained with retrieve-seq with the equivalent 
coordinates provided by eg REST service

=head1 AUTHORS

=over

=item Bruno Contreras-Moreira <bcontreras\@eead.csic.es>

=item Jacques van Helden <Jacques.van-Helden\@univ-amu.fr>

=back

=head1 CATEGORY

genomics

=head1 USAGE

check-retrieve-seq-rest -n 10 -org organism_name

=head1 INPUT FORMAT

Reference organisms can be entered on the command line (option -org). 

=head1 OUTPUT FORMAT

A tab-separated file with three to five olumns. Each row of the output shows an upstream sequence.
Output contains the following columns:

=over

=item 1: gene name

=item 2. genomic coordinates of upstream region

=item 3. result of sequence comparison (equal=1, unequal=0)

=item 4. RSAT sequence (if unequal)

=item 5. REST sequence (if unequal)

=back

=cut

BEGIN {
  if ($0 =~ /([^(\/)]+)$/) {
    push (@INC, "$`lib/");
  }
}

require "RSA.lib";
use JSON;
use HTTP::Tiny;

################################################################
## Main package
package main;
{
  
  ################################################################
  #### initialise parameters and vars
  our $start_time = &RSAT::util::StartScript();

  my $server = 'http://rest.ensemblgenomes.org'; # could be in %ENV
  
  our %params = ( 'n' => 10, 'rm' => 0 );

  our $verbose = 0;
  our $in = STDIN;
  our $out = STDOUT;
  
  # parse arguments
  &ReadArguments();

  # check  organism and other params
  &RSAT::error::FatalError("Please select an organism")
    unless ($params{'org'});
 
  if($params{'n'}){
    if($params{'n'} < 0){
      &RSAT::error::FatalError("Parameter n should be a natural number");
    }
  }

  &Verbose() if ($main::verbose >= 1);

  # start actual job
  my $identical = 0;
  my ($random_gene_ids,$coords,$ext,$response,%genes,%rsat_seqs);

  RSAT::message::Info("Checking ",$params{'n'},$params{'org'},"upstream sequences") if($verbose >=2);

  # shuffle genes from  
  open(RND,"random-genes -org $params{'org'} -n $params{'n'} |")
    || die "# failed running random-genes -org $params{'org'} -n $params{'n'}\n";  
  while(<RND>)
  {
    $random_gene_ids .= ' -q '.(split)[0];
  }
  close(RND);

  # get the corresponding DNA sequences and parse the coords from the header
  my $ret_command = "retrieve-seq -org $params{'org'} $random_gene_ids ";
  if($params{'rm'}){ $ret_command .= '-rm ' }
  open(SEQ,"$ret_command |")|| die "# failed running $ret_command\n";
  while(<SEQ>)
  {
    #>AT5G36350  ... location: arabidopsis_thaliana:5:14333830:14335829:R
    if(/^>(\S+).*?location: (\S+)/)
    {
      my @loc = split(/:/,$2);
      $coords = $loc[1].':'.$loc[2].'..'.$loc[3].':';
      if($loc[4] eq 'D'){ $coords .= '1' }
      else{ $coords .= '-1' } # print "$coords\n";
      $genes{$coords} = $1;
    }
    else
    {
      chomp;
      $rsat_seqs{$coords} .= $_;
    }
  }
  close(SEQ);

  # start output with header   
  $header =  join("\t",
      'gene_id',
      'coordinates',
      'equal',
      'RSAT',
      'REST'
  );
  print $out "; $header\n";

  my $http = HTTP::Tiny->new();

  foreach $coords (keys(%rsat_seqs))
  {
    $ext = "/sequence/region/$params{'org'}/".$coords.'?';
    if($params{'rm'}){ $ext .= 'mask=hard' }
    $response = $http->get($server.$ext,{ headers => { 'Content-type' => 'application/json' } });
    RSAT::error::FatalError("Failed connecting to",$server.$ext) unless $response->{success};

    if(length $response->{content})
    {
      my $json = decode_json($response->{content});
      if($json->{'seq'} ne $rsat_seqs{$coords})
      {
        print $out "$genes{$coords}\t$coords\t0\t$rsat_seqs{$coords}\t$json->{'seq'}\n";
      }
      else
      {
        print $out "$genes{$coords}\t$coords\t1\tNA\tNA\n";
        $identical++;
      }
    }
  }

  RSAT::message::Info("Identical sequences:",$identical,"out of",$params{'n'}) if($verbose >=2);

  close_and_quit();
}

################################################################
################### SUBROUTINE DEFINITION ######################
################################################################

################################################################
### Close output file and quit
sub close_and_quit {
  if ($main::verbose >= 1) 
  {
    my $exec_time = &RSAT::util::ReportExecutionTime($start_time);
    print $main::out $exec_time;
  }

  exit(0);
}

################################################################
#### display full help message 
sub PrintHelp {
    system "pod2text -c $0";
    exit()
}

################################################################
#### display short help message
sub PrintOptions {
    &PrintHelp();
}

################################################################
#### Read arguments 
sub ReadArguments {
  my $arg = "";
  
  my @arguments = @ARGV; 
 
  while ($arg = shift(@arguments)) {

=pod
    
=head1 OPTIONS

=over 4

=item B<-v #>

Level of verbosity (detail in the warning messages during execution)

=cut

    if ($arg eq "-v") {
      if (&IsNatural($arguments[0])){ $main::verbose = shift(@arguments) } 
    
=pod

=item B<-h>

Display full help message

=cut
    } elsif ($arg eq "-h") {
	  &PrintHelp();
=pod

=item B<-help>

Same as -h

=cut
	} elsif ($arg eq "-help") {
	  &PrintOptions();
	
=pod

=item B<-org organism>

Check sequences from this organism.

=cut

	} elsif ($arg eq '-org') {
	  $main::params{'org'} = shift (@arguments);
=pod

=item B<-n number of sequences to check>

Total number of sequences to be checked. Default: 10 .

=cut

    } elsif ($arg eq '-n') {
      $main::params{'n'} = shift(@arguments);
=pod

=item B<-rm use repeat-masked sequences>

Request repeast-masked sequences (hard-masked, with Ns) instead of default raw sequences.

=cut

    } elsif ($arg eq '-rm') {
      $main::params{'rm'} = 1;
=pod

=back

=cut


    }
  }
}

################################################################
#### verbose message
sub Verbose {
  print $main::out "; check-retrieve-seq-rest ";
  &PrintArguments($main::out);
   
  printf $main::out "; %-21s\t%s\n", "org", $main::params{'org'};
  printf $main::out "; %-21s\t%s\n", "n", $main::params{'n'};
}

__END__

