#!/usr/bin/perl -w
############################################################
#
# $Id: get_ensembl_xref,v 1.0 2009/12/10 11:10:00 oly Exp $
#
# Time-stamp
#
############################################################
if ($0 =~ /([^(\/)]+)$/) {
    push (@INC, "$`lib/");
}
require "RSA.lib";

#use strict;
use DBI();
require Bio::EnsEMBL::Registry;

local $start_time = &RSAT::util::StartScript();
my $ensembl_host = 'ensembldb.ensembl.org';
my $ensembl_user = "anonymous";
my $dbversion = '';
my $port = '';
my $org = '';
my $verbose = 0;
my $query_file;
my @queries;
my $left_limit;
my $right_limit;
my $strand = 1;
my $chrom;
my $ft_file;
my $ft_file_format = "gft";
my $output_file;
my $all = 0;

################################################################
## Read arguments
&ReadArguments();

################################################################
## Get ensembl mysql port from db version
if ($dbversion && ($dbversion < '48')) {
  $port = '3306';
} else {
  $port = '5306';
}

################################################################
## If option -org is used, connect to ensembldb to get list of 
## databases and pick the latest one corresponding to chosen organism
if ($org) {
#  &RSAT::message::TimeWarn (join("\t", "Connecting EnsEMBL to get the dbname for organism ", $org, 
#				 "host=".$ensembl_host, 
#				 "user=".$ensembl_user )) if ($main::verbose >= 1);
  my $dbh = DBI->connect("DBI:mysql:host=$ensembl_host:port=$port", "$ensembl_user", "", {'RaiseError' => 1});
  my $sth = $dbh->prepare("SHOW DATABASES");
  $sth->execute();
  while (my $ref = $sth->fetchrow_hashref()) {
    if ($ref->{'Database'} =~ /($org)_core_\d+/) {
      $dbname = $ref->{'Database'};
    }
  }
  $sth->finish();
  $dbh->disconnect();
  unless ($dbname) {
    die "Error: there is no organism named $org in the EnsEMBL database. Use the command supported-organisms-ensembl to obtain a full list of supported organisms.\n";
  }
} else {
    die "; You must provide an organism name (-org)\n";
}

################################################################
# A query must be provided
unless (@queries || $query_file || $all || $ft_file || ($left_limit && $right_limit)) {
  die "; You must either provide an EnsEMBL gene ID (-q), a query file (-i), left and right limits or use the '-all' option\n";
}

################################################################
## Get EnsEMBL db version from db name
unless ($dbversion) {
  $dbversion = $dbname;
  $dbversion =~ s/($org)_core_//;
  $dbversion =~ s/_.+//;
}

################################################################
my $registry = "Bio::EnsEMBL::Registry";

$registry->load_registry_from_db(
				 -host => $ensembl_host,
				 -user => $ensembl_user,
				 -db_version => $dbversion,
				 -port => $port,
				 -verbose => "0" );

my $db = Bio::EnsEMBL::Registry->get_DBAdaptor($org, "core");

################################################################
### Open output stream
if ($output_file) {
  $fh = 'OUT';
  open $fh, ">".$output_file || die "cannot open file ".$output_file."\n";

} else {
  $fh = *STDOUT;
}
print $fh "# Ensembl_id\tDB\tDB_id\n";

my $slice_adaptor = $db->get_SliceAdaptor();

################################################################
## Left and right limits
if ($left_limit && $right_limit && $chrom) {
  my $slice = $slice_adaptor -> fetch_by_region('chromosome', $chrom, $left_limit, $right_limit);
  my @genes = @{$slice->get_all_Genes()};
  foreach my $gene (@genes) {
    &print_DBEntries($gene, $gene -> get_all_DBLinks());
  }

## Feature file
} elsif ($ft_file) {
  open FEAT, $ft_file;
  my $ft_name;
  while ($line = <FEAT>) {
    chomp($line);
    next if (($line =~/^[#|;]/)||($line eq ""));
    if ($ft_file_format eq "ft") {
      ($chrom, $ft_type, $ft_id, $strand, $left_limit, $right_limit,@other_comments) = split (/\t/,$line);
    } elsif ($ft_file_format eq 'gft') {
      ($ft_id, $ft_type, $ft_name, $chrom, $left_limit, $right_limit, $strand, @other_comments) = split (/\t/,$line);
    }

    ## Extract only chromosome number if necessary
    $chrom =~ s/chromosome:[\w\.]*?://;
    $chrom =~ s/:.*//;

    ## Tranforms strand in ensembl format
    $strand =~ s/F/1/;
    $strand =~ s/R/-1/;
    $strand =~ s/D/1/;
    $strand =~ s/W/1/;
    $strand =~ s/C/-1/;
    $strand =~ s/>/1/;
    $strand =~ s/</-1/;

    my $slice = $slice_adaptor -> fetch_by_region('chromosome', $chrom, $left_limit, $right_limit, $strand);
    my @genes = @{$slice->get_all_Genes()};
    foreach my $gene (@genes) {
      &print_DBEntries($gene, $gene -> get_all_DBLinks());
    }
  }

## All genes
} elsif ($all) {
  ## from one chromosome
  if ($chrom) {
    my $slice = $slice_adaptor -> fetch_by_region('chromosome', $chrom);
    my @genes = @{$slice->get_all_Genes()};
    foreach my $gene (@genes) {
      &print_DBEntries($gene, $gene -> get_all_DBLinks());
    }
    ## From all chromosomes
  } else {
    my @slices = @{$slice_adaptor->fetch_all("chromosome")};
    foreach my $slice (@slices) {
      my @genes = @{$slice->get_all_Genes()};
      foreach my $gene (@genes) {
	&print_DBEntries($gene, $gene -> get_all_DBLinks());
      }
    }
  }
## Query
} else {
  my $gene_adaptor = $db->get_GeneAdaptor();
  ## Input file of query IDs
  if ($query_file) {
    open IN, $query_file;
    while ($line = <IN>) {
      my @genes;
      $line =~s/\t.*//;
      chomp($line);
      if (($line =~ /ENST\d/) || ($line =~ /ENS...T/)) {
	push(@genes, $gene_adaptor -> fetch_by_transcript_stable_id($line));
      } elsif (($line =~ /ENSP\d/) || ($line =~ /ENS...P/)) {
	push(@genes, $gene_adaptor -> fetch_by_translation_stable_id($line));
      } elsif (($line =~ /ENSG\d/) || ($line =~ /ENS...G/)) {
	#		my $gene_id = $line;
	push(@genes,$gene_adaptor -> fetch_by_stable_id($line));
      } else {
	if ($gene_adaptor -> fetch_by_stable_id($line)) {
	  push(@genes,$gene_adaptor -> fetch_by_stable_id($line));
	} else {
	  @genes = @{$gene_adaptor -> fetch_all_by_external_name($line)};
	}
      }

      if (@genes) {
	foreach my $gene (@genes) {
	  if ($gene) {
	    &print_DBEntries($gene, $gene -> get_all_DBLinks());
	  }
	}
      }
    }
    close IN;

  ## List of query IDs
  } else {
    foreach my $id (@queries) {
      my @genes = ();
      if (($id =~ /ENST\d/) || ($id =~ /ENS...T/)) {
	push (@genes, $gene_adaptor -> fetch_by_transcript_stable_id($id));
      } elsif (($id =~ /ENSP\d/) || ($id =~ /ENS...P/)) {
	push (@genes, $gene_adaptor -> fetch_by_translation_stable_id($id));
      } elsif (($id =~ /ENSG\d/) || ($id =~ /ENS...G/)) {
	push (@genes, $gene_adaptor -> fetch_by_stable_id($id));
      } else {
	if ($gene_adaptor -> fetch_by_stable_id($id)) {
	  push(@genes,$gene_adaptor -> fetch_by_stable_id($id));
	} else {
	  @genes = @{$gene_adaptor -> fetch_all_by_external_name($id)};
	}
      }

      if (@genes) {
	foreach my $gene (@genes) {
	  if ($gene) {
	    &print_DBEntries($gene, $gene -> get_all_DBLinks());
	  }
	}
      }
    }
  }
}

################################################################
## Report execution time
my $exec_time = &RSAT::util::ReportExecutionTime($start_time); ## This has to be exectuted by all scripts
warn $exec_time if ($verbose >= 1); ## only report exec time if verbosity is specified

## Subroutine to print DBEnties
sub print_DBEntries {
  my ($gene, $db_entries) = @_;
  foreach my $dbe (@$db_entries) {
    print $fh $gene->stable_id(), "\t", $dbe->dbname(),"\t",$dbe->display_id(),"\n";
  }
}

## Read arguments
sub ReadArguments {
  foreach my $a (0..$#ARGV) {
    ### verbose  
    if ($ARGV[$a] eq "-v") {
      if (&IsNatural($ARGV[$a+1])) {
	$verbose = $ARGV[$a+1];
      } else {
	$verbose = 1;
      }

      ### detailed help
    } elsif ($ARGV[$a] eq "-h") {
      &PrintHelp();

      ### list of options
    } elsif ($ARGV[$a] eq "-help") {
      &PrintShortHelp();

      ### output file
    } elsif ($ARGV[$a] eq "-o") {
      $output_file = $ARGV[$a+1];

      ### EnsEMBL database server (host)
    } elsif ($ARGV[$a] eq "-ensemblhost") {
      $ensembl_host = $ARGV[$a+1];

       ### EnsEMBL database version
    } elsif ($ARGV[$a] eq "-dbversion") {
      $dbversion = $ARGV[$a+1];

      ### organism
    } elsif ($ARGV[$a] eq "-org") {
      $org = lc($ARGV[$a+1]);

      ### Query
    } elsif ($ARGV[$a] eq "-q") {
      @queries = (@queries, $ARGV[$a+1]);

      ### chromosome name
    } elsif ($ARGV[$a] eq "-chrom") {
      $chrom = $ARGV[$a+1];

      ### Left limit
    } elsif ($ARGV[$a] eq "-left") {
      $left_limit = $ARGV[$a+1];

      ### Right limit
    } elsif ($ARGV[$a] eq "-right") {
      $right_limit = $ARGV[$a+1];

      ### Strand
    } elsif ($ARGV[$a] eq "-strand") {
      $strand = $ARGV[$a+1];

      ### Feature file
    } elsif ($ARGV[$a] eq "-ftfile") {
      $ft_file = $ARGV[$a+1];

      ### Feature file format
    } elsif ($ARGV[$a] eq "-ftfileformat") {
      $ft_file_format = $ARGV[$a+1];

      ### All genes
    } elsif ($ARGV[$a] eq "-all") {
      $all = 1;
    }
  }
}

################################################################
#### detailed help message
sub PrintHelp {
    open(HELP, "| less");
    print HELP<<End_help;
USAGE
	get_ensembl_xref -org organism [-o outpufile] -q query | -i query file | -all

DESCRIPTION
	Returns cross-references for a list of query genes.

CATEGORY
	utility

REMARK  Requires local instal of the EnsEMBL Perl Core API (see http://www.ensembl.org/info/using/api/api_installation.html)

OPTIONS
	-org organism
	        underscore between words (eg 'homo_sapiens')

	        If this option is not used, the option -dbname must be used
	         instead.

	        (type 'supported-organisms | grep EnsEMBL' to obtain the list of supported
	         organisms)

        -ensemblhost
                address of ensembl database server (default is EBI server)

        -dbversion
	        version of ensembl database (example: 47)

	-q query
		The query should be an EnsEMBL gene identifier (eg 'ENSG00000177799').
		Multiple queries can be entered by reiteratively using the -q
		option.

        -i     query file. The first word of each line is taken as a query.
                This option is incompatible with -q.

	-all	return x-references fo all genes

	-o	name of the output file

        -chrom  Chromosome name or number (to use with -left and -right or -all)

        -left   Left limit of region to get x-references for

        -right  Right limit of region to get x-references for

        -strand Strand of region to get x-references for, when using -left and -right. Values: 1, -1

        -ftfile Feature file

        -ftfileformat
                Feature file format. Supported: ft, gft

End_help
    close HELP;
    exit;
}

################################################################
#### list of options
sub PrintShortHelp {
  open(HELP, "| less");
  print HELP<<End_short_help;
retrieve-seq options
--------------------
-org		organism
-ensemblhost    address of ensembl database server (default is EBI server)
-q		query
-i              query file
-all		returns x-refernces for all genes
-o		followed by the name of the outputfile.
-chrom          chromosome name or number (to use with -left and -right)
-left           left limit of region to get x-references for
-right          right limit of region to get x-references for
-strand         strand of region to get x-references for, when using -left and -right. Values: 1, -1
-ftfile         feature file
-ftfileformat   feature file format. Supported: ft, gft

End_short_help
  close HELP;
  exit;
}
