#!/usr/bin/env perl

=pod

=head1 NAME

get-orthologs-compara

=head1 DESCRIPTION

Returns orthologues, plus optionally paralogues and homeologues, for
a set of genes in one or more organisms. Relies on primary data from
Ensembl Compara.

=head1 AUTHORS

=over

=item Bruno Contreras-Moreira <bcontreras\@eead.csic.es>

=item Jacques van Helden <Jacques.van-Helden\@univ-amu.fr>

=back

=head1 CATEGORY

comparative genomics

=head1 USAGE

get-orthologs-compara -q GENE1 -q GENE2 -ref_org organism_name [...]

=head1 HOMOLOGY CRITERIA

Ensembl Compara gene orthology and paralogy predictions are generated
by a pipeline where maximum likelihood phylogenetic gene trees play a
central role. They attempt to represent the evolutionary history of
gene families, i.e. genes that diverged from a common ancestor. These
gene trees reconciled with their species tree have their internal
nodes annotated to distinguish duplication or speciation events, and
thus support the annotation of orthologous and paralagous genes, which
can be part of complex one-to-many and many-to-many relations.

Pairs of homologous sequences are scored in terms of % amino acid identity,
which is calculated with respect to query and target in order to capture 
length and possibly domain content differentes. For example, if the 
species selected is Arabidopsis thaliana, and the homologue is in maize, 
the query sequence is the A.thaliana protein and the target is the maize 
protein. I<ident_query> in this case is the % of the query identical 
to the maize protein, and I<ident_target> is the % of the maize protein 
identical to the A.thaliana protein. These % identities will only be the 
same if the length (number of amino acids) of both sequences are the same. 

[Adapted from:
L<http://www.ensembl.org/info/genome/compara/homology_method.html>].

This program queries a BerkeleyDB db of Ensembl Compara records,
created with I<parse-compara>, which should be installed as
$RSAT/data/genomes/compara.dbd .

=head1 INPUT FORMAT

Query genes can be directly entered on the command line (option I<-q>)
or in an input file (option I<-i>). The first word of each row of such
a file is handled as a gene. Any additional information on the same
row is ignored.

As for genes, the reference organisms can also be entered on the
command line (option I<-ref_org>) or in an input file (option
I<-org_list>).

=head1 OUTPUT FORMAT

A tab-separated file with seven columns. Each row of the output file
describes a homology relation between a query gene and a target gene.
Output contains the following columns:

=over

=item 1. ID of target homologous gene (Ensembl gene_stable_id).

=item 2. Name of reference organism (of target gene).

=item 3. type of homology relationship.

=item 4. ID of the query gene.

=item 5. Name of query organism (of query gene).

=item 6. % protein sequence identity with respect to target length.

=item 7. % protein sequence identity with respect to query length.

=back

=head1 EXAMPLES

=head2 Quick demo

The option I<-demo> selects orthologs for a single gene in a single
genome.

 get-orthologs-compara -v 1 -demo                                                                                                                                                            

=head2 Get the list of suppported organisms

Beware, the list of organisms supported for compara is a subset of
those installed in the RSAT server. For this reason,
get-orthologs-compara has a specific option I<-supported_organisms> to
get the relevant list of organisms.

 get-orthologs-compara -v 1 -supported_organisms \
      -o compara_organisms.tab

=head2 Get orthologs from a query gene in a list of reference genomes

 get-orthologs-compara -v 1 -org_list compara_organisms.tab \
   -type ortholog -q AT5G45730.1 

=head2 Get all orthologs from a query genome, shortening genome names 

 get-orthologs-compara -query_org brachypodium_distachyon \
   -ref_org arabidopsis_thaliana -short -o brachy.thaliana.tab

=head2 Get paralogs for all genes of Arabidopsis thaliana in its own genome (inparalogues)

 get-orthologs-compara -query_org arabidopsis_thaliana \
   -ref_org arabidopsis_thaliana -type paralog

=head2 Get homeologs within the wheat genome with stringent identity cut-off values

 get-orthologs-compara -query_org triticum_aestivum \
   -ref_org triticum_aestivum -type homeolog \
   -ident_query 70 -ident_target 70

=cut

# get-orthologs-compara -v 1 \
#    -i $RSAT/public_html/data/genomes/Arabidopsis_thaliana.TAIR10.29/genome/genes.tab \
#       -type paralog -ref_org Arabidopsis_thaliana.TAIR10.29 \
#          -o Arabidopsis_thaliana.TAIR10.29_all_paralogs.tab
#
# get-orthologs-compara -v 1 \
#    -i $RSAT/public_html/data/genomes/Arabidopsis_thaliana.TAIR10.29/genome/genes.tab \
#       -type paralog -ref_org Arabidopsis_thaliana.TAIR10.29 \
#          -o Arabidopsis_thaliana.TAIR10.29_all_paralogs.tab
#
#          You can then count the number of paralogs per gene.
#
#           grep -v '^;' Arabidopsis_thaliana.TAIR10.29_all_paralogs.tab \
#              | cut -f 1 | sort | uniq -c  | sort -nr \
#                 > Arabidopsis_thaliana.TAIR10.29_all_paralogs_nb_per_gene.tab
#



BEGIN {
  if ($0 =~ /([^(\/)]+)$/) {
    push (@INC, "$`lib/");
  }
}

require "RSA.lib";
use DB_File;

################################################################
## Main package
package main;
{
  
  ################################################################
  #### initialise parameters and vars
  our $start_time = &RSAT::util::StartScript();
  our $comparabdb = "$ENV{RSAT}/data/genomes/compara.bdb";
  #$comparabdb = "$ENV{RSAT}/data/ensemblgenomes/plants/release-37/compara.bdb"; # debug
  
  our %infile = ();
  our %outfile = ();
  our %params = ( 'ident_target' => 0, 
		  'ident_query' => 0, 
		  'type' => 'ortholog',  
		  'supported_organisms' => 0, 
		  'demo' => 0 );

  our $verbose = 0;
  our $in = STDIN;
  our $out = STDOUT;
  
  our @query_genes = ();
  our @ref_organisms = ();
  our %organisms = ();
  our %homology_types = ();
  our (%db,%stats,%int2organisms,%int2organisms_fullname,%org2int_fullname);

  our %homology_subtypes = (
    'ortholog', { 'ortholog_many2many',1,'ortholog_one2many',1,'ortholog_one2one',1 },
    'paralog', { 'within_species_paralog',1,'other_paralog',1,'gene_split',1 },
    'homeolog',  { 'homoeolog_many2many',1,'homoeolog_one2many',1,'homoeolog_one2one',1 } );
  $homology_subtypes{'all'} = { 
      %{$homology_subtypes{'ortholog'}}, %{$homology_subtypes{'paralog'}}, %{$homology_subtypes{'homeolog'}}  };
  our $homology_subtypes = join ",", sort(keys(%homology_subtypes)); ## For the error message

  my ($n_of_results,$count,$idOK,$rows,$sp,$query_sp,$header,$orig_query) = (0);
  my ($ref_org,$query,$real_query,$id,$type,$subtype,$identity1,$identity2);

  # parse arguments
  &ReadArguments();
  
  # open BerkeleyDB database
  if($main::infile{'bdbfile'})
  {
     $comparabdb = $main::infile{'bdbfile'};
  }
  
  tie(%db,"DB_File",$comparabdb, O_RDONLY, 0666, $DB_HASH)
    || die "# $0 : cannot tie $comparabdb:$!\n";
  
  # get supported organisms and homology types, encoded as integers
  %int2organisms = split(/,/,$db{'supported_organisms'});
  %organisms = reverse(%int2organisms);
  %int2organisms_fullname = split(/,/,$db{'supported_organisms_fullname'});
  %org2int_fullname = reverse(%int2organisms_fullname);
  foreach $sp (keys(%org2int_fullname))
  {
    $organisms{$sp} = $org2int_fullname{$sp};
  }
   
  %homology_types = split(/,/,$db{'homology_types'});

  # open output stream
  $main::out = &OpenOutputFile($main::outfile{'output'});

  # check arguments
  if($main::params{'ref_org'} && !defined($organisms{$main::params{'ref_org'}})) {
      &RSAT::error::FatalError(join("\t", "Organism", $main::params{'ref_org'},
         "is not supported.","Use option -supported_organisms to get a list. "));
  }
  elsif($main::params{'query_org'} && ! defined($organisms{$main::params{'query_org'}})) {
     &RSAT::error::FatalError(join("\t", "Organism", $main::params{'query_org'},
        "is not supported.","Use option -supported_organisms to get a list. "));
  }
  
  if($params{'demo'})
  {
#    @query_genes = qw( BRADI4G31367.1 );
#    @ref_organisms = qw( triticum_aestivum );
      &doit("get-orthologs-compara -v ".$main::verbose." -q BRADI_1g48830v3 -ref_org triticum_aestivum");
      exit(0);
  }
  elsif($params{'supported_organisms'})
  {
#    &RSAT::message::Info("Supported organisms:");
#    &RSAT::message::Info(join(',',sort(keys(%organisms))));
      &Verbose() if ($main::verbose >= 1);
      
      print $main::out "; Supported organisms\n";
      foreach my $organism (sort(keys(%organisms))) {
	  print $main::out $organism, "\n";
      }
      close_and_quit();    
  }
  else
  {
    # query genes
    if($main::infile{'input'})
    {
      RSAT::message::Info("Reading genes from file",$main::infile{'input'}) if($verbose >= 2);

      my ($genefile,$listdir) = OpenInputFile($main::infile{'input'});
      while(<$genefile>)
      {
        next if(/^[#;]/);
        s/\r/\n/g;
        $id = (split)[0];
        push(@query_genes,$id);
      }
      close($genefile);
    }
    elsif($params{'query_org'})
    {
      $query_sp = 'gene_ids_'. $organisms{$params{'query_org'}};

      if($db{ $query_sp }){
        foreach $id (split(',', $db{ $query_sp })){
          push(@query_genes,$id); #print "$id\n";
        }
        RSAT::message::Info("Querying all genes in ",$params{'query_org'}, scalar(@query_genes)) if($verbose >= 2);
      }
      else{
        RSAT::error::FatalError("Cannot find gene IDs of organism:",$params{'query_org'})
      }
    }


    &RSAT::error::FatalError("You should indicate at least a query gene")
      unless(scalar(@query_genes) > 0);

    # homology type
    if(!$homology_subtypes{$params{'type'}})
    {
      &RSAT::error::FatalError("Unrecognized homology type. Supported: ", $homology_subtypes);
    }

    if($verbose >=2)
    {
      &RSAT::message::Info("Compara subtypes included in homology type", $params{'type'},":");
      my $all_subtypes = '';
      foreach $subtype (keys(%{$homology_subtypes{$params{'type'}}}))
      {
        $all_subtypes .= "$subtype,";
      }
      &RSAT::message::Info($all_subtypes);
    }

    # reference organisms
    &RSAT::error::FatalError("Please select a single reference organism or provide a list")
      unless ($params{'ref_org'} || $infile{'org_list'});
    
    &RSAT::error::FatalError("Options -ref_org and -org_list are mutually exclusive")
      if($params{'ref_org'} && $infile{'org_list'});
 
    if($params{'ref_org'}) # single ref organisms, already validate in ReadArguments
    {
      @ref_organisms = ( $params{'ref_org'} );  
    } 
    else # list of orgs in a file
    {
      RSAT::message::Info("Reading organisms from file",$infile{'org_list'}) if($verbose >= 2);

      my ($listfile,$listdir) = OpenInputFile($infile{'org_list'});
      while(<$listfile>)
      {
        next if(/^[#;]/);
        s/\r/\n/g;
        $sp = (split)[0];

        unless (defined($organisms{$sp})) 
        {
          &RSAT::error::FatalError(join("\t","Organism",$sp,
            "is not supported.","Use option -supported_organisms to get a list. "));
        }
        
        push(@ref_organisms,$sp);
      }
      close($listfile);

      if(scalar(@ref_organisms) < 1)
      {
        &RSAT::error::FatalError("File -org_list ", $infile{'org_list'}, 
          "should contain at least one valid organism name.",
          "Use option -supported_organisms to obtain the list.");
      }
    }

    # identity cutoffs
    if($params{'ident_target'} || $params{'ident_query'})
    {
      if($params{'ident_target'} < 0 || $params{'ident_target'} > 100)
      {
        &RSAT::error::FatalError("Parameter ident_target should take values in the range [0,100]");
      }
      if($params{'ident_query'} < 0 || $params{'ident_query'} > 100)
      {
        &RSAT::error::FatalError("Parameter ident_query should take values in the range [0,100]");
      }
    }
  }

  ## Print verbosity
  &Verbose() if ($main::verbose >= 1);


  # add header line to output  
  $header =  join("\t",
      'target_id',
      'ref_organism',
      'subtype',
      'query_id',
      'query_organism',
      'ident_target',
      'ident_query'
  );
  print $out "; $header\n";

  # look for homologues reported in Compara
  foreach $query (@query_genes)
  {
    # check query id and strip it if necessary
    $orig_query = $query;
    $real_query = $query;

    $idOK = id_exists_in_db($query);
    if(!$idOK)
    {
      my $trimmed_id = strip_gene_id($query);
      if($trimmed_id eq $query) 
      {
        $trimmed_id = uc($trimmed_id); # try upper-case
        $idOK = id_exists_in_db($trimmed_id);
      }
      else
      {  
        $idOK = id_exists_in_db($trimmed_id);
        if(!$idOK) 
        {
          $trimmed_id = uc($trimmed_id); # try upper-case
          $idOK = id_exists_in_db($trimmed_id);
        }
      }
      
      # TO BE DONE: here is where synonyms should be retrieved ...

      if(!$idOK)
      {
        # Bad IDs
        print $out "_cannot\tfind\tquery\t$query/$trimmed_id\tNA\tNA\tNA\n";
        $stats{$query}{'bad_gene_id'} = 1;
        next;
      }
      else
      {
        RSAT::message::Info("Change query",$query,"to",$trimmed_id) if($verbose >=2);
        $real_query = $trimmed_id;
      }
    }

    # query Compara for all requested organisms
    foreach $ref_org (@ref_organisms)
    {
      $count = 0;
      $rows = $db{$real_query.'_'.$organisms{$ref_org}} || '';

      foreach my $row (split(/\n/,$rows))
      {
        #Traes_5BL_8B50EC68A,36,6,76,74,5
        ($id,$species,$subtype,$identity1,$identity2,$query_sp) = split(/,/,$row);

        next if($identity1 < $params{'ident_query'} || 
          $identity2 < $params{'ident_target'});

        $subtype = $homology_types{$subtype};
        next if(!$homology_subtypes{$params{'type'}}{$subtype});

        if($params{'short'}){
          $species = $int2organisms{$species};
          $query_sp = $int2organisms{$query_sp};
        }
        else{ 
          $species = $int2organisms_fullname{$species};
          $query_sp = $int2organisms_fullname{$query_sp};
        }

        print $out "$id\t$species\t$subtype\t$orig_query\t$query_sp\t$identity2\t$identity1\n"; 

        $count++;
        $n_of_results++;
      }

      # collect stats
      $stats{$orig_query}{'total'} += $count;
    }
  }

  $stats{'all'}{'total'} = $n_of_results;  

  close_and_quit();
}

################################################################
################### SUBROUTINE DEFINITION ######################
################################################################

################################################################
### Close output file and quit
sub close_and_quit {
    if ($main::verbose >= 1) 
    {
	if (defined($main::stats{'all'}{'total'})) {
	    print $main::out "; Total homologues reported\t", $main::stats{'all'}{'total'}, "\n";
	}
	foreach my $query (@main::query_genes) {
	    if($stats{$query}{'bad_gene_id'}) {
		print $main::out "; Bad query id ",$query,"\t","\n"; 
	    }
	    else {
		print $main::out "; Total homologues for ",$query,"\t",$main::stats{$query}{'total'},"\n";
	    }
	}
	
	
    my $exec_time = &RSAT::util::ReportExecutionTime($start_time);
	print $main::out $exec_time;
    }
    
    if($main::outfile{'output'})
    {
	&RSAT::message::TimeWarn("Output file:", $main::outfile{'output'}) if ($main::verbose >= 0);
	close($main::out);
    }
    exit(0);
}

################################################################
#### display full help message 
sub PrintHelp {
    system "pod2text -c $0";
    exit()
}

################################################################
#### display short help message
sub PrintOptions {
    &PrintHelp();
}

################################################################
#### Read arguments 
sub ReadArguments {
  my $arg = "";
  
  my @arguments = @ARGV; 
 
  while ($arg = shift(@arguments)) {

=pod
    
=head1 OPTIONS

=over 4

=item B<-v #>

Level of verbosity (detail in the warning messages during execution)

=cut

    if ($arg eq "-v") {
      if (&IsNatural($arguments[0])){ $main::verbose = shift(@arguments) } 
    
=pod

=item B<-h>

Display full help message

=cut
    } elsif ($arg eq "-h") {
	  &PrintHelp();
=pod

=item B<-help>

Same as -h

=cut
	} elsif ($arg eq "-help") {
	  &PrintOptions();
	
=pod

=item B<-demo>

Run demo, ignores all other options.

=cut
    } elsif ($arg eq "-demo") {
      $main::params{'demo'} = 1;
    
=pod

=item B<-i inputfile>

If no input file is specified, the standard input is used.  This
allows to use the command within a pipe.

=cut
	} elsif ($arg eq "-i") {
	  &RSAT::error::FatalError("option -i is incompatible with option -q")
	    if (scalar(@main::query_genes) > 0);
	  $main::infile{'input'} = shift(@arguments);    
=pod

=item	B<-o outputfile>

If no output file is specified, the standard output is used.  This
allows to use the command within a pipe.

=cut
	} elsif ($arg eq "-o") {
	  $main::outfile{'output'} = shift(@arguments);
	    
=pod

=item B<-ref_org organism>

Reports homologues of this reference organism.

This option can be used iteratively to specify multiple reference
organisms. An alternative is to use the option I<-org_list> in order
to specify a file containing the list of reference organisms.

=cut

	} elsif ($arg eq "-ref_org") {
	  $main::params{'ref_org'} = shift (@arguments);
	  
      # May2017
      #unless (defined($main::organisms{$main::params{'ref_org'}})) {
	  #  &RSAT::error::FatalError(join("\t", "Organism", $main::params{'ref_org'}, 
	  #    "is not supported.","Use option -supported_organisms to get a list. "));
	  #}

=pod

=item B<-org_list>

This option gives the posibility to specify a set of reference
organisms instead of a single one. Homologous genes will only be searched in 
listed organisms.

File format: each row should contain the identifier of one
organism. Lines starting with # or ; are ignored.

=cut

	} elsif ($arg eq "-org_list") {
	  $main::infile{'org_list'} = shift(@arguments);

=pod

=item B<-bdbfile>

This option gives the posibility to specify a user-selected BerkeleyDB file
produced with parse-compara. A valid path to that file should be indicated.

=cut

    } elsif ($arg eq "-bdbfile") {
      $main::infile{'bdbfile'} = shift(@arguments);

=pod

=item B<-supported_organisms>

Lists supported organisms in local Compara database.

This option is incompatible with the option I<ref_org>.

To obtain a list of supported organisms, use the option
I<-supported_organisms>.

=cut

    } elsif ($arg eq "-supported_organisms") {
      $main::params{'supported_organisms'} = 1;
=pod

=item B<-short>

Print short organism names on output.

=cut

    } elsif ($arg eq "-short") {
      $main::params{'short'} = 1;
=pod

=item B<-q query_gene>

Query gene ID. 

This option can be used iteratively on the same command to specify
multiple query genes.

Alternatively, a list of query genes can be provided in a text file,
specified with the option I<-i>.

=cut
    } elsif ($arg eq "-q") {
      &RSAT::error::FatalError("Option -q is incompatible with option -i")
        if ($main::infile{input});
      push(@main::query_genes, shift(@arguments));

=pod

=item B<-query_org organism>

Name of query organism.

This option allows to search homologues for all genes in a query organism.
Genes with no homologues are not listed in output.

=cut
    } elsif ($arg eq "-query_org") {
      $main::params{'query_org'} = shift (@arguments);

      # May2017
      #unless (defined($main::organisms{$main::params{'query_org'}})) {
      #  &RSAT::error::FatalError(join("\t", "Organism", $main::params{'query_org'},
      #    "is not supported.","Use option -supported_organisms to get a list. "));
      #}

      &RSAT::error::FatalError("Option -query_org is incompatible with option -q")
        if (@main::query_genes);

      &RSAT::error::FatalError("Option -query_org is incompatible with option -i")
        if ($main::infile{input});

=pod

=item B<-type homology_type>

Supported homology types: ortholog, paralog, homeolog, all

Default: ortholog

=over

=item I<all>

Returns all types of homologues annotated in Compara.

=item I<ortholog> 

Returns only orthologous genes, including many2many, one2many & one2one

=item I<paralog>

Returns only paralogues, including within_species_paralog,
other_paralog & gene_split

=item I<homeolog>

Returns only homeologues, including many2many, one2many & one2one

I<Definition>: the term homoeologous, also spelled homeologous, is
used to describe the relationship of similar chromosomes or parts of
chromosomes brought together following inter-species hybridization and
allopolyploidization, and whose relationship was completely homologous
in an ancestral species [L<https://en.m.wikipedia.org/wiki/Polyploid>].

=back

=cut
	} elsif ($arg eq "-type") {
	  $main::params{'type'} = shift(@arguments);

=pod

=item B<-ident_target % amino acid identity over target length>

Homologues with lower ident_target values are skipped. Default: 0 .

=cut

    } elsif ($arg eq "-ident_target") {
      $main::params{'ident_target'} = shift(@arguments);

=pod

=item B<-ident_query % amino acid identity over query length>

Homologues with lower ident_query values are skipped. Default: 0 .

=cut

    } elsif ($arg eq "-ident_query") {
      $main::params{'ident_query'} = shift(@arguments);

=pod

=back

=cut
    } else {
      &FatalError(join("\t", "Invalid option", $arg));

    }
  }
}

################################################################
#### verbose message
sub Verbose {
  print $main::out "; get-orthologs-compara ";
  &PrintArguments($main::out);
  
  print $main::out "; BerkeleyDBD file: $comparabdb\n";

  if (%main::infile) {
    print $main::out "; Input files\n";
	while (my ($key,$value) = each %main::infile) {
	print $main::out ";\t$key\t$value\n";
    }
  }
  
  if (%main::outfile) {
    print $main::out "; Output files\n";
	while (my ($key,$value) = each %main::outfile) {
	  print $main::out ";\t$key\t$value\n";
    }
  }

  print $main::out "; Compara options\n";
  printf $main::out ";\t%-14s\t%s\n", "type", $main::params{'type'};
  printf $main::out ";\t%-14s\t%s\n", "ident_target", $main::params{'ident_target'};
  printf $main::out ";\t%-14s\t%s\n", "ident_query", $main::params{'ident_query'};

  print $main::out ("; Query genes\t",scalar(@main::query_genes),"\n");
  if (scalar(@main::query_genes) <= 100) {
    foreach my $query (@main::query_genes) {
	  print $main::out ";\t$query\n";
    }
  }

  print $main::out "; Reference organisms\t", scalar(@main::ref_organisms), "\n";
  foreach my $org (@main::ref_organisms) {
	print $main::out ";\t$org\n";
  }
}

#####################################################
## check whether ID exists, uses main vars
sub id_exists_in_db
{
  my ($id) = @_;
  
  my $count = 0;
  foreach my $sp (keys(%main::organisms))
  {
    if($main::db{$id.'_'.$main::organisms{$sp}}){ $count++; last; }
  }

  return $count;
}

#######################################################
## strips an ID of transcript number
sub strip_gene_id
{
  my ($id) = @_;

  my $trimmed_id = $id;
  $trimmed_id =~ s/\.\d+$//;

  return $trimmed_id;
}

__END__

=pod

=head1 DEMO

    get-orthologs-compara -q BRADI4G31367.1 -ref_org triticum_aestivum 

=cut
