#!/usr/bin/perl -w
############################################################
#
# $Id: graph-neighbours,v 1.25 2011/02/17 05:15:54 rsat Exp $
#
# Time-stamp: <2003-07-04 12:48:55 jvanheld>
#
############################################################

## use strict;

=pod

=head1 NAME

graph-neighbours

=head1 DESCRIPTION

Extract from a graph the neighbourhood of a set of seed nodes. 

=head1 AUTHORS

=over

=item Jacques van Helden <jvanheld@bigre.ulb.ac.be>

=item Sylvain Brohee <sylvain@bigre.ulb.ac.be>

=head1 CATEGORY

graph analysis

=head1 USAGE
    
graph-neighbours [-i graph_file]  -steps # -seed seed_node1 [-seed seed_node2 ...] [-o outputfile] [-v #] [...]

graph-neighbours [-i graph_file]  -steps # -seedf seed_file [-o outputfile] [-v #] [...]

graph-neighbours [-i graph_file]  -steps # -all [-o outputfile] [-v #] [...]

=head1 INPUT FORMAT

See convert-graph for a description of the supported input formats.

=head1 OUTPUT FORMAT

Two types of output are available. 

Without any specification, the program will return a four column file "neighbour", "seed", "number of steps" and "direction". 

Using the -stats option, the program will return one line for each seed node and display some information about the nodes. For the moment, it only works with weighted graphs.

=cut


BEGIN {
    if ($0 =~ /([^(\/)]+)$/) {
	push (@INC, "$`lib/");
    }
}
require "RSA.lib";
require RSAT::Graph2;

################################################################
## Main package
package main;
{

    ################################################################
    ## Initialise parameters
    local $start_time = &RSAT::util::StartScript();

    %unknown_nodes = (); ## Names of the unidentified nodes
    $all_seeds = 0; ## Use all nodes as seeds
    $steps = 1;
    $self_included = 0;
    $direction = "all";
    %supported_direction = (
	all=>1,
	out=>1,
	in=>1
    );
    $supported_directions = join (",", keys %supported_direction);
    ################################################################
    ## Initialize the graph
    $graph = new RSAT::Graph2();
    $graph->set_attribute("label", "graph");

    ## Columns of the tab-delimited graph file
    local $source_col = 1;
    local $target_col = 2;
    local $weight_col = 0;
 
    #local $out_format = "classes";

    ## Input formats
    local $input_format = "tab";
    %supported_input_format =(
	tab=>1,
	gml=>1
    );
    $supported_input_formats = join (",", keys %supported_input_format);
    
    
    %main::infile = ();
    %main::outfile = ();

    $main::verbose = 0;
#    $main::in = STDIN;
    $main::out = STDOUT;
    
    $stats = 0;
    ################################################################
    ## Read argument values
    &ReadArguments();

    ################################################################
    ## Check argument values
    if ($stats && $steps != 1) {
      &RSAT::error::FatalError("-stats option cannot be used with number of steps > 1")
    }
    if ($stats && $self_included == 1) {
      &RSAT::error::FatalError("-stats option cannot be used with -self option")
    }
    if ($direction ne "all" && $self_included) {
      &RSAT::error::FatalError("You cannot choose the -direction $direction option when including self nodes")
    }
    if ($direction ne "all" && $stats) {
      &RSAT::error::FatalError("You cannot choose the -direction $direction option with the -stats output format")
    }
    if ($direction ne "all" && $steps > 1) {
      &RSAT::error::FatalError("You cannot choose the -direction $direction option with a number of steps higher than one")
    }
    ################################################################
    ## Open output stream
    $main::out = &OpenOutputFile($main::outfile{output});

    ################################################################
    ## Read input graph
    $graph->graph_from_text($input_format,$main::infile{graph}, $source_col, $target_col, $weight_col);
    
    ################################################################
    ## Build the list of seed nodes

    ## Use all nodes as seeds
    if ($all_seeds) {
      my @seed_nodes = $graph->get_nodes();
      foreach my $seed_node (@seed_nodes) {
        push @seed_names, $seed_node;
      }
      &RSAT::message::TimeWarn("Using all",scalar(@seed_nodes), "nodes of the graph as seeds") if ($main::verbose >= 2);
      ## Put the name of each node in the list
    } else {
      ## Read seed nodes from a file
      if ($infile{seeds}) {
	my $l = 0;
	&RSAT::message::TimeWarn("Reading seed nodes from file", $infile{seeds}) if ($main::verbose >= 2);
	my ($seed_handle) = &OpenInputFile($infile{seeds});
	while (my $line = <$seed_handle>) {
	  $l++;
	  next if ($line =~ /^\#/); ## Skip header lines
	  next if ($line =~ /^--/); ## Skip comment lines
	  next if ($line =~ /^;/); ## Skip comment lines
	  next unless ($line =~ /\S/); ## Skip empty lines
	  chomp($line);
	  my @fields = split /\s+/, $line;
	  my $name =  $fields[0];
	  if ($name) {
	    push @seed_names, $name;
	  } else {
	    &RSAT::message::Warning("Line", $l, "starts with space. Skipped.");
	  }
	}
	close $seed_handle;
      }
    }
      
    ## Identify seed nodes in the graph
    if (!$all_seeds) {
      RSAT::message::TimeWarn("Identifying",scalar(@seed_names), "seed nodes in the graph") if ($main::verbose >= 2);
      foreach my $name (@seed_names) {
        my $node_id = $graph->node_by_name($name);
        if (defined($node_id)) {
  	  $seed_nodes_id{$node_id} = $name;
	  &RSAT::message::Info("Identified node with name", $name, $node_id) if ($main::verbose >= 3);
        } else {
          $unknown_nodes{$name}++;
   	  &RSAT::message::Warning("The graph does not contain any node with name", $name);
        }
      }
    } else {
      %seed_nodes_id = $graph->get_attribute("nodes_id_name");
    }
    

    
    ################################################################
    ## Print verbose
    &Verbose() if ($main::verbose);
    
    ################################################################
    ## Extract the neighborhood and prints the result
    my $range = 3;
    if ($stats == 0) {
      print $out '#',  join ("\t", "neighb", "seed", "steps", "dir");
      if ($steps == 1 && $weight_col) {
        print $out "\tlabel";
        $range = 4;
      }
      print $out "\n";
    } else {
      print $out '#'.join ("\t", "seed", "n", "mean","median", "min", "max","cluscoef","neighb")."\n";
    } 
    foreach my $seed_node_id (keys %seed_nodes_id) {
      # Normal OUTPUT
      &RSAT::message::Info("\t","Looking for neighbours of node", $seed_node_id, $seed_nodes_id{$seed_node_id}) if $main::verbose >= 2;
      my @neighbours = $graph->get_neighbours_id($seed_node_id, $steps, $self_included, $weight_col);
      if (!$stats) {
        for (my $i = 0; $i < scalar(@neighbours); $i++) {
          if (@{$neighbours[$i]}) {
            my @neighbours_array = @{$neighbours[$i]};
            if (($direction eq "all") || ($direction ne "all" && $neighbours_array[3] eq $direction)) {
              print $out join("\t", @neighbours_array[0..$range])."\n";
            }
          }
        }
      } else {
        # STATS OUTPUT
        my $seed = $neighbours[0][1];
        if ($seed) {
          print $out $seed."\t";
          my @weights = ();
          my @neighbours_list = ();
          for (my $i = 0; $i < scalar(@neighbours); $i++) {
            push @neighbours_list, $neighbours[$i][0];
            if ($weight_col >= 0 && &IsReal($neighbours[$i][4])) {
              #print $neighbours[$i][0]. " " . $neighbours[$i][4]."\n";
              push @weights, $neighbours[$i][4]; 
            }
          }
          #print join(" ",@weights)."\n";
          push @neighbours_list, $seed;
          my $clustering_coef = $graph->get_clust_coef(@neighbours_list);
          pop @neighbours_list;
          my %weight_summary = ();
          if (scalar((@neighbours)) == 1) {
            $weight_summary{'n'} = 1;
            $weight_summary{'mean'} = $weights[0];
            $weight_summary{'median'} = $weights[0];
            $weight_summary{'min'} = $weights[0];
            $weight_summary{'max'} = $weights[0];
          } else {
            %weight_summary = &summary(@weights);
          }
          print $out $weight_summary{'n'}."\t";
          print $out $weight_summary{'mean'}."\t";
          print $out $weight_summary{'median'}."\t";
          print $out $weight_summary{'min'}."\t";
          print $out $weight_summary{'max'}."\t";
          printf $out "%.4f",$clustering_coef;
          print $out "\t".join(", ", @neighbours_list)."\n";
        }
      }
    }

    ################################################################
    ## Close output stream
    my $exec_time = &RSAT::util::ReportExecutionTime($start_time);
    print $main::out $exec_time if ($main::verbose >= 1);
    close $main::out if ($main::outfile{output});

    exit(0);
}

################################################################
################### SUBROUTINE DEFINITION ######################
################################################################


################################################################
## Display full help message 
sub PrintHelp {
    system "pod2text -c $0";
    exit()
}

################################################################
## Display short help message
sub PrintOptions {
    &PrintHelp();
}

################################################################
## Read arguments 
sub ReadArguments {
    my $arg;
    my @arguments = @ARGV; ## create a copy to shift, because we need ARGV to report command line in &Verbose()
    while (scalar(@arguments) >= 1) {
      $arg = shift (@arguments);
	## Verbosity
=pod

=head1 OPTIONS

=over 4

=item B<-v #>

Level of verbosity (detail in the warning messages during execution)

=cut
	if ($arg eq "-v") {
	    if (&IsNatural($arguments[0])) {
		$main::verbose = shift(@arguments);
	    } else {
		$main::verbose = 1;
	    }

	    ## Help message
=pod

=item B<-h>

Display full help message

=cut
	} elsif ($arg eq "-h") {
	    &PrintHelp();

	    ## List of options
=pod

=item B<-help>

Same as -h

=cut
	} elsif ($arg eq "-help") {
	    &PrintOptions();


	    ### Input format  
=pod

=item B<-in_format input_format>

Input format. Supported: tab, gml

=cut
	} elsif ($arg eq "-in_format") {
	    $input_format = shift(@arguments);
	    &RSAT::error::FatalError("$input_format\tInvalid input format. Supported: $supported_input_formats")
		unless ($supported_input_format{$input_format});
			    
	    ## Graph file
=pod

=item B<-i graphfile>

If no graph file is specified, the standard input is used.  This
allows to use the command within a pipe.

=cut
	} elsif ($arg eq "-i") {
	    $main::infile{graph} = shift(@arguments);

	    ## Source column
=pod

=item B<-scol>

Source column. Column containing the source nodes in the tab-delimited
graph file.

=cut
	} elsif ($arg eq "-scol") {
	    $source_col = shift (@arguments);
	    unless (&IsNatural($source_col) && ($source_col > 0)) {
		&RSAT::error::FatalError(join("\t", $source_col, "Invalid value for the source column. Must be a strictly positive natural number"));
	    }

	    ## Target column
=pod

=item B<-tcol>

Target column. Column containing the target nodes in the tab-delimited
graph file.

=cut
	} elsif ($arg eq "-tcol") {
	    $target_col = shift (@arguments);
	    unless (&IsNatural($target_col) && ($target_col > 0)) {
		&RSAT::error::FatalError(join("\t", $target_col, "Invalid value for the target column. Must be a strictly positive natural number"));
	    }

	    ## Weight column
=pod

=item B<-wcol>

Weight column. Column containing the weight nodes in the tab-delimited
graph file.

=cut
	} elsif ($arg eq "-wcol") {
	    $weight_col = shift (@arguments);
	    unless (&IsNatural($weight_col) && ($weight_col > 0)) {
		&RSAT::error::FatalError(join("\t", $weight_col, "Invalid value for the weight column. Must be a strictly positive natural number"));
	    }


=pod

=item B<-seedf seed_file>

Seed file. The seed file specifies a list of seed nodes.

Seed file format: the first word of each row specifies one seed. The
rest of the row is ignored.

=cut
	} elsif ($arg eq "-seedf") {
	    $main::infile{seeds} = shift(@arguments);

=pod

=item B<-all>

Use all the nodes of the input graph as seed. Each node is the seed of
one cluster indicating its neighbours.

=cut
	} elsif ($arg eq "-all") {
	    $all_seeds = 1;

=pod

=item B<-seed seed_node>

Specify one seed node. This option can be used iteratively to specify
several seed nodes.

=cut
	} elsif ($arg eq "-seed") {
	    push @seed_names, shift(@arguments);

=pod

=item B<-steps nb_of_steps>

Maximal number of steps between a seed node and its
neighbours. Default: 1.

=cut
	} elsif ($arg eq "-steps") {
	    $main::steps = shift(@arguments);

=pod

=item B<-self>

Include each node in its neighborhood, with a distance of 0, even if
there is no self-loop at this node. This allows to extract the node
together with its neighborhood, rather than the neighborhood only
(default).


=cut
	} elsif ($arg eq "-self") {
	    $main::self_included = 1;

=pod

=item B<-direction>

Returns only the in or the out neighbours. The accepted values are all, in or out (default all). 

This option cannot be used with the -stats option, -self and with a number of steps higher than 1. 

=cut
	} elsif ($arg eq "-direction") {
	    $main::direction = shift(@arguments);
	    &RSAT::error::FatalError("$direction\tInvalid neighbours directions. Supported: $supported_directions")
		unless ($supported_direction{$direction});



	    ## Output file
=pod

=item	B<-o outputfile>

If no output file is specified, the standard output is used.  This
allows to use the command within a pipe.

=cut
	} elsif ($arg eq "-o") {
	  $main::outfile{output} = shift(@arguments);
	    
	   
	   ### stats
=pod

=item	B<-stats>

Only valid when the number of steps is equal to 1. 
The output file is presented differently, with one line for each seed node.


=cut
	} elsif ($arg eq "-stats") {
	  $stats = 1;

	} else {
	    &FatalError(join("\t", "Invalid option", $arg));

	}

    }


=pod

=back

=cut

}

################################################################
#### verbose message
sub Verbose {
    print $main::out "; graph-neighbnours ";
    &PrintArguments($main::out);
    if (%main::infile) {
	print $main::out "; Input files\n";
	while (my ($key,$value) = each %main::infile) {
	    print $main::out ";\t$key\t$value\n";
	}
    }
    if (%main::outfile) {
	print $main::out "; Output files\n";
	while (my ($key,$value) = each %main::outfile) {
	    print $main::out ";\t$key\t$value\n";
	}
    }

    ## Report graph size
    my ($nodes, $arcs) = $graph->get_size();
    print $out "; Graph size\n";
    print $out ";\tnodes\t",$nodes,"\n";
    print $out ";\tarcs\t",$arcs,"\n";

    ## Report seed nodes
    print $out "; Seed nodes\t",scalar(@seed_names),"\n";
    if (%unknown_nodes) {
      print $out ";\tidentified\t",scalar(keys (%seed_nodes_id)),"\n";
      print $out ";\tunknown nodes\t",scalar(keys (%unknown_nodes)),"\n";
      foreach my $name (keys %unknown_nodes) {
	print $out join ("\t", ";\t", "unknown", $name), "\n";
      }
    }
}


__END__

=pod

=head1 SEE ALSO

=over

=item I<convert-graph>

=item I<graph-get-clusters>

=item I<graph-node-degree>

=item I<compare-graphs>

=item I<random-graph>

=back

=cut

