#!/usr/bin/perl -w

## TO DO: calculate GC content of the motif in parameters

############################################################
#
# $Id: convert-matrix,v 1.88 2009/11/16 21:21:55 jvanheld Exp $
#
# Time-stamp: <2002-06-06 13:14:17 jvanheld>
#
############################################################
#use strict;
BEGIN {
  if ($0 =~ /([^(\/)]+)$/) {
    push (@INC, "$`lib/");
  }
  require "RSA.lib";
}
use RSAT::MarkovModel;
use RSAT::matrix;
use RSAT::MatrixReader;
use File::Basename;
use Data::Dumper;

#### initialise parameters ####
my $start_time = &AlphaDate();
local $seqlogo_path = $ENV{seqlogo} || $ENV{RSAT}."/bin/seqlogo";
$seqlogo_path = &trim($seqlogo_path);
$decimals = 1;
$pseudo_weight = 1;
$equi_pseudo = 0;
$max_profile = 24;
$sep="\t";
$null = "NA";
$perm = 0;
$prefix = ""; ## Prefix to be added before motif names

local %infile = ();
local %outfile = ();
local @matrix_files = ();
local @matrices = ();
local $input_format = "";
local $output_format = "patser";
local $verbose = 0;
local $out = STDOUT;
local $info_log_base = exp(1);

local $bg_model = new RSAT::MarkovModel();

local $sort_key = "";
local $sort_order = "";

## Return type(s)
@return_fields = ();
%supported_return_type = (
			  "profile"=>1,
			  "counts"=>1,
			  "frequencies"=>1,
			  "weights"=>1,
			  "info"=>1,
			  "information"=>1,
			  "parameters"=>1,
			  "consensus"=>1,
			  "margins"=>1,
			  "sites"=>1,
			  "wdistrib"=>1,
			  "logo"=>1,
			 );
$supported_return_fields = join ",", sort keys %supported_return_type;

## input formats
%supported_input_format = %RSAT::MatrixReader::supported_input_format;
$supported_input_formats = join ",", sort keys %supported_input_format;

## output formats
%supported_output_format = %RSAT::matrix::supported_output_format;
$supported_output_formats = join ",", sort keys %supported_output_format;

## site formats
%supported_site_format = ('fasta'=>1,
			  "wc"=>1,
			  "multi"=>1,
			 );
$supported_site_formats = join ",", sort keys %supported_site_format;
$site_format="fasta";

## bg formats
$bg_format = "oligo-analysis";
%supported_bg_format = $bg_model->get_supported_input_formats();
$supported_bg_formats = join ",", sort keys %supported_bg_format;

## logo formats
local $logo_format = "png";
local $logo_opt = "";
local $logo_dir = "";
@logo_formats=();
%supported_logo_format =  ('png'=>1,
			  "eps"=>1,
#			  "gif"=>1, # log from seqlogo : check logo.conf
			  "pdf"=>1,
			 );
$supported_logo_formats = join ",", sort keys %supported_logo_format;



################################################################
## Read command-line arguments
&ReadArguments();

################################################################
## Check argument values ####

## Input format
unless ($input_format) {
    &RSAT::error::FatalError("You should specify the input matrix format.");
}

## Matrix provided with option -i
if ($infile{input}) {
    push @matrix_files, $infile{input};
}

## Matrix list has been provided
if ($infile{matrix_list}) {
    my ($mlist, $input_dir) = &OpenInputFile($infile{matrix_list});
    while (<$mlist>) {
	next if (/'^;'/);		# skip comment lines
	next if (/'^#'/);		# skip header lines
	next if (/'^--'/);	# skip mysql-type comment lines
	next unless (/\S/);	# skip empty lines
	my @fields = split /\s+/;
	my $matrix_file = $fields[0];
	push @matrix_files, $matrix_file;
    }
    close $mlist;
    &RSAT::message::Info("Read matrix list from file", $infile{matrix_list}, scalar(@matrix_files), "matrices") if ($main::verbose >= 2);
}

## Check that there is at least one input matrix
unless (scalar(@matrix_files >= 1)) {
    # &RSAT::error::FatalError("You must specify at least one matrix file.(option -i or -mlist)");
}

&RSAT::message::Info(scalar(@matrix_files), "input matrix files") if ($main::verbose >= 2);

## Check return type(s)
local %return_fields = ();
unless (scalar(@return_fields)) {
    if ((lc($output_format ) eq "transfac") ||
	(lc($output_format ) eq "consensus")) {
	push @return_fields, "counts";
    } else {
	&RSAT::error::FatalError("You should define at least one return type");
  }
}

## Permutations are only compatible with -return counts
if ($perm) {
  foreach my $field (@return_fields) {
    unless ($field eq "counts") {
      &RSAT::error::FatalError("The option -perm is only compatible with -return counts");
    }
  }
}


foreach my $field (@return_fields) {
    if ($supported_return_type{$field}) {
	$return_fields{$field}++;
    } else {
	&RSAT::error::FatalError("Invalid return type $field. Supported: ".$supported_return_fields);
    }
}

## Information
if ($return_fields{information}) {
    $return_fields{info} = 1;
    &RSAT::message::Warning("Option -return information is obsolete. Please use -return info.");
}

## Prior residue frequencies
local %prior = ();
if ($infile{prior}) {
    if (defined($bg_pseudo)) {
	$bg_model->force_attribute("bg_pseudo" => $bg_pseudo);
    }
    $bg_model->load_from_file($infile{prior}, $bg_format);
    %prior = $bg_model->get_attribute("suffix_proba");
    foreach my $key (sort keys %prior) {
	my $residue = lc($key);
	&RSAT::message::Debug("residue", $residue, "prior", $prior{$residue}) if ($main::verbose >= 2);
    }
}

## logo format
if ($return_fields{logo}) {
  if (scalar(@logo_formats)>=1){
    foreach my $logo_format (@logo_formats){
      unless ($supported_logo_format{$logo_format}) {
	&RSAT::error::FatalError("$logo_format\tInvalid format for logo\tSupported: $supported_logo_formats");
      }
    }
  }else{
    @logo_formats = ($logo_format);
  }

  ## Check that seqlogo is installed at the expected location
  if (-e $seqlogo_path) {
      if(-x $seqlogo_path) {
	&RSAT::message::Debug("seqlogo path", $seqlogo_path) if ($main::verbose >= 4);
      } else {
	  &RSAT::message::Warning("Cannot execute the program seqlogo", 
				  $seqlogo_path);
	  $seqlogo_path="";
      }
  } else {
      &RSAT::message::Warning("Cannot generate logos because the program seqlogo is not found in the expected path", 
			      $seqlogo_path);
	  $seqlogo_path="";
  }
}

################################################################
#### Perform the conversion

### open output file ###
$out = &OpenOutputFile($outfile{output});

##### print output ######
&Verbose if ($verbose);


## Read the input matrices and collect all matrices
if (scalar(@matrix_files >= 1)) {
  $m = 0;
  foreach my $matrix_file (@matrix_files) {
    my @matrices_from_file = &RSAT::MatrixReader::readFromFile($matrix_file, $input_format);
    foreach my $matrix (@matrices_from_file) {
      $m++;
      &SetMatrixName($matrix, $m, $matrix_file, $input_format);
      #       my ($matrix_name) = &RSAT::util::ShortFileName($matrix_file);
#       #	    $matrix_name = basename($matrix_file);
#       #	    $matrix_name =~ s/\.\w+$//; ## suppress the extension from the file name
#       $matrix_name =~ s/\.${input_format}$//; ## suppress the extension from the file name if it corresponds to the matrix format
#       unless (defined($matrix->get_attribute("name"))){
# 	$matrix->set_attribute("name", $matrix_name);
#       }
#       ## if we have more than one matrix in the same file a number is added to the attribute "name"
#       if (scalar(@matrices_from_file) > 1) {
# 	$matrix->force_attribute("name" => $matrix_name."_".$matrix->get_attribute("matrix.nb"));
#       }
      push @matrices, $matrix;
    }
  }
} else {
  my @matrices_from_file = &RSAT::MatrixReader::readFromFile($infile{input}, $input_format);
  my $m = 0;
  foreach my $matrix (@matrices_from_file) {
    $m++;
    &SetMatrixName($matrix, $m, "", $input_format);
    push @matrices, $matrix;
  }
}

################################################################
## Process each matrix
foreach my $matrix (@matrices) {

    $matrix->reverse_complement() if ($rc);

    $matrix->set_parameter("pseudo", $pseudo_weight);
    $matrix->set_attribute("equi_pseudo", $equi_pseudo);
    #  $matrix->push_attribute("parameters", "pseudo");
    $matrix->force_attribute("decimals", $decimals);
    $matrix->force_attribute("max_profile", $max_profile);
    $matrix->force_attribute("sep", $sep);
    # $matrix->force_attribute("col_width", $col_width);
    $matrix->force_attribute("margins", $return_fields{margins});
    $matrix->setInfoLogBase($info_log_base);
    $matrix->set_parameter("bg_markov_order", 0);	
    if ($infile{prior}) {
	$matrix->setPrior(%prior);
    }
    ## Calculate parameters before sorting because sorting can be done on
    ## a computed parameter rather than defined in the input file.
    if ($return_fields{parameters}) {
	$matrix->calcWeights();
	$matrix->calcInformation();
	$matrix->calcConsensus();
	$matrix->calcGCcontent();
    }
    ## Generate logo
    if (($return_fields{logo}) && ($seqlogo_path)) {
      unless ($logo_dir) {
	if ($outfile{output}) {
	  $logo_dir = dirname($outfile{output});
	} else {
	  $logo_dir = "logos";
	}
      }
      $logo_dir .= "/";
      $logo_dir =~ s|/+|/|g;
#      &RSAT::message::Debug("LOGO DIR", $logo_dir) if ($main::verbose >= 0);

      ## Create logo directory if required
      &RSAT::util::CheckOutDir($logo_dir);

      my $logo_file=$logo_dir.$matrix->get_attribute("name")."_logo";
      #	$logo_file =~ s/\.\w+$//;
      $matrix->set_attribute("logo_file" => $logo_file);

#      if ($logo_dir){
	$matrix->makeLogo($logo_file,\@logo_formats,$logo_dir, $logo_opt);
#      } else {
#	$matrix->makeLogo($logo_file,\@logo_formats,"",$logo_opt);
#      }
    }
}

## Sort the matrices if requested
if ($sort_key) {
    @matrices = &RSAT::MatrixReader::SortMatrices($sort_key, $sort_order, @matrices);
}

#### export the matrices
my $m = 0;
foreach my $matrix (@matrices) {
  $m++;
  if ((defined($top_matrices)) && ($m > $top_matrices)) {
    last;
  }
  &RSAT::message::Info("Exporting matrix", $m."/".scalar(@matrices))
    if ($main::verbose >= 3);

  if ($m > 1) {
    my $fields = join ('', keys (%return_fields));
    # unless ($fields eq "logo") {
    #print $out $RSAT::matrix::matrix_terminator{$output_format}, "\n";
    #}
  }
  if ($main::verbose >= 1) {
    print $out  ";\n; MATRIX ", $m."/".scalar(@matrices), " : " , $matrix->get_attribute("name"), "\n;\n";
  }


  ## Matrix permutations
  if ($perm) {
    my $ID = $matrix->get_attribute("identifier")
      || $matrix->get_attribute("name")
	|| $matrix->get_attribute("ID")
	  || "matrix_".$m;
    my $AC = $matrix->get_attribute("accession") || $ID;
    &RSAT::message::Debug("ExportMatrix", "ID", $ID, "AC", $AC) if ($main::verbose >= 4);
    &RSAT::message::Info("Permutating matrix", $matrix->get_attribute("name"))
      if ($main::verbose >= 2);
    for my $i (1..$perm) {

      # if ($i > 1) {
      #print $out $RSAT::matrix::matrix_terminator{$output_format}, "\n";
      #}

      ## Permute the matrix
      $matrix->permute_columns();
      my $ID_perm = join("_", $ID, "perm".$i);
      my $AC_perm = join("_", $AC, "perm".$i);
      $matrix->force_attribute("name", $ID_perm);
      $matrix->force_attribute("ID", $ID_perm);
      $matrix->force_attribute("identifier", $ID_perm);
      $matrix->force_attribute("accession", $AC_perm);
      &RSAT::message::Debug("Permutation", $i, "ID_perm", $ID_perm, "AC_perm", $AC_perm) if ($main::verbose >= 5);


      ## print result
      if ($main::verbose >= 1) {
	print $out "; ID\t$ID_perm\n" if ($verbose >=1);
	print $out "; AC\t$AC_perm\n" if ($verbose >=1);
	print $out "; permutation $i/$perm\n";
      }

#       print $out $matrix->toString(sep=>"\t",
# 				   type=>"counts",
# 				   format=>$output_format,
# 				   #				   format=>"tab",
# 				   pipe=>"" ## We suppress the pipe for permute-table
# 				  );
      &ExportMatrix($matrix, $m);
    }
  } else {
    &ExportMatrix($matrix, $m);
  }
}


## closing verbose
if ($verbose) {
  my $done_time = &AlphaDate();
  print $out "; Job started $start_time\n";
  print $out "; Job done    $done_time\n";
}

##### close output file
close $out if ($outfile{output});


exit(0);

################################################################
#################### subroutine definition #####################
################################################################

################################################################
## Display full help message 
sub PrintHelp {
  open HELP, "| more";
  print HELP <<End_of_help;
NAME
	convert-matrix

        1999 by Jacques van Helden (jvanheld\@bigre.ulb.ac.be)

USAGE
        convert-matrix [-i inputfile] [-o outputfile] [-v]

DESCRIPTION

	Performs inter-conversions between various formats of
	position-specific scoring matrices (PSSM).

	The program also performs a statistical analysis of the
	original matrix to provide different position-specific scores
	(weight, frequencies, information contents), general
	statistics (E-value, total information content), and synthetic
	descriptions (consensus).

	PSSM can be used to represent the binding specificity of a
	transcription factor or the conserved residues of a protein
	domain.

	Each row of the matrix corresponds to one residue (nucleotide
	or amino-acid depending on the sequence type).  Each column
	corresponds to one position in the alignment.  The value
	within each cell represents the frequency of each residue at
	each position.

CATEGORY
	util
	conversion
	PSSM

OPTIONS
	-h	display full help message
	-help	display options
	-v	verbose

	-i inputfile
		if not specified, the standard input is used.
		This allows to place the command within a pipe.

	-mlist matrix_list
		Indicate a file containing a list of matrices to be used 
		for scanning the region. This facilitates the scanning of 
		a sequence with a library of matrices (e.g. all the matrices from RegulonDB, or TRANSFAC). 

		Format: the matrix list file is a text file. The first word of each
		row is suppose to indicate a file name. Any further information on the
		same row is ignored.

	-o outputfile
		if not specified, the standard output is used.
		This allows to place the command within a pipe.

	-prior [deprecated, use -bgfile] prior frequency file
	
	-bgfile [deprecated] prior frequency file
		File indicating prior residue frequencies
		If no prior frequency file is specified, prior
		frequencies are read from the input file. In case this
		file does not contain any indication of prior
		frequencies, equal priors are assumed.
		
	-bg_format bg_format
		Format for the background model (prior) files.
		Supported formats: all the input formats supported by 
		convert-background-model.
        
	-bg_pseudo #
		Pseudo frequency for the background models. Value must be a real
		between 0 and 1 (default: $bg_model->{bg_pseudo})

	-from
		input matrix format
		Supported: $supported_input_formats

	-to
		output matrix format. 
		Supported: $supported_output_formats

		The option -out_format allows to export the matrix in
		different formats in order to use it as input for various
		pattern matching programs (e.g. patser, MotifScanner, ...).

		The option -return allows to specify the matrix content, the
		option -out_format its format. 

		Beware: each of these programs expectes to find a specific
		type of information in the matrix. For instance, patser uses a
		matrix of counts (or optionnally weights), whereas
		MotifScanner uses a frequency matrix. 

		In order to use a matrix as input for another program, the
		verbosity should be set to 0, in order to avoid comment lines
		(which would be misinterpreted by other programs).

		Recommended combinations of parameters

		Input for patser
		      -return counts -out_format patser -v 0

		Input for MotifScanner
		      -return frequencies -out_format MotifScanner -v 0

	-return	return type
		Supported: $supported_return_fields

		convert-matrix allows to perform various conversions, starting
		from an input occurrence matrix, in order to obtain various
		statistics such as frequencies, weights, information contents,
		.... The supported return types are described in detail below.

	-sort desc|asc|alpha key

		Sort matrices according to the specified attribute
		(sort_key). The sorting can be done on numerical
		values, either in descending (desc) or ascending (asc)
		order. It can also be done in alphabetical order
		(alpha).

		The key mst be one of the numeric parameters of the
		matrices (e.g. information.content, E-value, ...).

		This option is convenient, for example, to sort
		matrices from MotifSampler according to their
		information content:
			-sort desc MS.ic

	-top 
	        Maximal number of matrices to return. 
		Some of the input formats can contain several matrices
		in a single file (e.g. consensus, meme,
		MotifSampler). By default, all the matrices are parsed
		and exported. The option -top allows to restrict the
		number of matrices to be exported.

	-pseudo	
		pseudo-weight used for the calculation of the weight
		matrix (default: $pseudo_weight)

	-equi_pseudo
		If this option is called, the pseudo-weight is
		distributed in an equiprobable way between residues.
		By default, the pseudo-weight is distributed
		proportionally to residue priors.

	-base #
	      Base for the logarithms used in the scores involving a
 	      log-likelihood (weight and information
 	      content). Default: exp(1) (natural logarithms).

	      A common alternative to natural logarithms is to use
	      logarithms in base 2, in which case the information
	      content is computed in bits.

	-decimals
		Number of decimals to print for real matrices
		(frequencies, weights, information) or to compute
		score distributions.

		Warning: for the computation of score distributions,
		the computing time increases exponentially with the
		number of decimals. We recommend to restrict the
		precision to 2 decimals for the weight, this is
		generally more than sufficient.

	-prefix
		Prefix to be added before identifier(s) of the input
		matrix/matrices. This can be convenient for converting
		formats where matrices have no associated name
		(e.g. tab) to formats with ames (e.g. transfac).

	-perm #
		Number of permuted matrices to return. Matrix columns are permuted
		so that the total information content remains identical to the original
		matrix. Note that the output format for permuted matrix is tab.

	-max_profile
		Maximal width of the profile histogram (units = number
		of characters).

	-rc
		Convert the matrix to its reverse complement.

	-logo_format
		Format for logo image file. supported : $supported_logo_formats (default:png).
		Beware: the logo file will be created in current directory by default, unless
		the -logo_dir option is specified or the -o option. In the last case the logo
                will be in the same directory than the output file (-o ).
                Several formats can be sepcified at the same time by using comma as separator
                (eg. -logo_format png,pdf)

	-logo_dir
		Specifies a directory for the output of the logo
		file. By default, the logos are exported in the
		directory from which is called the convert-matrix
		command.

	-logo_opt 

		Any other option to be passed to seqlogo for
		generating the logo. This option can be used
		iteratively to specify multiple options.

		For instance
		    -logo_opt '-w 12' -logo_opt '-t "CRP matrix" -a'
		will add the following options to the commande seqlogo:
		     -w 12 -t "CRP matrix" -a

		To obtain the list of options supported by seqlogo, type
			    seqlogo

PRIOR FREQUENCY FILE

      The prior frequency file is a tab-delimited text file with one
      row per residue, and two columns. The first column indicates the
      residue, the second column its prior frequency.

      Prior frequency files can be generated with the program
      oligo-analysis.

      oligo-analysis -i reference-seq.fasta -l 1 -return freq -1str -v 1

INPUT/OUTPUT FORMATS

    Some formats are supported only for input, others for
    output. There are more formats accepted for input, because the
    general use of this program is to convert a PSSM obtained from a
    database (e.g. TRANSFAC) or a pattern-discovery program
    (e.g. consensus, gibbs, meme, MotifSampler, ...) and obtain a
    matrix either for scanning (with matrix-scan) or for computing
    statistical parameters (see the return fields below).

    tab (input/output)
	tab-delimited file. One row per residue, one column per
	position. The first column of each row indicates the residue,
	the following columns give the frequency of that residue
	at the corresponding position of the matrix.
	e.g.:
	
	a	|	7	9	0	0	16	0	1	0	0	11	6	9	6	1	8
	c	|	5	1	4	16	0	15	0	0	0	3	5	5	0	2	0
	g	|	4	4	1	0	0	0	15	0	16	0	3	0	0	2	0
	t	|	0	2	11	0	0	1	0	16	0	2	2	2	10	11	8
	

        The tab format accepts a user-specific set of return fields
        (option -return), provigind different statistics on the matrix
        (counts, frequencies, weights, information, other parameters:
        see description below).

    patser (output)
        This format can be used as input to scan sequences with
        patser, the pattern-matching program developed by Jerry Hertz.

        This is actually the same format as tab (described above), but
        the only return field is the count matrix.

    assembly (input)
	Output file from the program pattern-assembly. One assembly
	file can contain zero, one or several assemblies. Each
	assembly is converted to a position-specific scoring matrix by
	taking, for each residue at each position, the score of the
	most significant pattern (oligonucleotide) containing that
	residue in this position of the assembly.

    consensus (input/output)
	Output file from consensus, the pattern-discovery program
	developed by Jerry Hertz (Hertz et al., Comput Appl Biosci,
	1990:6, 81-92). This file contains one or several matrices, +
	additional information on the parameters used for pattern
	discovery (e.g. prior residue frequencies).

    gibbs (input)
	Output file from gibbs, the pattern-discovery program
	developed by Andrew Neuwald (Lawrence et al. Science, 1993:
	262, 208-214; Neuwald, et al. Protein Sci, 1995: 4, 1618-1632)

    meme (input)
	Output file from MEME, the pattern-discovery program developed by
	tim Bailey.This file contains one or several matrices, +
	additional information on the parameters used for pattern
	discovery (e.g. prior residue frequencies).

    MotifSampler (input/output)
	Output file from MotifSampler, the pattern-discovery program
	developed by Gert Thijs (Thijs et al. Bioinformatics, 2001:17,
	1113-1122).

    TRANSFAC (input/output)
        Format used in the TRANSFAC database;
        (http://www.gene-regulation.com/pub/databases.html)

    InfoGibbs (input)
        Output file from InfoGiibbs.

        InfoGibbs is a gibbs sampler based on the optimization of the
        information content of the matrix (rather than the weight of
        the sampled segments). InfoGibbs is currently under
        development in the RSAT team (Grgory Gathy, unpublished).

    cb /trap (input/output)
	Cluster-Buster output file (usual extention .cb). The header
	line starts with a > (like in fasta format). The matrix is
	then printed "vertically" on the following lines: each column
	corresponds to one residue, and each row to a position in the
	alignment.
	For TRAP (Roider et al, Bioinformatics, 2007), the "/name=" is necessary for the program to work.
	
	>element1 /name=element1
	0  4 2 14
	12 0 0 8
	8  0 1 11
	20 0 0 0
	....
	

    feature (input)
        Output file from convert-features. 

        This format allows to obtain a PSSM from a list of (supposedly
        pre-aligned) sites. These sites can themselves have been
        collected by scanning sequences with a matrix (matrix-scan) or
        by searching string-based patterns in a sequence
        (dna-pattern). 

        Converting features to matrices can for example be useful for
        iterative refinment of a matrix (colecting sites from a
        matrix, and building a matrix from those sites).

        Another application is to detect oligomers or dyads in a
        sequence set, and build a matrix from these.

    clustal (input)
	The popular multiple alignemnt program clustalw. 

RETURN FIELDS FOR THE TAB-DELIMITED OUTPUT FORMAT

   The tab output format supports user-specified return fields.

    counts
        Each cell of the matrix indicates the number of occurrences of the
        residue at a given position of the alignment.

    profile
	The matrix is printed vertically (each matrix column becomes a
	row in the output text). Additional parameters are indicated
	besides each position, and a histogram is drawed.

    crude frequencies
        Relative frequencies are calculated as the counts of residues
        divided by the total count of the column.

        Fij=Cij/SUMi(Cij)

        where

        Cij is the absolute frequency (counts) of residue i at position j of
            the alignment

        Fij is the relative frequency of residue i at position j of the
            alignment

    frequencies corrected with pseudo-weights
        Relative frequencies can be corrected by a pseudo-weight (b) to
        reduce the bias due to the small number of observations.

        F\'ij=Cij+b*Pi/[SUMi(Cij)+b]

        where

        Pi  is the prior frequency for residue i

        b   is the pseudo-weight, which is "shared" between residues
            according to their prior frequencies.

    weights
        Weights are calculated according to the formula from Hertz (1999),
        as the natural logarithm of the ratio between the relative frequency
        (corrected for pseudo-weights) and the prior residue probability.

        Wij=ln(F\'ij/Pi)

    information matrix
        The crude information content is calculated according to the formula
        from Hertz (1999).

        Iij = Fij*ln(Fij/Pi)

        In addition, we calculate a "corrected" information content which
        takes pseudo-weights into account.

        I\'ij = F\'ij*ln(F\'ij/Pi)

	One advantage of the corrected information content is that it
	assigns finite values when Fij=0.

    margins
	Calculate marginal values (column and row sum, min, max) for each 
        matrix.

    parameters
        Returns a series of parameters associated to the matrix. The
        list of parameters to be exported depends on the input formats
        (each pattern discovery program returns specific parameters,
        which are more or less related to each others but not
        identical).

        Some additional parameters are optionally calculated

        consensus
            The degenerate consensus is calculated by collecting, at each
            position, the list of residues with a positive weight.
            Contrarily to most applications, this consensus is thus weighted
            by prior residue frequencies: a residue with a high frequency
            might not be represented in the consensus if this frequency does
            not significantly exceed the expected frequency. Uppercases are
            used to highlight weights >= 1.

            The consensus is exported as regular expression, and with the
            IUPAC code for ambiguous nucleotides
            (http://www.chem.qmw.ac.uk/iupac/misc/naseq.html).

                    A                       (Adenine) 
                    C                       (Cytosine)
                    G                       (Guanine)
                    T                       (Thymine)
                    R       = A or G        (puRines)
                    Y       = C or T        (pYrimidines)
                    W       = A or T        (Weak hydrogen bonding)
                    S       = G or C        (Strong hydrogen bonding)
                    M       = A or C        (aMino group at common position)
                    K       = G or T        (Keto group at common position)
                    H       = A, C or T     (not G)
                    B       = G, C or T     (not A)
                    V       = G, A, C       (not T)
                    D       = G, A or T     (not C)
                    N       = G, A, C or T  (aNy)

            The strict consensus indicates, at each position, the residue
            with the highest positive weight.

        information
            The total information is calculated by summing the
            information content of all the cells of the matrix. This
            parameters is already returned by the program consensus
            (Hertz), but not by other programs.

	wdistrib (deprecated!)
	    Theoretical distribution of weight probabilities (computed
	    as in Bailey, Bioinformatics, 1999).
            WARNING: use matrix-distrib instead

	logo 
	    Sequence logo, a visual representation of the motif, where
	    each column of the matrix is represented as a stack of
	    letters whose size is proportional to the corresponding
	    residue frequency. The total height of each column is
	    proportional to its information content.

	    Sequence logo are generated using the freeware program Weblogo
	    (http://weblogo.berkeley.edu/).

REFERENCES

    Matrix theory
        Hertz, G.Z. and G.D. Stormo (1999). Identifying DNA and
        protein patterns with statistically significant alignments of
        multiple sequences. Bioinformatics, 15(7-8): p. 563-77.

    Sequence logos
        Schneider, T. D. & Stephens, R. M. Sequence logos: a new way
        to display consensus sequences. Nucleic Acids Res 18,
        6097-6100 (1990).

    Weblogo
        Crooks, G. E., Hon, G., Chandonia, J. M. & Brenner,
        S. E. WebLogo: a sequence logo generator. Genome Res 14,
        1188-1190, doi:10.1101/gr.849004 14/6/1188 (2004).


End_of_help
  close HELP;
  exit(0);
}

################################################################
## Display short help message
sub PrintOptions {
  open HELP, "| more";
  print HELP <<End_short_help;
convert-matrix options
----------------
-h		(must be first argument) display full help message
-help		(must be first argument) display options
-i		input file
-mlist 	matrix_list
-o		output file
-v		verbose
-from		input matrix format. Supported: $supported_input_formats
-to		output matrix format. Supported: $supported_output_formats
-return		return type(s). Supported: $supported_return_fields
-sort desc|asc|alpha key     Sort matrices according to the specified key.
-top		maximal number of matrices to return. 
-prior [deprecated, use -bgfile] 		background model (prior frequency file)
-bgfile		background model (prior frequency file)
-bg_format	background model format. Supported: $supported_bg_formats
-bg_pseudo	pseudo-frequency for background model. (default:$bg_model->{bg_pseudo})
-pseudo		pseudo-weight (default: $pseudo_weight)
-equi_pseudo	equiprobable distribution of pseudo-weights between residues
-base		base for the logarithms used to compute the weight and info content
-decimals	number of decimals digits to print for real matrices
-prefix		prefix to add before matrix identifiers
-perm		number of permuted matrices to return.
-max_profile	Maximal width of the profile histogram 
-rc		Convert the matrix to its reverse complement.
-logo_format	logo image format. Supported $supported_logo_formats 
-logo_dir	specifies a directory to output the logo files
-logo_opt	options to be passed to seqlogo
End_short_help
  close HELP;
  exit(0);
}


################################################################
## Read arguments
sub ReadArguments {
    foreach my $a (0..$#ARGV) {
	### verbose ###
	if ($ARGV[$a] eq "-v") {
	    if (&IsNatural($ARGV[$a+1])) {
		$verbose = $ARGV[$a+1];
	    } else {
		$verbose = 1;
	    }

	    ### detailed help
	} elsif ($ARGV[$a] eq "-h") {
	    &PrintHelp();

	    ### list of options
	} elsif ($ARGV[$a] eq "-help") {
	    &PrintOptions();

	    ### input file 
	} elsif ($ARGV[$a] eq "-i") {
	    $infile{input} = $ARGV[$a+1];
	    
	    ### list of input file 
	} elsif ($ARGV[$a] eq "-mlist") {
	    $infile{matrix_list} = $ARGV[$a+1];;
	    
	    ### Maximal number of matrices to return
	} elsif ($ARGV[$a] eq "-top") {
	    $top_matrices = $ARGV[$a+1];
	    &FatalError(join("\t", $top_matrices, "Invalid value for the option -top. Should be a strictly positive Natural number.")) 
		unless ((&IsNatural($top_matrices)) && ($top_matrices >= 1));

	    ## Sort matrices
	} elsif ($ARGV[$a] eq "-sort") {
	    $sort_order = lc($ARGV[$a+1]);
	    unless (($sort_order eq "desc")
		    || ($sort_order eq "asc")
		    || ($sort_order eq "alpha")
		) {
		&RSAT::error::FatalError($sort_order, "is not a valid sorting order. Supported: desc,asc,alpha.");
	    }
	    $sort_key = $ARGV[$a+2];

	    ### prior frequency file
	} elsif ($ARGV[$a] eq "-prior") {
	    $infile{prior} = $ARGV[$a+1];

	} elsif ($ARGV[$a] eq "-bgfile") {
	    $infile{prior} = $ARGV[$a+1];

	    ## bg format
	} elsif ($ARGV[$a] eq "-bg_format") {
	    $main::bg_format = lc($ARGV[$a+1]);
	    &RSAT::error::FatalError(join("\t", $main::bg_format,
					  "Invalid input format.",
					  "Supported: ", $main::bg_format))
		unless ($main::supported_bg_format{$main::bg_format});  
	    
	    ##bg_pseudo
	} elsif ($ARGV[$a] eq "-bg_pseudo") {
	    $main::bg_pseudo = $ARGV[$a+1];
	    &RSAT::error::FatalError(join("\t", $main::bg_pseudo,
					  "Invalid value for bg_pseudo, should be a Real number between 0 and 1."))
		unless ((&IsReal($main::bg_pseudo)) && (0 <= $main::bg_pseudo) && ($main::bg_pseudo <= 1));

	    ### output file 
	} elsif ($ARGV[$a] eq "-o") {
	    $outfile{output} = $ARGV[$a+1];

	    ### return type(s)
	} elsif ($ARGV[$a] eq "-return") {
	    my $return_fields = $ARGV[$a+1];
	    push @return_fields, split(",", $return_fields);


	    ### input format 
	} elsif ($ARGV[$a] eq "-from") {
	    $input_format = lc($ARGV[$a+1]);
	    unless ($supported_input_format{$input_format}) {
		&RSAT::error::FatalError("$input_format\tInvalid input format for matrix\tSupported: $supported_input_formats");
	    }
	} elsif ($ARGV[$a] eq "-in_format") {
	    &RSAT::message::Warning("The option -in_format is obsolete, please use -from instead.");
	    $input_format = lc($ARGV[$a+1]);
	    unless ($supported_input_format{$input_format}) {
		&RSAT::error::FatalError("$input_format\tInvalid output format for matrix\tSupported: $supported_input_formats");
	    }

	    ### Obsolete option for the input format 
	} elsif ($ARGV[$a] eq "-format") {
	    &RSAT::message::Warning("Option -format is obsolete. Use -in_format instead.");
	    $input_format = $ARGV[$a+1];
	    unless ($supported_input_format{$input_format}) {
		&RSAT::error::FatalError("$input_format\tInvalid format for input matrix\tSupported: $supported_input_formats");
	    }

	    ### output format 
	} elsif ($ARGV[$a] eq "-to") {
	    $output_format = $ARGV[$a+1];
	    unless ($supported_output_format{$output_format}) {
		&RSAT::error::FatalError("$output_format\tInvalid format for output matrix\tSupported: $supported_output_formats");
	    }
	} elsif ($ARGV[$a] eq "-out_format") {
	    &RSAT::message::Warning("The option -out_format is obsolete, please use -to instead.");
	    $output_format = $ARGV[$a+1];
	    unless ($supported_output_format{$output_format}) {
		&RSAT::error::FatalError("$output_format\tInvalid format for output matrix\tSupported: $supported_output_formats");
	    }

	    ### site format 
	} elsif ($ARGV[$a] eq "-site_format") {
	    $site_format = $ARGV[$a+1];
	    unless ($supported_site_format{$site_format}) {
		&RSAT::error::FatalError("$site_format\tInvalid format for sites\tSupported: $supported_site_formats");
	    }

	    ### pseudo-weight
	} elsif ($ARGV[$a] eq "-pseudo") {
	    $pseudo_weight = $ARGV[$a+1];
	    unless (&IsReal($pseudo_weight)) {
		&RSAT::error::FatalError("Invalid pseudo-weight $pseudo_weight. Must be a real value");
	    }

	    ## Equiprobable distribution of the pseudo-weight
	} elsif ($ARGV[$a] eq "-equi_pseudo") {
	    $equi_pseudo = 1;


	    ## base for the logarihtms in the weight and info content
	} elsif ($ARGV[$a] eq "-base") {
	    $info_log_base = $ARGV[$a+1];
	    &RSAT::error::FatalError("base should be a real number") unless (&IsReal($info_log_base));
	    &RSAT::error::FatalError("base should be larger than 1") if ($info_log_base <= 1);

	    ### decimals
	} elsif ($ARGV[$a] eq "-decimals") {
	    $decimals = $ARGV[$a+1];
	    unless (&IsNatural($decimals)) {
		&RSAT::error::FatalError("Invalid decimals $decimals. Must be a natural value");
	    }

	    ### prefix
	} elsif ($ARGV[$a] eq "-prefix") {
	    $main::prefix = $ARGV[$a+1];
	    $main::prefix =~s/\s/_/g;

	    ### permutations
	} elsif ($ARGV[$a] eq "-perm") {
	    $perm = $ARGV[$a+1];
	    unless (&IsNatural($perm)) {
		&RSAT::error::FatalError("Invalid permutation number $perm. Must be a natural value");
	    }

	    ### max_profile
	} elsif ($ARGV[$a] eq "-max_profile") {
	    $max_profile = $ARGV[$a+1];
	    unless (&IsNatural($max_profile)) {
		&RSAT::error::FatalError("Invalid max_profile $max_profile. Must be a natural value");
	    }

	    ## Compute reverse complement

	    ## Equiprobable distribution of the pseudo-weight
	} elsif ($ARGV[$a] eq "-rc") {
	    $rc = 1;

	    ## Export logos
	} elsif ($ARGV[$a] eq "-logo_format") {
	    $logo_formats = $ARGV[$a+1];
	    push @logo_formats, split(",", $logo_formats);

	} elsif ($ARGV[$a] eq "-logo_dir") {
	    $logo_dir = $ARGV[$a+1];

	} elsif ($ARGV[$a] eq "-logo_opt") {
	    $logo_opt .= " ".$ARGV[$a+1];

	}
    }
}

################################################################
## Print verbosity
sub Verbose {
  print $out "; convert-matrix ";
  &PrintArguments($out);
  if (defined(%infile)) {
    print $out "; Input files\n";
    while (($key,$value) = each %infile) {
      print $out ";\t$key\t$value\n";
    }
  }
  printf $out "%-22s\t%s\n", "; Input format", $input_format;
  if (defined(%outfile)) {
    print $out "; Output files\n";
    while (($key,$value) = each %outfile) {
      print $out ";\t$key\t$value\n";
    }
  }
  printf $out "%-22s\t%s\n", "; Output format", $output_format;

  printf $out "%-22s\t%d\n", "; pseudo-weight", $pseudo_weight;
      ## Background model
    if ($infile{prior}) {
   	printf $main::out "; Background model\n";
	my $order = $bg_model->get_attribute("order");
    if ($order == 0) {
	printf $main::out ";\t%-14s\n", "Bernoulli model (order=0)";
    }
   	printf $main::out ";\t%-14s\t%s\n", "Strand", $bg_model->get_attribute("strand");
    printf $main::out ";\t%-14s\t%s\n", "Background pseudo-frequency", $bg_model->get_attribute("bg_pseudo");

    my %bg_prior = $bg_model->get_attribute("suffix_proba");
    print $main::out ";\tResidue probabilities\n";
    foreach my $residue (sort keys %bg_prior) {
	printf $main::out ";\t\t%s\t%.5f\n", $residue, $bg_prior{$residue};
    }
    }
}

################################################################
## Choose the name of a matrix
sub SetMatrixName {
  my ($matrix, $m, $matrix_file, $input_format) = @_;
  my $matrix_name;
  my $matrix_ac = $matrix->get_attribute("accession");
  my $matrix_id = $matrix->get_attribute("id");
  if ($matrix_ac) {
    $matrix_name = $matrix_ac;
  } elsif ($matrix_file) {
    $matrix_name = &RSAT::util::ShortFileName($matrix_file);
    $matrix_name =~ s/\.${input_format}$//; ## suppress the extension from the file name if it corresponds to the matrix format
    if (scalar(@matrices) > 0) {
      $matrix_name .= "_m".$m;
    }
  } else {
    $matrix_name = "matrix";
    if (scalar(@matrices) > 0) {
      $matrix_name .= "_m".$m;
    }
  }
  unless (defined($matrix->get_attribute("name"))){
    $matrix->set_attribute("name", $matrix_name);
  }
  &RSAT::message::Info($m,
		       "name=".$matrix->get_attribute("name"),
		       "id=".$matrix->get_attribute("id"),
		       "format=".$input_format,
		       "file=".$matrix_file,
		      ) if ($main::verbose >= 5);
}

################################################################
## Export one matrix
sub ExportMatrix {
  my ($matrix, $m) = @_;
  #  $matrix->readFromFile($infile{input}, $input_format);
  if ($infile{prior}) {
    $matrix->setPrior(%prior);
  }

  my $ID = $matrix->get_attribute("identifier")
      || $matrix->get_attribute("name")
      || $matrix->get_attribute("id")
      || $matrix->get_attribute("AC")
      || "matrix_".$m;
  my $AC = $matrix->get_attribute("accession") || $ID;
  &RSAT::message::Debug("ExportMatrix", "ID", $ID, "AC", $AC) if ($main::verbose >= 4);
  
  if ($prefix) {
      $matrix->force_attribute("name", $prefix.$matrix->get_attribute("name"));
      $matrix->force_attribute("id", $prefix.$matrix->get_attribute("id"));
      $matrix->force_attribute("AC", $prefix.$matrix->get_attribute("AC"));
  }
#   ## Matrix permutations
#   if ($perm) {
#     &RSAT::message::Info("Permutating matrix", $matrix->get_attribute("name"))
#       if ($main::verbose >= 0);
#     ## permute matrix
#     for my $i (1..$perm) {
#      # if ($i > 1) {
# 	#print $out $RSAT::matrix::matrix_terminator{$output_format}, "\n";
#       #}
#       ## Permute the matrix
#       $matrix->permute_columns();
#       my $ID_perm = join("_", $ID, "perm".$i);
#       my $AC_perm = join("_", $AC, "perm".$i);
#       $matrix->force_attribute("name", $ID_perm);
#       $matrix->force_attribute("ID", $ID_perm);
#       $matrix->force_attribute("identifier", $ID_perm);
#       $matrix->force_attribute("accession", $AC_perm);
#       &RSAT::message::Debug("Permutation", $i, "ID_perm", $ID_perm, "AC_perm", $AC_perm) if ($main::verbose >= 5);
#       ## print result
#       if ($main::verbose >= 1) {
# 	print $out "; ID\t$ID_perm\n" if ($verbose >=1);
# 	print $out "; AC\t$AC_perm\n" if ($verbose >=1);
# 	print $out "; permutation $i/$perm\n";
#       }
#       print $out $matrix->toString(sep=>"\t",
# 				   type=>"counts",
# 				   format=>$output_format,
# #				   format=>"tab",
# 				   pipe=>"" ## We suppress the pipe for permute-table
# 				  );
#     }
#  } else {

    ## Print the counts matrix
    if ($return_fields{counts}) {
      print $out $matrix->toString(sep=>"\t",
				   type=>"counts",
				   format=>$output_format,
				  );
    }

    ## Calculate frequency matrix
    if ($return_fields{frequencies}) {
      $matrix->calcFrequencies();
      print $out $matrix->toString(col_width=>($decimals+4), 
				   decimals=>$decimals, 
				   type=>"frequencies",
				   format=>$output_format);
    }

    ## Calculate weight matrix
    if ($return_fields{weights}) {
      $matrix->calcWeights();
      print $out $matrix->toString(col_width=>($decimals+4), 
				   decimals=>$decimals, 
				   type=>"weights",
				   format=>$output_format);
    }

    ## Print the profile matrix
    if ($return_fields{profile}) {
      print $out $matrix->toString(sep=>"\t",
				   type=>"profile",
				   format=>$output_format,
				  );
    }

    ## Calculate information content matrix
    if ($return_fields{info}) {
      $matrix->calcInformation();
      print $out $matrix->toString(col_width=>($decimals+4), 
				   decimals=>$decimals, 
				   type=>"information",
				   format=>$output_format);
    }


    ## Return sites
    if ($return_fields{sites}) {
      print $out "; Sites\t",$matrix->get_attribute("sites"),"\n" if ($main::verbose >= 1);
      my $s = 0;
      my @site_ids = $matrix->get_attribute("site_ids");
      foreach my $site_seq ($matrix->get_attribute("sequences")) {
	$s++;
	my $site_id =  $site_ids[$s-1] || $s;
	&PrintNextSequence($out, $site_format, 0, $site_seq, $site_id);
	#      print $out $s, "\t\\", $site_seq, "\\\n";
      }
    }

    ## Calculate consensus
    if ($return_fields{consensus}) {
      $matrix->calcConsensus();
      print $out $matrix->toString(type=>'consensus');
    }

    ## Return parameters
    if ($return_fields{parameters}) {
      $matrix->calcConsensus();
      print $out $matrix->toString(type=>"parameters");
    }

    ## Return parameters
    if ($return_fields{logo}) {
      if ($main::verbose >= 1) {
	foreach my $logo_format (@logo_formats) {
	  print $out  "; logo file:" , $matrix->get_attribute("logo_file").".".$logo_format, "\n";
	}
      }
    }
  }

  ## Return weight distribution
  if ($return_fields{wdistrib}) {
    print ";WARNING: return wdistrib is deprecated in convert-matrix! Use matrix-dsitrib instead\n";
    $matrix->calcTheorScoreDistrib("weights", decimals=>$decimals);
    my %weight_proba = $matrix->getTheorScoreDistrib("weights");
    my %weight_proba_cum = $matrix->getTheorScoreDistrib("weights", "cum");
    my %weight_proba_inv_cum = $matrix->getTheorScoreDistrib("weights", "inv_cum");

    ## Print the description of column contents
    my @columns = ("weight", "proba", "cum", "Pval", "ln_Pval", "log_P", "sig");
    if ($main::verbose >= 1) {
      print $out ";\n; Theoretical distribution of weight probabilities\n";
      my %descr = ();
      $descr{"weight"} = "log-likelihood score: w=P(S|M)/P(S|B)";
      $descr{"proba"} = "probability density function: P(W=w)";
      $descr{"cum"} = "cumulative density function: P(W <= w)";
      $descr{"Pval"} = "P-value = inverse cumulative density function: Pval = P(W >= w)";
      $descr{"ln_Pval"} = "natural logarithm of the P-value";
      $descr{"log_P"} = "base 10 logarithm of the P-value";
      $descr{"sig"} = "significance: sig = -log10(Pval)";
      $c =0;
      foreach my $col (@columns) {
	$c++;
	print $out sprintf(";\t%d\t%-12s\t%s", $c, $col, $descr{$col}), "\n";
      }
    }

    ## Print header
    print $out "#", join ("\t", @columns), "\n";

    ## Print the score distribution
    my $log10 = log(10);
    foreach my $weight (sort {$a <=> $b} keys (%weight_proba)) {
      $weight = sprintf("%.${decimals}f", $weight);
      my $weight_proba = $null;
      my $weight_proba_cum = $null;
      my $weight_proba_inv_cum = $null;
      my $ln_pval = $null;
      my $log_P = $null;
      my $sig = $null;
      if (defined($weight_proba{$weight})) {
	$weight_proba = sprintf("%.1e", $weight_proba{$weight});
      }
      if (defined($weight_proba_cum{$weight})) {
	$weight_proba_cum = sprintf("%.1e", $weight_proba_cum{$weight});
      }
      if (defined($weight_proba_inv_cum{$weight})) {
	$weight_proba_inv_cum = sprintf("%.1e", $weight_proba_inv_cum{$weight});
	if ($weight_proba_inv_cum{$weight} > 0) {
	  $ln_pval =  sprintf("%.3f",log($weight_proba_inv_cum{$weight}));
	  $sig =  sprintf("%.3f",-log($weight_proba_inv_cum{$weight})/$log10);
	  $sig =~ s/^-(0.0+)$/$1/;
	  $log_P = -$sig;
	} else {
	  $ln_pval = "-Inf";
	  $log_P = "-Inf";
	  $sig = "Inf";
	}
      }
      print $out join("\t", $weight, 
		      $weight_proba,
		      $weight_proba_cum,
		      $weight_proba_inv_cum,
		      $ln_pval,
		      $log_P,
		      $sig,
		     ), "\n";
    }
#  }

}

