/***************************************************************************
 *                                                                         *
 *  info-gibbs
 *  Gibbs sampler for DNA motif discovery based on information content
 *   
 *  2008/07/09 
 *                                                                         *
 ***************************************************************************/
using namespace std;

#include <iostream> 
#include <vector>
#include <string>
#include <getopt.h>

#include "fasta.h"
#include "markov.h"
#include "gibbs.h"

#define ERROR(...) {fprintf(stderr, "ERROR: "); fprintf(stderr, __VA_ARGS__); fprintf(stderr, "\n"); exit(1);}
#define CHECK_VALUE(a, minval, maxval, msg) {if (a < minval || a > maxval){ERROR(msg); exit(1);}}
int VERSION = 20080900;
char* COMMAND_LINE;
/*
 *
 * USAGE & HELP
 *
 */
void usage(){
	printf("usage: info-gibbs -l length -w expected_occ -i seq.fa\n\n");
}

void help(){
	printf(
"\n"
"NAME\n"
"        info-gibbs\n"
"\n"
"VERSION\n"
"        ");
//"        2008\n"
    printf("%d", VERSION);
    printf(
"\n"
"AUTHOR\n"
"        Matthieu Defrance <defrance@scmbb.ulb.ac.be>\n"
"\n"
"DESCRIPTION\n"
"        Gibbs sampling algorithm for motifs discovery.\n"
"        Searches for highly conserved motifs in a set of DNA sequences.\n"
"        Convervation is based on the motif information content (Relative Entropy).\n"
"\n"
"CATEGORY\n"
"        sequences\n"
"        motif discovery\n"
"        \n"
"USAGE        \n"
"        info-gibbs -l motiflength [-i inputfile] [-h | --help]\n"
"\n"
"ARGUMENTS\n"
"  GENERAL OPTIONS\n"
"    -h, --help            show this help message and exit\n"
"    -v #, --verbosity=#   set verbosity to level #\n"
"                              0 no verbosity\n"
"                              1 low verbosity\n"
"                              2 high verbosity\n"
"                              3 maximal verbosity + debug + trace\n"
"    -i #, --input=#       read sequence from # (must be in FASTA format)\n"
"                          if not specified, the standard input is used\n"
"\n"
"    -l #, --length=#      set oligomer length to #\n"
"                          when the option dyad is used # represents the length of one monad\n"
"                          EXAMPLE: --length=7\n"
"\n"
"    -s #, --strand=#      search in foward strand + or in both strands +-\n"
"                          DEFAULT: +-\n"
"                          EXAMPLE: --strand=+\n"
"\n"
"    -n #, --iter=#        maximum number of Gibbs sampling iterations\n"
"                          DEFAULT: 1000\n"
"\n"
"    -w #, --words=#       number of motif occurrences that are expected to\n"
"                          be found (incompatible with -e)\n"
"                          DEFAULT: 10\n"
"    -e #, --expected=#    expected number of motif occurrences per sequence\n"
"                          that are expected to be found (incompatible with -w)\n"
"                          DEFAULT: 1\n"
"    -m #, --motifs=#      number of motifs to extract (one by default)\n"
"                          DEFAULT: 1\n"
"    -b #, --bgfile=#      use # predefined INCLUSive background model\n"
"                          [http://homes.esat.kuleuven.be/~thijs/help/help_motifsampler.html#background]    \n"
"                          EXAMPLE --bgfile=mybgfile\n"
"    -d #, --dmin=#        set minimal distance between 2 motif occurrences to #\n"
"\n"
"    -t #        		  set the temperature (should be in range [0.6 1.4])\n"
"						  DEFAULT: 1.0\n"
"    -r #  --nrun=#        try to run the Gibbs sampling seach # times\n"
"                          DEFAULT: 10\n"
"    --rseed=#             set random seed to #\n"
"    -V, --version         print version\n"
"\n"    );
}

/*
 *
 * MAIN
 *
 */
int main(int argc, char * argv[]){

	VERBOSITY = 0;
    Parameters params;

    params.l = 6;        // motif length
	params.iter = 1000;  // iterations
	params.n = 0;       // a motif is composed of w sites (or words)
	params.nrun = 10;    // run gibbs main loop n times
	params.temperature = 1.0;
    params.rc = true;    // search also on reverse strand
    params.dmin = 0;     // minimal distance between 2 sites
    params.motifs = 1;
    params.update = 1;

	int optchar;
    float e = 1.0;    // expected number of motif occurrences per sequence
	char *strand;    // "+-" "+"
	char *seqfile = NULL;
	char *bgfile = NULL;

	if (argc <= 1){
		usage();
		return 0;
	}

	// construct command line string
	string cmdline = "";
	for (int i=0; i<argc; i++){
		cmdline += argv[i];
		cmdline += " ";
        COMMAND_LINE = (char *) strdup(cmdline.c_str());
	}

    // options
	static const char *optString =  "l:w:i:n:t:r:hs:v:b:u:d:m:e:V";
	static const struct option longOpts[] = {
	    { "input",  	 required_argument, NULL, 'i' },
	    { "strand", 	 required_argument, NULL, 's' },
	    { "length", 	 required_argument, NULL, 'l' },
	    { "words",  	 required_argument, NULL, 'w' },
	    { "expected",  	 required_argument, NULL, 'e' },
	    { "nrun",   	 required_argument, NULL, 'r' },
	    { "iter",        required_argument, NULL, 'n' },
	    { "temperature", required_argument, NULL, 't' },
	    { "bgfile",      required_argument, NULL, 'b' },
	    { "verbose",     required_argument, NULL, 'v' },
	    { "update",      required_argument, NULL, 'u' },
	    { "dmin",        required_argument, NULL, 'd' },
	    { "motifs",      required_argument, NULL, 'm' },
	    { "rseed",       required_argument, NULL, 'z' },
	    { "help",        no_argument,       NULL, 'h' },
	    { "version",     no_argument,       NULL, 'V' },
	    { NULL,          no_argument,       NULL, 0   }
	};
	int longIndex;
	while ((optchar = getopt_long( argc, argv, optString, longOpts, &longIndex )) != -1)
	{ 	
		switch (optchar) 
		{ 
			case 'h':
			help();
			return 0;
			break;

			case 'V':
            printf("%d\n", VERSION);
			return 0;
			break;
			
			case 'l': 
			params.l = atoi(optarg);
            CHECK_VALUE(params.l, 1, 100, "invalid value for length (should be between 0 and 100)");
			break; 

			case 'w': 
			params.n = atoi(optarg);
            CHECK_VALUE(params.n, 1, 1000, "invalid value for e (should be between 1 and 1000)")
			break; 

			case 'e': 
			e = atof(optarg);
            CHECK_VALUE(e, 0.001, 1000, "invalid value for e (should be between 0.001 and 1000)")
			break; 

			case 'r':
			params.nrun = atoi(optarg);
			CHECK_VALUE(params.nrun, 1, 100, "invalid number of run (should be between 1 and 100)")
			break; 

			case 'u': 
            UPDATE = atoi(optarg);
            params.update = atoi(optarg);
			break; 

			case 't': 
			params.temperature = atof(optarg);
            CHECK_VALUE(params.temperature, 0.1, 2.0, "invalid value for temperature (should be between 0.1 and 2.0)")
			break; 

			case 'n':
			params.iter = atoi(optarg);
			CHECK_VALUE(params.iter, 0, 1000000, "invalid number of iterations (should be between 0 and 1.000.000)")
			break; 
			
			case 'i':
			seqfile = (char *) strdup(optarg);
			break;

			case 'b':
			bgfile = (char *) strdup(optarg);
			break;

			case 'm':
			params.motifs = atoi(optarg);
			CHECK_VALUE(params.motifs, 1, 10, "invalid number of motifs (should be between 1 and 10)")
			break;

			case 'v':
			VERBOSITY = atoi(optarg);
            CHECK_VALUE(VERBOSITY, 0, 3, "invalid verbosity level (0,1,2,3)")
			break;

			case 'z':
			SEED = atoi(optarg);
			break;

			case 'd':
			params.dmin = atoi(optarg);
			CHECK_VALUE(params.dmin, 0, 10000000, "invalid distance")
			break;

			case 's':
			strand = (char *) strdup(optarg);
				if (strcmp(strand, "+-") == 0)
					params.rc = true;
				else
					params.rc = false;
			break;

			default:
			usage();
			return 0;
		}
	}

    // read the sequences
    if (seqfile == NULL)
        ERROR("must provide at least a fasta file");
	vector<string> raw_sequences;
	if (read_fasta(raw_sequences, seqfile, params.rc) == 0){
			ERROR("can not load sequences");		
	}
	Sequences sequences = convert_sequences(raw_sequences);

    // set expected number of sites
    if (params.n == 0){
		if (params.rc == true)
        	params.n = (int) (0.5 * sequences.len * e);
		else
        	params.n = (int) (sequences.len * e);
    }

    // set bg model
	Markov markov;
	if(bgfile != NULL){
		if (load_inclusive(markov, bgfile) == 0){
			ERROR("can not load bg model");
		}
	}else{
		//double priori[4] = {0.25, 0.25, 0.25, 0.25};
		bernoulli(markov, compute_priori(sequences));
	}

    // run gibbs sampler
	gibbs(raw_sequences, sequences, markov, params);

    return 0;
}








