/***************************************************************************
 *                                                                         *
 *  quick-scan
 *  
 *   
 *
 *                                                                         *
 ***************************************************************************/

#include <iostream> 
#include <vector>
#include <string>
#include <cstring>

using namespace std;

#include <time.h>
#include <string.h>

#include "utils.h"
#include "markov.h"
#include "matrix.h"
#include "scan.h"
#include "cfasta.h"
#include "seq.h"
#include "dist.h"
#include "pval.h"

int VERSION = 20131211;
char *COMMAND_LINE;

/*
 *
 * USAGE & HELP
 *
 */
void usage()
{
    printf("usage: matrix-scan-quick -m matrix -i seq.fa\n\n");
}

void help()
{
    printf(
"\n"
"NAME\n"
"        matrix-scan-quick\n"
"\n"
"VERSION\n"
"        20120208\n"
"\n"
"AUTHOR\n"
"        Matthieu Defrance <defrance@bigre.ulb.ac.be>\n"
"\n"
"DESCRIPTION\n"
"        Faster and limited version of matrix-scan.\n"
"\n"
"CATEGORY\n"
"        sequences\n"
"        pattern matching\n"
"        PSSM\n"
"        \n"
"USAGE        \n"
"        matrix-scan-quick -m matrix [-i sequences] [-bgfile background] [-h | --help]\n"
"\n"
"INPUT FORMATS\n"
"  Sequence file\n"
"    Only sequences in FASTA format are supported.\n"
"\n"
"  Matrix file\n"
"    Only the tab format is supported.\n"
"    see convert-matrix for details.\n"
"\n"
"  Background file\n"
"    Only the INCLUSive format is supported.\n"
"    see convert-background-model for details.\n"
"\n"
"OUTPUT FORMAT\n"
"    The output is a tab-delimited file, with one row per match.\n"
"\n"
"SCORING SCHEME\n"
"    See matrix-scan -h for details.\n"
"\n"
"ARGUMENTS\n"
"    -h, --help            show this help message and exit.\n"
"\n"
"    -i #                  read sequence from filename # (FASTA format).\n"
"                          if not specified, the standard input is used.\n"
"\n"
"    -o #                  print the output to filename #.\n"
"                          if not specified, the standard output is used.\n"
"\n"
"    -m #                  read the matrix # (must be in tab format).\n"
" \n"
"    -bgfile #             use # as background model (must be in INCLUSive format).\n"
"                          by default an equiprobable model is used.\n"
"\n"
"    -2str                 scan both DNA strands.\n"
"\n"
"    -1str                 scan only one DNA strand.\n"
"\n"
"    -t #                  capture sites with a score >= #.\n"
"                          capture sites with a P-value <= # if -distrib is used.\n"
"\n"
"    -name #               set the matrix name to #.\n"
"\n"
"    -return distrib       output the weight score distribution.\n"
"\n"
"    -return sites         output the list of sites (default).\n"
"\n"
"    -distrib #            read score distrib file # (generated by matrix-distrib).\n"
"\n"
"    -decimals #           precision parameter for the -return distrib option.\n"
"\n"
"    -pseudo #             pseudo-count for the matrix (1.0 by default).\n"
"\n"
"    -first_hit_per_seq    only report the first hit per sequence.\n"
"\n"
"    -origin [start|end|center]\n"
"                           Specify the origin for the calculation of positions\n"
"                           (see matrix-scan manual for details).\n"
"\n"
   );
}

/*
 *
 * MAIN
 *
 */
int main(int argc, char *argv[])
{
    VERBOSITY = 0;

    char *outfile = NULL;
    char *seqfile = NULL;
    char *bgfile  = NULL;
    char *matfile = NULL;
    char *distribfile = NULL;
    int distrib = 0;
    int rc = TRUE;
    char *matrix_name = (char *) "matrix";
    double precision = 0.1;
    double theshold = -1000.0;
    double pseudo = 1.0;
    int    origin = -1;
    int    first_hit = FALSE;
    FILE *fout;
    pvalues_t *pvalues = NULL;

    // construct command line string
    string cmdline = "";
    for (int i = 0; i < argc; i++)
    {
        cmdline += argv[i];
        cmdline += " ";
        COMMAND_LINE = (char *) strdup(cmdline.c_str());
    }

    int i;
    for (i = 1; i < argc; i++) 
    {
        if (strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 0) 
        {
            help();
            exit(0);
        } else if (strcmp(argv[i], "--version") == 0) 
        {
            printf("%d\n", VERSION);
            exit(0);
        } else if (strcmp(argv[i], "-v") == 0) 
        {
            ASSERT(argc > i + 1, "-v requires a nummber (0, 1 or 2)");
            VERBOSITY = atoi(argv[++i]);
            ASSERT(VERBOSITY >= 0 && VERBOSITY <= 2, "invalid verbosity level (should be 0, 1 or 2)");
        } 
        else if (strcmp(argv[i], "-1str") == 0) 
        {
            rc = FALSE;
        } 
        else if (strcmp(argv[i], "-2str") == 0) 
        {
            rc = TRUE;
        } 
        else if (strcmp(argv[i], "-first_hit_per_seq") == 0) 
        {
            first_hit = TRUE;
        } 

        else if (strcmp(argv[i], "-name") == 0) 
        {
            ASSERT(argc > i + 1, "-name requires a value");
            matrix_name = argv[++i];
        } 
        else if (strcmp(argv[i], "-i") == 0) 
        {
            ASSERT(argc > i + 1, "-i requires a filename");
            seqfile = argv[++i];
        } 
        else if (strcmp(argv[i], "-origin") == 0) 
        {
            ASSERT(argc > i + 1, "-origin requires a value");
            char *value = argv[++i];
            if (strcmp(value, "center") == 0)
                origin = 0;
            else if (strcmp(value, "start") == 0)
                origin = -1;
            else if (strcmp(value, "end") == 0)
                origin = 1;
            else
                ERROR("invalid value for option -origin");
        } 
        else if (strcmp(argv[i], "-o") == 0) 
        {
            ASSERT(argc > i + 1, "-o requires a filename");
            outfile = argv[++i];
        } 
        else if (strcmp(argv[i], "-m") == 0) 
        {
            ASSERT(argc > i + 1, "-m requires a filename");
            matfile = argv[++i];
        } 
        else if (strcmp(argv[i], "-bgfile") == 0) 
        {
            ASSERT(argc > i + 1, "-bgfile requires a filename");
            bgfile = argv[++i];
        } 
        else if (strcmp(argv[i], "-distrib") == 0) 
        {
            ASSERT(argc > i + 1, "-distrib requires a filename");
            distribfile = argv[++i];
        } 
        else if (strcmp(argv[i], "-return") == 0) 
        {
            ASSERT(argc > i + 1, "-return requires name");
            char *roption = argv[++i];
            if (strcmp(roption, "distrib") == 0)
                distrib = TRUE;
            // TODO: add more options
        } 
        else if (strcmp(argv[i], "-decimals") == 0) 
        {
            ASSERT(argc > i + 1, "-decimals requires a number");
            int decimals = atoi(argv[++i]);
            precision = pow(10.0, -decimals);
            ASSERT(precision >= 0.0001 && precision <= 10, "invalid precision");
        } 
        else if (strcmp(argv[i], "-e") == 0) 
        {
            ASSERT(argc > i + 1, "-e requires a number");
            precision = atof(argv[++i]);
            ASSERT(precision >= 0.0001 && precision <= 10, "invalid precision");
        } 
        else if (strcmp(argv[i], "-t") == 0) 
        {
            ASSERT(argc > i + 1, "-t requires a number");
            theshold = atof(argv[++i]);
        }
        else if (strcmp(argv[i], "-pseudo") == 0) 
        {
            ASSERT(argc > i + 1, "-pseudo requires a number");
            pseudo = atof(argv[++i]);
        }
        else
        {
            WARNING("invalid option %s", argv[i]);
        }
    }

    if (argc <= 1)
    {
        usage();
        return 0;
    }

    // monitor start & end time
    time_t rawtime;
    time(&rawtime);
    struct tm * start_time;
    start_time = localtime(&rawtime);

    // output
    if (outfile == NULL)
        fout = stdout;
    else
        fout = fopen(outfile, "w");
    
    ASSERT(fout != NULL, "invalid output");

   if (VERBOSITY >= 1)
        fprintf(stdout, "; %s\n", COMMAND_LINE);

    // set bg model
    Markov markov;
    if (bgfile != NULL)
    {   
        if (!load_inclusive(markov, bgfile))
            ERROR("can not load bg model");
    }
    else
    {
        WARNING("using default bernoulli model (computed using input)");
        double priori[4] = {0.25, 0.25, 0.25, 0.25};
        bernoulli(markov, priori);
    }

    if (matfile == NULL)
        ERROR("You should specify at least a matrix file and a DNA sequence file");
    
    // input distrib
    if (distribfile != NULL)
        pvalues = read_distrib(distribfile);

    // matrix
    Array matrix;
    read_matrix(matrix, matfile, pseudo);
    matrix.transform2logfreq(markov);

    // values (distrib)
    values_t *values = NULL;
    if (distrib)
        values = new_values(-1000, 10000.0, precision);

    // sequences
    FILE *fp;
    if (seqfile == NULL)
        fp = stdin;
    else
        fp = fopen(seqfile, "r");

    if (fp == NULL)
        ERROR("unable to open '%s'", seqfile);

    fasta_reader_t *reader = new_fasta_reader(fp);

    if (!distrib)
    {
        fprintf(fout, "#seq_id\tft_type\tft_name\tstrand\tstart\tend\tsequence\tweight");
        if (distribfile)
            fprintf(fout, "\tPval\n");
        else
            fprintf(fout, "\n");
    }
    
    // scan all sequences
    int s = 1;
    int scanned_pos = 0;
    while (1)
    {
        seq_t *seq = fasta_reader_next(reader);
        if (seq == NULL)
            break;
        scan_seq(fout, seq, s++, matrix, markov, values, theshold, rc, pvalues, origin, matrix_name, &scanned_pos, first_hit);
        free_seq(seq);
    }
    
    if (distrib)
        values_print(fout, values);

    if (distribfile)
        free_pvalues(pvalues);

    // time info
    char time_buffer[256];
    time(&rawtime);
    struct tm * end_time;
    end_time = localtime(&rawtime);
    if (VERBOSITY >= 1)
    {
        strftime (time_buffer, 256, "%Y_%m_%d.%H%M%S", start_time);
        printf("; Scanned positions %d\n", scanned_pos);
        printf("; Job started %s\n", time_buffer);
        strftime (time_buffer, 256, "%Y_%m_%d.%H%M%S", end_time);
        printf("; Job done    %s\n", time_buffer);
    }
    return 0;
}
