#!/usr/bin/perl

### CVS: added tab as supported input format

############################################################
#
# $Id: convert-seq,v 1.39 2010/08/06 09:50:50 jvanheld Exp $
#
# Time-stamp: <2002-09-24 21:24:13 jvanheld>
#
############################################################

if ($0 =~ /([^(\/)]+)$/) {
    push (@INC, "$`lib/");
}
require "RSA.lib";

#### initialise parameters ####
$start_time = &RSAT::util::StartScript();
$line_width = 60;
$check_id = 1;
$dotmask = 0;

$top = 0;

$accepted_input_string = join "\n\t\t\t", "", sort(keys %accepted_input_seq);
$accepted_output_string = join "\n\t\t\t", "", sort(keys %accepted_output_seq); 

%args=(id_column=>1,
       seq_column=>2,
       comment_col=>0);

&ReadArguments();


#### check argument values ####
unless ($line_width >= 0) {
    print "\tinvalid line width\n";
    print "\ttype convert-seq -h for help\n";
    exit;

}
&CheckInputSeqFormat($in_format);
&CheckOutputSeqFormat($out_format);

if ($out_format eq "wc") {
    $line_width = 0;
}
if ($out_format eq "multi") {
    $line_width = 0;
}

### open input file ###
($in, $input_dir) = &OpenInputFile($inputfile);

### open output file ###
$out = &OpenOutputFile($outputfile);
if ($outputfile) {
    $output_dir = `dirname $outputfile`;
}
unless ($output_dir) {
    $output_dir = `pwd`;
}
chomp $output_dir;

#### verbose ####
if ($verbose) {
    print ";convert-seq result\n";
    if ($inputfile ne "") {
	print ";Input file	$inputfile\n";
    }
    if ($outputfile ne "") {
	print ";Output file	$outputfile\n";
    }
    print ";Input format\t$in_format\n";
    print ";Output format\t$out_format\n";
}

## ##############################################################
## Read the sequences and convert them
while ((($current_seq, $current_id, @comments) = &ReadNextSequence($in, $in_format, $input_dir, "",$mask, %args)) &&
       (($current_seq ne "") || ($current_id ne ""))) {
    $seq_nb++;
    my $seq_len = length($current_seq);

    if (($top > 0) && ($seq_nb > $top)) {
	&RSAT::message::Info("Exported the $top top sequences only (topion -top). Remaining sequences are ignored.") if ($main::verbose >= 1);
	last;
    }

    &RSAT::message::Info("Converting sequence", $seq_nb, $current_id, $seq_len."bp", $comments[0]) if ($main::verbose >= 2);


#    ## Remove empty sequences 
#    if (($noempty) && ($current_seq !~ /\S/)) {
#	next;
#    }

    ## Skip short sequences
    if ($skip_short) {
	if ($seq_len < $skip_short) {
	    &RSAT::message::Warning(join("\t", "Skipping short sequence", $current_id, "length", $seq_len)) if ($main::verbose >= 1);
	    next;
	}
    }

    ## Skip long sequences
    if ($skip_long) {
	if ($seq_len > $skip_long) {
	    &RSAT::message::Warning(join("\t", "Skipping long sequence", $current_id, "length", $seq_len)) if ($main::verbose >= 1);
	    next;
	}
    }

    ## Mask short sequences
    if ($mask_short) {
	if ($seq_len < $mask_short) {
	    $current_seq = "N"x$seq_len;
	}
    }

    #### case conversion
    if ($lowercases) {
	$current_seq = lc($current_seq);
    } elsif ($uppercases) {
	$current_seq = uc($current_seq);
    }


    #### DNA only
    if ($dna) {
	&RSAT::message::Info("Cleaning DNA") if ($main::verbose >= 2);
	$current_seq = &CleanDNA($current_seq, "n");
	if ($out_format eq "wc") {
	    $dotmask = 1;
	}
    }

    ## Replace N by dots
    if ($dotmask) {
	$current_seq =~ s/n/./gi; ## consensus does not recognize the N character
    }

    if ($seq_identifier) {
	#### identifier specified with the -id option
	$current_id = $seq_identifier;
    } elsif ($in_format eq "raw") {
	#### sequence id from file name
	$current_id =~ s/\.raw$//;
    }


    ## check that the a sequence identifier is defined
    unless ($current_id) {
	$current_id = $seq_nb;
    }

    ## Add a prefix to the sequence ID
    if ($seq_prefix) {
	$current_id = join ("", $seq_prefix, $current_id);
    }

    if ($out_format eq "filelist") {
	#$current_id = `basename $current_id .$in_format`;
	#if ($current_id =~ /\.$in_format/) {
	#    $current_id = $';
	#}

	#### Make sure that the id is a valid filename
	chomp $current_id;
	if ($check_id) {
	    $current_id =~ s/\|/_/g;
	    $current_id =~ s/\;/_/g;
	    $current_id =~ s/\:/_/g;
	    $current_id =~ s/\&/_/g;
	}
	$one_seq_file = "$current_id.raw";
	open RAW, "> ${output_dir}/$one_seq_file"
	    || die "Error: cannot write file ${output_dir}/$one_seq_file\n";
	&PrintNextSequence(RAW, "raw", 0, $current_seq, $current_id);
	close RAW;
	print $out "$one_seq_file\t$current_id\n";
    } else {
#      &RSAT::message::Debug("Printing sequence", $current_id, $comments[0]) if ($main::verbose >= 10);
	&PrintNextSequence($out, $out_format, $line_width, $current_seq, $current_id, @comments);
    }

    if ($add_RC) {
	$current_id .= "_reverse_complement";
	@comments = ();
	$rev_seq = &ReverseComplement($current_seq);
	if ($lowercases) {
	    $rev_seq = lc($rev_seq);
	} elsif ($uppercases) {
	    $rev_seq = uc($rev_seq);
	} 
	if ($out_format eq "filelist") {
	    $one_seq_file = "${current_id}.raw";
	    open RAW, "> $one_seq_file";
	    &PrintNextSequence(RAW, "raw", 0, $rev_seq, $current_id);
	    close RAW;
	    print $out "$one_seq_file\n";
	} else {
	    &PrintNextSequence($out, $out_format, $line_width, $rev_seq, $current_id, @comments);
	}
    }
}

###### verbose ######
if ($verbose) {
    $done_time = `date`;
    print ";Job started $start_time";
    print ";Job done    $done_time";
}


###### close input file ######
if ($inputfile ne "") {
    close $in;
}

###### close output file ######
my $exec_time = &RSAT::util::ReportExecutionTime($start_time);
print $main::out $exec_time if ($main::verbose >= 1);
close $out if ($outputfile);


exit(0);



################################################################
######################### SUBROUTINES ##########################
################################################################


sub PrintHelp {
    open HELP, "| more";
    print HELP <<End_of_help;
    NAME
	convert-seq

        v1.0, 1997 by Jacques van Helden (jvanheld\@bigre.ulb.ac.be)

	DESCRIPTION
	Converts sequences between different formats. Optionally, also
	returns the reverse-complement of the input sequences, or
	perform some cleaning operations (skip short sequences,
					  suppress Ns, ...).

					  CATEGORY
					  util
					  sequences

					  USAGE
					  consert-seq [-i inputfile] [-o outputfile] [-v] 
					  -id identifier
					  -from inputformat -to outputformat
					  [-lw line_width]

					  OPTIONS
					  -v #	verbose level

					  -i inputfile
					  if not specified, the standard input is used.
					  This allows to place the command within a pipe.

					  -mask upper|lower|non-dna
					  Mask lowercases, uppercases, or non-dna characters, respecively. 

					  Masked characters are replaced by by N characters, or
					  by a dot (option -dotmask).

					  -noempty
					  Remove empty sequences from the set (same as -skip_short 1)

					  -mask_short min_seq_len
					  Mask (replace by N characters) sequences shorter than
					  the specified length. This can be useful to discard
					  short intergenic segments from the pattern discovery
					  step, especially when working with bacterial genomes,
					  where short intergenic sequences generally correspond
					  to intra-operon segments.

					  -skip_short min_seq_len 
					  Skip sequences shorter than the specified length. Same
					  functionality as -mask_short, except that short
					  sequences are not returned at all in the output.

					  -skip_long max_seq_len
					  Skip sequences longer than the specified length. These
					  sequences are not returned at all in the output.

					  -o outputfile
					  if not specified, the standard output is used.
					  This allows to place the command within a pipe.

					  -from   input format
					  Supported input formats : $accepted_input_string

					  -id_col	   
					  column containing sequence identifiers in tab format
					  (default: $args{id_column}).

					  -seq_col	   
					  column containing sequence sequences in tab format
					  (default: $args{seq_column}).

					  -comment_col	   
					  column containing sequence comments (sequence
									       description) in tab format (default:
													   $args{comment_column}).

													   -to     output format
													   Supported output formats : $accepted_output_string

													   -lw #   line width. A carriage return is inserted every # 
													   characters within the output sequence.
													   Default is 60. A 0 value indicates that no carriage 
													   return must be inserted.

													   -addrc  adds the reverse complement of each input sequence
													   to the output file. This is usefull for programs that 
													   cannot handle reverse complement (like the Gibbs Sampler).

													   -lc	lowercase. the sequence is printed in lowercase.

													   -uc	lowercase. the sequence is printed in uppercase.

													   -dna	convert any non-acgt character into "n" characters.
													   This is to filter out partly specified nucleotides.

													   -dotmask
													   convert masked characters into dots. 
													   This is useful for programs like consensus, or
													   AlignACE, which do not support N characters in the
													   sequences.

													   -id identifier
													   sequence identifier (useful for converting a raw
																sequence from the STDIN)

													   -prefix prefix
													   sequence prefix (useful for converting from a multi
															    sequence)

													   -nocheckid
													   Prevent to check sequence IDs for conversion to file list

													   SUPPORTED FORMATS
													   INPUT $accepted_input_string

													   OUTPUT $accepted_output_string

													   FORMAT DESCRIPTION
													   raw	The input file should contain raw sequences without any
													   comment or other text. Tabs (\\t), blank spaces and newline
													   characters (\\n) are accepted (they will be automatically
																	  removed before analysis). The sequence must be terminated by
																	  a newline character.

																	  multi	same as raw except than each new line is considered to contain
																	  a new independent sequence in raw format.

																	  filelist
																	  file list. Each line of the input file contains the
																	  name of a file containing a single sequence.

																	  fasta	FastA format.

																	  IG	IntelliGenetics format.
																	  The first non-comment line must be the sequence identifier
																	  (a single word without spaces).
																	  The sequence follows the identifier line identifier. It can
																	  include spaces, tabs or newlines, that will be removed for
																	  sequence analysis.
																	  The end of one sequence is indicated by termination character:
																	  1 for linear, 2 for circular sequences.
																	  A single file may contain several sequences.

																	  EXAMPLE of IG suite:

																	  ; sequence of the region upstream from NIL1
																	  ; Locus GAT1
																	  ; ORF YFL021W  coord:   6 95964 97496
																	  ; upstream region size: 100
																	  ; upstream region coord:        6 95864 95963
																	  GAT1
																	  ACAGAGCAACAATAATAACAGCACTATGAGTCGCACACTT
																	  GCGGTGCCCGGCCCAGCCACATATATATAGGTGTGTGCCA
																	  CTCCCGGCCCCGGTATTAGC
																	  1
																	  ; sequence of the region upstream from PUT4
																	  ; Locus PUT4
																	  ; ORF YOR348C  coord:   15 988773 986890
																	  ; upstream region size: 100
																	  ; upstream region coord:        15 988873 988774
																	  PUT4
																	  GGGTTTGTGTTCCTCTTCCTTTCCTTTTTTTTTCTCTCTT
																	  CCCTTCCAGTTTCTTTTATTCTTTGCTGTTTCGAAGAATC
																	  ACACCATCAATGAATAAATC
																	  1

																	  EXAMPLES
																	  convert-seq -i myseq.multi -from multi -to FASTA -o myseq.FASTA

																	  End_of_help
																	  close HELP;
    exit(0);
}

sub PrintOptions {
    open HELP, "| more";
    print HELP <<End_shorthelp;
    consvert-seq options
	--------------------
	-h			detailed help message
	-help			short help message (shows the present message)
	-v #			verbose level
	-i			inputfile
	-mask upper|lower	mask upper- or lowercases, respectively
	-mask non-dna		mask non-dna characters
	-dotmask		convert masked characters into dots. 
	-dna			convert any non-acgt character into "n"
	-mask_short min_seq_len	Mask sequences shorter than the specified length
	-skip_short min_seq_len	Skip sequences shorter than the specified length
	-skip_long max_seq_len	Skip sequences longer than the specified length
	-noempty		Remove empty sequences from the set (same as -skip_short 1)
	-o			outputfile
	-from			input format (see below for accepted formats)
	-id_col			column containing the ID in tab-delimted format
	-seq_col		column containing the sequence in tab-delimted format
	-comment_col		column containing comments (sequence description) in tab-delimted format
	-to			output format (see below for accepted formats)
	-lw #			line width. 
	-addrc			add the reverse complement of each input sequence
	-lc			convert all letters to lowercase
	-uc			convert all letters to uppercase
	-id			sequence identifier
	-prefix			sequence prefix
	-nocheckid		Prevent to check sequence IDs for conversion to the format filelist.
	SUPPORTED FORMATS
	INPUT $accepted_input_string

	OUTPUT $accepted_output_string
	End_shorthelp
	close HELP;
    exit(0);
}


################################################################
## Read arguments 
sub ReadArguments {
    foreach $a (0..$#ARGV) {

	if ($ARGV[$a] eq "-v") {
	    if (&IsNatural($ARGV[$a+1])) {
		$verbose = $ARGV[$a+1];
	    } else {
		$verbose = 1;
	    }

	    ### detailed help
	} elsif ($ARGV[$a] eq "-h") {
	    &PrintHelp;

	    ### list of options
	} elsif ($ARGV[$a] eq "-help") {
	    &PrintOptions();

	    ## Input file
	} elsif ($ARGV[$a] eq "-i") {
	    $inputfile = $ARGV[$a+1];

	    ## mask upper or lower cases
	} elsif ($ARGV[$a] eq "-mask") {
	    $mask = $ARGV[$a+1];
	    &CheckMask($mask);

	    ## Remove empty sequences from the set
	} elsif ($ARGV[$a] eq "-noempty") {
#	    $noempty = 1;
	    $skip_short = 1;

	    ## mask short sequences
	} elsif ($ARGV[$a] eq "-mask_short") {
	    $mask_short = $ARGV[$a+1];
	    &RSAT::error::FatalError(join("\t", 
					  $mask_short,
					  "Invalid value for the minimal masking length. ". 
					  "Must be a strictly positive natural number"))
		unless &IsNatural($mask_short) && ($mask_short > 0);

	    ## skip short sequences
	} elsif ($ARGV[$a] eq "-skip_short") {
	    $skip_short = $ARGV[$a+1];
	    &RSAT::error::FatalError(join("\t", 
					  $skip_short,
					  "Invalid value for the minimal skipping length. ". 
					  "Must be a strictly positive natural number"))
		unless &IsNatural($skip_short) && ($skip_short > 0);

	    ## skip long sequences
	} elsif ($ARGV[$a] eq "-skip_long") {
	    $skip_long = $ARGV[$a+1];
	    &RSAT::error::FatalError(join("\t", 
					  $skip_long,
					  "Invalid value for the maximal skipping length. ". 
					  "Must be a strictly positive natural number"))
		unless &IsNatural($skip_long) && ($skip_long > 0);

	    ## Only export the N top sequences of the input file
	} elsif ($ARGV[$a] eq "-top") {
	    $top = $ARGV[$a+1];
	    &RSAT::error::FatalError(join("\t", 
					  $top,
					  "Invalid value for the maximal skipping length. ". 
					  "Must be a strictly positive natural number"))
		unless &IsNatural($top) && ($top > 0);


	} elsif ($ARGV[$a] eq "-id") {
	    $seq_identifier = $ARGV[$a+1];

	} elsif ($ARGV[$a] eq "-prefix") {
	    $seq_prefix = $ARGV[$a+1];

	} elsif ($ARGV[$a] eq "-o") {
	    $outputfile = $ARGV[$a+1];

	    ## Input formaat
	} elsif ($ARGV[$a] eq "-from") {
	    $in_format = lc($ARGV[$a+1]);

	    ## Output format
	} elsif ($ARGV[$a] eq "-to") {
	    $out_format = lc($ARGV[$a+1]);

	    ## ID column for the tab format
	} elsif ($ARGV[$a] eq "-id_col") {
	    $args{'id_column'} = $ARGV[$a+1];
	    &RSAT::error::FatalError(join("\t", 
					  $args{"id_column"},
					  "Invalid column specification for sequence ID. ". 
					  "Must be a strictly positive natural number"))
		unless &IsNatural($args{'id_column'}) && ($args{'id_column'} > 0);


	    ## Sequence column for the tab format
	} elsif ($ARGV[$a] eq "-seq_col") {
	    $args{'seq_column'} = $ARGV[$a+1];
	    &RSAT::error::FatalError(join("\t", 
					  $args{"seq_column"},
					  "Invalid column specification for sequence. ". 
					  "Must be a strictly positive natural number"))
		unless &IsNatural($args{'seq_column'}) && ($args{'seq_column'} > 0);

	    ## Comment column for the tab format
	} elsif ($ARGV[$a] eq "-comment_col") {
	    $args{'comment_column'} = $ARGV[$a+1];
	    &RSAT::error::FatalError(join("\t", 
					  $args{"comment_column"},
					  "Invalid column specification for sequence COMMENT. ". 
					  "Must be a strictly positive natural number"))
		unless &IsNatural($args{'comment_column'}) && ($args{'comment_column'} > 0);

	    ## Line width
	} elsif ($ARGV[$a] eq "-lw") {
	    $line_width = $ARGV[$a+1];

	} elsif ($ARGV[$a] eq "-lc") {
	    $uppercases = 0;
	    $lowercases = 1;

	} elsif ($ARGV[$a] eq "-uc") {
	    $uppercases = 1;
	    $lowercases = 0;

	} elsif ($ARGV[$a] eq "-addrc") {
	    $add_RC = 1;

	} elsif ($ARGV[$a] eq "-dna") {
	    $dna = 1;

	} elsif ($ARGV[$a] eq "-dotmask") {
	    $dotmask = 1;

	    ## Prevent to check sequence IDs for conversion to file list
	} elsif ($ARGV[$a] eq "-nocheckid") {
	    $check_id = 0;

	}
    }
}
