#!/usr/bin/perl

### CVS: added tab as supported input format

############################################################
#
# $Id: convert-seq,v 1.32 2009/11/05 00:32:07 jvanheld Exp $
#
# Time-stamp: <2002-09-24 21:24:13 jvanheld>
#
############################################################

if ($0 =~ /([^(\/)]+)$/) {
    push (@INC, "$`lib/");
}
require "RSA.lib";

#### initialise parameters ####
$start_time = `date`;
$line_width = 60;
$check_id = 1;
$dotmask = 0;

$accepted_input_string = join "\n\t\t\t", "", sort(keys %accepted_input_seq);
$accepted_output_string = join "\n\t\t\t", "", sort(keys %accepted_output_seq); 

%args=(id_column=>1,
	seq_column=>2,
	comment_col=>0);

&ReadArguments();


#### check argument values ####
unless ($line_width >= 0) {
    print "\tinvalid line width\n";
    print "\ttype convert-seq -h for help\n";
    exit;

}
&CheckInputSeqFormat($in_format);
&CheckOutputSeqFormat($out_format);

if ($out_format eq "wc") {
    $line_width = 0;
}
if ($out_format eq "multi") {
    $line_width = 0;
}

### open input file ###
($in, $input_dir) = &OpenInputFile($inputfile);

### open output file ###
$out = &OpenOutputFile($outputfile);
if ($outputfile) {
    $output_dir = `dirname $outputfile`;
}
unless ($output_dir) {
    $output_dir = `pwd`;
}
chomp $output_dir;

#### verbose ####
if ($verbose) {
    print ";convert-seq result\n";
    if ($inputfile ne "") {
	print ";Input file	$inputfile\n";
    }
    if ($outputfile ne "") {
	print ";Output file	$outputfile\n";
    }
    print ";Input format\t$in_format\n";
    print ";Output format\t$out_format\n";
}

## ##############################################################
## Read the sequences and convert them
while ((($current_seq, $current_id, @comments) = &ReadNextSequence($in, $in_format, $input_dir, "",$mask, %args)) &&
       (($current_seq ne "") || ($current_id ne ""))) {

    $seq_nb++;

#    ## Remove empty sequences 
#    if (($noempty) && ($current_seq !~ /\S/)) {
#	next;
#    }

    ## Skip short sequences
    if ($skip_short) {
	my $seq_len = length($current_seq);
	if ($seq_len < $skip_short) {
	    &RSAT::message::Warning(join("\t", "Skipping short sequence", $current_id, "length", $seq_len)) if ($main::verbose >= 1);
	    next;
	}
    }

    ## Skip long sequences
    if ($skip_long) {
	my $seq_len = length($current_seq);
	if ($seq_len > $skip_long) {
	    &RSAT::message::Warning(join("\t", "Skipping long sequence", $current_id, "length", $seq_len)) if ($main::verbose >= 1);
	    next;
	}
    }

    ## Mask short sequences
    if ($mask_short) {
	my $seq_len = length($current_seq);
	if ($seq_len < $mask_short) {
	    $current_seq = "N"x$seq_len;
	}
    }

    #### case conversion
    if ($lowercases) {
        $current_seq = lc($current_seq);

    } elsif ($uppercases) {
        $current_seq = uc($current_seq);
    } 

    #### DNA only
    if ($dna) {
        $current_seq = &CleanDNA($current_seq, "n");
	if ($out_format eq "wc") {
	  $dotmask = 1;
	}
    }

    ## Replace N by dots
    if ($dotmask) {
      $current_seq =~ s/n/./gi; ## consensus does not recognize the N character
    }

    if ($seq_identifier) {
	#### identifier specified with the -id option
	$current_id = $seq_identifier;
    } elsif ($in_format eq "raw") {
	#### sequence id from file name
	$current_id =~ s/\.raw$//;
    }


    ## check that the a sequence identifier is defined
    unless ($current_id) {
	$current_id = $seq_nb;
    }
    
    ## Add a prefix to the sequence ID
    if ($seq_prefix) {
	$current_id = join ("", $seq_prefix, $current_id);
    }

    if ($out_format eq "filelist") {
	#$current_id = `basename $current_id .$in_format`;
	#if ($current_id =~ /\.$in_format/) {
	#    $current_id = $';
	#}

	#### Make sure that the id is a valid filename
	chomp $current_id;
	if ($check_id) {
	    $current_id =~ s/\|/_/g;
	    $current_id =~ s/\;/_/g;
	    $current_id =~ s/\:/_/g;
	    $current_id =~ s/\&/_/g;
	}
	$one_seq_file = "$current_id.raw";
	open RAW, "> ${output_dir}/$one_seq_file"
	    || die "Error: cannot write file ${output_dir}/$one_seq_file\n";
	&PrintNextSequence(RAW, "raw", 0, $current_seq, $current_id);
	close RAW;
	print $out "$one_seq_file\t$current_id\n";
    } else {
	&PrintNextSequence($out, $out_format, $line_width, $current_seq, $current_id, @comments);
    }
    if ($add_RC) {
	$current_id .= "_reverse_complement";
	@comments = ();
	$rev_seq = &ReverseComplement($current_seq);
	if ($lowercases) {
	    $rev_seq = lc($rev_seq);
	} elsif ($uppercases) {
	    $rev_seq = uc($rev_seq);
	} 
	if ($out_format eq "filelist") {
	    $one_seq_file = "${current_id}.raw";
	    open RAW, "> $one_seq_file";
	    &PrintNextSequence(RAW, "raw", 0, $rev_seq, $current_id);
	    close RAW;
	    print $out "$one_seq_file\n";
	} else {
	    &PrintNextSequence($out, $out_format, $line_width, $rev_seq, $current_id, @comments);
	}
    }
}

###### verbose ######
if ($verbose) {
    $done_time = `date`;
    print ";Job started $start_time";
    print ";Job done    $done_time";
}


###### close input file ######
if ($inputfile ne "") {
    close $in;
}

###### close output file ######
if ($outputfile ne "") {
    close $out;
}


exit(0);



################################################################
######################### SUBROUTINES ##########################
################################################################


sub PrintHelp {
    open HELP, "| more";
    print HELP <<End_of_help;
NAME
	convert-seq

        v1.0, 1997 by Jacques van Helden (jvanheld\@bigre.ulb.ac.be)

DESCRIPTION
	Converts sequences between different formats. Optionally, also
	returns the reverse-complement of the input sequences, or
	perform some cleaning operations (skip short sequences,
	suppress Ns, ...).

CATEGORY
	util
	sequences

USAGE
        consert-seq [-i inputfile] [-o outputfile] [-v] 
	          -id identifier
                  -from inputformat -to outputformat
                  [-lw line_width]

OPTIONS
	-v #	verbose level

	-i inputfile
		if not specified, the standard input is used.
		This allows to place the command within a pipe.

	-mask upper|lower|non-dna
		Mask lowercases, uppercases, or non-dna characters, respecively. 

		Masked characters are replaced by by N characters, or
		by a dot (option -dotmask).

	-noempty
		Remove empty sequences from the set (same as -skip_short 1)

	-mask_short min_seq_len
		Mask (replace by N characters) sequences shorter than
		the specified length. This can be useful to discard
		short intergenic segments from the pattern discovery
		step, especially when working with bacterial genomes,
		where short intergenic sequences generally correspond
		to intra-operon segments.

	-skip_short min_seq_len 
		Skip sequences shorter than the specified length. Same
		functionality as -mask_short, except that short
		sequences are not returned at all in the output.

	-skip_long max_seq_len
		Skip sequences longer than the specified length. These
		sequences are not returned at all in the output.

	-o outputfile
		if not specified, the standard output is used.
		This allows to place the command within a pipe.

	-from   input format
		    Supported input formats : $accepted_input_string

	-id_col	   
		column containing sequence identifiers in tab format
		(default: $args{id_column}).

	-seq_col	   
		column containing sequence sequences in tab format
		(default: $args{seq_column}).

	-comment_col	   
		column containing sequence comments (sequence
		description) in tab format (default:
		$args{comment_column}).

        -to     output format
		    Supported output formats : $accepted_output_string

        -lw #   line width. A carriage return is inserted every # 
                characters within the output sequence.
                Default is 60. A 0 value indicates that no carriage 
                return must be inserted.

        -addrc  adds the reverse complement of each input sequence
                to the output file. This is usefull for programs that 
                cannot handle reverse complement (like the Gibbs Sampler).

	-lc	lowercase. the sequence is printed in lowercase.

	-uc	lowercase. the sequence is printed in uppercase.

	-dna	convert any non-acgt character into "n" characters.
		This is to filter out partly specified nucleotides.

 	-dotmask
		convert masked characters into dots. 
		This is useful for programs like consensus, or
		AlignACE, which do not support N characters in the
		sequences.

	-id identifier
		sequence identifier (useful for converting a raw
		sequence from the STDIN)

	-prefix prefix
		sequence prefix (useful for converting from a multi
		sequence)

	-nocheckid
		Prevent to check sequence IDs for conversion to file list

SUPPORTED FORMATS
    INPUT $accepted_input_string

    OUTPUT $accepted_output_string

FORMAT DESCRIPTION
	raw	The input file should contain raw sequences without any
                comment or other text. Tabs (\\t), blank spaces and newline
                characters (\\n) are accepted (they will be automatically
                removed before analysis). The sequence must be terminated by
                a newline character.

	multi	same as raw except than each new line is considered to contain
 		a new independent sequence in raw format.

	filelist
		file list. Each line of the input file contains the
		name of a file containing a single sequence.

	fasta	FastA format.

	IG	IntelliGenetics format.
 		The first non-comment line must be the sequence identifier
		(a single word without spaces).
		The sequence follows the identifier line identifier. It can
                include spaces, tabs or newlines, that will be removed for
                sequence analysis.
                The end of one sequence is indicated by termination character:
		1 for linear, 2 for circular sequences.
		A single file may contain several sequences.

		EXAMPLE of IG suite:

		; sequence of the region upstream from NIL1
	        ; Locus GAT1
 	        ; ORF YFL021W  coord:   6 95964 97496
 	        ; upstream region size: 100
 	        ; upstream region coord:        6 95864 95963
 	        GAT1
 	        ACAGAGCAACAATAATAACAGCACTATGAGTCGCACACTT
  	        GCGGTGCCCGGCCCAGCCACATATATATAGGTGTGTGCCA
  	        CTCCCGGCCCCGGTATTAGC
  	        1
 	        ; sequence of the region upstream from PUT4
                ; Locus PUT4
                ; ORF YOR348C  coord:   15 988773 986890
                ; upstream region size: 100
                ; upstream region coord:        15 988873 988774
                PUT4
                GGGTTTGTGTTCCTCTTCCTTTCCTTTTTTTTTCTCTCTT
                CCCTTCCAGTTTCTTTTATTCTTTGCTGTTTCGAAGAATC
                ACACCATCAATGAATAAATC
                1

EXAMPLES
	convert-seq -i myseq.multi -from multi -to FASTA -o myseq.FASTA

End_of_help
  close HELP;
  exit(0);
}

sub PrintOptions {
  open HELP, "| more";
  print HELP <<End_shorthelp;
consvert-seq options
--------------------
-h			detailed help message
-help			short help message (shows the present message)
-v #			verbose level
-i			inputfile
-mask upper|lower	mask upper- or lowercases, respectively
-mask non-dna		mask non-dna characters
-dotmask		convert masked characters into dots. 
-dna			convert any non-acgt character into "n"
-mask_short min_seq_len	Mask sequences shorter than the specified length
-skip_short min_seq_len	Skip sequences shorter than the specified length
-skip_long max_seq_len	Skip sequences longer than the specified length
-noempty		Remove empty sequences from the set (same as -skip_short 1)
-o			outputfile
-from			input format (see below for accepted formats)
-id_col			column containing the ID in tab-delimted format
-seq_col		column containing the sequence in tab-delimted format
-comment_col		column containing comments (sequence description) in tab-delimted format
-to			output format (see below for accepted formats)
-lw #			line width. 
-addrc			add the reverse complement of each input sequence
-lc			convert all letters to lowercase
-uc			convert all letters to uppercase
-id			sequence identifier
-prefix			sequence prefix
-nocheckid		Prevent to check sequence IDs for conversion to the format filelist.
SUPPORTED FORMATS
    INPUT $accepted_input_string
	
    OUTPUT $accepted_output_string
End_shorthelp
  close HELP;
  exit(0);
}


################################################################
## Read arguments 
sub ReadArguments {
    foreach $a (0..$#ARGV) {
	
	if ($ARGV[$a] eq "-v") {
	    if (&IsNatural($ARGV[$a+1])) {
		$verbose = $ARGV[$a+1];
	    } else {
		$verbose = 1;
	    }
	    
	    ### detailed help
	} elsif ($ARGV[$a] eq "-h") {
	    &PrintHelp;
	    
	    ### list of options
	} elsif ($ARGV[$a] eq "-help") {
	    &PrintOptions();
	    
	    ## Input file
	} elsif ($ARGV[$a] eq "-i") {
	    $inputfile = $ARGV[$a+1];

	    ## mask upper or lower cases
	} elsif ($ARGV[$a] eq "-mask") {
	    $mask = $ARGV[$a+1];
	    &CheckMask($mask);

	    ## mask short sequences
	} elsif ($ARGV[$a] eq "-mask_short") {
	    $mask_short = $ARGV[$a+1];
	    &RSAT::error::FatalError(join("\t", 
					  $mask_short,
					  "Invalid value for the minimal masking length. ". 
					  "Must be a strictly positive natural number"))
		unless &IsNatural($mask_short) && ($mask_short > 0);
	    
	    ## skip short sequences
	} elsif ($ARGV[$a] eq "-skip_short") {
	    $skip_short = $ARGV[$a+1];
	    &RSAT::error::FatalError(join("\t", 
					  $skip_short,
					  "Invalid value for the minimal skipping length. ". 
					  "Must be a strictly positive natural number"))
		unless &IsNatural($skip_short) && ($skip_short > 0);
	    
	    ## Remove empty sequences from the set
	} elsif ($ARGV[$a] eq "-noempty") {
#	    $noempty = 1;
	    $skip_short = 1;
	    

	    ## skip long sequences
	} elsif ($ARGV[$a] eq "-skip_long") {
	    $skip_long = $ARGV[$a+1];
	    &RSAT::error::FatalError(join("\t", 
					  $skip_long,
					  "Invalid value for the maximal skipping length. ". 
					  "Must be a strictly positive natural number"))
		unless &IsNatural($skip_long) && ($skip_long > 0);
	    

	} elsif ($ARGV[$a] eq "-id") {
	    $seq_identifier = $ARGV[$a+1];
	    
	} elsif ($ARGV[$a] eq "-prefix") {
	    $seq_prefix = $ARGV[$a+1];
	    
	} elsif ($ARGV[$a] eq "-o") {
	    $outputfile = $ARGV[$a+1];
	    

	    ## Input formaat
	} elsif ($ARGV[$a] eq "-from") {
	    $in_format = lc($ARGV[$a+1]);
	    
	    ## Output format
	} elsif ($ARGV[$a] eq "-to") {
	    $out_format = lc($ARGV[$a+1]);
	    
	    ## ID column for the tab format
	} elsif ($ARGV[$a] eq "-id_col") {
	    $args{'id_column'} = $ARGV[$a+1];
	    &RSAT::error::FatalError(join("\t", 
					  $args{"id_column"},
					  "Invalid column specification for sequence ID. ". 
					  "Must be a strictly positive natural number"))
		unless &IsNatural($args{'id_column'}) && ($args{'id_column'} > 0);


	    ## Sequence column for the tab format
	} elsif ($ARGV[$a] eq "-seq_col") {
	    $args{'seq_column'} = $ARGV[$a+1];
	    &RSAT::error::FatalError(join("\t", 
					  $args{"seq_column"},
					  "Invalid column specification for sequence. ". 
					  "Must be a strictly positive natural number"))
		unless &IsNatural($args{'seq_column'}) && ($args{'seq_column'} > 0);

	    ## Comment column for the tab format
	} elsif ($ARGV[$a] eq "-comment_col") {
	    $args{'comment_column'} = $ARGV[$a+1];
	    &RSAT::error::FatalError(join("\t", 
					  $args{"comment_column"},
					  "Invalid column specification for sequence COMMENT. ". 
					  "Must be a strictly positive natural number"))
		unless &IsNatural($args{'comment_column'}) && ($args{'comment_column'} > 0);

	    ## Line width
	} elsif ($ARGV[$a] eq "-lw") {
	    $line_width = $ARGV[$a+1];
	    
	} elsif ($ARGV[$a] eq "-lc") {
	    $uppercases = 0;
	    $lowercases = 1;
	    
	} elsif ($ARGV[$a] eq "-uc") {
	    $uppercases = 1;
	    $lowercases = 0;
	    
	} elsif ($ARGV[$a] eq "-addrc") {
	    $add_RC = 1;

	} elsif ($ARGV[$a] eq "-dna") {
	    $dna = 1;

	} elsif ($ARGV[$a] eq "-dotmask") {
	    $dotmask = 1;
	    
	    ## Prevent to check sequence IDs for conversion to file list
	} elsif ($ARGV[$a] eq "-nocheckid") {
	    $check_id = 0;
	    
	}
    }
}
