################################################################
## Analysis of replication origins
##
## Data from Christelle Cayrou and Marcel Mechali
##
## This file implements some targets to scan sequences in order to
## predict replication origin from positional profiles of residues
## (letters) or oligonucleotides (words).

## include ${RSAT}/makefiles/util.mk
include makefiles/02_motifs.mk
MAKEFILE=makefiles/05_scan_origins.mk

## Chose the dataset
#PEAKSET=SWEMBL_ES_indiff_C3_BN_vs_input_R0.002
PEAKSET_DIR = analysis/peaks/SWEMBL/${PEAKSET}
PEAKSET = SWEMBL_s_7_NS3_vs_s_8_Genomique_1_R0.002

PEAKSET_SUFFIX = summits_sorted
#PEAKSET_SUFFIX = summits_sorted_chr11_2kb

SWEMBL_SUMMITS_SORTED = ${PEAKSET_DIR}/${PEAKSET}_${PEAKSET_SUFFIX}




list_param_scanning:
	@echo "SIZE_PSSM	${SIZE_PSSM}"
	@echo "PEAKSET	${PEAKSET}"
	@echo "PEAKSET_DIR	${PEAKSET_DIR}"
	@echo "SCAN_DIR	${SCAN_DIR}"
	@echo "PSSM_DIR	${PSSM_DIR}"
	@echo "PSSM_MAX_PEAKS	${PSSM_MAX_PEAKS}"
	@echo "OLIGO_SIZE	${OLIGO_SIZE}"
	@echo "PSSM_CI		${PSSM_CI}"
	@echo "PSSM_FILE	${PSSM_FILE}"


## Define the parameters for sequence scanning
SCAN_DIR=analysis/scanning
dir:
	@echo "Creating scaning directory"
	mkdir -p ${SCAN_DIR}
	@echo "	${SCAN_DIR}"

## Generate a conventional PSSM using position-analysis


PSSM_DIR=${SCAN_DIR}/pssm/${PEAKSET}
#PSSM_PREFIX=${PEAKSET}_${PEAKSET_SUFFIX}_${OLIGO_SIZE}nt_ci${PSSM_CI}_nbrSeq-All_size-${SIZE_PSSM}
#PSSM_FILE=${PSSM_DIR}/${PSSM_PREFIX}


PSSM_PREFIX=${PEAKSET}_${OLIGO_SIZE}nt_ci${PSSM_CI}_nbrSeq-All_size-${SIZE_PSSM}
#PSSM_FILE=${PSSM_DIR}/${PSSM_PREFIX}
#SWEMBL_SUMMITS_SORTED = analysis/peaks_fev2013/mes_hm/7kb/q99/${PEAKSET}



#-last ${PSSM_MAX_PEAKS}\
SIZE_PSSM = $(shell echo ${TO}\+${TO} | bc)
FROM=-100
TO=-1
V=2
OLIGO_SIZE=1
PSSM_CI=1
PSSM_MAX_PEAKS=500


PSSM_INPUT = analysis/peaks_fev2013/mes_hm/7kb/q99/mes_swembl_002_100_7kb_ordered_cluster1_summit_ext1000
PSSM_OUTPUT = analysis/scanning/pssm/mes_swembl_002_100_7kb_ordered_cluster1_summit_ext1000/mes_swembl_002_100_7kb_ordered_cluster1_summit_ext1000_${OLIGO_SIZE}nt_ci${PSSM_CI}_nbrSeq-All_size-${SIZE_PSSM}_from${FROM}_to${TO}



pssm:
	@echo "${DATE}	Computing PSSM from sequences"
	@mkdir -p ${PSSM_DIR}
	position-analysis -v ${V} \
		-i ${PSSM_INPUT}.fasta -seqtype dna -return distrib \
		-l ${OLIGO_SIZE} -1str -ci ${PSSM_CI} -origin center -ovlp \
		-minpos ${FROM} \
		-maxpos ${TO} \
		${OPT} \
		-o ${PSSM_OUTPUT}.tab
	@echo "	${PSSM_OUTPUT}.tab"
	@${MAKE} convert


POS_CLUSTER_SCRIPT=${RSAT}/R-scripts/cluster_position_profiles.R
pssm_graphs:
	@cat ${POS_CLUSTER_SCRIPT} \
		| R --slave --no-save --no-restore --no-environ \
		--args "file.pos=' ${PSSM_OUTPUT}.tab'"



convert:
	@echo "Converting matrix in transfac format"
	grep -v '^#' -i ${PSSM_OUTPUT}.tab | cut -f 1,4-2000 |\
		convert-matrix -v ${V} -from tab -to transfac \
		-return counts,parameters \
		-prefix ${PSSM_PREFIX} \
		-o ${PSSM_OUTPUT}.tf
	@echo "	${PSSM_OUTPUT}.tf"


SITES_DIR=analysis/scanning/sites/${PEAKSET}
SITES_FILE=${SITES_DIR}/${PEAKSET}_1nt_ci1_nbrSeq-All_size-100_sites${QUICK}_from-100_to-1

#SWEMBL_SUMMITS_CHR11 = ${PEAKSET_DIR}/${PEAKSET}_summits_sorted_chr11_2kb
SWEMBL_SUMMITS_CHR11 =analysis/peaks/SWEMBL/SWEMBL_ES_indiff_C3_BN_vs_input_R0.002/SWEMBL_ES_indiff_C3_BN_vs_input_R0.002_summits_sorted_chr11_2kb

PSSM_FILE = analysis/scanning/pssm/mes_swembl_002_100_7kb_ordered_cluster1_summit_ext1000/mes_swembl_002_100_7kb_ordered_cluster1_summit_ext1000_1nt_ci1_nbrSeq-All_size-_from-100_to-1

#-lth score ${SCORE}\
#-top_seq ${NBR_SEQ} \

NBR_SEQ = 100
SCORE = 0
QUICK=

scan:
	@echo "Scanning sequences ${PEAKSET_SUFFIX} with PSSM ${PSSM_PREFIX}"
	@mkdir -p ${SITES_DIR}
	matrix-scan ${QUICK} -v ${V} -i ${SWEMBL_SUMMITS_SORTED}.fasta \
		-decimals 1 \
		-matrix_format transfac -m ${PSSM_FILE}.tf \
		-bginput -markov 0 -1str \
		-top_seq ${NBR_SEQ} \
		-return sites \
		-o ${SITES_FILE}.tab
	@echo ${SITES_FILE}.tab

scan2:
	@echo "Scanning sequences ${PEAKSET_SUFFIX} with PSSM ${PSSM_PREFIX}"
	@mkdir -p ${SITES_DIR}
	matrix-scan ${QUICK} -v ${V} -i ${SWEMBL_SUMMITS_SORTED}.fasta \
		-decimals 1 \
		-matrix_format transfac -m ${PSSM_FILE}.tf \
		-bginput -markov 0 -1str \
		-lth score ${SCORE} \
		-return sites \
		-top_seq ${NBR_SEQ} \
		-o ${SITES_FILE}_weight_0.tab
	@echo ${SITES_FILE}_weight_0.tab

score_distrib:
	@echo
	@echo "Computing distribution of weight scores"
	classfreq -v 1 -i ${SITES_FILE}.tab -ci 0.1 -col 8 \
		-o ${SITES_FILE}_weight_distrib.tab
	@echo ${SITES_FILE}_weight_distrib.tab
	XYgraph -i ${SITES_FILE}_weight_distrib.tab \
		-xcol 1 -ycol 7,9 \
		-lines -xleg1 'weight score' -yleg1 'Frequency (sites per position)'  \
		-xsize 600 -ysize 400 \
		-ylog 10 -ymax 1 \
		-o ${SITES_FILE}_weight_distrib.png
	@echo ${SITES_FILE}_weight_distrib.png





background:
	@echo
	@echo "Background Model"
	@echo "Count oligo occurrences"
	oligo-analysis -v 1 -i ${SWEMBL_SUMMITS_CHR11}.fasta \
	-l 3 \
	-return occ,freq \
	-1str -olvp \
	-o ${SWEMBL_SUMMITS_CHR11}_bg_model_3nt-1str-ovlp.tab
	@echo "oligo-anlysis OK, save in "${SWEMBL_SUMMITS_CHR11}_bg_model_3nt-1str-ovlp.tab
	@echo "Convert background model in transition table"
	convert-background-model -v 1 -i ${SWEMBL_SUMMITS_CHR11}_bg_model_3nt-1str-ovlp.tab \
	-from oligos -to transitions \
	-o ${SWEMBL_SUMMITS_CHR11}_bg_model_transitions_markov2-1str-ovlp.tab
	@echo "convert-background OK, save in "${SWEMBL_SUMMITS_CHR11}_bg_model_transitions_markov2-1str-ovlp.tab


# ################################################################
# ## Run position-analysis to detect k-mers showing a position bias.
# POS_SEQ_PREFIX=${MACS_PREFIX}_summits_sorted
# POS_SEQ_DIR=${MACS_PEAKS_DIR}
# POS_SEQ=${POS_SEQ_DIR}/${POS_SEQ_PREFIX}.fasta
# POS_OL=6
# POS_CI=25
# POS_STR=-1str
# POS_NOOV=-noov
# POS_PREFIX=${POS_SEQ_PREFIX}_${POS_OL}nt_ci${POS_CI}${POS_STR}${POS_NOOV}
# #POS_PREFIX=${POS_SEQ_PREFIX}_${POS_OL}nt_ci${POS_CI}${POS_STR}${POS_NOOV}_first${POS_FIRST}_seqnb${POS_SEQNB}
# POS_BASE_DIR=analysis/motifs/position_analysis
# POS_DIR=${POS_BASE_DIR}/${POS_PREFIX}
# POS_FILE=${POS_DIR}/${POS_PREFIX}.tab
# #POS_FIRST=1
# #POS_SEQNB=0
# POS_CMD=position-analysis -v ${V} \
# 		-i ${POS_SEQ} -seqtype dna -return chi,distrib,graph,rank -sort \
# 		-l ${POS_OL} ${POS_STR} -ci ${POS_CI} -origin center ${POS_NOOV} \
# 		-max_graphs 100 \
# 		-title ${POS_PREFIX} \
# 		-minpos -${SUMMIT_EXT} \
# 		-maxpos ${SUMMIT_EXT} ${OPT} \
# 		-o ${POS_FILE}; \
# 		${MAKE} positions_clusters
# #		-first ${POS_FIRST} \
# #		-seqnb ${POS_SEQNB}
# positions:
# 	@echo
# 	@echo "Running position-analysis	${POS_PREFIX}"
# 	@${MAKE} my_command MY_COMMAND="${POS_CMD}"  TEST=${TEST}
# 	@echo "	${POS_FILE}"
# 	@echo "	${POS_DIR}"

