################################################################
## Detect motifs (words discarding from random expectation) in the ORI
## regions

include scripts/makefiles/01_peak_calling.mk

MAKEFILE=scripts/makefiles/02_motifs.mk

################################################################
## List parameters for motif analysis
PARAM_GROUP=motifs
list_param_motifs:
	@echo
	@echo "position-analysis parameters"
	@echo "	POS_SEQ_DIR		${POS_SEQ_DIR}"
	@echo "	POS_SEQ_PREFIX		${POS_SEQ_PREFIX}"
	@echo "	POS_SEQ			${POS_SEQ}"
	@echo "	POS_OL			${POS_OL}"
	@echo "	POS_CI			${POS_CI}"
	@echo "	POS_STR			${POS_STR}"
	@echo "	POS_NOOV		${POS_NOOV}"
	@echo "	POS_PREFIX		${POS_PREFIX}"
	@echo "	POS_DIR			${POS_DIR}"
	@echo "	POS_FILE		${POS_FILE}"
	@echo "	POS_FIRST		${POS_FIRST}"
	@echo "	POS_SEQNB		${POS_SEQNB}"
	@echo "	POS_CMD			${POS_CMD}"
#	@echo "	POS_CLUST_NB_UNIF	${POS_CLUST_NB_UNIF}"
#	@echo "	POS_CLUST_NB_MKV	${POS_CLUST_NB_MKV}"
	@echo "	POS_MIN_CLUST_NB	${POS_MIN_CLUST_NB}"
	@echo "	POS_MAX_CLUST_NB	${POS_MAX_CLUST_NB}"
	@echo "	POS_CLUST_METHOD	${POS_CLUST_METHOD}"
	@echo
	@echo "peak-motifs parameters"
	@echo "	PEAKMO_CMD	${PEAKMO_CMD}"
	@echo
	@echo "oligo-analysis parameters"
	@echo "	OLIGO_CMD	${OLIGO_CMD}"
	${MAKE} list_param_ogre

list_param_ogre:
	@echo
	@echo "Parameters to scan OGRE motifs"
	@echo "	OGRE_DIR		${OGRE_DIR}"
	@echo "	OGRE_SEQ		${OGRE_SEQ}"
	@echo "	OGRE_MOTIF_PREFIX	${OGRE_MOTIF_PREFIX}"
	@echo "	OGRE_PSSM		${OGRE_PSSM}"
	@echo "	OGRE_SITES		${OGRE_SITES}"

################################################################
## Run peak-motifs on the ORI peaks
MINOL=6
MAXOL=7
MIN_MKV=auto
MAX_MKV=auto
STR=-1str
NOOV=-noov
DISCO=oligos,dyads,positions
PM_TASK=purge,seqlen,composition,disco,merge_words,collect_motifs,motifs_vs_motifs,timelog,motifs_vs_db,scan,synthesis
TOP_PEAKS=0
MOTIF_PREFIX=${PEAK_PREFIX}${SUMMITS}_top${TOP_PEAKS}
DIR_PEAKMO=${DIR_RESULTS}/motifs/peak_motifs/${MOTIF_PREFIX}
PEAKMO_CMD=peak-motifs  -v ${V} -title "${MOTIF_PREFIX}" \
		-i ${PEAK_SEQ} \
		-minol ${MINOL} -maxol ${MAXOL} -nmotifs 10 \
		-min_markov ${MIN_MKV} -max_markov ${MAX_MKV} \
		${STR} ${NOOV} \
		-disco ${DISCO} \
		-motif_db jaspar_core_vertebrates tf ${RSAT}/public_html/data/motif_databases/JASPAR/jaspar_core_vertebrates_2009_10.tf \
		-motif_db jaspar_pbm_mouse tf ${RSAT}/public_html/data/motif_databases/JASPAR/jaspar_pbm_mouse_2009_10.tf \
		-source galaxy \
		-task ${PM_TASK} -prefix ${MOTIF_PREFIX} \
		-img_format png  \
		-top_peaks ${TOP_PEAKS} ${OPT} \
		-outdir ${DIR_PEAKMO}
peakmo:
	@echo
	@echo "Running peak motifs	${MOTIF_PREFIX}"
	@mkdir -p ${DIR_PEAKMO}
	@${MAKE} my_command MY_COMMAND="${PEAKMO_CMD}"  TEST=${TEST}
	@echo "	${DIR_PEAKMO}"

## Run peak-motifs with MACS peaks
peakmo_macs_peaks:
	@${MAKE} V=2 peakmo  PEAK_PREFIX='${MACS_PREFIX}' PEAKS='${MACS_PEAKS_SORTED}'

peakmo_macs_summits:
	@${MAKE} V=2 peakmo  PEAK_PREFIX='${MACS_PREFIX}' SUMMITS=_summits PEAKS='${MACS_SUMMITS_SORTED}'

peakmo_swembl_summits:
	@${MAKE} V=2 peakmo  PEAK_PREFIX='${SWEMBL_PREFIX}' SUMMITS=_summits PEAKS='${SWEMBL_SUMMITS_SORTED}'

################################################################
## Run peak-quality to estimate the "depth" of relevant peaks,
## i.e. how far in the list os sorted peaks can we discover
## significant motifs.
QUALITY_SEQ_PREFIX=${MACS_PREFIX}_peaks_sorted
QUALITY_SEQ_DIR=${MACS_PEAKS_DIR}
QUALITY_BED=${QUALITY_SEQ_DIR}/${QUALITY_SEQ_PREFIX}.bed
QUALITY_SEQ=${QUALITY_SEQ_DIR}/${QUALITY_SEQ_PREFIX}.fasta
QUALITY_SLICES=5
QUALITY_SEQ_PER_SLICE=0
QUALITY_PREFIX=${QUALITY_SEQ_PREFIX}_${QUALITY_SLICES}slices_${QUALITY_SEQ_PER_SLICE}sps
QUALITY_DIR=analysis/peak_quality/${QUALITY_PREFIX}
QUALITY_TASK=slices,seq_len,bg_model,peakmo
QUALITY_CMD=peak-quality -v ${V} \
	-bed ${QUALITY_BED} \
	-seq ${QUALITY_SEQ} \
	-prefix '${QUALITY_PREFIX}' \
	-1str \
	-slices ${QUALITY_SLICES} \
	-seq_per_slice ${QUALITY_SEQ_PER_SLICE} \
	-task ${QUALITY_TASK} \
	-o ${QUALITY_DIR}
peak_quality:
	@echo ""
	@echo "Running peak-quality	${QUALITY_PREFIX}"
	@${MAKE} my_command MY_COMMAND='${QUALITY_CMD}'
	@echo "	${QUALITY_DIR}"

peak_quality_macs_peaks:
	@${MAKE} peak_quality QUALITY_SEQ_PREFIX='${MACS_PREFIX}_peaks_sorted' QUALITY_SEQ_DIR='${MACS_PEAKS_DIR}'

peak_quality_macs_summits:
	@${MAKE} peak_quality QUALITY_SEQ_PREFIX='${MACS_PREFIX}_summits_sorted' QUALITY_SEQ_DIR='${MACS_PEAKS_DIR}'

peak_quality_swembl_peaks:
	@${MAKE} peak_quality QUALITY_SEQ_PREFIX='${SWEMBL_PREFIX}_peaks_sorted' QUALITY_SEQ_DIR='${SWEMBL_PEAKS_DIR}'

################################################################
## Run oligo-analysis to detect over-represented k-mers
OLIGO_SEQ_PREFIX=${MACS_PREFIX}_summits_sorted
OLIGO_SEQ_DIR=${MACS_PEAKS_DIR}
OLIGO_SEQ=${OLIGO_SEQ_DIR}/${OLIGO_SEQ_PREFIX}.fasta
OL=6
OLIGO_MKV=-2
OLIGO_STR=-1str
OLIGO_NOOV=-noov
OLIGO_TAILS=-two_tails
OLIGO_PREFIX=${OLIGO_SEQ_PREFIX}_${OL}nt_mkv${OLIGO_MKV}${OLIGO_STR}${OLIGO_NOOV}${OLIGO_TAILS}
OLIGO_DIR=analysis/motifs/oligo_analysis/${OLIGO_PREFIX}
OLIGO_FILE=${OLIGO_DIR}/${OLIGO_PREFIX}
OLIGO_CMD=oligo-analysis -v ${V} -quick \
		-i ${OLIGO_SEQ} -seqtype dna -return occ,freq,proba,ratio,zscore,rank -sort \
		-l ${OL} ${OLIGO_STR} -markov ${OLIGO_MKV} ${OLIGO_NOOV} ${OLIGO_TAILS} \
		${OPT} \
		-o ${OLIGO_FILE}.tab ; text-to-html -i ${OLIGO_FILE}.tab -o ${OLIGO_FILE}.html -chunk 100000
oligos:
	@echo
	@echo "Running oligo-analysis	${OLIGO_PREFIX}"
	@mkdir -p ${OLIGO_DIR}
	@echo "	${OLIGO_DIR}"
	@${MAKE} my_command MY_COMMAND="${OLIGO_CMD}"  TEST=${TEST}
	@echo "	${OLIGO_FILE}.tab"
	@echo "	${OLIGO_FILE}.html"

## Run oligo-analysis on MACS summits
oligos_macs_peak_summits:
	${MAKE} V=2 oligos OLIGO_SEQ_DIR=${MACS_PEAKS_DIR} OLIGO_SEQ_PREFIX=${MACS_PREFIX}_summits_sorted

## Run oligo-analysis on summits of MACS sub-peaks (obtained from PeakSplitter)
oligos_macs_subpeak_summits:
	${MAKE} V=2 oligos OLIGO_SEQ_DIR=${MACS_PEAKS_DIR} OLIGO_SEQ_PREFIX=${MACS_PREFIX}_peaks.subpeaks_summits_sorted

## Run oligo-analysis on SWEMBL summits
oligos_swembl_summits:
	${MAKE} V=2 oligos OLIGO_SEQ_DIR=${SWEMBL_PEAKS_DIR} OLIGO_SEQ_PREFIX=${SWEMBL_PREFIX}_summits_sorted

## Run oligo-analysis on summits obtained with different peak-calling approaches
oligos_all_summits: oligos_macs_peak_summits oligos_macs_subpeak_summits oligos_swembl_summits

## Iterate oligo-analysis for oligonucleotide lengths from 2 to 8. We
## cannot analyze smaller oligonucleotides (we cannot define a Markov
## model m <= k-2 for single nucleotides).
OLIGO_LENGTHS=6 2 3 4 5 7 8
OLIGO_TASK=oligos_all_summits
oligos_all_lengths:
	${MAKE} iterate_oligo_lengths

################################################################
## Run position-analysis to detect k-mers showing a position bias.
POS_SEQ_PREFIX=${SWEMBL_PREFIX}_summits_sorted
POS_SEQ_DIR=${SWEMBL_PEAKS_DIR}
POS_SEQ=${POS_SEQ_DIR}/${POS_SEQ_PREFIX}.fasta
POS_OL=8
POS_CI=50
POS_MINPOS=-999
POS_MAXPOS=1000
#POS_DRAWING_OFFSET=-25.5
#POS_CI=25
#POS_DRAWING_OFFSET=-13
POS_STR=-1str
POS_NOOV=-noov
POS_BG_OPT=
POS_BG_SUFFIX=
POS_PREFIX=${POS_SEQ_PREFIX}_${POS_OL}nt_ci${POS_CI}${POS_STR}${POS_NOOV}${POS_BG_SUFFIX}
#POS_PREFIX=${POS_SEQ_PREFIX}_${POS_OL}nt_ci${POS_CI}${POS_STR}${POS_NOOV}_first${POS_FIRST}_seqnb${POS_SEQNB}
POS_BASE_DIR=analysis/motifs/position_analysis
POS_DIR=${POS_BASE_DIR}/${POS_PREFIX}
POS_FILE=${POS_DIR}/${POS_PREFIX}
POS_MAX_GRAPHS=64
#POS_SKIP=0
#POS_LAST=0
#OPT=-skip ${POS_SKIP} -last ${POS_LAST}
#POS_CLUST_NB_MKV=9
#POS_CLUST_NB_UNIF=6
#POS_CLUST_NB=${POS_CLUST_NB_UNIF}
POS_MIN_CLUST_NB=6
POS_MAX_CLUST_NB=12
POS_CLUST_METHOD=complete
TOP_SEQ_FOR_MATRICES=10000
POS_RETURN=chi,distrib,occ,exp_occ,freq_per_window,freq_per_word,coverage,graphs,clusters,matrices,rank,index
POS_TASK=all
POS_CMD=position-analysis -v ${V} \
		-i ${POS_SEQ} -seqtype dna -return ${POS_RETURN} -sort \
		-l ${POS_OL} ${POS_STR} -ci ${POS_CI} -origin center ${POS_NOOV} ${POS_BG_OPT} \
		-max_graphs ${POS_MAX_GRAPHS} \
		-title ${POS_PREFIX} \
		-minpos ${POS_MINPOS} \
		-maxpos ${POS_MAXPOS} \
		-header min \
		-min_clust_nb ${POS_MIN_CLUST_NB} \
		-max_clust_nb ${POS_MAX_CLUST_NB} \
		-top_seq_for_matrices ${TOP_SEQ_FOR_MATRICES} \
		-clust_method ${POS_CLUST_METHOD} \
		-task ${POS_TASK} \
		${OPT} \
		-o ${POS_FILE}.tab

#		-clust_suffix clusters_${POS_CLUST_METHOD}_k${POS_CLUST_NB} \
#\
#		${MAKE} positions_clusters
positions:
	@echo
	@echo "Running position-analysis	${POS_PREFIX}"
	@mkdir -p ${POS_DIR}
	@echo "POS_DIR	${POS_DIR}"
	@echo "Command:	${POS_CMD}"
	@${MAKE} my_command MY_COMMAND="${POS_CMD}"
	@echo "	${POS_DIR}"
	@echo "	${POS_FILE}.tab"
	@echo "	${POS_FILE}_index.html"

################################################################
## Detect positionally biased oligos using a Markov model to estimate
## window-specific expected frequencies.
POS_MKV=-2
POS_BG=mkv${POS_MKV}
positions_markov:
	@echo
	@echo "Running position-analysis with Markov chain background model	${POS_MKV}"
	@${MAKE} positions POS_BG_OPT='-markov ${POS_MKV}' POS_BG_SUFFIX=_bg_${POS_BG}
## POS_CLUST_NB=${POS_CLUST_NB_MKV}
#	@${MAKE} positions_pssm POS_BG_OPT='-markov ${POS_MKV}' POS_BG_SUFFIX=_bg_${POS_BG}

################################################################
## Convert the k-mer discovered by position-analysis lists into a PSSSM and generate a logo
POS_PSSM_CMD=matrix-from-patterns -v ${V} -format fasta  \
		-seq ${POS_SEQ} \
		-pl ${POS_FILE}_clusters.tab \
		-toppat 50 -max_asmb_nb 5 -sc 4 -cc 2 -subt 1 -maxfl 1 -1str \
		-collect_method matrix-scan-quick -flanks 3 \
		-min_weight 7 -logo \
		-o ${POS_FILE}_pssm
positions_pssm:
	@echo
	@echo "Converting k-ers to PSSM and logos"
	@echo ${POS_PSSM_CMD}
	@${MAKE} my_command MY_COMMAND="${POS_PSSM_CMD}"
	@echo "	${POS_FILE}_pssm"

################################################################
## Draw a sequence logo with the matrix returned by position-analysis
## -l 1. This logo will indicate the compositional biases as a
## function of the distance from peak summits
##
## We proceeed in two times to circumvent a problem of memory
## apparently due to the large size of the fake sequences to be
## generated: we first convert the counts into a frequency matrix, and
## then compute the logos from this matrix (frequencies are
## arbitrarily multiplied by 100 to obtain counts).
pos_profile_logo:
	@echo ""
	@echo "Drawing sequence logo of 1nt profiles"
	make _pos_profile_logo POS_OL=1

POS_FREQ=${POS_DIR}/${POS_PREFIX}_freq_matrix.tab
POS_LOGO=${POS_DIR}/${POS_PREFIX}_logo.tab
_pos_profile_logo:
	@echo ${POS_FILE}.tab
	@grep -v '^;' ${POS_FILE}.tab | cut -f 1,12-90 | \
		convert-matrix -v ${V} -from tab -to tab \
		-return frequencies \
		-decimals 3 -pseudo 0 -bg_pseudo 0 \
		-o ${POS_FREQ}
	@echo ${POS_FREQ}
	@convert-matrix -v ${V} -i ${POS_FREQ} \
		-from tab -to tab \
		-return counts,logo \
		-pseudo 0 -bg_pseudo 0 -logo_format pdf,png -multiply 100 \
		-o ${POS_LOGO}
	@echo ${POS_LOGO}

################################################################
## Motif clustering on the basis of their position profiles.
## This is done in R, by passing arguments to R.
#POS_CLUSTER_SCRIPT=scripts/R-scripts/cluster_position_profiles.R
POS_CLUSTER_SCRIPT=${RSAT}/R-scripts/cluster_position_profiles.R
# positions_clusters:
# 	@cat ${POS_CLUSTER_SCRIPT} \
# 		| R --slave --no-save --no-restore --no-environ \
# 		--args "file.pos='${POS_FILE}.tab';pos.offset=${POS_DRAWING_OFFSET}"

################################################################
## Extract the PSSM and logo for one k-mer cluster
POS_CLUSTER_DIR=${POS_DIR}/position_clusters
POS_CLUSTER_FILE=${POS_CLUSTER_DIR}/clusters_${POS_PREFIX}.tab
# positions_clusters_to_logos:
# 	@echo	"POS_FILE	${POS_FILE}"
# 	@echo	"POS_DIR		${POS_DIR}"
# 	@echo	"POS_CLUSTER_DIR	${POS_CLUSTER_DIR}"
# 	@echo	"POS_CLUSTER_FILE	${POS_CLUSTER_FILE}"

## Run position-analysis on MACS summits
positions_macs_peak_summits:
	${MAKE} V=2 positions POS_SEQ_DIR=${MACS_PEAKS_DIR} POS_SEQ_PREFIX=${MACS_PREFIX}_summits_sorted


# ## Cluster position-analysis profiles on SWEMBL summits
# positions_clusters_macs_peak_summits:
# 	${MAKE} V=2 positions_clusters POS_SEQ_DIR=${MACS_PEAKS_DIR} POS_SEQ_PREFIX=${MACS_PREFIX}_summits_sorted

## Run position-analysis on summits of MACS sub-peaks (obtained from PeakSplitter)
positions_macs_subpeak_summits:
	${MAKE} V=2 positions POS_SEQ_DIR=${MACS_PEAKS_DIR} POS_SEQ_PREFIX=${MACS_PREFIX}_peaks.subpeaks_summits_sorted

# ## Cluster position-analysis profiles on SWEMBL summits
# positions_clusters_macs_subpeak_summits:
# 	${MAKE} V=2 positions_clusters POS_SEQ_DIR=${MACS_PEAKS_DIR} POS_SEQ_PREFIX=${MACS_PREFIX}_peaks.subpeaks_summits_sorted

## Run position-analysis on SWEMBL summits
positions_swembl_summits:
	@${MAKE} V=2 positions POS_SEQ_DIR=${SWEMBL_PEAKS_DIR} POS_SEQ_PREFIX=${SWEMBL_PREFIX}_summits_sorted

positions_markov_swembl_summits:
	@${MAKE} V=2 positions_markov POS_SEQ_DIR=${SWEMBL_PEAKS_DIR} POS_SEQ_PREFIX=${SWEMBL_PREFIX}_summits_sorted


## Run position-analysis on ORI summits (SWEMBL overlapping at least one SICER)
positions_ori_summits:
	@${MAKE} V=2 positions POS_SEQ_DIR=${ORI_DIR} POS_SEQ_PREFIX=${ORI_PREFIX}_summits

positions_bernoulli_ori_summits:
	@${MAKE} V=2 positions_markov_ori_summits POS_MKV=0

positions_markov_ori_summits:
	@${MAKE} V=2 positions_markov POS_SEQ_DIR=${ORI_DIR} POS_SEQ_PREFIX=${ORI_PREFIX}_summits

## Run position-analysis on ORI summits (SWEMBL overlapping at least one SICER)
positions_rand_peaks:
	@for rep in ${RAND_REPEATS}; do \
		${MAKE} positions_rand_peaks_one_rep RAND_REPEAT=$${rep} ; \
	done

positions_rand_peaks_one_rep:
	@${MAKE} V=2 positions POS_SEQ_DIR=${RAND_DIR} POS_SEQ_PREFIX=${RAND_PREFIX}

positions_markov_rand_peaks:
	@for rep in ${RAND_REPEATS}; do \
		${MAKE} positions_markov_rand_peaks_one_rep RAND_REPEAT=$${rep} ; \
	done

positions_markov_rand_peaks_one_rep:
	@${MAKE} V=2 positions_markov POS_SEQ_DIR=${RAND_DIR} POS_SEQ_PREFIX=${RAND_PREFIX}


# ## Cluster position-analysis profiles on SWEMBL summits
# positions_clusters_swembl_summits:
# 	${MAKE} V=2 positions_clusters POS_SEQ_DIR=${SWEMBL_PEAKS_DIR} POS_SEQ_PREFIX=${SWEMBL_PREFIX}_summits_sorted

## Run position-analysis on random genome fragments calibrated on MACS summits
## Obsolete
#positions_rand_macs_peak_summits:
#	${MAKE} V=2 positions POS_SEQ_DIR=analysis/peaks/RAND/RAND_genome_fragments_${MACS_PREFIX}_summits_sorted POS_SEQ_PREFIX=RAND_genome_fragments_${MACS_PREFIX}_summits_sorted

## Run position-analysis on random genome fragments calibrated on SWEMBL summits
positions_rand_swembl_peak_summits:
	${MAKE} V=2 positions POS_SEQ_DIR=analysis/peaks/random_fragments/rand_fragments_${SWEMBL_PREFIX}_summits_sorted POS_SEQ_PREFIX=rand_fragments_${SWEMBL_PREFIX}_summits_sorted

## Run position-analysis with Markov background model on random genome fragments calibrated on SWEMBL summits
positions_markov_rand_swembl_peak_summits:
	${MAKE} V=2 positions_markov POS_SEQ_DIR=analysis/peaks/random_fragments/rand_fragments_${SWEMBL_PREFIX}_summits_sorted POS_SEQ_PREFIX=rand_fragments_${SWEMBL_PREFIX}_summits_sorted

## Run position-analysis on random genome fragments calibrated on MACS summits
positions_randseq:
	${MAKE} V=2 positions POS_SEQ_DIR=${RANDSEQ_DIR} POS_SEQ_PREFIX=${RANDSEQ_PREFIX}


## Run position-analysis on summits obtained with different peak-calling approaches
positions_all_summits: positions_macs_peak_summits positions_macs_subpeak_summits positions_swembl_summits positions_rand_macs_peak_summits positions_randseq 

## Iterate position-analysis for oligonucleotide lengths from 1 to 8
#LEN_TASK=positions_markov_swembl_summits
LEN_TASK=positions_bernoulli_ori_summits
POS_OLIGO_LENGTHS=1 2 3 4 5 6 7 8
positions_all_lengths:
	@for ol in ${POS_OLIGO_LENGTHS}; do \
		${MAKE} POS_OL=$${ol} ${LEN_TASK}; \
	done

################################################################
## Run all the analyses for on the origin summits
origins_all_positions:
	@${MAKE} positions_all_lengths  LEN_TASK=positions_bernoulli_ori_summits
	@${MAKE} positions_all_lengths  LEN_TASK=positions_ori_summits 
	@${MAKE} positions_all_lengths  LEN_TASK=positions_markov_ori_summits POS_MKV=-2

################################################################
## Run all the analyses for random peaks
rand_peaks_all_positions:
	@for rep in ${RAND_REPEATS}; do \
		${MAKE} rand_peaks_all_positions_one_rep RAND_REPEAT=$${rep} ; \
	done

rand_peaks_all_positions_one_rep:
	@${MAKE} positions_all_lengths  LEN_TASK=positions_rand_peaks_one_rep 
	@${MAKE} positions_all_lengths  LEN_TASK=positions_markov_rand_peaks_one_rep POS_MKV=0
	@${MAKE} positions_all_lengths  LEN_TASK=positions_markov_rand_peaks_one_rep POS_MKV=-2

################################################################
## Scan peaks to detect OGRE sites, using the motif provided by
## Christelle Cayrou.

ogre_analysis: ogre_scan ogre_pos_distrib ogre_clean_features

#OGRE_PSSM=data/motifs/ogre-like.tf
#OGRE_PREFIX=ogre-like
#OGRE_SEQ_PREFIX=${SWEMBL_PREFIX}_summits_sorted
#OGRE_SEQ_DIR=${SWEMBL_PEAKS_DIR}
OGRE_SEQ_PREFIX=${ORI_PREFIX}_summits
OGRE_SEQ_DIR=${ORI_DIR}
OGRE_SEQ=${OGRE_SEQ_DIR}/${OGRE_SEQ_PREFIX}
#OGRE_MOTIF_PREFIX=ogre_cayrou
#OGRE_PSSM=data/motifs/ogre_cayrou.tf
OGRE_MOTIF_PREFIX=OGRE_mES_Cayrou_GR_2011
OGRE_PSSM=data/motifs/${OGRE_MOTIF_PREFIX}.tf
OGRE_LOGO=data/motifs/${OGRE_MOTIF_PREFIX}_logo.pdf
OGRE_DIR=analysis/motifs/OGRE/${OGRE_MOTIF_PREFIX}_${OGRE_SEQ_PREFIX}_sites
UTH_PVAL=1e-4
SCAN_STR=-1str
OGRE_SITES=${OGRE_DIR}/${OGRE_SEQ_PREFIX}_${OGRE_MOTIF_PREFIX}_sites_pval${UTH_PVAL}${SCAN_STR}
ogre_scan:
	@echo
	@echo "Scanning OGRE in peaks ${OGRE_SEQ_PREFIX}"
	@mkdir -p ${OGRE_DIR}
	matrix-scan -v ${V} -quick \
		-i ${OGRE_SEQ}.fasta ${SCAN_STR} \
		-origin center \
		-matrix_format tf -m ${OGRE_PSSM} \
		-bginput -markov 1 -uth pval ${UTH_PVAL} \
		${OPT} \
		-o ${OGRE_SITES}.ft
	@echo "OGRE sites (features)"
	@echo "	${OGRE_SITES}.ft"
#	@${MAKE} ogre_pos_distrib
#	@${MAKE} ogre_clean_features


## Scan sorted regions around the peaks with
## PSSM (Ogre + discovered motifs) in order to esitmate the coverage.
##
## The order of was determined regions around the origins (500bp or
## 300bp around summits) by clustering according to the read density
## profiles).
SORTED_REGION_SIZE=500
SORTED_REGIONS=analysis/f4_rnase/cpg/mes_sw_k20_ordered_summit${SORTED_REGION_SIZE}bp.bed
PSSM=${OGRE_PSSM}
matrix_scan_sorted_peaks:
	@echo
	@echo "Scanning sorted regions with position-specific scoring matrix"
	@echo "	${PSSM}"
	@echo "	${SORTED_REGIONS}"


################################################################
## Compute position distribution of OGRE sites in peaks.
## We scan on both strands, but we then generate a separate
## distribution profile for each strand.
DISTRIB_IMG_FORMAT=pdf
#		-ymin 0 -ymax ${OGRE_YMAX} \
#			-ymin 0 -ymax ${OGRE_YMAX} \
#OGRE_YMAX=5000
OGRE_DISTRIB=${OGRE_SITES}_distrib
ogre_pos_distrib:
	classfreq -v 1 -i ${OGRE_SITES}.ft -col 5 -ci ${POS_CI} -o ${OGRE_DISTRIB}.tab
	@echo "	${OGRE_DISTRIB}.tab"
	@${MAKE} _ogre_pos_distrib_graph OGRE_SITES=${OGRE_SITES} OGRE_SEQ_PREFIX=${OGRE_SEQ_PREFIX}
	@${MAKE} _ogre_pos_distrib_one_strand DISTRIB_STRAND=D OGRE_SITES=${OGRE_SITES} OGRE_SEQ_PREFIX=${OGRE_SEQ_PREFIX}
	@${MAKE} _ogre_pos_distrib_one_strand DISTRIB_STRAND=R OGRE_SITES=${OGRE_SITES} OGRE_SEQ_PREFIX=${OGRE_SEQ_PREFIX}

_ogre_pos_distrib_graph:
	@XYgraph -i ${OGRE_DISTRIB}.tab \
		-format ${DISTRIB_IMG_FORMAT} \
		-xcol 3 -ycol 4 -lines -xsize 800 -ysize 300 \
		-title1 'OGRE sites; ${OGRE_SEQ_PREFIX}' \
		-xmin -1000 -xmax 1000 \
		-yleg1 'Number of sites' \
		-xleg1 'position' \
		-pointsize 0 \
		-o ${OGRE_DISTRIB}.${DISTRIB_IMG_FORMAT}
	@echo "	${OGRE_DISTRIB}.${DISTRIB_IMG_FORMAT}"

_ogre_pos_distrib_one_strand:
	@awk '$$4 == "${DISTRIB_STRAND}"' ${OGRE_SITES}.ft \
		| classfreq -v 1 -col 5 -ci ${POS_CI} -o ${OGRE_SITES}_${DISTRIB_STRAND}_distrib.tab
	@echo "	${OGRE_SITES}_${DISTRIB_STRAND}_distrib.tab"
	@XYgraph -i  ${OGRE_SITES}_${DISTRIB_STRAND}_distrib.tab \
			-format ${DISTRIB_IMG_FORMAT} \
			-xcol 3 -ycol 4 -lines -xsize 800 -ysize 400 \
			-title1 'OGRE sites; ${OGRE_SEQ_PREFIX}' \
			-title2 'Strand ${DISTRIB_STRAND}' \
			-xmin -1000 -xmax 1000 \
			-yleg1 'Number of sites' \
			-xleg1 'position' \
			-pointsize 0 \
			-o ${OGRE_SITES}_${DISTRIB_STRAND}_distrib.${DISTRIB_IMG_FORMAT}
	@echo "	${OGRE_SITES}_${DISTRIB_STRAND}_distrib.${DISTRIB_IMG_FORMAT}"

## remove the feature file, which occupies too much space 
ogre_clean_features:
	@echo "Cleaning feature file ${OGRE_SITES}.ft"
	rm -f ${OGRE_SITES}.ft


## Scan random genome fragments with OGRE motif
OGRE_TASK=ogre_scan ogre_pos_distrib
ogre_rand_fragments:
	${MAKE} OGRE_SEQ_DIR=analysis/peaks/random_fragments/rand_fragments_${SWEMBL_PREFIX}_summits_sorted \
		OGRE_SEQ_PREFIX=rand_fragments_${SWEMBL_PREFIX}_summits_sorted ${OGRE_TASK}


################################################################
## Estimate the coverage of a motif for different regions around the
## summit
motif_profiles_param:
	@echo
	@echo "Parameters for the analysis of motif coverage"
	@echo "	PROF_DIR	${PROF_DIR}"
	@echo "	PROF_PREFIX	${PROF_PREFIX}"
	@echo "	PROF_SEQ	${PROF_SEQ}"
#	@echo "	PROF_SUBSEQ	${PROF_SUBSEQ}"
	@echo "	PROF_STR	${PROF_STR}"
	@echo "	PROF_MATRIX_DIR	${PROF_MATRIX_DIR}"
	@echo "	PROF_MATRICES	${PROF_MATRICES}"
	@echo "	PROF_SITES	${PROF_SITES}"
	@echo "	PROF_PROFILES	${PROF_PROFILES}"


# ## Extract subset of the sequences (positonal window)
# #PROF_FROM=-700
# #PROF_TO=699
# #PROF_SUBSEQ=${PROF_DIR}/${ORI_PREFIX}_subseq_from${PROF_FROM}_to${PROF_TO}.fasta
# motif_profiles_subseq:
# 	@echo 
# 	@echo "Extracting subset from ${PROF_FROM} to ${PROF_TO}"
# 	@mkdir -p ${PROF_DIR}
# 	sub-sequence -i ${PROF_SEQ} -from ${PROF_FROM} -to ${PROF_TO} -origin center -o ${PROF_SUBSEQ}
# 	@echo "	${PROF_SUBSEQ}"

PROF_SEQ=${ORI_DIR}/${ORI_PREFIX}_summits.fasta
PROF_PREFIX=${ORI_PREFIX}
PROF_MATRIX_PREFIX=${PROF_PREFIX}_summits_${POS_OL}nt_ci${POS_CI}${POS_STR}-noov_bg_mkv0
PROF_MATRIX_DIR=${POS_BASE_DIR}/${PROF_MATRIX_PREFIX}
PROF_MATRICES=${PROF_MATRIX_DIR}/${PROF_MATRIX_PREFIX}_pssm_count_matrices.tf
PROF_DIR=${PROF_MATRIX_DIR}/coverage
PROF_SITES=${PROF_DIR}/${ORI_PREFIX}${PROF_STR}${PROF_NOOV}_sites.ft
PROF_STR=-2str
PROF_CMD=matrix-scan -v ${V} -quick \
		-i ${PROF_SEQ} \
		-matrix_format tf -m ${PROF_MATRICES} \
		${PROF_STR} -pseudo 1 -origin center \
		-bginput -markov 1 -uth pval ${UTH_PVAL} \
		${OPT} \
		-o ${PROF_SITES}
motif_profiles_scan:
	@echo
	@echo "Calculating motif coverage in peaks ${PROF_SEQ}"
	@mkdir -p ${PROF_DIR}
	@echo ${PROF_CMD}
	@${PROF_CMD}
	@echo "Sites (features)"
	@echo "	${PROF_SITES}"

################################################################
## Compute positional distribution for a set of motifs
PROFILE_TYPE=mseq
PROF_PROFILES=${PROF_DIR}/${ORI_PREFIX}${PROF_STR}_profiles_ci${PROF_CI}_from${PROF_MIN_POS}_to${PROF_MAX_POS}_${PROFILE_TYPE}.tab
PROF_CI=300
PROF_MIN_POS=-750
PROF_MAX_POS=749
DISTRIB_CMD=feature-profiles -v ${V} \
	-i ${PROF_SITES} \
	-ci ${PROF_CI} -sep_strands -return ${PROFILE_TYPE} \
	-min_pos ${PROF_MIN_POS} -max_pos ${PROF_MAX_POS} \
	-o ${PROF_PROFILES}
motif_profiles:
	@echo
	@echo "Calculating motif distribution"
	@echo ${DISTRIB_CMD}
	@${DISTRIB_CMD}
	@echo "	${PROF_PROFILES}"

motif_profiles_mseq:
	${MAKE} motif_profiles PROFILE_TYPE=mseq

motif_profiles_occ:
	${MAKE} motif_profiles PROFILE_TYPE=occ
