################################################################
## Run various peak calling algorithms on Marcel Mechali's data on replication
## origins.

include ${RSAT}/makefiles/util.mk
include scripts/makefiles/00_parameters.mk
MAKEFILE=scripts/makefiles/01_peak_calling.mk


################################################################
## List parameters for peak-calling
PARAM_GROUP=peak_calling
list_param: 
	@echo ""
	@echo "Parameters"
	@echo "----------"
	@echo "PARAM_GROUP	${PARAM_GROUP}"
	@${MAKE} list_param_${PARAM_GROUP}

list_param_peak_calling:
	@echo ""
	@echo "ANALYSIS			${ANALYSIS}"
	@echo "	ORGANISM		${ORGANISM}"
	@echo "	GENOME			${GENOME}"
	@echo "	TEST			${TEST}"
	@echo "	TEST_READS		${TEST_READS}"
	@echo "	CTRL			${CTRL}"
	@echo "	CTRL_READS		${CTRL_READS}"
	@echo ""
	@echo "ANALYSES"
	@echo "	${ANALYSES}" | perl -pe 's| |\n\t|g'
	@echo ""
	@echo "READS"
	@echo "	READ_FILES		${READ_FILES}"
	@echo "	READ_NAMES		${READ_NAMES}"
	@echo "	ANALYSES		${ANALYSES}"
	@echo "	TEST_READ_FILES		${TEST_READ_FILES}"
	@echo "	TEST_READ_NAMES		${TEST_READ_NAMES}"
	@echo ""
	@echo "SWEMBL parameters"
	@echo "	SWEMBL_R		${SWEMBL_R}"
	@echo "	TEST_READS		${TEST_READS}"
	@echo "	CTRL_READS		${CTRL_READS}"
	@echo "	SWEMBL_PREFIX		${SWEMBL_PREFIX}"
	@echo "	SWEMBL_PEAKS_DIR	${SWEMBL_PEAKS_DIR}"
	@echo "	SWEMBL_PEAKS		${SWEMBL_PEAKS}"
	@echo "	SWEMBL_PEAKS_SORTED	${SWEMBL_PEAKS_SORTED}"
	@echo "	SWEMBL_SUMMITS_SORTED	${SWEMBL_SUMMITS_SORTED}"
	@echo "	SWEMBL_CMD		${SWEMBL_CMD}"
	@echo ""
	@echo "ORI peaks (SWEMBL overlapping SICER)"
	@echo "	ORI_DIR			${ORI_DIR}"
	@echo "	ORI_PREFIX		${ORI_PREFIX}"
	@echo "	ORI_PEAKS		${ORI_PEAKS}"
	@echo "	ORI_SUMMITS		${ORI_SUMMITS}"
	@echo ""
	@echo "ORI regions (SICER overlapping SWEMBL)"
	@echo "	ORI_REGIONS		${ORI_REGIONS}"
	@echo ""
	@echo "Random genome fragments"
	@echo "	TEMPLATE_DIR		${TEMPLATE_DIR}"
	@echo "	TEMPLATE_PREFIX		${TEMPLATE_PREFIX}"
	@echo "	TEMPLATE_PEAKS		${TEMPLATE_PEAKS}"
	@echo "	RAND_PREFIX		${RAND_PREFIX}"
	@echo "	RAND_DIR		${RAND_DIR}"
	@echo "	RAND_REPEATS		${RAND_REPEATS}"
	@echo "	RAND_PEAKS		${RAND_PEAKS}"
#	@echo "	RAND_FRAGMENTS_CMD	${RAND_FRAGMENTS_CMD}"
	@echo ""
	@echo "MACS parameters"
	@echo "	MACS_PEAKS_DIR		${MACS_PEAKS_DIR}"
	@echo "	MACS_PEAKS		${MACS_PEAKS}"
	@echo "	MACS_SUBPEAKS		${MACS_SUBPEAKS}"
	@echo "	MACS_SUBPEAKS_SORTED	${MACS_SUBPEAKS_SORTED}"
#	@echo "	MACS_CMD		${MACS_CMD}"
	@echo ""
	@echo "SICER parameters"
	@echo "	SICER_PGM		${SICER_PGM}"
	@echo "	SICER_PREFIX		${SICER_PREFIX}"
	@echo "	SICER_REGIONS_DIR	${SICER_REGIONS_DIR}"
	@echo "	GENOME			${GENOME}"
	@echo "	SICER_REDUND		${SICER_REDUND}"
	@echo "	SICER_WIN_SIZE		${SICER_WIN_SIZE}"
	@echo "	FRAGMENT_SIZE		${FRAGMENT_SIZE}"
	@echo "	SICER_GENOME_FR		${SICER_GENOME_FR}"
	@echo "	SICER_GAP_SIZE		${SICER_GAP_SIZE}"
	@echo "	SICER_CMD		${SICER_CMD}"
	@echo "	SICER_SUMMARY		${SICER_SUMMARY}"
	@echo "	SICER_ISLANDS		${SICER_ISLANDS}"
	@echo "	SICER_ISLANDS_SORTED	${SICER_ISLANDS_SORTED}"
	@echo "	SICER_FDR		${SICER_FDR}"
	@echo ""
	@echo "Current peak set"
	@echo "	PEAKS			${PEAKS}"
	@echo "	PEAK_EXT		${PEAK_EXT}"
	@echo "	PEAKS			${PEAKS}"
	@echo "	PEAK_FILE		${PEAK_FILE}"

################################################################
## Task iterations

## Iteration by analysis
list_analyses:
	@echo "ANALYSES"
	@echo "	${ANALYSES}" | perl -pe 's| |\n\t|g'

TASK=list_param
iterate_analyses:
	@echo
	@echo "Iterating task ${TASK} on all analyses ${ANALYSES}"
	@for a in ${ANALYSES}; do \
		${MAKE} ${TASK} ANALYSIS=$${a}; \
	done

## Iterations by read files (test + control)
list_read_names:
	@echo ${READ_NAMES}

list_read_files:
	@echo ${READ_FILES}

READ_TASK=reads_per_chrom sort_reads
iterate_reads:
	@echo
	@echo "Iterating task ${READ_TASK} on all read files ${READ_NAMES}"
	@for t in ${READ_NAMES} ; do \
		${MAKE} ${READ_TASK} TEST=$${t} ; \
	done

## Iterations by "test" samples
list_test_names:
	@echo ${TEST_READ_NAMES}

list_test_files:
	@echo ${TEST_READ_FILES}

TEST_TASK=list_param
iterate_tests:
	@echo
	@echo "Iterating task ${TEST_TASK} on all test files ${TEST_READ_NAMES}"
	@for t in ${TEST_READ_NAMES}; do \
		${MAKE} ${TEST_TASK} TEST=$${t}; \
	done

################################################################
## Index reads with igvtools in order to load them efficiently in IGV
## viewer
igv_index_reads:
	@echo
#	@${MAKE} iterate_tests TEST_TASK=index_one_read_file
	@${MAKE} igv_index_one_read_file READS=${CTRL_READS}

igv_index_one_read_file:
	@echo 
	@echo "${DATE}	Indexing reads	${READS}"
	igvtools index ${READS}
	@echo "${DATE}	Indexed reads	${READS}"

################################################################
## Count the number of reads per chromosomes
READS_PER_CHROM=${DIR_READS}/${TEST}_reads_per_chrom.txt
READS_PER_CHROM_CMD=cut -f 1 ${READS} | sort | uniq -c > ${READS_PER_CHROM}
reads_per_chrom:
	@echo
	@echo "Counting reads per chromosome"
	@${MAKE} my_command MY_COMMAND="${READS_PER_CHROM_CMD}"
	@echo "Reads per chromosome	READS_PER_CHROM"

################################################################
## Sort the reads if they are provided in non-sorted format.  This is
## required for SWEMBL. To avoid duplicating the space, we should then
## manually
READS_SORTED=${DIR_READS}/${TEST}_sorted.bed
READS_UNSORTED=${DIR_READS}/${TEST}_unsorted.bed
sort_reads:
	@echo 
	@echo "Sorting reads	${READS}"
	@echo "CHROMOSOMES	${CHROMOSOMES}"
	@echo "${DATE}	Copyying reads	READS_UNSORTED	${READS_UNSORTED}"
	@cp -f ${READS} ${READS_UNSORTED}
	@for chrom in ${CHROMOSOMES}; do \
		${MAKE} _sort_reads_one_chrom CHROM=$${chrom} ; \
	done
	@echo "${DATE}	Sorted reads"
	@echo "	Original       	${READS}"
	@echo "	Unsorted (copy)	${READS_UNSORTED}"
	@echo "	Sorted         	${READS_SORTED}"
	@echo "After having checked file content, run the following commands"
	@echo "mv -f ${READS_SORTED} ${READS}"
	@echo "rm ${READS_UNSORTED}"

_sort_reads_one_chrom:
	@echo "	${DATE}	Sorting reads for chromosome ${CHROM}" ; \
	awk '$$1 =="${CHROM}"' ${READS_UNSORTED} | sort -n -k 2 >> ${READS_SORTED}; \

################################################################
## Sort one bed file by positions.
##
## This function is called by the makefile
## 06_marks_vs_oris_and_others.mk but can by used in other cases for
## peak sorting. 
CHROMOMOSOMES=`sort -f 1 ${TO_SORT} | sort -u `
TO_SORT=${SWEMBL_SUMMITS_FLANKS}.bed
SORTED=${SWEMBL_SUMMITS_FLANKS_BYPOS}.bed
sort_bed_file:
	@echo "	${TO_SORT}"
	@echo "CHROMOSOMES ${CHROMOSOMES}"
	@rm -f ${SORTED}
	@for chrom in ${CHROMOSOMES}; do \
		${MAKE}  _sort_one_chrom CHROM=$${chrom} ; \
	done
	@echo "Sorted bed"
	@echo "	${SORTED}"
_sort_one_chrom:
	@echo "	chromosome ${CHROM}"
	awk -F'\t' '$$1=="${CHROM}"' ${TO_SORT} | sort -n -k 2 >> ${SORTED}

################################################################
## Peak calling with MACS
##
## BEWARE: a single run with MACS costs 100 minutes on my Mac.
##
## Pour utiliser peaksplitter, il faut retourner le fichier "wig", ce
## qui permet Ã  ce programme de trouver la forme des peaks et les
## couper.
##
## (Morgane) Les options Ã  utiliser sont  --wig --single-profile.
## http://liulab.dfci.harvard.edu/MACS/README.html
##
## Voici les commandes de MACS, il faut bien sÃ»r les lancer aprÃ¨s
## avoir fait le mapping (et s'assurer au prÃ©alable de la qualitÃ© des
## reads, et les nettoyer si besoin, et connaÃ®tre le type FASTQ
## utilisÃ© pour adapter les bon paramÃ¨tres pour le mapping):
##
##   MACS_MFOLD=5,30
##   MACS_MFOLD=10,30 if the dataset is of good quality (less permissive threshold)
##   MACS_PVAL=1e-5
##   ## bw: should be the average size of fragment before sonication default is 300
##   BW=300
## This value should be set to the sonication fragment size expected from wet experiment
##
## by default, the reads are normalized to the bigger dataset. if the
## control is larger than the experiment, add --to-small so that the
## treatment reads are not artificially changed
##
## ${MACS} -t ${READS_TREAT}.ENS.bed -c ${READS_CTL}.ENS.bed --format BED  --gsize ${GSIZE} --name "macs"  --mfold ${MACS_MFOLD} --pvalue ${MACS_PVAL} --bw ${BW} --wig --single-profile --verbose 2 --diag &> ${READS_TREAT}vs${READS_CTL}/MACS.out
##
## peak-splitter has to be installed separately
## it can be run within MACS with the option --call-subpeaks
##
## I still use it separately, so I can use filtered peaks (with additional control treatment and/or filtered by FDR), instead of the default MACS output
## BED is this peak file
## @${PEAK_SPLITTER} -f -p ${BED} -w ${WIG} -o ${OUTDIR} > ${OUTDIR}/peak_splitter.log
MACS=macs14
GENOME_SIZE=3000000000
TAG_SIZE=72
FRAGMENT_SIZE=500
MACS_MFOLD=10,30
MACS_PVAL=1e-5
BAND_WIDTH=${FRAGMENT_SIZE}
#MACS_PREFIX=${TEST}_vs_${CTRL}_macs14_pval${MACS_PVAL}_BW${BAND_WIDTH}_tsize${TAG_SIZE}_wigsp${MACS_WIG_SPACE}${MACS_NOMODEL}${MACS_NOLAMBDA}
MACS_PREFIX=MACS_${TEST}_vs_${CTRL}${MACS_NOMODEL}${MACS_NOLAMBDA}
MACS_PEAKS_DIR=${DIR_PEAKS}/MACS/${MACS_PREFIX}

## Sort MACS peaks by decreasing score
MACS_NOMODEL=
MACS_NOLAMBDA=
#MACS_NOMODEL=--nomodel
#MACS_NOLAMBDA=--nolambda
MACS_PEAKS=${MACS_PEAKS_DIR}/${MACS_PREFIX}_peaks
MACS_PEAKS_SORTED=${MACS_PEAKS}_sorted
MACS_SUMMITS=${MACS_PEAKS_DIR}/${MACS_PREFIX}_summits
MACS_SUMMITS_SORTED=${MACS_SUMMITS}_sorted
MACS_WIG_SPACE=50
MACS_CMD=(cd ${MACS_PEAKS_DIR}; ${NICE}  ${MACS} \
		--treatment ${MACS_ROOT_DIR}/${TEST_READS} \
		--control ${MACS_ROOT_DIR}/${CTRL_READS} \
		--format BED  \
		--gsize ${GENOME_SIZE} \
		--mfold ${MACS_MFOLD} \
		--pvalue ${MACS_PVAL} \
		--bw ${BAND_WIDTH} \
		--wig --space=${MACS_WIG_SPACE} \
		--single-profile \
		--verbose 2 \
		--diag \
		--tsize ${TAG_SIZE} \
		${MACS_NOMODEL} ${MACS_NOLAMBDA} \
		--name ${MACS_PREFIX} > ${MACS_PREFIX}.out; \
	); \
	echo '	${MACS_PREFIX}.out' ; \
	echo '	${MACS_PEAKS_DIR}'
macs:
	@echo
	@echo 'Peak calling with MACS'
	@echo 
	@mkdir -p ${MACS_PEAKS_DIR}
	@echo 'MACS directory	${MACS_PEAKS_DIR}'
	@${MAKE} my_command MY_COMMAND="${MACS_CMD}" TEST=${TEST}
#	@echo "Log file	${MACS_PREFIX}_log.txt"

## Run R to generate the distribution of strand-wise peak shift
MACS_MODEL_CMD= \
	(cd ${MACS_PEAKS_DIR}; R --vanilla --slave --file=${MACS_PREFIX}_model.r) ; \
	echo ${MACS_PEAKS_DIR}/${MACS_PREFIX}_model.pdf
macs_model:
	@echo "Computing MACS model"
	@${MAKE} my_command MY_COMMAND="${MACS_MODEL_CMD}"  TEST=${TEST}

## Apply post-treatment tasks to MACS results: stats, region splitting, sequence fetching
MACS_POST_CMD= \
	${MAKE} peak_stats_macs TEST=${TEST} ; \
	${MAKE} sort_peaks_macs TEST=${TEST} ; \
	${MAKE} fetch_macs_peaks TEST=${TEST} ; \
	${MAKE} split_peaks TEST=${TEST}
macs_post_treatment:
	@echo
	@echo "MACS post-treatment"
	@${MAKE} my_command MY_COMMAND="${MACS_POST_CMD}" TEST=${TEST}

## Compute stats on MACS peak lengths
peak_stats_macs:
	@${MAKE} peak_stats PEAKS='${MACS_PEAKS}' PEAK_EXT='.bed' PEAK_PREFIX='${MACS_PREFIX}' PEAK_CI='200' PEAK_GSTEP1=1000 PEAKLEN_XMAX=20000


## Fetch peak sequences returned by MACS + fixed-width peaks centered
## around MACS peak summits.
fetch_macs_peaks:
	@${MAKE} V=2 PEAKS=${MACS_PEAKS_SORTED} EXTEND=0 fetch_one_peak_set
	@${MAKE} V=2 PEAKS=${MACS_SUMMITS_SORTED} EXTEND=${SUMMIT_EXT} fetch_one_peak_set

fetch_macs_subpeaks:
	@${MAKE} V=2 PEAKS=${MACS_SUBPEAKS_SORTED} EXTEND=0 fetch_one_peak_set
	@${MAKE} _fetch_macs_subpeak_summits

## Beware ! In UCSC the bed format uses zero-based coordinates, and
## the end coordinate represents the first position *after* the
## feature.
_fetch_macs_subpeak_summits:
	awk -F'\t' '{print $$1"\t"$$5-1"\t"$$5"\t"$$4}' ${MACS_SUBPEAKS_SORTED}.bed > ${MACS_SUBPEAKS_SUMMITS_SORTED}.bed
	echo '	${MACS_SUBPEAKS_SUMMITS_SORTED}.bed'
	@${MAKE} V=2 PEAKS=${MACS_SUBPEAKS_SUMMITS_SORTED} EXTEND=${SUMMIT_EXT} fetch_one_peak_set

## Run all MACS_related tasks (peak-calling and post-treatment) for one test set
MACS_ALL_CMD=${MACS_CMD}; ${MACS_MODEL_CMD}; ${MACS_POST_CMD}
macs_all:
	@${MAKE} my_command MY_COMMAND="${MACS_ALL_CMD}"  TEST=${TEST}

#sort_peaks_macs:
SORT_MACS_CMD=	sort -nr -k 5 ${MACS_PEAKS}.bed > ${MACS_PEAKS_SORTED}.bed ; \
	sort -nr -k 5 ${MACS_SUMMITS}.bed > ${MACS_SUMMITS_SORTED}.bed
sort_peaks_macs:
	@echo
	@echo 'Sorting MACS peaks and summits by decreasing score values'
	@${MAKE} my_command MY_COMMAND="${SORT_MACS_CMD}"  TEST=${TEST}
	@echo '	${MACS_PEAKS_SORTED}.bed'
	@echo '	${MACS_SUMMITS_SORTED}.bed'

################################################################
## Run peak splitter to split the peaks (more precisely regions)
## produced by MACS.
WIG=${MACS_PEAKS_DIR}/${MACS_PREFIX}_MACS_wiggle/treat/${MACS_PREFIX}_treat_afterfiting_all.wig.gz
MACS_SUBPEAKS=${MACS_PEAKS}.subpeaks
MACS_SUBPEAKS_SORTED=${MACS_PEAKS}.subpeaks_sorted
MACS_SUBPEAKS_SUMMITS_SORTED=${MACS_PEAKS}.subpeaks_summits_sorted
SPLIT_PEAKS_CMD=PeakSplitter -f -p ${MACS_PEAKS}.bed -w ${WIG} -o ${MACS_PEAKS_DIR} > ${MACS_PEAKS_DIR}/peak_splitter.log ; \
	echo '	${MACS_PEAKS_DIR}/peak_splitter.log' ; \
	echo '	${MACS_SUBPEAKS}.bed' ; \
	head -1 ${MACS_SUBPEAKS}.bed > ${MACS_SUBPEAKS_SORTED}.bed ; \
	grep -v '^Chromosome' ${MACS_SUBPEAKS}.bed | sort -rn -k 4  >> ${MACS_SUBPEAKS_SORTED}.bed ; \
	echo '	${MACS_SUBPEAKS_SORTED}.bed' ; \
	awk -F'\t' '{print $$1"\t"$$5-1"\t"$$5"\t"$$4}' ${MACS_SUBPEAKS_SORTED}.bed > ${MACS_SUBPEAKS_SUMMITS_SORTED}.bed ; \
	echo '	${MACS_SUBPEAKS_SUMMITS_SORTED}.bed' ; \
	echo 'PeakSplitter done'; \
	${POST_SPLIT_CMD}

POST_SPLIT_CMD =\
	${MAKE} fetch_macs_subpeaks
split_peaks:
	@echo
	@echo 'Running PeakSplitter ${MACS_PREFIX}'
	@${MAKE} my_command MY_COMMAND="${SPLIT_PEAKS_CMD}"  TEST=${TEST}

################################################################
## Peak calling with SWEMBL
SWEMBL_PREFIX=SWEMBL_${TEST}_vs_${CTRL}_R${SWEMBL_R}
SWEMBL_PEAKS_DIR=${DIR_PEAKS}/SWEMBL/${SWEMBL_PREFIX}
SWEMBL_PEAKS=${SWEMBL_PEAKS_DIR}/${SWEMBL_PREFIX}_peaks
#SWEMBL_R=0.003
SWEMBL_R=0.002
SWEMBL_CMD=SWEMBL -i ${TEST_READS} -r ${CTRL_READS} -R ${SWEMBL_R} -B -o ${SWEMBL_PEAKS}.bed ; \
	echo '	${SWEMBL_PEAKS}.bed'; ${MAKE} swembl_post_treatment TEST=${TEST} ANALYSIS=${ANALYSIS}
swembl:
	@echo
	@echo "Peak calling with SWEMBL"
	@echo "Log file	${SWEMBL_PEAKS}_log.txt"
	@mkdir -p ${SWEMBL_PEAKS_DIR}
	@${MAKE} my_command MY_COMMAND="${SWEMBL_CMD}"  TEST=${TEST}

swembl_post_treatment: 
	${MAKE}  TEST=${TEST} peak_stats_swembl sort_swembl_peaks swembl_summits fetch_swembl_seq

## Sort SWEMBL peak and adapt them to standard bed format (name column
## should be 4, score column should be 5)
SWEMBL_PEAKS_SORTED=${SWEMBL_PEAKS}_sorted
sort_swembl_peaks:
	@echo
	@echo "Converting SWEMBL peaks"
	@convert-features -v 0 -from swembl -to bed -i  ${SWEMBL_PEAKS}.bed | grep -v '^chrM' > ${SWEMBL_PEAKS}_converted.bed
	@echo "	${SWEMBL_PEAKS}_converted.bed"
	@echo "Sorting SWEMBL peaks"
	@grep -v ';' ${SWEMBL_PEAKS}_converted.bed \
		| grep -v '^track name' \
		| grep -v '^browser' \
		| grep -v '^#' \
		| sort -rn -k 5 \
		> ${SWEMBL_PEAKS_SORTED}.bed
#	@grep -v '^\#' ${SWEMBL_PEAKS}.bed \
#		| grep -v '^Region' \
#		| sort -nr -k 7 \
#		| awk -F'\t' '{print $$1"\t"$$2"\t"$$3"\t"$$1"_"$$2"_"$$3"_+\t"$$7}' \
#		> ${SWEMBL_PEAKS_SORTED}.bed
	@echo "	${SWEMBL_PEAKS_SORTED}.bed"

## Identify SWEMBL peak summits (last column of SWEMBL output) Beware
## ! I bed format, coordinates are zero-based , and the end coordinate
## indicates the first base *after* the feature.
SWEMBL_SUMMITS_SORTED=${SWEMBL_PEAKS_DIR}/${SWEMBL_PREFIX}_summits_sorted
swembl_summits:
	@echo
	@echo "Extracting summits for SWEMBL file	${SWEMBL_PEAKS}_sorted.bed"
	@grep -v '^\#' ${SWEMBL_PEAKS}.bed \
		| grep -v '^Region' \
		| sort -nr -k 7 \
		| awk -F'\t' '{print $$1"\t"int($$10-1)"\t"int($$10)"\t"$$1"_"$$10"_"$$10"_+\t"$$7}' \
		> ${SWEMBL_SUMMITS_SORTED}.bed
	@echo "	${SWEMBL_SUMMITS_SORTED}.bed"

## Compute statistics on SWEMBL peaks
peak_stats_swembl:
	@${MAKE} peak_stats PEAKS="${SWEMBL_PEAKS}" PEAK_EXT='.bed' PEAK_PREFIX="${SWEMBL_PREFIX}" PEAK_CI="20" PEAK_GSTEP1=200 TEST="${TEST}" PEAKLEN_XMAX=1000

## Fetch peak sequences returned by SWEMBL
fetch_swembl_seq: fetch_swembl_peaks fetch_swembl_summits

fetch_swembl_peaks:
	@${MAKE} V=2 PEAKS=${SWEMBL_PEAKS_SORTED} EXTEND=0 fetch_one_peak_set

fetch_swembl_summits:
	@${MAKE} V=2 PEAKS=${SWEMBL_SUMMITS_SORTED} EXTEND=${SUMMIT_EXT} fetch_one_peak_set

## Run SWEMBL with a series of values for the parameter R
#SWEMBL_R_VALUES=0.2 0.17 0.15 0.12 0.1 0.05 0.02 0.01 0.005 0.003 0.002 0.001
SWEMBL_R_VALUES=0.01 0.005 0.003 0.002 0.001
SWEMBL_TASK=swembl
#SWEMBL_TASK=peak_len_distrib
swembl_series:
	for r in ${SWEMBL_R_VALUES} ; do \
		${MAKE} SWEMBL_R=$${r} ${SWEMBL_TASK}; \
	done


################################################################
## Run SICER to identify large enriched regions (zones)
################################################################
SICER_BASE_DIR=${SRC_DIR}/sicer
SICER_VERSION=1.1
SICER_DISTRIB_DIR=${SICER_BASE_DIR}/SICER_V${SICER_VERSION}
SICER_PGM=${SICER_DISTRIB_DIR}/SICER/SICER.sh
SICER_PREFIX=${TEST}_vs_${CTRL}_SICER
SICER_FDR=0.01
SICER_REGIONS_DIR=${DIR_PEAKS}/SICER/${SICER_PREFIX}
SICER_REDUND=1
SICER_GENOME_FR=0.74
SICER_WIN_SIZE=200
SICER_GAP_SIZE=600
SICER_SUMMARY=${SICER_REGIONS_DIR}/${TEST}-W${SICER_WIN_SIZE}-G${SICER_GAP_SIZE}-islands-summary
SICER_ISLANDS=${SICER_SUMMARY}-FDR${SICER_FDR}
SICER_ISLANDS_SORTED=${SICER_ISLANDS}_sorted
SICER_CMD=echo 'Starting SICER' >  ${SICER_REGIONS_DIR}/${SICER_PREFIX}.log ; \
	date >>  ${SICER_REGIONS_DIR}/${SICER_PREFIX}.log ; \
	(cd ${SICER_REGIONS_DIR}; sh ${SICER_PGM} ${PWD}/${DIR_READS} \
		${TEST}.bed ${CTRL}.bed \
		. ${GENOME} \
		${SICER_REDUND} \
		${SICER_WIN_SIZE} \
		${FRAGMENT_SIZE} \
		${SICER_GENOME_FR} \
		${SICER_GAP_SIZE} \
		${SICER_FDR} >> ./${SICER_PREFIX}.log ); \
	echo '\#chrom	start	end	ChIP_island_read_count	CONTROL_island_read_count	p_value	fold_change	FDR_threshold' \
		> ${SICER_ISLANDS_SORTED}.bed ; \
	sort -n -k 8 ${SICER_ISLANDS} >> ${SICER_ISLANDS_SORTED}.bed ; \
	echo 'Done SICER' >>  ${SICER_REGIONS_DIR}/${SICER_PREFIX}.log ; \
	date >>  ${SICER_REGIONS_DIR}/${SICER_PREFIX}.log ; \
	chmod -R a+r ${SICER_REGIONS_DIR}
SORT_SICER_CMD=
sicer:
	@echo
	@echo "Running SICER	${SICER_PGM}"
	@mkdir -p ${SICER_REGIONS_DIR}
	@echo "SICER regions directory	 ${SICER_REGIONS_DIR}"
	@${MAKE} my_command MY_COMMAND="${SICER_CMD}" TEST=${TEST}
	@echo "	SICER log"
	@echo "	${SICER_REGIONS_DIR}/${SICER_PREFIX}.log"
	@echo "SICER_ISLANDS	${SICER_ISLANDS}"
	@echo "SICER_ISLANDS_SORTED 	${SICER_ISLANDS_SORTED}.bed"

sort_sicer:
#	bedtools sort -i ${SICER_ISLANDS} > ${SICER_ISLANDS_SORTED}.bed
	@echo '#chrom	start	end	ChIP_island_read_count	CONTROL_island_read_count	p_value	fold_change	FDR_threshold' \
		> ${SICER_ISLANDS_SORTED}.bed
	@sort -n -k 8 ${SICER_ISLANDS} >> ${SICER_ISLANDS_SORTED}.bed
	@echo "	${SICER_ISLANDS_SORTED}.bed"

################################################################
## ATTENTION: the Engineer notation is not interpreted correctly by awk -> we change the thresholds
## SICER_THRESHOLD_VALUES=1e-001 1e-002 1e-005 1e-010 1e-020 1e-030 1e-040 1e-050 1e-070 1e-100 1e-150 1e-200
SICER_THRESHOLD_VALUES=0.01 0.001 0.0001 0.00001 0.0000000001
sicer_threshold_series:
	@echo
	@echo "Selecting SICER regions with decreasing FDR values"
	@echo "Number of lines in original SICER_SUMMARY	${SICER_SUMMARY}"
	@wc -l ${SICER_SUMMARY}
	@echo
	@echo "SICER_THRESHOLD_VALUES	${SICER_THRESHOLD_VALUES}"
	@for t in ${SICER_THRESHOLD_VALUES}; do \
		${MAKE} _sicer_threshold SICER_THRESHOLD=$${t} CTRL=${CTRL}; \
	done


## Select subsets of SICER islands with various thresholds on FDR.
## Note: the threshold must be decreased by several orders of
## magnitude to observe a sensible effect on the peak number.
SICER_THRESHOLD=1e-050
SICER_THRESHOLDED=${SICER_SUMMARY}_FDR${SICER_THRESHOLD}
_sicer_threshold:
	awk -F '\t' '$$8 <= ${SICER_THRESHOLD}' ${SICER_SUMMARY} > ${SICER_THRESHOLDED}.bed
	@echo "	${SICER_THRESHOLDED}.bed"
	@wc -l ${SICER_THRESHOLDED}.bed
	@${MAKE} _sicer_sort SICER_TO_SORT=${SICER_THRESHOLDED}
	@rm ${SICER_THRESHOLDED}.bed

## Sort one set of SICER regions
SICER_TO_SORT=${SICER_THRESHOLDED}
SICER_SORTED=${SICER_TO_SORT}_sorted
SICER_SORT_CMD=echo '\#chrom	start	end	ChIP_island_read_count	CONTROL_island_read_count	p_value	fold_change	FDR_threshold	-log10_FDR' > ${SICER_SORTED}.bed; \
	awk -F'\t' '$$8 <= 1e-300 {print $$0"\t"320}' ${SICER_TO_SORT}.bed | sort -nr -k 7  >> ${SICER_SORTED}.bed; \
	awk -F'\t' '$$8 >  1e-300 {print $$0"\t"(-log($$8)/log(10))}' ${SICER_TO_SORT}.bed | sort -rn -k 9  >> ${SICER_SORTED}.bed
_sicer_sort:
#	@echo 
#	@echo "Sorting SICER regions by increasing FDR"
#	@make my_command MY_COMMAND=`${SICER_SORT_CMD}`
	@${SICER_SORT_CMD}
	@wc -l ${SICER_SORTED}.bed
#	@echo "	SICER_SORTED	${SICER_SORTED}.bed"

## Remove VERY heavy temporary files generated by SICER (lists of
## ignored reads, full list of reads under the islands, scored
## islands, ...).
sicer_clean:
	@echo
	@echo "Cleaning SICER directory	${SICER_REGIONS_DIR}"
	@echo "Before cleaning"
	@du -sk ${SICER_REGIONS_DIR}
	@du -sk ${SICER_REGIONS_DIR}/* | sort -n
	rm -f ${SICER_REGIONS_DIR}/*-removed.bed.gz
	rm -f ${SICER_REGIONS_DIR}/*-islandfiltered.bed.gz
	rm -f ${SICER_REGIONS_DIR}/*-removed.bed
	rm -f ${SICER_REGIONS_DIR}/*-islandfiltered.bed
	rm -f ${SICER_REGIONS_DIR}/*.graph
	@echo "After cleaning"
	@du -sk ${SICER_REGIONS_DIR}
	@du -sk ${SICER_REGIONS_DIR}/* | sort -n

peak_stats_sicer:
	@${MAKE} peak_stats PEAKS="${SICER_ISLANDS}_sorted" PEAK_EXT='.bed' PEAK_PREFIX="${SICER_PREFIX}" PEAK_CI=${SICER_WIN_SIZE}  PEAK_GSTEP1=2000 TEST="${TEST}" PEAKLEN_XMAX=10000

################################################################
## Select the intersections between SWEMBL peaks and SICER regions
##
## - SICER regions that overlap at least one peak ("ori zones")
## - SWEMBL peaks that overlap at least one SICER region ("ori peaks")
origins: sort_sicer sort_swembl_peaks swembl_vs_sicer fetch_ori_summits

ORI_DIR=${DIR_PEAKS}/ORIGINS/${SWEMBL_PREFIX}_SICER_match
ORI_PREFIX=${SWEMBL_PREFIX}_SICERmatch
ORI_PEAKS=${ORI_DIR}/${ORI_PREFIX}
ORI_REGIONS_PREFIX=${SICER_PREFIX}_SWEMBLmatch
ORI_REGIONS=${ORI_DIR}/${ORI_REGIONS_PREFIX}
swembl_vs_sicer:
	@mkdir -p ${ORI_DIR}
	@echo
	@echo "Computing intersection between SICER and SWEMBL"
	@echo
	@echo "Input peaks"
	@echo "	SWEMBL peaks"
	@${MAKE} PEAKS=${SWEMBL_PEAKS}_converted.bed _report_peak_nb  _peak_len_distrib
	@echo "	SICER regions"
	@${MAKE} PEAKS=${SICER_ISLANDS_SORTED}.bed  _report_peak_nb  _peak_len_distrib
	@echo
	@echo "Output peaks"
	bedtools intersect -bed -a ${SWEMBL_PEAKS}_converted.bed -b ${SICER_ISLANDS} -u -wa > ${ORI_PEAKS}.bed
	@echo "	ORI_PEAKS (SWEMBL peaks with SICER overlap)"
	@${MAKE} PEAKS=${ORI_PEAKS}.bed _report_peak_nb  _peak_len_distrib
	${MAKE} _ori_summits
	bedtools intersect -bed -b ${SWEMBL_PEAKS}_converted.bed -a ${SICER_ISLANDS} -u -wa > ${ORI_REGIONS}.bed
	@echo "	ORI_REGIONS (SICER islands containing SWEMBL peaks)"
	@${MAKE} PEAKS=${ORI_REGIONS}.bed _report_peak_nb  _peak_len_distrib
	@echo
	@echo "	ORI_DIR	${ORI_DIR}"

ORI_SUMMITS=${ORI_PEAKS}_summits
_ori_summits:
	awk -F'\t' '{print $$1"\t"int($$7-1)"\t"int($$7)"\t"$$1"_"int($$7-1)"_"int($$7)"+\t"$$5"\t"$$6}' ${ORI_PEAKS}.bed > ${ORI_SUMMITS}.bed
	@echo "	ORI_SUMMITS	${ORI_SUMMITS}.bed"

## We specify a larger extension in order to have the same number of words
fetch_ori_summits:
	@${MAKE} V=2 PEAKS=${ORI_SUMMITS} EXTEND=${SUMMIT_EXT} fetch_one_peak_set

fetch_ori_peaks:
	@${MAKE} V=2 PEAKS=${ORI_PEAKS} EXTEND=0 fetch_one_peak_set

_report_peak_nb:
	@echo "	`wc -l ${PEAKS}`"

_peak_len_distrib:
	sequence-lengths -in_format bed -i ${PEAKS} | classfreq -col 2 -v 1 -ci 100 -o  ${PEAKS}_len_distrib.tab
	@echo "	${PEAKS}_len_distrib.tab"
	XYgraph -i ${PEAKS}_len_distrib.tab -format ${IMG_FORMAT} -pointsize 0 -xcol 3 -ycol 4,5,6 -o ${PEAKS}_len_distrib.${IMG_FORMAT}
	@echo "	${PEAKS}_len_distrib.${IMG_FORMAT}"

################################################################
## Compute distribution of peak sizes
PEAK_STATS_CMD=
PEAK_CI=200
PEAK_GSTEP1=1000
PEAKLEN_XMAX=10000
IMG_FORMAT=png
peak_stats:
	@echo
	@echo "Computing peak length distribution	${PEAK_FILE}"
	grep -v "^\#" ${PEAK_FILE} \
		| awk '{print $$3-$$2+1}'  \
		| classfreq  -v -ci ${PEAK_CI} \
		-o ${PEAKS}_size_distrib.tab ; \
	XYgraph -i  ${PEAKS}_size_distrib.tab \
		-size 1200 -ysize 400 -xcol 3 -ycol 4,5,6 -lines \
		-xgstep1 ${PEAK_GSTEP1} -xgstep2 ${PEAK_CI} -xmax ${PEAKLEN_XMAX} \
		-title1 "${PEAK_PREFIX}" \
		-xleg1 "region size" -yleg1 "number of regions"  \
		-pointsize 0 -legend \
		-format ${IMG_FORMAT} \
		-o ${PEAKS}_size_distrib.${IMG_FORMAT} ; \
	XYgraph -i  ${PEAKS}_size_distrib.tab \
		-size 1200 -ysize 400 -xcol 3 -ycol 4,5,6 -lines \
		-title1 "${PEAK_PREFIX}" \
		-xleg1 "region size" -yleg1 "number of regions" -log 2  \
		-pointsize 0 -legend \
		-format ${IMG_FORMAT} \
		-o ${PEAKS}_size_distrib_log.${IMG_FORMAT} ; \
	echo "	${PEAKS}_size_distrib.tab" ; \
	echo "	${PEAKS}_size_distrib_log.${IMG_FORMAT}" ; \
	echo "	${PEAKS}_size_distrib.${IMG_FORMAT}" 

################################################################
## Plot read counts as a function of peak length
peak_len_vs_score:
	@echo
	@echo "Plotting read counts as a funciton of peak lengths"
	@echo "	${PEAK_FILE}"
	@grep -v "^\#" ${PEAK_FILE} \
		| awk -F'\t' '{print $$3-$$2+1"\t"$$5}' \
		| XYgraph -xcol 1 -ycol 2 \
		-xlab 'Peak length' -yleg1 'Read counts' \
		-format ${IMG_FORMAT} -log \
		-o ${PEAKS}_len_vs_counts_log.${IMG_FORMAT}
	@echo "	${PEAKS}_len_vs_counts.${IMG_FORMAT}"

################################################################
## Compare statistics between two peak sets
peak_stats_compa:
	@echo "NOT YET IMPLEMENTED"

################################################################
## Fetch peak sequences from UCSC
PEAK_PREFIX=${MACS_PREFIX}
PEAKS=${MACS_PEAKS_SORTED}
PEAK_EXT=.bed
PEAK_FILE=${PEAKS}${PEAK_EXT}
PEAK_SEQ=${PEAKS}.fasta

## Fetch peak sequences 
EXTEND=0
FETCH_CHUNK=1000
FETCH_CMD=fetch-sequences -v ${V} -i ${PEAK_FILE} \
	-header_format galaxy -extend ${EXTEND} -chunk ${FETCH_CHUNK} -genome ${GENOME} \
	${OPT} \
	-o ${PEAK_SEQ}
fetch_one_peak_set:
	@echo
	@echo "Fetching peaks	${PEAK_FILE}"
	@${MAKE} my_command MY_COMMAND="${FETCH_CMD}"  TEST=${TEST}
	@echo ${PEAK_SEQ}

################################################################
## Select random peaks (negative controls)
################################################################
#ORG=Homo_sapiens_EnsEMBL
#ORG=Mus_musculus_EnsEMBL
ORG=`awk -F'\t' '$$3 == "${ANALYSIS}" {print $$5"_EnsEMBL"}' ${ANALYSIS_TABLE} | head -1 | perl -pe 's| |_|g'`
TEMPLATE_DIR=${ORI_DIR}
TEMPLATE_PREFIX=${ORI_PREFIX}_summits
TEMPLATE_PEAKS=${TEMPLATE_DIR}/${TEMPLATE_PREFIX}
RAND_REPEAT=01
RAND_DIR=analysis/peaks/random_fragments/rand_fragments_${TEMPLATE_PREFIX}
RAND_PREFIX=rand_fragments_${TEMPLATE_PREFIX}_rep${RAND_REPEAT}
RAND_PEAKS=${RAND_DIR}/${RAND_PREFIX}


## Generate repeats of random fragments
list_rand_rep:
	@echo "RAND_REPEATS	${RAND_REPEATS}"

## I read peak length from the bed files, in order to be able to treat
## the summits (size=1bp), which will then be extended by a fixed
## length when fecthing the sequences (${SUMMIT_EXT}).
RAND_FRAGMENTS_CMD=random-genome-fragments -v ${V} \
		-org ${ORG} \
		-i ${TEMPLATE_PEAKS}.bed -template_format bed \
		-return coord -coord_format bed ${OPT} \
		| cut -f 1-3 > ${RAND_PEAKS}.bed
random_fragments_one_rep:
	@echo
	@echo "Random genome fragments (repeat ${RAND_REPEAT})"
	@echo "	Template peaks	${TEMPLATE_PEAKS}.fasta"
	@echo "	Analysis	${ANALYSIS}"
	@echo "	Organism	${ORG}"
	@echo "	${RAND_FRAGMENTS_CMD}"
	@mkdir -p ${RAND_DIR}
#	@${MAKE} my_command MY_COMMAND="${RAND_FRAGMENTS_CMD}"
	@echo ${RAND_FRAGMENTS_CMD}
	${RAND_FRAGMENTS_CMD}
	@echo "	${RAND_PEAKS}.bed"

random_fragments:
	@echo
	@echo "Generating random genome fragments	${RAND_REPEATS}"
	@for rep in ${RAND_REPEATS}; do \
		${MAKE} random_fragments_one_rep RAND_REPEAT=$${rep} ; \
	done

fetch_random_fragments_one_rep:
	@echo
	@echo "Fetching random genome fragments	${RAND_PREFIX}"
	@${MAKE} fetch_one_peak_set PEAK_PREFIX=${RAND_PREFIX} PEAKS=${RAND_PEAKS} V=2
	@echo "	${RAND_PEAKS}.fasta"

fetch_random_fragments:
	@echo
	@echo "Fetching sequences for random genome fragments	${RAND_REPEATS}"
	@for rep in ${RAND_REPEATS}; do \
		${MAKE} fetch_fragments_one_rep RAND_REPEAT=$${rep} ; \
	done


## Select random genome fragments based on MACS peaks
random_macs_peaks:
	@${MAKE} random_fragments TEMPLATE_DIR='${MACS_PEAKS_DIR}' TEMPLATE_PREFIX='${MACS_PREFIX}_peaks_sorted'

## Select random genome fragments based on MACS summits
random_macs_peak_summits:
	@${MAKE} random_fragments TEMPLATE_DIR=${MACS_PEAKS_DIR} TEMPLATE_PREFIX=${MACS_PREFIX}_summits_sorted EXTEND=${SUMMIT_EXT}

## Select random genome fragments based on summits of MACS sub-peaks (obtained from PeakSplitter)
random_macs_subpeak_summits:
	@${MAKE} random_fragments TEMPLATE_DIR=${MACS_PEAKS_DIR} TEMPLATE_PREFIX=${MACS_PREFIX}_peaks.subpeaks_summits_sorted EXTEND=${SUMMIT_EXT}

## Select random genome fragments based on SWEMBL peaks
random_swembl_peaks:
	@${MAKE} random_fragments TEMPLATE_DIR=${SWEMBL_PEAKS_DIR} TEMPLATE_PREFIX=${SWEMBL_PREFIX}_peaks_sorted

## Select random genome fragments based on SWEMBL summits
random_swembl_summits:
	@${MAKE} random_fragments TEMPLATE_DIR=${SWEMBL_PEAKS_DIR} TEMPLATE_PREFIX=${SWEMBL_PREFIX}_summits_sorted EXTEND=${SUMMIT_EXT}
#	@${MAKE} random_fragments TEMPLATE_DIR='${SWEMBL_PEAKS_DIR}' TEMPLATE_PREFIX='${SWEMBL_PREFIX}_summits_sorted' EXTEND=${SUMMIT_EXT}
## Select random genome fragments based on SWEMBL summits

################################################################
## Generate random peaks corresponding to the "origin summits", where
## "origins" are SWEMBL peaks overlapping SICER regions. Same number
## of peaks, and same lengths (1kb on each size).
random_ori_summits:
	@${MAKE} random_fragments TEMPLATE_DIR=${ORI_DIR} TEMPLATE_PREFIX=${ORI_PREFIX}_summits EXTEND=${SUMMIT_EXT}


################################################################
## Generate random peaks corresponding to the "origin regions", where
## "origins" are SICER regions contianing at least one SWEMBL
## peak. Same number of peaks, and same lengths as the origin regions.
random_ori_regions:
	@${MAKE} random_fragments TEMPLATE_DIR=${ORI_DIR} TEMPLATE_PREFIX=${ORI_REGIONS_PREFIX} EXTEND=0


## Select random genome fragments based on summits obtained with different peak-calling approaches
random_all_summits: random_macs_peak_summits random_macs_subpeak_summits random_swembl_summits


################################################################
## Generate random sequences (negative controls)
################################################################
MKV=0
#RANDSEQ_NB=`wc -l ${TEMPLATE_PEAKS}.bed | awk '{print $$1}'`
#RANDSEQ_LEN=2000
#RANDSEQ_PREFIX=RANDSEQ_N${RANDSEQ_NB}_L${RANDSEQ_LEN}_mkv${MKV}_${TEMPLATE_PREFIX}
RANDSEQ_PREFIX=RANDSEQ_mkv${MKV}_${TEMPLATE_PREFIX}
RANDSEQ_DIR=analysis/peaks/RANDSEQ/${RANDSEQ_PREFIX}
RANDSEQ_FILE=${RANDSEQ_DIR}/${RANDSEQ_PREFIX}
RANDSEQ_CMD=sequence-lengths -i ${TEMPLATE_PEAKS}.fasta -in_format fasta -o ${TEMPLATE_PEAKS}_seqlen.tab; \
		echo ${TEMPLATE_PEAKS}_seqlen.tab; \
		random-seq -v 0 -type DNA -lw 0\
		-org ${ORG} -markov ${MKV} \
		-lf ${TEMPLATE_PEAKS}_seqlen.tab \
		-return coord -coord_format bed \
		-o ${RANDSEQ_FILE}.fasta
#		-l ${RANDSEQ_LEN} -n ${RAND_SEQ_NB}
randseq:
	@mkdir -p ${RANDSEQ_DIR}
	@echo "Generating  random sequences ${RANDSEQ_LEN}"
	@${MAKE} my_command MY_COMMAND='${RANDSEQ_CMD}'
	@echo  ${TEMPLATE_PEAKS}_seqlen.tab
	@echo  ${RANDSEQ_FILE}.fasta

## Select randseq genome fragments based on MACS peaks
randseq_macs_peaks:
	@${MAKE} randseq TEMPLATE_DIR='${MACS_PEAKS_DIR}' TEMPLATE_PREFIX='${MACS_PREFIX}_peaks_sorted'

## Select randseq genome fragments based on MACS summits
randseq_macs_peak_summits:
	${MAKE} randseq TEMPLATE_DIR='${MACS_PEAKS_DIR}' TEMPLATE_PREFIX='${MACS_PREFIX}_summits_sorted' EXTEND=${SUMMIT_EXT}

## Select randseq genome fragments based on summits of MACS sub-peaks (obtained from PeakSplitter)
randseq_macs_subpeak_summits:
	${MAKE} randseq TEMPLATE_DIR='${MACS_PEAKS_DIR}' TEMPLATE_PREFIX='${MACS_PREFIX}_peaks.subpeaks_summits_sorted' EXTEND=${SUMMIT_EXT}

## Select randseq genome fragments based on SWEMBL summits
randseq_swembl_summits:
	${MAKE} randseq TEMPLATE_DIR='${SWEMBL_PEAKS_DIR}' TEMPLATE_PREFIX='${SWEMBL_PREFIX}_summits_sorted' EXTEND=${SUMMIT_EXT}

## Select randseq genome fragments based on ORI summits (SWEMBL inter SICER)
randseq_ori_summits:
	${MAKE} randseq TEMPLATE_DIR='${ORI_DIR}' TEMPLATE_PREFIX='${ORI_PREFIX}_summits' EXTEND=${SUMMIT_EXT}

## Select randseq genome fragments based on summits obtained with different peak-calling approaches
randseq_all_summits: randseq_macs_peak_summits randseq_macs_subpeak_summits randseq_swembl_summits


################################################################
## Analyze distribution of inter-peak distances
PEAK_DIST_SCRIPT=scripts/R-scripts/inter_peak_distances.R
#PEAKS=${MACS_PEAKS}.bed
inter_peak_dist:
	@cat ${PEAK_DIST_SCRIPT} \
		| R --slave --no-save --no-restore --no-environ \
		--args "file.peaks='${PEAK_FILE}'"

inter_peak_dist_macs:
	@${MAKE} inter_peak_dist PEAKS=${MACS_PEAKS}.bed

inter_peak_dist_swembl:
	@${MAKE} inter_peak_dist PEAKS=${SWEMBL_PEAKS}.bed


inter_peak_dist_SICER:
	@${MAKE} inter_peak_dist PEAKS=${SICER_THRESHOLDED}.bed


################################################################
## Compare peaks between two peak calling results
## - compute intersection
## - compare the scores obtained by similar peaks in the different programs
TEST1=ES_indiff_vs_input
TEST1_PEAKS=analysis/peaks/SWEMBL/SWEMBL_${TEST1}_R0.003/SWEMBL_${TEST1}_R0.003_peaks_sorted.bed
TEST2=ES_indiff_vs_HF_RNAse
TEST2_PEAKS=analysis/peaks/SWEMBL/SWEMBL_${TEST2}_R0.003/SWEMBL_${TEST2}_R0.003_peaks_sorted.bed
PEAK_COMPA_DIR=analysis/peaks/peak_comparisons/
PEAK2_COMPA=${PEAK_COMPA_DIR}/${TEST1}__vs__${TEST2}
compare_2_peak_sets:
	@mkdir -p ${PEAK_COMPA_DIR}
	compare-features -v 2 -i ${TEST1_PEAKS} -i ${TEST2_PEAKS} -iformat bed -oformat ft -return stats,inter,diff -lth inter_len 1 -o ${PEAK_COMPA}.ft
	@echo ${PEAK_COMPA}.ft

PEAK_SETS=${MACS_PEAK_SETS_MM}
PEAK_COMPA=${PEAK_COMPA_DIR}/${COMPA_PREFIX}_peak_compa
compare_peaks:
	@echo ""
	@echo "Comparing peak sets	${COMPA_PREFIX}"
	@echo "	${PEAK_SETS}"
	@mkdir -p ${PEAK_COMPA_DIR}
	compare-features -v 2 -iformat bed -oformat ft -return stats -lth inter_len 1 -o ${PEAK_COMPA}.tab \
		-files ${PEAK_SETS}
	@echo ${PEAK_COMPA}.ft

MACS_PEAK_SETS_MM=`ls -1 analysis/peaks/MACS/*/*_peaks_sorted.bed | grep -v Genomique`
compare_peaks_macs:
	@${MAKE} compare_peaks PEAK_SETS='${MACS_PEAK_SETS_MM}' COMPA_PREFIX=MACS_Mus_musculus

SWEMBL_PEAK_SETS_MM=`ls -1 analysis/peaks/SWEMBL/*${SWEMBL_R}/*_peaks_sorted.bed | grep -v Genomique`
compare_peaks_swembl:
	@${MAKE} compare_peaks PEAK_SETS='${SWEMBL_PEAK_SETS_MM}' COMPA_PREFIX=SWEMBL_Mus_musculus_R${SWEMBL_R}
