main=NA, ## Main title for the plot
cex.score = "ARI", # Supported: "ARI" or "Acc"
cex.legend = 1,
min.cex=0.4, ## Minimal cex value, because ARI can have null or negative values
add=FALSE ## If TRUE, plot is added to previous plot
) {
## Plot the Sensitivity / PPV tradeoff per TF family
iteration.counter <- 0
plotted.series <- vector()
legend.pch <- vector()
legend.col <- vector()
current.tool <- "stamp" ## Init for quick test
for(current.tool in c(tools.to.plot)){
current.metric <- "cor" ## Init for quick test
for(current.metric in c(metrics.to.plot, "stamp")){
current.rule <- "average"
for(current.rule in c(rules.to.plot, "stamp")){
## Select the subset of the stats table corresponding to the current series
current.series <- subset(stats,
tool == current.tool &
linkage.rule == current.rule &
threshold.metric == current.metric)
# nrow(current.series)
## Sort current series by increasing Sn for the lines
current.series <- current.series[order(current.series$Sn, decreasing = TRUE),]
if (nrow(current.series) > 0) {
current.series.label <- paste(current.tool, current.metric, current.rule, sep = "_")
plotted.series <- append(plotted.series, current.series.label)
## Choose pch according to the tool + agglomeration rule
if (current.tool =="stamp") {
## Use different pch for stamp
#            current.pch <- pch.per.rule[current.rule]  + length(linkage.rules) + 10
current.pch <- pch.per.rule["stamp"]
} else {
current.pch <- pch.per.rule[current.rule]
}
legend.pch <- append(legend.pch, current.pch)
## Choose color according to the metric
current.col <- col.per.metrics[current.metric]
legend.col <- append(legend.col, current.col)
iteration.counter <- iteration.counter + 1
par(mar=c(4.6, 4.6, 2, 0.5))
if ((iteration.counter == 1) & (add == FALSE)) {
plot(1, type="n", main=main,
xlim=c(0,1), ylim=c(0,1),
las=1,
xlab="Clustering-wise PPV",
ylab="Clustering-wise Sn"
)
abline(a = 0, b = 1, col = "darkgray", lty="solid")
abline(a = 1, b = -1, col = "darkgray", lty="solid") ## random expectation for a ROC curve
abline(h=seq(0,1,0.1), col = "#DDDDDD", lty="solid")
abline(v=seq(0,1,0.1), col = "#DDDDDD", lty="solid")
}
## Define point sizes
if (cex.score == "ARI") {
cex <- 4*current.series$ARI
} else {
cex <- 2*current.series$Acc
}
cex <- pmax(cex, min.cex)
## Draw the series Sn / PPV values
lines(current.series[, c("PPV", "Sn")],
xlab="Clustering-wise PPV",
ylab="Clustering-wise Sn",
col= current.col,
cex=cex,
pch=current.pch,
type = "b"
)
}
legend("bottomleft",
bty="o", bg="white",
legend = plotted.series,
cex=cex.legend, col=legend.col,
pch=legend.pch)
}
# par(mfrow=c(3,3))
# thrash <- sapply(compa.metrics, function (x) {
#   message("Comparison metrics:", x)
#   plot.PPV.Sn(stats,
#               tools.to.plot=tools,
#               metrics.to.plot=x,
#               rules.to.plot=linkage.rules, main=paste("Metric:", x))
# })
#
# thrash <- sapply(linkage.rules, function (x) {
#   message("Linkage rule:", x)
#   plot.PPV.Sn(stats,
#               tools.to.plot=tools,
#               metrics.to.plot=compa.metrics,
#               rules.to.plot=x, main=paste("Linkage:", x))
# })
# par(mfrow=c(1,1))
par(mfrow=c(2,2))
counter <- 0
for(l in c("single", "average", "complete")){
counter <- counter + 1
plot.PPV.Sn(stats,
tools.to.plot="matrix-clustering",
metrics.to.plot=compa.metrics.matrix.clustering,
cex.score = "ARI", cex.legend = 0.85,
rules.to.plot=l, main=paste(letters[counter], ") All metrics - ", l, " linkage", sep = "")
)
}
par(mfrow=c(3,2))
for(m in compa.metrics.matrix.clustering){
sim <- unlist(strsplit(m, "-"))[1]
plot.PPV.Sn(stats,
tools.to.plot="matrix-clustering",
metrics.to.plot=m,
cex.score = "ARI", cex.legend = 1.2,
rules.to.plot=c("single", "average", "complete"), main = paste("Similarity metric: ", sim, " - Threshold:", m)
)
}
par(mfrow=c(1,1))
## Print stat table (matrix-clustering results only)
## !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
## JAIME, I SUPPRESS THIS SINCE IT WAS SELECTING TOO MANY ROWS
# max.ACC.all <- NULL
# # max.Acc.files <- vector()
# for(l in linkage.rules){
#   stats.tmp <- stats[which(stats$linkage.rule == l),]
#   max.ACC <- ddply(stats.tmp, "threshold.metric", summarize, max.ACC = max(Acc, na.rm = TRUE))
#   max.ACC.all <- append(max.ACC.all, max.ACC$max.ACC)
#   # max.Acc.file <- as.vector(stats.tmp$file[which.max(stats.tmp$Acc)])
#   # max.Acc.files <-append(max.Acc.files, max.Acc.file)
#   rm(stats.tmp)
# }
# # max.Acc.lines <- stats$file %in% max.Acc.files
# max.ACC.all <- sort(max.ACC.all, decreasing = TRUE)
# max.Acc.lines <- unlist(sapply(max.ACC.all, function(x){
#   which(stats$Acc == x)
# }))
# stats.short <- stats[max.Acc.lines,]
# stats.short <- subset(stats.short, tool == "matrix-clustering")
# # View(stats.short)
#
# #stats.short <- stats.short[order(stats.short$threshold.metric, stats.short$linkage.rule, stats.short$Acc, decreasing = TRUE),]
# stats.short <- stats.short[order(stats.short$Acc, decreasing = TRUE),]
#
## Print a table with the maximal accuracy per combination of linkage + cutoff metric
# kable(stats.short[, stats.columns], row.names = FALSE, digits=c(0,0,0,2,0,3,3,3,3), col.names = stats.colnames, caption="**Old Table IV:** Best combinations of RSAT matrix-clustering parameters, sorted by descending Accuracy (Acc).")
## !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
## Select parameters giving maximal accuracy (Acc) per algorithm and linkage rule
max.Acc <- ddply(.data = stats, .variables=c("threshold.metric", "linkage.rule"), .fun=summarize, max.Acc = max(Acc, na.rm=TRUE), file=file[which.max(Acc)])
stats.max.Acc <- subset(stats, file %in% max.Acc$file)
stats.max.Acc <- stats.max.Acc[order(stats.max.Acc$Acc, decreasing = TRUE),]
## Select parameters giving maximal Adjusted Rand Index (ARI) per algorithm and linkage rule
max.ARI <- ddply(.data = stats, .variables=c("threshold.metric", "linkage.rule"), .fun=summarize, max.ARI = max(ARI, na.rm=TRUE), file=file[which.max(ARI)])
stats.max.ARI <- subset(stats, file %in% max.ARI$file)
stats.max.ARI <- stats.max.ARI[order(stats.max.ARI$ARI, decreasing = TRUE),]
## Select colums to print out for the table with optimal parameters
stats.columns <- c("linkage.rule", "similarity.metric", "threshold.metric",  "cutoff", "nb.clusters", "Sn", "PPV", "Acc", "ARI")
stats.colnames <- sub(x = stats.columns, pattern = "linkage.rule", replacement = "Linkage")
stats.colnames <- sub(x = stats.colnames, pattern = "similarity.metric", replacement = "Sim. metric")
stats.colnames <- sub(x = stats.colnames, pattern = "threshold.metric", replacement = "Cutoff metric")
stats.colnames <- sub(x = stats.colnames, pattern = "cutoff", replacement = "Cutoff")
stats.colnames <- sub(x = stats.colnames, pattern = "nb.clusters", replacement = "Clusters")
## Print a table with the maximal accuracy per combination of linkage + cutoff metric
kable(stats.max.Acc[stats.max.Acc$tool == "matrix-clustering", stats.columns], row.names = FALSE, digits=c(0,0,0,2,0,3,3,3,3), col.names = stats.colnames, caption="**Table VIa: Impact of the parametric choices on the correspondence between motif clusters and transcription factor families.** For each linkage rule and metric (or pair of metrics) the table displays the correspondence statistics for the partitioning threshold giving the maximal Accuracy ($Acc$). ")
## Print a table with the maximal ARI per combination of linkage + cutoff metric
kable(stats.max.ARI[stats.max.ARI$tool == "matrix-clustering", stats.columns], row.names = FALSE, digits=c(0,0,0,2,0,3,3,3,3), col.names = stats.colnames, caption="**Table VIb:** Correspondence statistics for the partitioning threshold giving the maximal Adjusted Rand Index ($ARI$). ")
#caption = "**Impact of the parametric choices on the relevance of *matrix-clustering* **. Each row indicates the statistics for one parameter setting for the comparison between clusters obtained from the HOCOMOCO Human motifs and transcription factor families. **Sn:** average sensitivity, i.e. proportion of motifs from a protein family regrouped in a same cluster. **PPV:** average positive predictive value, i.e. proportion of motifs of a cluster that belong to the same protein family. **Acc:** average accuracy, computed as the geometric mean between Sn and PPV. "
plot.PPV.Sn(stats,
tools.to.plot=tools,
metrics.to.plot=c("Ncor-cor","cor", "Ncor", "PCC", "ALLR", "CS", "KL", "SSD"),
#metrics.to.plot=compa.metrics,
cex.score = "ARI", cex.legend = 1.2,
rules.to.plot="average", main=paste("STAMP and RSAT matrix-clustering"))
## Print stat table (matrix-clustering and STAMP)
## !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
## JAIME, I SUPPRESS THIS SINCE IT WAS SELECTING TOO MANY ROWS
# stats.stamp.mc <- stats[max.Acc.lines,]
#
# #stats.short <- stats.short[order(stats.short$threshold.metric, stats.short$linkage.rule, stats.short$Acc, decreasing = TRUE),]
# stats.stamp.mc <- stats.stamp.mc[order(stats.stamp.mc$Acc, decreasing = TRUE),]
#
# stats.stamp.mc$tool <- gsub("matrix-clustering", "RSAT", as.vector(stats.stamp.mc$tool))
# stats.stamp.mc$tool <- gsub("stamp", "STAMP", as.vector(stats.stamp.mc$tool))
# # View(stats.stamp.mc)
# kable(stats.stamp.mc[, c("tool", stats.columns)], row.names = FALSE, digits=c(0,0,2,0,3,3,3,3,3,3), col.names = c("Tool", stats.colnames), caption = "**Old Table V: Best combinations of RSAT matrix-clustering and STAMP parameters, sorted by descending Accuracy (Acc).** RSAT *matrix-clustering* shown as RSAT. The RSAT matrix-clustering results are the same as in Table IV and indicated here to facilitate the comparaison with STAMP.")
#
# # View(stats.stamp.mc[, c("tool", stats.columns)])
#
## !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
## Print a table with the maximal accuracy per combination of linkage + cutoff metric
kable(stats.max.Acc[, c("tool", stats.columns)], row.names = FALSE, digits=c(0,0,0,0,2,0,3,3,3,3), col.names = c("Tool", stats.colnames), caption="**Table VIIa: Best combinations of RSAT *matrix-clustering* and STAMP parameters, sorted by descending Accuracy (Acc).** RSAT *matrix-clustering* shown as RSAT. The RSAT *matrix-clustering* results are the same as in Table VI and indicated here to facilitate the comparaison with STAMP.")
## Print a table with the maximal ARI per combination of linkage + cutoff metric
kable(stats.max.ARI[, c("tool", stats.columns)], row.names = FALSE, digits=c(0,0,0,0,2,0,3,3,3,3), col.names = c("Tool", stats.colnames), caption="**Table VIIb: Best combinations of RSAT matrix-clustering and STAMP parameters, sorted by descending Adjusted Rand Index (ARI).** RSAT *matrix-clustering* shown as RSAT. The RSAT matrix-clustering results are the same as in Table VI and indicated here to facilitate the comparaison with STAMP.")
## Read file with IC information between nodes
IC.info.file <- "clusters_attributes_table.tab"
IC.info.tab <- read.delim(file.path(families.annotation.dir, IC.info.file), header = TRUE)
names(IC.info.tab)[1] <- "cluster"
## (IC - IC_min )/ (IC_max -IC_min)
IC.info.tab$IC.children.mean <- apply(IC.info.tab[, c("IC_node_1", "IC_node_2")], 1, mean)
## Compare IC of children and merged motifs
IC.info.tab$IC.child.min <- pmin(IC.info.tab$IC_node_1, IC.info.tab$IC_node_2)
IC.info.tab$IC.child.max <- pmax(IC.info.tab$IC_node_1, IC.info.tab$IC_node_2)
IC.info.tab$IC.child.diff <- IC.info.tab$IC.child.max - IC.info.tab$IC.child.min
IC.info.tab$IC.child.rel.diff <- (IC.info.tab$IC - IC.info.tab$IC.child.min) / IC.info.tab$IC.child.diff
## Assign specific status according to IC change
IC.info.tab$IC_status <- "interm"
IC.info.tab$IC_status[IC.info.tab$IC < IC.info.tab$IC.child.min] <- "loss"
IC.info.tab$IC_status[IC.info.tab$IC == IC.info.tab$IC.child.min] <- "lowest"
IC.info.tab$IC_status[IC.info.tab$IC == IC.info.tab$IC.child.max] <- "highest"
IC.info.tab$IC_status[IC.info.tab$IC > IC.info.tab$IC.child.max] <- "gain"
## Assign specific colors to denote gain and loss of information
IC.info.tab$color <- "grey"
IC.info.tab$color[IC.info.tab$IC_status=="loss"] <- "red"
IC.info.tab$color[IC.info.tab$IC_status=="lowest"] <- "orange"
IC.info.tab$color[IC.info.tab$IC_status=="highest"] <- "darkblue"
IC.info.tab$color[IC.info.tab$IC_status=="gain"] <- "darkgreen"
# View(IC.info.tab)
par(mfrow=c(2,2))
## IC1 vs IC2
IC.range <- range(IC.info.tab[,c("IC_node_1", "IC_node_2")])
plot(IC.info.tab$IC_node_1, IC.info.tab$IC_node_2, panel.first=grid(), col="grey",
xlab="IC of the first motif",
ylab="IC of the second motif", xlim=IC.range, ylim=IC.range)
abline(a=0, b=1)
## IC mean versus IC merged
IC.range <- range(IC.info.tab[,c("IC", "IC.children.mean")])
plot(IC.info.tab$IC.children.mean, IC.info.tab$IC, panel.first=grid(), col="grey",
xlab="Mean IC of the two children motifs",
ylab="IC of the merged motifs", xlim=IC.range, ylim=IC.range)
abline(a=0, b=1)
PCC <- round(digits=2, cor(IC.info.tab$IC.children.mean, IC.info.tab$IC))
legend("topleft", paste("PCC =", PCC), bty = "n")
## Difference between the two children motifs
IC.range <- range(IC.info.tab[,c("IC", "IC_node_1", "IC_node_2")])
plot(IC.info.tab$IC.child.min, IC.info.tab$IC,
xlab="Minimal child IC",
ylab="IC of merged matrix",
panel.first=grid(), xlim=IC.range, ylim=IC.range,
col=IC.info.tab$color)
abline(a=0, b=1)
legend("topleft", col="red", pch=1, paste("IC loss:", sum(IC.info.tab$IC.child.min > IC.info.tab$IC),"motifs"))
plot(IC.info.tab$IC.child.max, IC.info.tab$IC,
xlab="Maximal child IC",
ylab="IC of merged matrix",
panel.first=grid(), xlim=IC.range, ylim=IC.range,
col=IC.info.tab$color)
abline(a=0, b=1)
legend("topleft", col="darkgreen", pch=1, paste("IC gain:",sum(IC.info.tab$IC.child.max < IC.info.tab$IC),"motifs"))
par(mfrow=c(1,1))
# A trick to have an inclusive interval [0,1] for the histogram:
# we round all values to 1e-3 and define an interval from 0-1e-6 to 1.
rel.diff <- round(digits=3, IC.info.tab$IC.child.rel.diff) ## Round
rel.diff <- rel.diff[!is.infinite(rel.diff)]
hist.breaks <- seq(from=floor(min(rel.diff)),
to=ceiling(max(rel.diff)), by=1)
hist.breaks[hist.breaks==0] <- -1e-6
# library(ggplot2)
# qplot(IC.child.rel.diff, data=IC.info.tab,geom="histogram",
#      fill=IC_status,
#      breaks=hist.breaks)
hist(rel.diff,
main="Information content change during clustering",
xlab="Relative IC difference",
ylab = "Proportion of tree nodes",
col="grey", border = "grey",
breaks=hist.breaks)
abline(v=0, col="red", lty="dashed")
abline(v=1, col="darkgreen", lty="dashed")
arrows(x0=-5, y0=0.5, x1=0, y1=0.5, length = 0.1, angle=20, code = 2, col="darkblue", lwd=2)
text(-5, y=0.5, col="darkblue", pos=2,
label=paste(
sep="", "Intermediate IC (",
sum((IC.info.tab$IC.child.min <= IC.info.tab$IC) &
(IC.info.tab$IC.child.max >= IC.info.tab$IC))
, " nodes)"))
#nrow(IC.info.tab)
## IC loss
arrows(x0=min(rel.diff), y0=0.2, x1=0, y1=0.2, length = 0.1, angle=20, code = 3, col="red", lwd=2)
text(x=min(rel.diff)/2, y=0.2, col="red", pos=3,
label=paste(sep="", "IC loss (", sum(IC.info.tab$IC.child.min > IC.info.tab$IC), " nodes)"))
## IC gain
arrows(x0=max(rel.diff), y0=0.2, x1=1, y1=0.2, length = 0.1,
angle=20, code = 3, col="darkgreen", lwd=2)
text(x=(max(rel.diff))/2, y=0.2, col="darkgreen", pos=3,
label=paste(sep="", "IC gain (", sum(IC.info.tab$IC.child.max < IC.info.tab$IC), " nodes)"))
# Prepare a table with IC statistics per class
cluster.names <- as.vector(unique(IC.info.tab$cluster))
# cl <- "cluster_14"
calc.cluster.stats <- function(cl) {
selected.cl <- subset(IC.info.tab, cluster == cl)
result <- data.frame(
size = nrow(selected.cl),
nb.leaves = nrow(selected.cl) + 1,
IC.gain = sum(selected.cl$IC.child.rel.diff > 1),
IC.loss = sum(selected.cl$IC.child.rel.diff < 0)
)
result$IC.gain.freq <- result$IC.gain / result$size
result$IC.loss.freq <- result$IC.loss / result$size
return(unlist(result))
}
cluster.stats <- as.data.frame(t(sapply(cluster.names, calc.cluster.stats)))
# summary(cluster.stats)
# names(cluster.stats)
# View(cluster.stats)
cluster.stats <- cluster.stats[order(cluster.stats$size, decreasing = TRUE),]
pairs(cluster.stats[, c("size", "IC.gain", "IC.loss")],
upper.panel=NULL)
par(mfrow=c(2,2))
## IC1 vs IC2
IC.range <- range(IC.info.tab[,c("IC_node_1", "IC_node_2")])
plot(IC.info.tab$IC_node_1, IC.info.tab$IC_node_2, panel.first=grid(), col="grey",
xlab="IC of the first motif",
ylab="IC of the second motif", xlim=IC.range, ylim=IC.range)
abline(a=0, b=1)
## IC mean versus IC merged
IC.range <- range(IC.info.tab[,c("IC", "IC.children.mean")])
plot(IC.info.tab$IC.children.mean, IC.info.tab$IC, panel.first=grid(), col="grey",
xlab="Mean IC of the two children motifs",
ylab="IC of the merged motifs", xlim=IC.range, ylim=IC.range)
abline(a=0, b=1)
PCC <- round(digits=2, cor(IC.info.tab$IC.children.mean, IC.info.tab$IC))
legend("topleft", paste("PCC =", PCC), bty = "n")
## Difference between the two children motifs
IC.range <- range(IC.info.tab[,c("IC", "IC_node_1", "IC_node_2")])
plot(IC.info.tab$IC.child.min, IC.info.tab$IC,
xlab="Minimal child IC",
ylab="IC of merged matrix",
panel.first=grid(), xlim=IC.range, ylim=IC.range,
col=IC.info.tab$color)
abline(a=0, b=1)
legend("topleft", col="red", pch=1, paste("IC loss:", sum(IC.info.tab$IC.child.min > IC.info.tab$IC),"motifs"))
plot(IC.info.tab$IC.child.max, IC.info.tab$IC,
xlab="Maximal child IC",
ylab="IC of merged matrix",
panel.first=grid(), xlim=IC.range, ylim=IC.range,
col=IC.info.tab$color)
abline(a=0, b=1)
legend("topleft", col="darkgreen", pch=1, paste("IC gain:",sum(IC.info.tab$IC.child.max < IC.info.tab$IC),"motifs"))
par(mfrow=c(1,1))
par(mfrow=c(2,2))
## IC1 vs IC2
IC.range <- range(IC.info.tab[,c("IC_node_1", "IC_node_2")])
plot(IC.info.tab$IC_node_1, IC.info.tab$IC_node_2, panel.first=grid(), col="grey",
xlab="IC of the first motif",
ylab="IC of the second motif", xlim=IC.range, ylim=IC.range)
abline(a=0, b=1)
## IC mean versus IC merged
IC.range <- range(IC.info.tab[,c("IC", "IC.children.mean")])
plot(IC.info.tab$IC.children.mean, IC.info.tab$IC, panel.first=grid(), col="grey",
xlab="Mean IC of the two children motifs",
ylab="IC of the merged motifs", xlim=IC.range, ylim=IC.range)
abline(a=0, b=1)
PCC <- round(digits=2, cor(IC.info.tab$IC.children.mean, IC.info.tab$IC))
legend("topleft", paste("PCC =", PCC), bty = "n")
## Difference between the two children motifs
IC.range <- range(IC.info.tab[,c("IC", "IC_node_1", "IC_node_2")])
plot(IC.info.tab$IC.child.min, IC.info.tab$IC,
xlab="Minimal child IC",
ylab="IC of merged matrix",
panel.first=grid(), xlim=IC.range, ylim=IC.range,
col=IC.info.tab$color)
abline(a=0, b=1)
legend("topleft", col="red", pch=1, paste("IC loss:", sum(IC.info.tab$IC.child.min > IC.info.tab$IC),"motifs"))
plot(IC.info.tab$IC.child.max, IC.info.tab$IC,
xlab="Maximal child IC",
ylab="IC of merged matrix",
panel.first=grid(), xlim=IC.range, ylim=IC.range,
col=IC.info.tab$color)
abline(a=0, b=1)
legend("topleft", col="darkgreen", pch=1, paste("IC gain:",sum(IC.info.tab$IC.child.max < IC.info.tab$IC),"motifs"))
par(mfrow=c(1,1))
# A trick to have an inclusive interval [0,1] for the histogram:
# we round all values to 1e-3 and define an interval from 0-1e-6 to 1.
rel.diff <- round(digits=3, IC.info.tab$IC.child.rel.diff) ## Round
rel.diff <- rel.diff[!is.infinite(rel.diff)]
hist.breaks <- seq(from=floor(min(rel.diff)),
to=ceiling(max(rel.diff)), by=1)
hist.breaks[hist.breaks==0] <- -1e-6
# library(ggplot2)
# qplot(IC.child.rel.diff, data=IC.info.tab,geom="histogram",
#      fill=IC_status,
#      breaks=hist.breaks)
hist(rel.diff,
main="Information content change during clustering",
xlab="Relative IC difference",
ylab = "Proportion of tree nodes",
col="grey", border = "grey",
breaks=hist.breaks)
abline(v=0, col="red", lty="dashed")
abline(v=1, col="darkgreen", lty="dashed")
arrows(x0=-5, y0=0.5, x1=0, y1=0.5, length = 0.1, angle=20, code = 2, col="darkblue", lwd=2)
text(-5, y=0.5, col="darkblue", pos=2,
label=paste(
sep="", "Intermediate IC (",
sum((IC.info.tab$IC.child.min <= IC.info.tab$IC) &
(IC.info.tab$IC.child.max >= IC.info.tab$IC))
, " nodes)"))
#nrow(IC.info.tab)
## IC loss
arrows(x0=min(rel.diff), y0=0.2, x1=0, y1=0.2, length = 0.1, angle=20, code = 3, col="red", lwd=2)
text(x=min(rel.diff)/2, y=0.2, col="red", pos=3,
label=paste(sep="", "IC loss (", sum(IC.info.tab$IC.child.min > IC.info.tab$IC), " nodes)"))
## IC gain
arrows(x0=max(rel.diff), y0=0.2, x1=1, y1=0.2, length = 0.1,
angle=20, code = 3, col="darkgreen", lwd=2)
text(x=(max(rel.diff))/2, y=0.2, col="darkgreen", pos=3,
label=paste(sep="", "IC gain (", sum(IC.info.tab$IC.child.max < IC.info.tab$IC), " nodes)"))
# A trick to have an inclusive interval [0,1] for the histogram:
# we round all values to 1e-3 and define an interval from 0-1e-6 to 1.
rel.diff <- round(digits=3, IC.info.tab$IC.child.rel.diff) ## Round
rel.diff <- rel.diff[!is.infinite(rel.diff)]
hist.breaks <- seq(from=floor(min(rel.diff)),
to=ceiling(max(rel.diff)), by=1)
hist.breaks[hist.breaks==0] <- -1e-6
# library(ggplot2)
# qplot(IC.child.rel.diff, data=IC.info.tab,geom="histogram",
#      fill=IC_status,
#      breaks=hist.breaks)
hist(rel.diff,
main="Information content change during clustering",
xlab="Relative IC difference",
ylab = "Proportion of tree nodes",
col="grey", border = "grey", log="y",
breaks=hist.breaks)
abline(v=0, col="red", lty="dashed")
abline(v=1, col="darkgreen", lty="dashed")
arrows(x0=-5, y0=0.5, x1=0, y1=0.5, length = 0.1, angle=20, code = 2, col="darkblue", lwd=2)
text(-5, y=0.5, col="darkblue", pos=2,
label=paste(
sep="", "Intermediate IC (",
sum((IC.info.tab$IC.child.min <= IC.info.tab$IC) &
(IC.info.tab$IC.child.max >= IC.info.tab$IC))
, " nodes)"))
#nrow(IC.info.tab)
## IC loss
arrows(x0=min(rel.diff), y0=0.2, x1=0, y1=0.2, length = 0.1, angle=20, code = 3, col="red", lwd=2)
text(x=min(rel.diff)/2, y=0.2, col="red", pos=3,
label=paste(sep="", "IC loss (", sum(IC.info.tab$IC.child.min > IC.info.tab$IC), " nodes)"))
## IC gain
arrows(x0=max(rel.diff), y0=0.2, x1=1, y1=0.2, length = 0.1,
angle=20, code = 3, col="darkgreen", lwd=2)
text(x=(max(rel.diff))/2, y=0.2, col="darkgreen", pos=3,
label=paste(sep="", "IC gain (", sum(IC.info.tab$IC.child.max < IC.info.tab$IC), " nodes)"))
# A trick to have an inclusive interval [0,1] for the histogram:
# we round all values to 1e-3 and define an interval from 0-1e-6 to 1.
rel.diff <- round(digits=3, IC.info.tab$IC.child.rel.diff) ## Round
rel.diff <- rel.diff[!is.infinite(rel.diff)]
hist.breaks <- seq(from=floor(min(rel.diff)),
to=ceiling(max(rel.diff)), by=1)
hist.breaks[hist.breaks==0] <- -1e-6
# library(ggplot2)
# qplot(IC.child.rel.diff, data=IC.info.tab,geom="histogram",
#      fill=IC_status,
#      breaks=hist.breaks)
hist(rel.diff,
main="Information content change during clustering",
xlab="Relative IC difference",
ylab = "Proportion of tree nodes",
col="grey", border = "grey",
breaks=hist.breaks)
abline(v=0, col="red", lty="dashed")
abline(v=1, col="darkgreen", lty="dashed")
arrows(x0=-5, y0=0.5, x1=0, y1=0.5, length = 0.1, angle=20, code = 2, col="darkblue", lwd=2)
text(-5, y=0.5, col="darkblue", pos=2,
label=paste(
sep="", "Intermediate IC (",
sum((IC.info.tab$IC.child.min <= IC.info.tab$IC) &
(IC.info.tab$IC.child.max >= IC.info.tab$IC))
, " nodes)"))
#nrow(IC.info.tab)
## IC loss
arrows(x0=min(rel.diff), y0=0.2, x1=0, y1=0.2, length = 0.1, angle=20, code = 3, col="red", lwd=2)
text(x=min(rel.diff)/2, y=0.2, col="red", pos=3,
label=paste(sep="", "IC loss (", sum(IC.info.tab$IC.child.min > IC.info.tab$IC), " nodes)"))
## IC gain
arrows(x0=max(rel.diff), y0=0.2, x1=1, y1=0.2, length = 0.1,
angle=20, code = 3, col="darkgreen", lwd=2)
text(x=(max(rel.diff))/2, y=0.2, col="darkgreen", pos=3,
label=paste(sep="", "IC gain (", sum(IC.info.tab$IC.child.max < IC.info.tab$IC), " nodes)"))
# A trick to have an inclusive interval [0,1] for the histogram:
# we round all values to 1e-3 and define an interval from 0-1e-6 to 1.
rel.diff <- round(digits=3, IC.info.tab$IC.child.rel.diff) ## Round
rel.diff <- rel.diff[!is.infinite(rel.diff)]
hist.breaks <- seq(from=floor(min(rel.diff)),
to=ceiling(max(rel.diff)), by=1)
hist.breaks[hist.breaks==0] <- -1e-6
# library(ggplot2)
# qplot(IC.child.rel.diff, data=IC.info.tab,geom="histogram",
#      fill=IC_status,
#      breaks=hist.breaks)
hist(rel.diff,
main="Information content change during clustering",
xlab="Relative IC difference",
ylab = "Proportion of tree nodes",
col="grey", border = "grey",
breaks=hist.breaks)
abline(v=0, col="red", lty="dashed")
abline(v=1, col="darkgreen", lty="dashed")
arrows(x0=-5, y0=0.5, x1=0, y1=0.5, length = 0.1, angle=20, code = 2, col="darkblue", lwd=2)
text(-5, y=0.5, col="darkblue", pos=2,
label=paste(
sep="", "Intermediate IC (",
sum((IC.info.tab$IC.child.min <= IC.info.tab$IC) &
(IC.info.tab$IC.child.max >= IC.info.tab$IC))
, " nodes)"))
#nrow(IC.info.tab)
## IC loss
arrows(x0=min(rel.diff), y0=0.2, x1=0, y1=0.2, length = 0.1, angle=20, code = 3, col="red", lwd=2)
text(x=min(rel.diff)/2, y=0.2, col="red", pos=3,
label=paste(sep="", "IC loss (", sum(IC.info.tab$IC.child.min > IC.info.tab$IC), " nodes)"))
## IC gain
arrows(x0=max(rel.diff), y0=0.2, x1=1, y1=0.2, length = 0.1,
angle=20, code = 3, col="darkgreen", lwd=2)
text(x=(max(rel.diff))/2, y=0.2, col="darkgreen", pos=3,
label=paste(sep="", "IC gain (", sum(IC.info.tab$IC.child.max < IC.info.tab$IC), " nodes)"))
