This vignette cached_results DMR detection on simulated ONT-style methylation data with known ground truth. The dataset used is a simulated ONT dataset, based on an in-house dataset of 5 samples from neuroendocrine islets, augmented to 20 samples using CMEnt::augmentBSSeq, subset to chromosomes 20 and 21, and with 1000 DMRs spiked in using CMEnt::simulateDMRs, with max delta beta (delta_max) set to 0.3 and expected within-dmr correlation (expected_corr) set to 0.7.
Methods:
Primary goal:
try(devtools::load_all(), silent = TRUE)
suppressPackageStartupMessages({
try(library(CMEnt), silent = TRUE)
library(bsseq)
library(data.table)
library(ggplot2)
library(dplyr)
library(GenomicRanges)
library(SummarizedExperiment)
library(S4Vectors)
})
getBenchmarkFile <- function(filename) {
benchmark_file <- ""
folder <- "simulated_ont"
if (getwd() == "notebooks") {
benchmark_file <- file.path("cached_results", folder, filename)
} else if ("NAMESPACE" %in% dir()) {
benchmark_file <- file.path("notebooks", "cached_results", folder, filename)
} else {
benchmark_file <- file.path("cached_results", folder, filename)
}
dir.create(dirname(benchmark_file), showWarnings = FALSE, recursive = TRUE)
benchmark_file
}
min_seeds <- 2L
min_sites <- 50L
max_pval <- 0.05
testing_mode <- "parametric"
ext_site_delta_beta <- 0.1
max_bridge_seeds_gaps <- 1L
max_bridge_extension_gaps <- 2L
max_lookup_dist <- 1000L
method_cache_tag <- sprintf(
"sites%d_p%s_ext%s_lookup%d",
min_sites,
gsub("[^0-9]+", "", sprintf("%.3f", max_pval)),
gsub("[^0-9]+", "", sprintf("%.3f", ext_site_delta_beta)),
max_lookup_dist
)
if (!requireNamespace("BiocManager", quietly = TRUE)) {
install.packages("BiocManager")
}
if (!requireNamespace("DMRcate", quietly = TRUE)) {
BiocManager::install("DMRcate", update = FALSE, ask = FALSE)
}
if (!requireNamespace("dmrseq", quietly = TRUE)) {
BiocManager::install("dmrseq", update = FALSE, ask = FALSE)
}
if (!requireNamespace("hummingbird", quietly = TRUE)) {
BiocManager::install("hummingbird", update = FALSE, ask = FALSE)
}
if (!requireNamespace("tidyr", quietly = TRUE)) {
install.packages("tidyr")
}
cment_file <- getBenchmarkFile(sprintf("dmrs.CMEnt.%s.rds", method_cache_tag))
dmrcate_file <- getBenchmarkFile(sprintf("dmrs.DMRcate.%s.rds", method_cache_tag))
dmrseq_file <- getBenchmarkFile(sprintf("dmrs.dmrseq.%s.rds", method_cache_tag))
hummingbird_file <- getBenchmarkFile(sprintf("dmrs.hummingbird.%s.rds", method_cache_tag))
pheno_file <- getBenchmarkFile("ont_sim_samplesheet.tsv")
seeds_file <- getBenchmarkFile(sprintf("seeds.p%s.tsv.gz", gsub("[^0-9]+", "", sprintf("%.3f", max_pval))))
computation_needed <- !file.exists(cment_file) || !file.exists(dmrcate_file) || !file.exists(dmrseq_file) || !file.exists(hummingbird_file) || !file.exists(pheno_file) || !file.exists(seeds_file)
zenodo_base_url <- "https://zenodo.org/records/20592944/files/"
sim_truth_path <- getBenchmarkFile("ont_sim_dmrs.tsv")
if (computation_needed) {
dir.create("data", showWarnings = FALSE, recursive = TRUE)
ont_sim_bs_path <- getBenchmarkFile("ont_sim_bs.rds")
ont_sim_bs_hdf5_path <- getBenchmarkFile("ont_sim_bs.h5")
if (file.exists(ont_sim_bs_hdf5_path)) {
ont_sim_bs <- HDF5Array::loadHDF5SummarizedExperiment(ont_sim_bs_hdf5_path)
} else {
if (!file.exists(ont_sim_bs_path)) {
curl::curl_download(
url = paste0(zenodo_base_url, basename(ont_sim_bs_path), "?download=1"),
destfile = ont_sim_bs_path
)
}
ont_sim_bs <- readRDS(ont_sim_bs_path)
ont_sim_bs <- sort(ont_sim_bs)
ont_sim_bs <- HDF5Array::saveHDF5SummarizedExperiment(
ont_sim_bs,
dir = ont_sim_bs_hdf5_path,
replace = TRUE
)
}
stopifnot(inherits(ont_sim_bs, "BSseq"))
cat(sprintf("Loaded BSseq with %d sites and %d samples\n", nrow(ont_sim_bs), ncol(ont_sim_bs)))
}
Loaded BSseq with 2292633 sites and 20 samples
if (!file.exists(sim_truth_path)) {
curl::curl_download(
url = paste0(zenodo_base_url, basename(sim_truth_path), "?download=1"),
destfile = sim_truth_path
)
}
truth_df <- data.table::fread(sim_truth_path)
truth_dmrs <- makeGRangesFromDataFrame(
truth_df,
seqnames.field = "seqnames",
start.field = "start",
end.field = "end",
keep.extra.columns = TRUE,
seqinfo = Seqinfo(genome = "hg38")
)
cat(sprintf("Loaded %d ground-truth DMRs\n", length(truth_dmrs)))
Loaded 1000 ground-truth DMRs
if (!file.exists(pheno_file)) {
sample_ids <- colnames(ont_sim_bs)
sample_group <- ifelse(grepl("^Condition1", sample_ids), "Condition1", "Condition2")
pheno <- data.frame(
Sample_ID = sample_ids,
Sample_Group = sample_group,
casecontrol = sample_group == "Condition2",
stringsAsFactors = FALSE
)
rownames(pheno) <- pheno$Sample_ID
data.table::fwrite(pheno, pheno_file, sep = "\t")
} else {
pheno <- data.table::fread(pheno_file, data.table = FALSE)
}
if (computation_needed) {
colData(ont_sim_bs) <- S4Vectors::DataFrame(pheno, row.names = pheno$Sample_ID)
ont_sim_bs <- sort(ont_sim_bs)
}
knitr::kable(
table(pheno$Sample_Group),
caption = "Phenotype Frequency Table"
)
| Var1 | Freq |
|---|---|
| Condition1 | 10 |
| Condition2 | 10 |
knitr::kable(
data.frame(
Parameter = c(
"min_seeds", "min_sites", "max_pval", "testing_mode",
"ext_site_delta_beta", "max_bridge_seeds_gaps",
"max_bridge_extension_gaps", "max_lookup_dist"
),
Value = c(
min_seeds, min_sites, max_pval, testing_mode,
ext_site_delta_beta, max_bridge_seeds_gaps,
max_bridge_extension_gaps, max_lookup_dist
)
),
caption = "Benchmark detection parameters"
)
| Parameter | Value |
|---|---|
| min_seeds | 2 |
| min_sites | 50 |
| max_pval | 0.05 |
| testing_mode | parametric |
| ext_site_delta_beta | 0.1 |
| max_bridge_seeds_gaps | 1 |
| max_bridge_extension_gaps | 2 |
| max_lookup_dist | 1000 |
This step is crucial for CMEnt, which relies on seed sites to guide DMR assembly. We use a DSS approach to identify candidate seeds based on differential methylation evidence. We keep the FDR threshold relatively lenient, to account for the low sample size.
if (!file.exists(seeds_file)) {
seeds <- findDMPsBSSeq(
bsseq = ont_sim_bs,
samplesheet = pheno,
sample_group_col = "Sample_Group",
id_col = "Sample_ID",
case_group = "Condition2",
covariates = NULL,
output_file = seeds_file
)
seeds <- seeds[seeds$qval < max_pval, , drop = FALSE]
} else {
seeds <- data.table::fread(seeds_file)
rownames(seeds) <- seeds$site_id
}
seeds <- as.data.frame(seeds, stringsAsFactors = FALSE)
rownames(seeds) <- seeds$site_id
seeds$Row.names <- NULL
seed_dist <- unlist(lapply(split(seeds$start, seeds$chr), function(x) {
if (length(x) < 2) return(numeric(0))
diff(x)
}))
seeds_qs <- round(stats::quantile(seed_dist, probs = c(0, 0.25, 0.5, 0.75, 0.9, 0.99, 1)))
cat(sprintf("Number of seeds for CMEnt: %d\n", nrow(seeds)))
Number of seeds for CMEnt: 74216
knitr::kable(
seeds_qs,
digits = 2,
col.names = "Distance (bp)",
caption = sprintf("Seed distance quantiles")
)
| Distance (bp) | |
|---|---|
| 0% | 1 |
| 25% | 1 |
| 50% | 15 |
| 75% | 104 |
| 90% | 329 |
| 99% | 41460 |
| 100% | 412180 |
Metrics explanation:
Metrics values:
seed_df <- as.data.frame(seeds, stringsAsFactors = FALSE)
seed_df <- seed_df[!is.na(seed_df$start) & nzchar(seed_df$chr), , drop = FALSE]
seed_gr <- GenomicRanges::makeGRangesFromDataFrame(
seed_df,
seqnames.field = "chr",
start.field = "start",
end.field = "end",
keep.extra.columns = TRUE,
seqinfo = Seqinfo(genome = "hg38")
)
seed_truth_hits <- GenomicRanges::findOverlaps(seed_gr, truth_dmrs, ignore.strand = TRUE)
seed_overlaps_truth <- rep(FALSE, length(seed_gr))
truth_has_seed <- rep(FALSE, length(truth_dmrs))
truth_has_enough_seeds <- 0L
if (length(seed_truth_hits) > 0) {
seed_overlaps_truth[unique(S4Vectors::queryHits(seed_truth_hits))] <- TRUE
truth_has_seed[unique(S4Vectors::subjectHits(seed_truth_hits))] <- TRUE
}
seeds_per_truth <- if (length(truth_dmrs) > 0) {
tabulate(S4Vectors::subjectHits(seed_truth_hits), nbins = length(truth_dmrs))
} else {
integer(0)
}
seeds_per_truth_within_mindist <- if (length(seed_truth_hits) > 0) {
seed_positions <- GenomicRanges::start(seed_gr)[S4Vectors::queryHits(seed_truth_hits)]
seeds_within_mindist <- list()
for (i in seq_along(truth_dmrs)){
dmr_seeds_mask <- S4Vectors::subjectHits(seed_truth_hits) == i
if (sum(dmr_seeds_mask) == 0) {
seeds_within_mindist[[i]] <- 0
}
sorts <- order(seed_positions[dmr_seeds_mask])
dists <- diff(seed_positions[dmr_seeds_mask][sorts])
seeds_within_mindist[[i]] <- sum(dists <= max_lookup_dist)
}
unlist(seeds_within_mindist)
} else {
integer(0)
}
truth_has_enough_seeds <- sum(seeds_per_truth >= min_seeds)
safe_pct <- function(x) if (length(x) == 0) 0 else round(100 * mean(x), 1)
seed_truth_summary <- data.frame(
Seeds_Total = length(seed_gr),
Seeds_Overlapping_Truth = sum(seed_overlaps_truth),
Seeds_Overlap_Pct = safe_pct(seed_overlaps_truth),
Truth_DMRs_Total = length(truth_dmrs),
Truth_DMRs_With_Seed_Pct = safe_pct(truth_has_seed),
Truth_DMRs_With_Enough_Seeds_Pct = ifelse(length(truth_dmrs) == 0, 0, 100 * truth_has_enough_seeds / length(truth_dmrs)),
Truth_DMRs_With_Enough_Proximal_Seeds_Pct = ifelse(length(truth_dmrs) == 0, 0, 100 * sum(seeds_per_truth_within_mindist >= min_seeds) / length(truth_dmrs)),
Median_Seeds_Per_Truth_DMR = ifelse(length(seeds_per_truth) == 0, NA_real_, stats::median(seeds_per_truth)),
Median_Seeds_Per_Hit_Truth_DMR = ifelse(sum(truth_has_seed) == 0, 0, stats::median(seeds_per_truth[truth_has_seed])),
Max_Seeds_In_Truth_DMR = ifelse(length(seeds_per_truth) == 0, NA_integer_, max(seeds_per_truth))
)
seed_sign <- sign(as.numeric(seed_df$delta_beta))
truth_sign <- sign(as.numeric(SummarizedExperiment::mcols(truth_dmrs)$delta_beta))
pair_match <- seed_sign[S4Vectors::queryHits(seed_truth_hits)] ==
truth_sign[S4Vectors::subjectHits(seed_truth_hits)]
seed_pair_direction_match_pct <- if (all(is.na(pair_match))) NA_real_ else 100 * mean(pair_match, na.rm = TRUE)
per_seed_dir_match <- tapply(pair_match, S4Vectors::queryHits(seed_truth_hits), function(x) any(x, na.rm = TRUE))
seed_any_direction_match_pct <- if (length(per_seed_dir_match) == 0) NA_real_ else 100 * mean(per_seed_dir_match)
seed_truth_summary$Overlap_Pairs_Direction_Match_Pct <- round(seed_pair_direction_match_pct, 1)
seed_truth_summary$Overlapping_Seeds_Any_Direction_Match_Pct <- round(seed_any_direction_match_pct, 1)
for (i in seq_along(seed_truth_summary)) {
cat(sprintf("- **%s**: %s\n", names(seed_truth_summary)[i], seed_truth_summary[[i]]))
}
score_col <- "score"
ord <- order(seed_df[[score_col]], decreasing = TRUE, na.last = NA)
top_k <- unique(pmin(length(ord), c(100, 500, 1000, 5000, 10000, length(ord))))
top_seed_overlap <- do.call(rbind, lapply(top_k, function(k) {
idx <- ord[seq_len(k)]
n_hit <- sum(seed_overlaps_truth[idx])
data.frame(
`Top Seeds` = k,
`Overlap Pct` = 100 * n_hit / k
)
}))
knitr::kable(
top_seed_overlap,
digits = 2,
caption = sprintf("Ground-truth overlap enrichment among top-scoring seeds")
)
| Top.Seeds | Overlap.Pct |
|---|---|
| 100 | 100.00 |
| 500 | 100.00 |
| 1000 | 100.00 |
| 5000 | 100.00 |
| 10000 | 100.00 |
| 74216 | 95.82 |
if (!file.exists(cment_file)) {
default_cment_njobs <- max(1L, min(8L, parallel::detectCores(logical = TRUE) - 1L))
cment_njobs <- suppressWarnings(as.integer(
Sys.getenv("CMENT_BENCHMARK_NJOBS", as.character(default_cment_njobs))
))
if (!is.finite(cment_njobs) || is.na(cment_njobs) || cment_njobs < 1L) {
cment_njobs <- default_cment_njobs
}
cment_njobs <- max(1L, min(cment_njobs, parallel::detectCores(logical = TRUE)))
options("CMEnt.njobs" = cment_njobs)
options("CMEnt.verbose" = 3)
cat(sprintf("CMEnt benchmark jobs: %d\n", cment_njobs))
start_time <- Sys.time()
beta_handler <- CMEnt::getBetaHandler(
beta = ont_sim_bs
)
gc(verbose = FALSE)
dmrs_cment <- CMEnt::buildDMRs(
beta = beta_handler,
seeds = seeds,
pheno = pheno,
seeds_id_col = "site_id",
sample_group_col = "Sample_Group",
casecontrol_col = "casecontrol",
genome = "hg38",
array = "NULL",
entanglement = "weak",
testing_mode = testing_mode,
max_bridge_seeds_gaps = max_bridge_seeds_gaps,
max_bridge_extension_gaps = max_bridge_extension_gaps,
min_seeds = min_seeds,
min_sites = min_sites/2,
max_pval = max_pval,
ext_site_delta_beta = ext_site_delta_beta,
max_lookup_dist = max_lookup_dist, # Use a larger lookup distance (10k is the default)
annotate_with_genes = FALSE,
extract_motifs = FALSE,
njobs = cment_njobs
)
end_time <- Sys.time()
time_cment <- end_time - start_time
saveRDS(list(dmrs = dmrs_cment, time = time_cment), cment_file)
rm(beta_handler)
colData(ont_sim_bs) <- S4Vectors::DataFrame(pheno, row.names = pheno$Sample_ID)
gc(verbose = FALSE)
} else {
ret <- readRDS(cment_file)
dmrs_cment <- ret$dmrs
time_cment <- ret$time
}
run_dmrcate_bsseq <- function(
bs_obj,
design,
max_sites_per_chunk = 500000L,
chunk_overlap_sites = 2000L
) {
if (!requireNamespace("DMRcate", quietly = TRUE)) {
warning("DMRcate not installed; returning empty set.")
return(GenomicRanges::GRanges())
}
dmrcate_design <- as.matrix(design)
if (nrow(dmrcate_design) != ncol(bs_obj)) {
warning(sprintf(
"DMRcate design rows (%d) do not match BSseq samples (%d); returning empty set.",
nrow(dmrcate_design),
ncol(bs_obj)
))
return(GenomicRanges::GRanges())
}
if (ncol(dmrcate_design) < 2L) {
warning("DMRcate requires at least two groups in the design matrix; returning empty set.")
return(GenomicRanges::GRanges())
}
colnames(dmrcate_design) <- gsub("Sample_Group", "", colnames(dmrcate_design))
colnames(dmrcate_design)[1] <- "Intercept"
dmrcate_design <- edgeR::modelMatrixMeth(dmrcate_design)
coef_names <- colnames(dmrcate_design)
sample_cols <- grepl("^Sample[0-9]+$", coef_names)
intercept_cols <- coef_names %in% c("Intercept", "(Intercept)")
contrast_candidates <- coef_names[!(sample_cols | intercept_cols)]
if (length(contrast_candidates) == 0) {
warning(sprintf(
"DMRcate could not infer a contrast coefficient from methdesign columns (%s); returning empty set.",
paste(coef_names, collapse = ", ")
))
return(GenomicRanges::GRanges())
}
preferred_coef <- c("Condition2_vs_Condition1", "Condition2", "cancer_vs_normal")
selected_coef_name <- preferred_coef[preferred_coef %in% contrast_candidates][1]
if (is.na(selected_coef_name)) {
selected_coef_name <- contrast_candidates[length(contrast_candidates)]
}
selected_coef <- match(selected_coef_name, coef_names)
cat(sprintf(
"DMRcate contrast coefficient: %s (column %d of methdesign)\n",
selected_coef_name,
selected_coef
))
all_gr <- bsseq::granges(bs_obj)
all_seqnames <- as.character(GenomicRanges::seqnames(all_gr))
all_pos <- GenomicRanges::start(all_gr)
rm(all_gr)
gc(verbose = FALSE)
chr_vec <- unique(all_seqnames)
chr_vec <- chr_vec[!is.na(chr_vec) & nzchar(chr_vec)]
if (length(chr_vec) == 0) {
warning("No chromosomes found in BSseq object; returning empty set.")
return(GenomicRanges::GRanges())
}
dmrs_by_chr <- vector("list", length(chr_vec))
names(dmrs_by_chr) <- chr_vec
for (chr_name in chr_vec) {
chr_idx <- which(all_seqnames == chr_name)
if (length(chr_idx) == 0) {
next
}
n_chr <- length(chr_idx)
chunk_starts <- seq.int(1L, n_chr, by = as.integer(max_sites_per_chunk))
chr_chunk_dmrs <- vector("list", length(chunk_starts))
for (chunk_i in seq_along(chunk_starts)) {
core_from <- chunk_starts[chunk_i]
core_to <- min(core_from + as.integer(max_sites_per_chunk) - 1L, n_chr)
ext_from <- max(1L, core_from - as.integer(chunk_overlap_sites))
ext_to <- min(n_chr, core_to + as.integer(chunk_overlap_sites))
idx_ext <- chr_idx[ext_from:ext_to]
core_start_pos <- all_pos[chr_idx[core_from]]
core_end_pos <- all_pos[chr_idx[core_to]]
cat(sprintf(
"DMRcate: %s chunk %d/%d (%s sites core, %s sites with overlap)\n",
chr_name,
chunk_i,
length(chunk_starts),
format(core_to - core_from + 1L, big.mark = ","),
format(length(idx_ext), big.mark = ",")
))
chunk_bs <- bs_obj[idx_ext, , drop = FALSE]
if (nrow(chunk_bs) < 3L) {
chr_chunk_dmrs[[chunk_i]] <- GenomicRanges::GRanges()
rm(chunk_bs)
gc(verbose = FALSE)
next
}
chunk_out <- tryCatch(
{
ann <- DMRcate::sequencing.annotate(
obj = chunk_bs,
all.cov = TRUE,
methdesign = dmrcate_design,
coef = selected_coef,
fdr = max_pval
)
dmrcate_res <- DMRcate::dmrcate(
ann,
lambda = 1000,
C = 2,
min.cpgs = min_sites
)
ranges_out <- DMRcate::extractRanges(dmrcate_res, genome = "hg38")
rm(ann, dmrcate_res)
gc(verbose = FALSE)
if (length(ranges_out) > 0) {
mids <- floor((GenomicRanges::start(ranges_out) + GenomicRanges::end(ranges_out)) / 2)
keep <- mids >= core_start_pos & mids <= core_end_pos
ranges_out <- ranges_out[keep]
}
ranges_out
},
error = function(e) {
cat(sprintf(
"DMRcate chunk failed (%s chunk %d/%d, fdr=0.05): %s\n",
chr_name,
chunk_i,
length(chunk_starts),
e$message
))
GenomicRanges::GRanges()
}
)
cat(sprintf(
"DMRcate chunk %d/%d found %d DMRs\n",
chunk_i,
length(chunk_starts),
length(chunk_out)
))
chr_chunk_dmrs[[chunk_i]] <- chunk_out
rm(chunk_bs, chunk_out)
gc(verbose = FALSE)
}
chr_chunk_dmrs <- Filter(function(x) !is.null(x) && length(x) > 0, chr_chunk_dmrs)
if (length(chr_chunk_dmrs) == 0) {
dmrs_by_chr[[chr_name]] <- NULL
} else {
dmrs_by_chr[[chr_name]] <- GenomicRanges::sort(do.call(c, chr_chunk_dmrs))
}
rm(chr_chunk_dmrs)
gc(verbose = FALSE)
}
dmrs_by_chr <- Filter(function(x) !is.null(x) && length(x) > 0, dmrs_by_chr)
if (length(dmrs_by_chr) == 0) {
return(GenomicRanges::GRanges())
}
out <- do.call(c, unname(dmrs_by_chr))
out <- GenomicRanges::sort(out)
out
}
if (!file.exists(dmrcate_file)) {
start_time <- Sys.time()
design <- model.matrix(~ Sample_Group, data = colData(ont_sim_bs))
dmrs_dmrcate <- run_dmrcate_bsseq(ont_sim_bs, design)
end_time <- Sys.time()
time_dmrcate <- end_time - start_time
saveRDS(list(dmrs = dmrs_dmrcate, time = time_dmrcate), dmrcate_file)
} else {
ret <- readRDS(dmrcate_file)
dmrs_dmrcate <- ret$dmrs
time_dmrcate <- ret$time
}
run_dmrseq_bsseq <- function(
bs_obj,
chunk = TRUE,
max_sites_per_chunk = 500000L,
chunk_overlap_sites = 2000L
) {
if (!requireNamespace("dmrseq", quietly = TRUE)) {
warning("dmrseq not installed; returning empty set.")
return(GenomicRanges::GRanges())
}
if (!"Sample_Group" %in% colnames(as.data.frame(colData(bs_obj)))) {
warning("Sample_Group not present in BSseq colData; returning empty set.")
return(GenomicRanges::GRanges())
}
all_gr <- bsseq::granges(bs_obj)
all_seqnames <- as.character(GenomicRanges::seqnames(all_gr))
all_pos <- GenomicRanges::start(all_gr)
rm(all_gr)
gc(verbose = FALSE)
chr_vec <- unique(all_seqnames)
chr_vec <- chr_vec[!is.na(chr_vec) & nzchar(chr_vec)]
if (length(chr_vec) == 0) {
warning("No chromosomes found in BSseq object; returning empty set.")
return(GenomicRanges::GRanges())
}
dmrs_by_chr <- vector("list", length(chr_vec))
names(dmrs_by_chr) <- chr_vec
bpparam <- CMEnt:::.makeBiocParallelParam(max(1L, min(8L, parallel::detectCores(logical = TRUE) - 1L)), progressbar = TRUE)
.runDMRSeq <- function(chunk_bs, chr_name, chunk_i, core_start_pos = NA_integer_, core_end_pos = NA_integer_) {
chunk_out <- tryCatch(
{
fit <- dmrseq::dmrseq(
bs = chunk_bs,
testCovariate = "Sample_Group",
cutoff = 0.1,
minNumRegion = min_sites,
verbose = FALSE,
BPPARAM = bpparam
)
qvals <- mcols(fit)$qval
if (length(fit) > 0 && any(is.finite(qvals))) {
fit <- fit[is.finite(qvals) & qvals < 0.05]
} else if (length(fit) > 0) {
cat(sprintf(
"dmrseq chunk %s %d/%d returned observed candidates but no finite q-values; keeping candidates without permutation significance.\n",
chr_name,
chunk_i,
length(chunk_starts)
))
}
if (length(fit) > 0 && !is.na(core_start_pos) && !is.na(core_end_pos)) {
mids <- floor((GenomicRanges::start(fit) + GenomicRanges::end(fit)) / 2)
keep <- mids >= core_start_pos & mids <= core_end_pos
fit <- fit[keep]
}
gc(verbose = FALSE)
fit
},
error = function(e) {
cat(sprintf(
"dmrseq chunk failed (%s chunk %d/%d): %s\n",
chr_name,
chunk_i,
length(chunk_starts),
e$message
))
NULL
}
)
chunk_out
}
if (!chunk) {
out <- .runDMRSeq(bs_obj, "", 1, NA_integer_, NA_integer_)
} else {
for (chr_name in chr_vec) {
chr_idx <- which(all_seqnames == chr_name)
if (length(chr_idx) == 0) {
next
}
n_chr <- length(chr_idx)
chunk_starts <- seq.int(1L, n_chr, by = as.integer(max_sites_per_chunk))
chr_chunk_dmrs <- vector("list", length(chunk_starts))
for (chunk_i in seq_along(chunk_starts)) {
core_from <- chunk_starts[chunk_i]
core_to <- min(core_from + as.integer(max_sites_per_chunk) - 1L, n_chr)
ext_from <- max(1L, core_from - as.integer(chunk_overlap_sites))
ext_to <- min(n_chr, core_to + as.integer(chunk_overlap_sites))
idx_ext <- chr_idx[ext_from:ext_to]
core_start_pos <- all_pos[chr_idx[core_from]]
core_end_pos <- all_pos[chr_idx[core_to]]
cat(sprintf(
"dmrseq: %s chunk %d/%d (%s sites core, %s sites with overlap)\n",
chr_name,
chunk_i,
length(chunk_starts),
format(core_to - core_from + 1L, big.mark = ","),
format(length(idx_ext), big.mark = ",")
))
chunk_bs <- bs_obj[idx_ext, , drop = FALSE]
chr_chunk_dmrs[[chunk_i]] <- .runDMRSeq(chunk_bs, chr_name, chunk_i, core_start_pos, core_end_pos)
rm(chunk_bs)
gc(verbose = FALSE)
}
chr_chunk_dmrs <- Filter(function(x) !is.null(x) && length(x) > 0, chr_chunk_dmrs)
if (length(chr_chunk_dmrs) == 0) {
dmrs_by_chr[[chr_name]] <- NULL
} else {
dmrs_by_chr[[chr_name]] <- GenomicRanges::sort(do.call(c, chr_chunk_dmrs))
}
rm(chr_chunk_dmrs)
gc(verbose = FALSE)
}
dmrs_by_chr <- Filter(function(x) !is.null(x) && length(x) > 0, dmrs_by_chr)
if (length(dmrs_by_chr) == 0) {
return(GenomicRanges::GRanges())
}
out <- do.call(c, unname(dmrs_by_chr))
}
out <- GenomicRanges::sort(out)
out
}
if (!file.exists(dmrseq_file)) {
start_time <- Sys.time()
dmrs_dmrseq <- run_dmrseq_bsseq(ont_sim_bs)
end_time <- Sys.time()
time_dmrseq <- end_time - start_time
saveRDS(list(dmrs = dmrs_dmrseq, time = time_dmrseq), dmrseq_file)
} else {
ret <- readRDS(dmrseq_file)
dmrs_dmrseq <- ret$dmrs
time_dmrseq <- ret$time
}
run_hummingbird_bsseq <- function(
bs_obj,
max_gap = 300L, # default
min_sites = 3L, # default is 10
min_length = 50L # default is 500
) {
sample_df <- as.data.frame(SummarizedExperiment::colData(bs_obj))
if (!"Sample_Group" %in% colnames(sample_df)) {
warning("Sample_Group not present in BSseq colData; returning empty set.")
return(GenomicRanges::GPos())
}
ctrl_idx <- which(sample_df$Sample_Group == "Condition1")
case_idx <- which(sample_df$Sample_Group == "Condition2")
if (length(ctrl_idx) == 0 || length(case_idx) == 0) {
warning("Need both Condition1 and Condition2 samples for hummingbird.")
return(GenomicRanges::GPos())
}
row_gr <- bsseq::granges(bs_obj)
# granges to gpos for hummingbird
row_gr <- GPos(
seqnames = GenomicRanges::seqnames(row_gr),
pos = GenomicRanges::start(row_gr)
)
to_count_matrix <- function(x) {
x <- as.matrix(x)
x[!is.finite(x)] <- 0
x[x < 0] <- 0
storage.mode(x) <- "integer"
x
}
all_seqnames <- as.character(GenomicRanges::seqnames(row_gr))
chr_vec <- unique(all_seqnames)
chr_vec <- chr_vec[!is.na(chr_vec) & nzchar(chr_vec)]
if (length(chr_vec) == 0) {
warning("No chromosomes found in BSseq object; returning empty set.")
return(GenomicRanges::GPos())
}
dmrs_by_chr <- vector("list", length(chr_vec))
names(dmrs_by_chr) <- chr_vec
for (chr_name in chr_vec) {
chr_idx <- which(all_seqnames == chr_name)
if (length(chr_idx) < as.integer(min_sites)) {
next
}
chr_gr <- row_gr[chr_idx]
chr_bs <- bs_obj[chr_idx, , drop = FALSE]
meth_mat <- bsseq::getCoverage(chr_bs, type = "M")
cov_mat <- bsseq::getCoverage(chr_bs, type = "Cov")
unmeth_mat <- cov_mat - meth_mat
control_se <- SummarizedExperiment::SummarizedExperiment(
rowRanges = chr_gr,
assays = list(
normM = to_count_matrix(meth_mat[, ctrl_idx, drop = FALSE]),
normUM = to_count_matrix(unmeth_mat[, ctrl_idx, drop = FALSE])
)
)
case_se <- SummarizedExperiment::SummarizedExperiment(
rowRanges = chr_gr,
assays = list(
abnormM = to_count_matrix(meth_mat[, case_idx, drop = FALSE]),
abnormUM = to_count_matrix(unmeth_mat[, case_idx, drop = FALSE])
)
)
chr_out <- tryCatch(
{
hb_fit <- hummingbird::hummingbirdEM(experimentInfoControl = control_se, experimentInfoCase = case_se, binSize = 40)
hb_fit <- hummingbird::hummingbirdPostAdjustment(
emInfo = hb_fit,
experimentInfoControl = control_se,
experimentInfoCase = case_se,
maxGap = as.integer(max_gap),
minCpGs = as.integer(min_sites),
minLength = as.integer(min_length)
)
hb_fit$DMRs
},
error = function(e) {
cat(sprintf("hummingbird failed (%s): %s\n", chr_name, e$message))
NULL
}
)
dmrs_by_chr[[chr_name]] <- chr_out
rm(chr_bs, chr_gr, meth_mat, cov_mat, unmeth_mat, control_se, case_se, chr_out)
gc(verbose = FALSE)
}
dmrs_by_chr <- Filter(function(x) !is.null(x) && length(x) > 0, dmrs_by_chr)
if (length(dmrs_by_chr) == 0) {
return(GenomicRanges::GPos())
}
out <- suppressWarnings(do.call(c, unname(dmrs_by_chr)))
GenomicRanges::sort(out)
}
if (!file.exists(hummingbird_file)) {
start_time <- Sys.time()
dmrs_hummingbird <- run_hummingbird_bsseq(ont_sim_bs, min_sites = min_sites, max_gap = 300L)
end_time <- Sys.time()
time_hummingbird <- end_time - start_time
saveRDS(list(dmrs = dmrs_hummingbird, time = time_hummingbird), hummingbird_file)
} else {
ret <- readRDS(hummingbird_file)
dmrs_hummingbird <- ret$dmrs
time_hummingbird <- ret$time
}
cat(sprintf("**CMEnt**: Found %d DMRs in %d seconds\n", length(dmrs_cment), round(as.numeric(time_cment, units = "secs"))))
CMEnt: Found 1051 DMRs in 606 seconds
cat(sprintf("**DMRcate**: Found %d DMRs in %d seconds\n", length(dmrs_dmrcate), round(as.numeric(time_dmrcate, units = "secs"))))
DMRcate: Found 445 DMRs in 243 seconds
cat(sprintf("**dmrseq**: Found %d DMRs in %d seconds\n", length(dmrs_dmrseq), round(as.numeric(time_dmrseq, units = "secs"))))
dmrseq: Found 800 DMRs in 3636 seconds
cat(sprintf("**hummingbird**: Found %d DMRs in %d seconds\n", length(dmrs_hummingbird), round(as.numeric(time_hummingbird, units = "secs"))))
hummingbird: Found 549 DMRs in 45 seconds
methods_list <- list(
CMEnt = dmrs_cment,
DMRcate = dmrs_dmrcate,
dmrseq = dmrs_dmrseq,
hummingbird = dmrs_hummingbird
)
method_order <- names(methods_list)
get_dmr_stats <- function(dmrs, method) {
if (length(dmrs) == 0) {
return(data.frame(
Method = method,
`Number of DMRs` = 0,
`Mean DMR Width` = NA_real_,
`Median DMR Width` = NA_real_,
`Total Coverage bp` = 0,
`Min Width` = NA_real_,
`Max Width` = NA_real_
))
}
data.frame(
Method = method,
`Number of DMRs` = length(dmrs),
`Mean DMR Width` = mean(width(dmrs)),
`Median DMR Width` = median(width(dmrs)),
`Total Coverage bp` = sum(width(dmrs)),
`Min Width` = min(width(dmrs)),
`Max Width` = max(width(dmrs))
)
}
comparison_stats <- do.call(rbind, lapply(names(methods_list), function(m) {
get_dmr_stats(methods_list[[m]], m)
}))
truth_stats <- get_dmr_stats(truth_dmrs, "Ground Truth")
comparison_stats <- rbind(comparison_stats, truth_stats)
comparison_stats$Method <- factor(
comparison_stats$Method,
levels = c(method_order, "Ground Truth")
)
comparison_stats <- comparison_stats[order(comparison_stats$Method), , drop = FALSE]
comparison_stats$Method <- as.character(comparison_stats$Method)
knitr::kable(comparison_stats, digits = 2, caption = "Basic DMR Statistics")
| Method | Number.of.DMRs | Mean.DMR.Width | Median.DMR.Width | Total.Coverage.bp | Min.Width | Max.Width |
|---|---|---|---|---|---|---|
| CMEnt | 1051 | 5448.85 | 4610.0 | 5726738 | 227 | 21553 |
| DMRcate | 445 | 4575.96 | 3835.0 | 2036302 | 783 | 14431 |
| dmrseq | 800 | 4754.24 | 3923.0 | 3803389 | 694 | 20771 |
| hummingbird | 549 | 2901.64 | 2520.0 | 1593000 | 400 | 13680 |
| Ground Truth | 1000 | 5335.16 | 4305.5 | 5335161 | 694 | 22892 |
evaluate_against_truth <- function(pred, truth) {
n_pred <- length(pred)
n_truth <- length(truth)
pred_overlap_truth <- sum(overlapsAny(pred, truth))
truth_overlap_pred <- sum(overlapsAny(truth, pred))
fp <- n_pred - pred_overlap_truth
fn <- n_truth - truth_overlap_pred
precision <- ifelse(n_pred == 0, 0, pred_overlap_truth / n_pred)
recall <- ifelse(n_truth == 0, 0, truth_overlap_pred / n_truth)
f1 <- ifelse(precision + recall == 0, 0, 2 * precision * recall / (precision + recall))
pred_red <- reduce(pred)
truth_red <- reduce(truth)
inter <- GenomicRanges::intersect(pred_red, truth_red)
pred_bp <- sum(width(pred_red))
truth_bp <- sum(width(truth_red))
inter_bp <- ifelse(length(inter) == 0, 0, sum(width(inter)))
union_bp <- pred_bp + truth_bp - inter_bp
precision_bp <- ifelse(pred_bp == 0, 0, inter_bp / pred_bp)
recall_bp <- ifelse(truth_bp == 0, 0, inter_bp / truth_bp)
f1_bp <- ifelse(precision_bp + recall_bp == 0, 0, 2 * precision_bp * recall_bp / (precision_bp + recall_bp))
iou_bp <- ifelse(union_bp == 0, 0, inter_bp / union_bp)
data.frame(
TP_regions = truth_overlap_pred,
FP_regions = fp,
FN_regions = fn,
Precision_regions = precision,
Recall_regions = recall,
F1_regions = f1,
Predicted_bp = pred_bp,
Truth_bp = truth_bp,
Intersect_bp = inter_bp,
Precision_bp = precision_bp,
Recall_bp = recall_bp,
F1_bp = f1_bp,
IoU_bp = iou_bp
)
}
truth_eval <- do.call(rbind, lapply(names(methods_list), function(method) {
out <- evaluate_against_truth(methods_list[[method]], truth_dmrs)
out$Method <- method
out
}))
truth_eval$Method <- factor(truth_eval$Method, levels = method_order)
truth_eval <- truth_eval[order(truth_eval$Method), , drop = FALSE]
truth_eval$Method <- as.character(truth_eval$Method)
truth_eval <- truth_eval |>
dplyr::select(
Method,
TP_regions,
FP_regions, FN_regions,
Precision_regions, Recall_regions, F1_regions,
Precision_bp, Recall_bp, F1_bp, IoU_bp
)
regions_truth_eval <- truth_eval |>
dplyr::select(Method, TP_regions, FP_regions, FN_regions, Precision_regions, Recall_regions, F1_regions) |>
dplyr::rename_with(~ sub("_regions$", "", .), ends_with("_regions"))
bp_truth_eval <- truth_eval |>
dplyr::select(Method, Precision_bp, Recall_bp, F1_bp, IoU_bp) |>
dplyr::rename_with(~ sub("_bp$", "", .), ends_with("_bp"))
knitr::kable(regions_truth_eval, digits = 4, caption = "Macro (Region-level) Performance Against Ground Truth")
| Method | TP | FP | FN | Precision | Recall | F1 |
|---|---|---|---|---|---|---|
| CMEnt | 994 | 20 | 6 | 0.981 | 0.994 | 0.9874 |
| DMRcate | 439 | 0 | 561 | 1.000 | 0.439 | 0.6101 |
| dmrseq | 800 | 0 | 200 | 1.000 | 0.800 | 0.8889 |
| hummingbird | 408 | 0 | 592 | 1.000 | 0.408 | 0.5795 |
knitr::kable(bp_truth_eval, digits = 4, caption = "Micro (Base-pair-level) Performance Against Ground Truth")
| Method | Precision | Recall | F1 | IoU |
|---|---|---|---|---|
| CMEnt | 0.8938 | 0.9594 | 0.9254 | 0.8611 |
| DMRcate | 0.9972 | 0.3806 | 0.5509 | 0.3802 |
| dmrseq | 0.9938 | 0.7085 | 0.8272 | 0.7053 |
| hummingbird | 0.9980 | 0.2980 | 0.4590 | 0.2978 |
all_sets <- c(methods_list, list(`Ground Truth` = truth_dmrs))
calculate_overlap <- function(gr1, gr2) {
if (length(gr1) == 0 || length(gr2) == 0) {
return(list(count = 0, percentage = 0))
}
overlap_count <- sum(overlapsAny(gr1, gr2))
pct <- overlap_count / length(gr1) * 100
list(count = overlap_count, percentage = pct)
}
overlap_count_mat <- matrix(0, nrow = length(all_sets), ncol = length(all_sets))
overlap_pct_mat <- matrix(0, nrow = length(all_sets), ncol = length(all_sets))
rownames(overlap_count_mat) <- colnames(overlap_count_mat) <- names(all_sets)
rownames(overlap_pct_mat) <- colnames(overlap_pct_mat) <- names(all_sets)
for (i in seq_along(all_sets)) {
for (j in seq_along(all_sets)) {
if (i == j) {
overlap_count_mat[i, j] <- length(all_sets[[i]])
overlap_pct_mat[i, j] <- 100
} else {
ov <- calculate_overlap(all_sets[[i]], all_sets[[j]])
overlap_count_mat[i, j] <- ov$count
overlap_pct_mat[i, j] <- ov$percentage
}
}
}
knitr::kable(overlap_count_mat, digits = 0, caption = "Overlap Counts (query=row, reference=column)")
| CMEnt | DMRcate | dmrseq | hummingbird | Ground Truth | |
|---|---|---|---|---|---|
| CMEnt | 1051 | 446 | 829 | 441 | 1031 |
| DMRcate | 445 | 445 | 443 | 322 | 445 |
| dmrseq | 800 | 442 | 800 | 415 | 800 |
| hummingbird | 549 | 432 | 544 | 549 | 549 |
| Ground Truth | 994 | 439 | 800 | 408 | 1000 |
knitr::kable(overlap_pct_mat, digits = 2, caption = "Overlap Percentages (query=row, reference=column)")
| CMEnt | DMRcate | dmrseq | hummingbird | Ground Truth | |
|---|---|---|---|---|---|
| CMEnt | 100.0 | 42.44 | 78.88 | 41.96 | 98.1 |
| DMRcate | 100.0 | 100.00 | 99.55 | 72.36 | 100.0 |
| dmrseq | 100.0 | 55.25 | 100.00 | 51.88 | 100.0 |
| hummingbird | 100.0 | 78.69 | 99.09 | 100.00 | 100.0 |
| Ground Truth | 99.4 | 43.90 | 80.00 | 40.80 | 100.0 |
Assessing seed-space recoverability and seed capture efficiency.
Metrics explanation:
Metrics values:
seed_gr <- sort(seed_gr)
mcols(seed_gr)$index <- seq_along(seed_gr)
seeds_in_truth <- subsetByOverlaps(seed_gr, truth_dmrs, ignore.strand = TRUE)
if (length(dmrs_cment) > 0) {
seeds_in_dmrs <- subsetByOverlaps(seed_gr, dmrs_cment, ignore.strand = TRUE)
seeds_in_truth_and_dmrs <- subsetByOverlaps(seeds_in_truth, dmrs_cment, ignore.strand = TRUE)
seeds_in_truth_not_dmrs <- subsetByOverlaps(seeds_in_truth, dmrs_cment, ignore.strand = TRUE, invert = TRUE)
} else {
seeds_in_dmrs <- seed_gr[0]
seeds_in_truth_and_dmrs <- seed_gr[0]
seeds_in_truth_not_dmrs <- seeds_in_truth
}
seed_truth_ov <- findOverlaps(seeds_in_truth, truth_dmrs, ignore.strand = TRUE)
truth_seed_total <- tabulate(subjectHits(seed_truth_ov), nbins = length(truth_dmrs))
seed_truth_hit_ov <- findOverlaps(seeds_in_truth_and_dmrs, truth_dmrs, ignore.strand = TRUE)
truth_seed_captured <- tabulate(subjectHits(seed_truth_hit_ov), nbins = length(truth_dmrs))
truth_seed_capture_pct <- ifelse(truth_seed_total > 0, round(100 * truth_seed_captured / truth_seed_total, 1), NA_real_)
recoverable_truth <- truth_seed_total >= min_seeds
recoverable_with_captured <- recoverable_truth & truth_seed_captured >= min_seeds
recoverable_fully_captured <- recoverable_truth & truth_seed_captured == truth_seed_total
stage1_quality <- data.frame(
Truth_DMRs_Total = length(truth_dmrs),
Truth_DMRs_With_Seeds = sum(truth_seed_total >= 1L),
Truth_DMRs_Stage1Recoverable_MinSeeds = sum(recoverable_truth),
Truth_DMRs_Stage1Recoverable_WithMinCapturedSeeds = sum(recoverable_with_captured),
Truth_DMRs_Stage1Recoverable_FullyCapturedSeeds = sum(recoverable_fully_captured),
Seed_Capture_Rate_Pct_in_Truth = ifelse(length(seeds_in_truth) == 0, NA_real_, round(100 * length(seeds_in_truth_and_dmrs) / length(seeds_in_truth), 1)),
Median_Seed_Capture_Pct_Per_Truth = ifelse(sum(truth_seed_total > 0) == 0, NA_real_, stats::median(truth_seed_capture_pct[truth_seed_total > 0], na.rm = TRUE))
)
for (col in colnames(stage1_quality)) {
cat(sprintf("- **%s**: %s\n", col, stage1_quality[[col]]))
}
# Consecutive missed-seed runs are computed per truth DMR using original seed index.
ov_truth <- findOverlaps(seeds_in_truth_not_dmrs, truth_dmrs, ignore.strand = TRUE)
qh_by_truth <- split(queryHits(ov_truth), subjectHits(ov_truth))
run_df <- do.call(rbind, lapply(names(qh_by_truth), function(truth_id_chr) {
truth_id <- as.integer(truth_id_chr)
qh <- qh_by_truth[[truth_id_chr]]
idx <- mcols(seeds_in_truth_not_dmrs)$index[qh]
db <- as.numeric(mcols(seeds_in_truth_not_dmrs)$delta_beta[qh])
ord <- order(idx)
idx <- idx[ord]
db <- db[ord]
run_id <- cumsum(c(TRUE, diff(idx) != 1L))
data.frame(
Truth_DMR_Index = truth_id,
run_length = as.integer(tabulate(run_id)),
mean_delta_beta = as.numeric(tapply(db, run_id, mean, na.rm = TRUE))
)
}))
run_count <- as.data.frame(table(run_df$run_length))
colnames(run_count) <- c("Consecutive_Seed_Length", "Count")
run_delta <- aggregate(mean_delta_beta ~ run_length, data = run_df, FUN = mean, na.rm = TRUE)
colnames(run_delta) <- c("Consecutive_Seed_Length", "Mean_Delta_Beta")
run_summary <- merge(run_count, run_delta, by = "Consecutive_Seed_Length", all.x = TRUE)
max_run_per_truth <- aggregate(run_length ~ Truth_DMR_Index, data = run_df, FUN = max)
run_overview <- data.frame(
Truth_DMRs_With_Missed_Seeds = nrow(max_run_per_truth),
Median_Max_Missed_Run_Per_Truth = stats::median(max_run_per_truth$run_length),
Max_Missed_Run_Observed = max(max_run_per_truth$run_length)
)
for (col in colnames(run_overview)) {
cat(sprintf("- **%s**: %s\n", col, run_overview[[col]]))
}
Assessing truth-side recovery and prediction-side fragmentation/purity after extension and merging. Metrics explanation:
Metrics values:
truth_to_pred_hits <- findOverlaps(truth_dmrs, dmrs_cment, ignore.strand = TRUE)
preds_per_truth <- tabulate(queryHits(truth_to_pred_hits), nbins = length(truth_dmrs))
truth_overlap_bp <- numeric(length(truth_dmrs))
if (length(truth_to_pred_hits) > 0) {
truth_intersections <- pintersect(
truth_dmrs[queryHits(truth_to_pred_hits)],
dmrs_cment[subjectHits(truth_to_pred_hits)]
)
truth_intersections_by_truth <- split(truth_intersections, queryHits(truth_to_pred_hits))
truth_overlap_bp[as.integer(names(truth_intersections_by_truth))] <- vapply(
truth_intersections_by_truth,
function(gr) sum(width(reduce(gr))),
numeric(1)
)
}
truth_bp_recall <- ifelse(width(truth_dmrs) > 0, truth_overlap_bp / width(truth_dmrs), 0)
pred_to_truth_hits <- findOverlaps(dmrs_cment, truth_dmrs, ignore.strand = TRUE)
pred_overlap_bp <- numeric(length(dmrs_cment))
if (length(pred_to_truth_hits) > 0) {
pred_intersections <- pintersect(
dmrs_cment[queryHits(pred_to_truth_hits)],
truth_dmrs[subjectHits(pred_to_truth_hits)]
)
pred_intersections_by_pred <- split(pred_intersections, queryHits(pred_to_truth_hits))
pred_overlap_bp[as.integer(names(pred_intersections_by_pred))] <- vapply(
pred_intersections_by_pred,
function(gr) sum(width(reduce(gr))),
numeric(1)
)
}
pred_bp_precision <- ifelse(width(dmrs_cment) > 0, pred_overlap_bp / width(dmrs_cment), 0)
stage24_quality <- data.frame(
Pred_DMRs_Total = length(dmrs_cment),
Pred_DMRs_Overlapping_Truth = sum(overlapsAny(dmrs_cment, truth_dmrs)),
Pred_DMRs_BPPrecision_ge50pct = sum(pred_bp_precision >= 0.5),
Pred_DMRs_BPPrecision_ge80pct = sum(pred_bp_precision >= 0.8),
Truth_DMRs_Total = length(truth_dmrs),
Truth_DMRs_Hit = sum(preds_per_truth > 0),
Truth_DMRs_BPRecall_ge50pct = sum(truth_bp_recall >= 0.5),
Truth_DMRs_BPRecall_ge80pct = sum(truth_bp_recall >= 0.8),
Median_Truth_BPRecall_Pct_Hit = ifelse(sum(preds_per_truth > 0) == 0, 0, 100 * stats::median(truth_bp_recall[preds_per_truth > 0])),
Median_Pred_BPPrecision_Pct_Overlap = ifelse(sum(pred_bp_precision > 0) == 0, 0, 100 * stats::median(pred_bp_precision[pred_bp_precision > 0])),
Truth_DMRs_With_Multiple_Pred = sum(preds_per_truth >= 2),
Median_Preds_Per_Hit_Truth = ifelse(sum(preds_per_truth > 0) == 0, 0, stats::median(preds_per_truth[preds_per_truth > 0])),
Max_Preds_On_Truth = ifelse(length(preds_per_truth) == 0, 0, max(preds_per_truth))
)
for (col in names(stage24_quality)) {
cat(sprintf("- **%s**: %s\n", col, stage24_quality[[col]]))
}
plot_df <- truth_eval |>
dplyr::select(Method, F1_regions, F1_bp, IoU_bp) |>
tidyr::pivot_longer(
cols = c(F1_regions, F1_bp, IoU_bp),
names_to = "Metric",
values_to = "Value"
)
plot_df$Metric <- factor(
plot_df$Metric,
levels = c("F1_regions", "F1_bp", "IoU_bp"),
labels = c("F1 (Region)", "F1 (Base-pair)", "IoU (Base-pair)")
)
plot_df$Method <- factor(plot_df$Method, levels = method_order)
ggplot(plot_df, aes(x = Method, y = Value, fill = Method)) +
geom_col() +
facet_wrap(~Metric, nrow = 1) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "none"
) +
ylim(0, 1) +
labs(
title = "Method Performance vs Ground Truth",
x = "",
y = "Score"
)
width_data <- do.call(rbind, lapply(names(all_sets), function(method) {
x <- all_sets[[method]]
if (length(x) == 0) {
return(NULL)
}
data.frame(Method = method, Width = width(x))
}))
width_data$Method <- factor(width_data$Method, levels = names(all_sets))
ggplot(width_data, aes(x = Method, y = Width, fill = Method)) +
geom_violin(alpha = 0.7) +
geom_boxplot(width = 0.2, alpha = 0.5, outlier.size = 0.5) +
scale_y_log10(labels = scales::comma) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "none"
) +
labs(
title = "DMR Width Distributions",
x = "",
y = "Width (bp, log scale)"
)
cat("\n## Key Findings\n\n")
if (nrow(truth_eval) > 0) {
best_f1 <- truth_eval[which.max(truth_eval$F1_regions), ]
best_iou <- truth_eval[which.max(truth_eval$IoU_bp), ]
cat(sprintf(
"- Best region-level F1: **%s** (%.3f)\n",
best_f1$Method, best_f1$F1_regions
))
cat(sprintf(
"- Best base-pair IoU: **%s** (%.3f)\n",
best_iou$Method, best_iou$IoU_bp
))
if ("CMEnt" %in% truth_eval$Method) {
seg <- truth_eval[truth_eval$Method == "CMEnt", ]
cat(sprintf(
"- **CMEnt** region metrics: Precision = %.3f, Recall = %.3f, F1 = %.3f\n",
seg$Precision_regions, seg$Recall_regions, seg$F1_regions
))
cat(sprintf(
"- **CMEnt** base-pair metrics: Precision = %.3f, Recall = %.3f, F1 = %.3f, IoU = %.3f\n",
seg$Precision_bp, seg$Recall_bp, seg$F1_bp, seg$IoU_bp
))
}
}
sessionInfo()
R version 4.6.0 (2026-04-24) Platform: x86_64-pc-linux-gnu Running under: Ubuntu 24.04.4 LTS
Matrix products: default BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.12.0 LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0 LAPACK version 3.12.0
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
time zone: Europe/Brussels tzcode source: system (glibc)
attached base packages:
[1] parallel stats4 stats graphics grDevices datasets utils
[8] methods base
other attached packages:
[1] bsseq_1.48.0 CMEnt_0.99.0
[3] dplyr_1.2.1 ggplot2_4.0.3
[5] data.table_1.18.4 minfi_1.58.0
[7] bumphunter_1.54.0 locfit_1.5-9.12
[9] iterators_1.0.14 foreach_1.5.2
[11] Biostrings_2.80.1 XVector_0.52.0
[13] SummarizedExperiment_1.42.0 Biobase_2.72.0
[15] MatrixGenerics_1.24.0 matrixStats_1.5.0
[17] GenomicRanges_1.64.0 Seqinfo_1.2.0
[19] IRanges_2.46.0 S4Vectors_0.50.1
[21] BiocGenerics_0.58.1 generics_0.1.4
[23] testthat_3.3.2 BiocStyle_2.40.0
loaded via a namespace (and not attached):
[1] igraph_2.3.2
[2] plotly_4.12.0
[3] ChAMPdata_2.44.0
[4] Formula_1.2-5
[5] devtools_2.5.2
[6] tidyselect_1.2.1
[7] bit_4.6.0
[8] doParallel_1.0.17
[9] clue_0.3-68
[10] lattice_0.22-9
[11] rjson_0.2.23
[12] nor1mix_1.3-3
[13] blob_1.3.0
[14] stringr_1.6.0
[15] rngtools_1.5.2
[16] S4Arrays_1.12.0
[17] base64_2.0.2
[18] dichromat_2.0-0.1
[19] scrime_1.3.7
[20] seqLogo_1.78.0
[21] png_0.1-9
[22] tinytex_0.59
[23] cli_3.6.6
[24] marray_1.90.0
[25] bedr_1.1.5
[26] outliers_0.15
[27] ProtGenerics_1.44.0
[28] askpass_1.2.1
[29] multtest_2.68.0
[30] openssl_2.4.2
[31] JADE_2.0-4
[32] nleqslv_3.3.7
[33] pkgdown_2.2.0
[34] BiocIO_1.22.0
[35] purrr_1.2.2
[36] dendextend_1.19.1
[37] curl_7.1.0
[38] mime_0.13
[39] evaluate_1.0.5
[40] wateRmelon_2.18.0
[41] ComplexHeatmap_2.28.0
[42] stringi_1.8.7
[43] backports_1.5.1
[44] desc_1.4.3
[45] XML_3.99-0.23
[46] httpuv_1.6.17
[47] AnnotationDbi_1.74.0
[48] magrittr_2.0.5
[49] rappdirs_0.3.4
[50] splines_4.6.0
[51] mclust_6.1.2
[52] DelayedDataFrame_1.28.0
[53] BiasedUrn_2.0.12
[54] jpeg_0.1-11
[55] doRNG_1.8.6.3
[56] rentrez_1.2.4
[57] org.Hs.eg.db_3.23.1
[58] IlluminaHumanMethylation450kmanifest_0.4.0
[59] DT_0.34.0
[60] sessioninfo_1.2.4
[61] DBI_1.3.0
[62] HDF5Array_1.40.0
[63] jquerylib_0.1.4
[64] genefilter_1.94.0
[65] withr_3.0.2
[66] class_7.3-23
[67] rprojroot_2.1.1
[68] brio_1.1.5
[69] sva_3.60.0
[70] isva_1.10
[71] formatR_1.14
[72] rtracklayer_1.72.0
[73] BiocManager_1.30.27
[74] Illumina450ProbeVariants.db_1.48.0
[75] htmlwidgets_1.6.4
[76] fs_2.1.0
[77] biomaRt_2.68.0
[78] missMethyl_1.46.0
[79] labeling_0.4.3
[80] SparseArray_1.12.2
[81] shinycssloaders_1.1.0
[82] h5mread_1.4.0
[83] annotate_1.90.0
[84] VariantAnnotation_1.58.0
[85] knitr_1.51
[86] TFBSTools_1.50.0
[87] UCSC.utils_1.8.0
[88] beanplot_1.3.1
[89] TFMPvalue_1.0.0
[90] ChAMP_2.42.0
[91] annotatr_1.38.0
[92] caTools_1.18.3
[93] grid_4.6.0
[94] rhdf5_2.56.0
[95] pwalign_1.8.0
[96] R.oo_1.27.1
[97] regioneR_1.44.0
[98] ellipsis_0.3.3
[99] lazyeval_0.2.3
[100] yaml_2.3.12
[101] survival_3.8-6
[102] RPMM_1.25
[103] BiocVersion_3.23.1
[104] crayon_1.5.3
[105] RColorBrewer_1.1-3
[106] tidyr_1.3.2
[107] later_1.4.8
[108] codetools_0.2-20
[109] base64enc_0.1-6
[110] GlobalOptions_0.1.4
[111] KEGGREST_1.52.0
[112] shape_1.4.6.1
[113] fastICA_1.2-7
[114] limma_3.68.4
[115] IlluminaHumanMethylationEPICanno.ilm10b4.hg19_0.6.0
[116] Rsamtools_2.28.0
[117] filelock_1.0.3
[118] showtext_0.9-8
[119] foreign_0.8-90
[120] pkgconfig_2.0.3
[121] IlluminaHumanMethylation450kanno.ilmn12.hg19_0.6.1
[122] xml2_1.5.2
[123] GenomicAlignments_1.48.0
[124] hummingbird_1.22.0
[125] BSgenome_1.80.0
[126] viridisLite_0.4.3
[127] biovizBase_1.60.0
[128] gridBase_0.4-7
[129] xtable_1.8-8
[130] interp_1.1-6
[131] lumi_2.64.0
[132] plyr_1.8.9
[133] httr_1.4.8
[134] tools_4.6.0
[135] pkgbuild_1.4.8
[136] htmlTable_2.5.0
[137] checkmate_2.3.4
[138] nlme_3.1-168
[139] affy_1.90.0
[140] futile.logger_1.4.9
[141] lambda.r_1.2.4
[142] dbplyr_2.5.2
[143] ExperimentHub_3.2.0
[144] IlluminaHumanMethylationEPICmanifest_0.3.0
[145] digest_0.6.39
[146] permute_0.9-10
[147] bookdown_0.46
[148] Matrix_1.7-5
[149] farver_2.1.2
[150] tzdb_0.5.0
[151] AnnotationFilter_1.36.0
[152] reshape2_1.4.5
[153] viridis_0.6.5
[154] DirichletMultinomial_1.54.0
[155] rpart_4.1.27
[156] glue_1.8.1
[157] cachem_1.1.0
[158] bspm_0.5.8
[159] VennDiagram_1.8.2
[160] BiocFileCache_3.2.0
[161] methylumi_2.58.0
[162] Hmisc_5.2-5
[163] txdbmaker_1.8.0
[164] sysfonts_0.8.9
[165] pkgload_1.5.2
[166] statmod_1.5.2
[167] impute_1.86.0
[168] GEOquery_2.80.0
[169] httr2_1.2.2
[170] showtextdb_3.0
[171] dmrseq_1.32.0
[172] siggenes_1.86.0
[173] gtools_3.9.5
[174] preprocessCore_1.74.0
[175] affyio_1.82.0
[176] gridExtra_2.3
[177] shiny_1.13.0
[178] R.utils_2.13.0
[179] rhdf5filters_1.24.0
[180] RCurl_1.98-1.19
[181] memoise_2.0.1
[182] rmarkdown_2.31
[183] scales_1.4.0
[184] R.methodsS3_1.8.2
[185] reshape_0.8.10
[186] illuminaio_0.54.0
[187] rstudioapi_0.18.0
[188] cluster_2.1.8.2
[189] ROC_1.88.0
[190] hms_1.1.4
[191] globaltest_5.66.0
[192] DMRcate_3.8.0
[193] colorspace_2.1-2
[194] FNN_1.1.4.1
[195] rlang_1.2.0
[196] quadprog_1.5-8
[197] GenomeInfoDb_1.48.0
[198] DelayedMatrixStats_1.34.0
[199] sparseMatrixStats_1.24.0
[200] shinythemes_1.2.0
[201] circlize_0.4.18
[202] mgcv_1.9-4
[203] xfun_0.58
[204] ggseqlogo_0.2.2
[205] e1071_1.7-17
[206] abind_1.4-8
[207] tibble_3.3.1
[208] Rhdf5lib_2.0.0
[209] readr_2.2.0
[210] futile.options_1.0.1
[211] bitops_1.0-9
[212] ps_1.9.3
[213] promises_1.5.0
[214] inline_0.3.21
[215] RSQLite_3.53.1
[216] qvalue_2.44.0
[217] DelayedArray_0.38.2
[218] proxy_0.4-29
[219] GO.db_3.23.1
[220] compiler_4.6.0
[221] prettyunits_1.2.0
[222] beachmat_2.28.0
[223] Rcpp_1.1.1-1.1
[224] DNAcopy_1.86.0
[225] edgeR_4.10.1
[226] AnnotationHub_4.2.0
[227] usethis_3.2.1
[228] MASS_7.3-65
[229] progress_1.2.3
[230] BiocParallel_1.46.0
[231] R6_2.6.1
[232] fastmap_1.2.0
[233] ensembldb_2.36.1
[234] nnet_7.3-20
[235] gtable_0.3.6
[236] KernSmooth_2.23-26
[237] strex_2.0.1
[238] latticeExtra_0.6-31
[239] deldir_2.0-4
[240] htmltools_0.5.9
[241] bit64_4.8.2
[242] lifecycle_1.0.5
[243] S7_0.2.2
[244] processx_3.9.0
[245] callr_3.8.0
[246] Gviz_1.56.0
[247] restfulr_0.0.16
[248] sass_0.4.10
[249] vctrs_0.7.3
[250] goseq_1.64.0
[251] bslib_0.11.0
[252] pillar_1.11.1
[253] GenomicFeatures_1.64.0
[254] magick_2.9.1
[255] otel_0.2.0
[256] combinat_0.0-8
[257] geneLenDataBase_1.48.0
[258] jsonlite_2.0.0
[259] cigarillo_1.2.0
[260] GetoptLong_1.1.1