Contents

0.1 Introduction

This vignette cached_results DMR detection on simulated ONT-style methylation data with known ground truth. The dataset used is a simulated ONT dataset, based on an in-house dataset of 5 samples from neuroendocrine islets, augmented to 20 samples using CMEnt::augmentBSSeq, subset to chromosomes 20 and 21, and with 1000 DMRs spiked in using CMEnt::simulateDMRs, with max delta beta (delta_max) set to 0.3 and expected within-dmr correlation (expected_corr) set to 0.7.

Methods:

Primary goal:

0.2 Setup and Dependencies

try(devtools::load_all(), silent = TRUE)
suppressPackageStartupMessages({
    try(library(CMEnt), silent = TRUE)
    library(bsseq)
    library(data.table)
    library(ggplot2)
    library(dplyr)
    library(GenomicRanges)
    library(SummarizedExperiment)
    library(S4Vectors)
})

getBenchmarkFile <- function(filename) {
    benchmark_file <- ""
    folder <- "simulated_ont"
    if (getwd() == "notebooks") {
        benchmark_file <- file.path("cached_results", folder, filename)
    } else if ("NAMESPACE" %in% dir()) {
        benchmark_file <- file.path("notebooks", "cached_results", folder, filename)
    } else {
        benchmark_file <- file.path("cached_results", folder, filename)
    }
    dir.create(dirname(benchmark_file), showWarnings = FALSE, recursive = TRUE)
    benchmark_file
}

min_seeds <- 2L
min_sites <- 50L
max_pval <- 0.05
testing_mode  <- "parametric"
ext_site_delta_beta  <- 0.1
max_bridge_seeds_gaps <- 1L
max_bridge_extension_gaps <- 2L
max_lookup_dist <- 1000L
method_cache_tag <- sprintf(
    "sites%d_p%s_ext%s_lookup%d",
    min_sites,
    gsub("[^0-9]+", "", sprintf("%.3f", max_pval)),
    gsub("[^0-9]+", "", sprintf("%.3f", ext_site_delta_beta)),
    max_lookup_dist
)

0.2.1 Install External Dependencies

if (!requireNamespace("BiocManager", quietly = TRUE)) {
    install.packages("BiocManager")
}

if (!requireNamespace("DMRcate", quietly = TRUE)) {
    BiocManager::install("DMRcate", update = FALSE, ask = FALSE)
}

if (!requireNamespace("dmrseq", quietly = TRUE)) {
    BiocManager::install("dmrseq", update = FALSE, ask = FALSE)
}

if (!requireNamespace("hummingbird", quietly = TRUE)) {
    BiocManager::install("hummingbird", update = FALSE, ask = FALSE)
}

if (!requireNamespace("tidyr", quietly = TRUE)) {
    install.packages("tidyr")
}

0.3 Load Simulated Data and Ground Truth

cment_file <- getBenchmarkFile(sprintf("dmrs.CMEnt.%s.rds", method_cache_tag))
dmrcate_file <- getBenchmarkFile(sprintf("dmrs.DMRcate.%s.rds", method_cache_tag))
dmrseq_file <- getBenchmarkFile(sprintf("dmrs.dmrseq.%s.rds", method_cache_tag))
hummingbird_file <- getBenchmarkFile(sprintf("dmrs.hummingbird.%s.rds", method_cache_tag))
pheno_file <- getBenchmarkFile("ont_sim_samplesheet.tsv")
seeds_file <- getBenchmarkFile(sprintf("seeds.p%s.tsv.gz", gsub("[^0-9]+", "", sprintf("%.3f", max_pval))))

computation_needed <- !file.exists(cment_file) || !file.exists(dmrcate_file) || !file.exists(dmrseq_file) || !file.exists(hummingbird_file) || !file.exists(pheno_file) || !file.exists(seeds_file)
zenodo_base_url <- "https://zenodo.org/records/20592944/files/"

sim_truth_path <- getBenchmarkFile("ont_sim_dmrs.tsv")
if (computation_needed) {
    dir.create("data", showWarnings = FALSE, recursive = TRUE)
    ont_sim_bs_path <- getBenchmarkFile("ont_sim_bs.rds")
    ont_sim_bs_hdf5_path <- getBenchmarkFile("ont_sim_bs.h5")
    if (file.exists(ont_sim_bs_hdf5_path)) {
        ont_sim_bs <- HDF5Array::loadHDF5SummarizedExperiment(ont_sim_bs_hdf5_path)
    } else {

        if (!file.exists(ont_sim_bs_path)) {
            curl::curl_download(
                url = paste0(zenodo_base_url, basename(ont_sim_bs_path), "?download=1"),
                destfile = ont_sim_bs_path
            )
        }

        ont_sim_bs <- readRDS(ont_sim_bs_path)
        ont_sim_bs <- sort(ont_sim_bs)

        ont_sim_bs <- HDF5Array::saveHDF5SummarizedExperiment(
            ont_sim_bs,
            dir = ont_sim_bs_hdf5_path,
            replace = TRUE
        )
    }

    stopifnot(inherits(ont_sim_bs, "BSseq"))
    cat(sprintf("Loaded BSseq with %d sites and %d samples\n", nrow(ont_sim_bs), ncol(ont_sim_bs)))

}

Loaded BSseq with 2292633 sites and 20 samples

if (!file.exists(sim_truth_path)) {
    curl::curl_download(
        url = paste0(zenodo_base_url, basename(sim_truth_path), "?download=1"),
        destfile = sim_truth_path
    )
}
truth_df <- data.table::fread(sim_truth_path)
truth_dmrs <- makeGRangesFromDataFrame(
    truth_df,
    seqnames.field = "seqnames",
    start.field = "start",
    end.field = "end",
    keep.extra.columns = TRUE,
    seqinfo = Seqinfo(genome = "hg38")
)

cat(sprintf("Loaded %d ground-truth DMRs\n", length(truth_dmrs)))

Loaded 1000 ground-truth DMRs

0.4 Build Phenotype Table from Sample Names

if (!file.exists(pheno_file)) {
    sample_ids <- colnames(ont_sim_bs)
    sample_group <- ifelse(grepl("^Condition1", sample_ids), "Condition1", "Condition2")

    pheno <- data.frame(
        Sample_ID = sample_ids,
        Sample_Group = sample_group,
        casecontrol = sample_group == "Condition2",
        stringsAsFactors = FALSE
    )
    rownames(pheno) <- pheno$Sample_ID
    data.table::fwrite(pheno, pheno_file, sep = "\t")
} else {
    pheno <- data.table::fread(pheno_file, data.table = FALSE)
}
if (computation_needed) {
    colData(ont_sim_bs) <- S4Vectors::DataFrame(pheno, row.names = pheno$Sample_ID)
    ont_sim_bs  <- sort(ont_sim_bs)
}
knitr::kable(
    table(pheno$Sample_Group),
    caption = "Phenotype Frequency Table"
)

Table 1: Phenotype Frequency Table
Var1 Freq
Condition1 10
Condition2 10

0.5 Benchmarking DMR Detection Methods

0.5.1 CMEnt

0.5.1.1 Adjusting parameters for NGS simulation

knitr::kable(
    data.frame(
        Parameter = c(
            "min_seeds", "min_sites", "max_pval", "testing_mode",
            "ext_site_delta_beta", "max_bridge_seeds_gaps",
            "max_bridge_extension_gaps", "max_lookup_dist"
        ),
        Value = c(
            min_seeds, min_sites, max_pval, testing_mode,
            ext_site_delta_beta, max_bridge_seeds_gaps,
            max_bridge_extension_gaps, max_lookup_dist
        )
    ),
    caption = "Benchmark detection parameters"
)

Table 2: Benchmark detection parameters
Parameter Value
min_seeds 2
min_sites 50
max_pval 0.05
testing_mode parametric
ext_site_delta_beta 0.1
max_bridge_seeds_gaps 1
max_bridge_extension_gaps 2
max_lookup_dist 1000

0.5.1.2 Extract Methylation Matrix and Generate Seeds

This step is crucial for CMEnt, which relies on seed sites to guide DMR assembly. We use a DSS approach to identify candidate seeds based on differential methylation evidence. We keep the FDR threshold relatively lenient, to account for the low sample size.

if (!file.exists(seeds_file)) {
    seeds <- findDMPsBSSeq(
        bsseq = ont_sim_bs,
        samplesheet = pheno,
        sample_group_col = "Sample_Group",
        id_col = "Sample_ID",
        case_group = "Condition2",
        covariates = NULL,
        output_file = seeds_file
    )
    seeds <- seeds[seeds$qval < max_pval, , drop = FALSE]
} else {
    seeds <- data.table::fread(seeds_file)
    rownames(seeds) <- seeds$site_id
}
seeds <- as.data.frame(seeds, stringsAsFactors = FALSE)
rownames(seeds) <- seeds$site_id
seeds$Row.names <- NULL
seed_dist <- unlist(lapply(split(seeds$start, seeds$chr), function(x) {
    if (length(x) < 2) return(numeric(0))
    diff(x)
}))
seeds_qs <- round(stats::quantile(seed_dist, probs = c(0, 0.25, 0.5, 0.75, 0.9, 0.99, 1)))
cat(sprintf("Number of seeds for CMEnt: %d\n", nrow(seeds)))

Number of seeds for CMEnt: 74216

knitr::kable(
    seeds_qs,
    digits = 2,
    col.names = "Distance (bp)",
    caption = sprintf("Seed distance quantiles")
)
(#tab:seeds_calculation)Seed distance quantiles
Distance (bp)
0% 1
25% 1
50% 15
75% 104
90% 329
99% 41460
100% 412180

0.5.1.3 Seed Overlap with Ground Truth

Metrics explanation:

  • Seeds_Total: Total number of seed sites identified.
  • Seeds_Overlapping_Truth: Number of seed sites that overlap any ground-truth DMR.
  • Seeds_Overlap_Pct: Percentage of seed sites that overlap ground-truth DMRs.
  • Truth_DMRs_Total: Total number of ground-truth DMRs.
  • Truth_DMRs_With_Seed_Pct: Percentage of ground-truth DMRs that have at least one overlapping seed site.
  • Truth_DMRs_With_Enough_Seeds_Pct: Percentage of ground-truth DMRs that have enough overlapping seed sites to be captured by CMEnt.
  • Truth_DMRs_With_Enough_Proximal_Seeds_Pct: Percentage of ground-truth DMRs that have enough overlapping seed sites within a specified proximity (max_lookup_dist) to be captured by CMEnt.
  • Median_Seeds_Per_Truth_DMR: Median number of overlapping seeds per ground-truth DMR (including those with zero seeds).
  • Median_Seeds_Per_Hit_Truth_DMR: Median number of overlapping seeds among ground-truth DMRs that have at least one seed.
  • Max_Seeds_In_Truth_DMR: Maximum number of overlapping seeds in any single ground-truth DMR.
  • Overlap_Pairs_Direction_Match_Pct: Among seed-truth overlap pairs, the percentage where the direction of methylation change (hyper/hypo) matches between the seed and the truth DMR.
  • Overlapping_Seeds_Any_Direction_Match_Pct: Percentage of seeds that have at least one overlapping truth DMR with a matching direction of change.

Metrics values:

seed_df <- as.data.frame(seeds, stringsAsFactors = FALSE)
seed_df <- seed_df[!is.na(seed_df$start) & nzchar(seed_df$chr), , drop = FALSE]

seed_gr <- GenomicRanges::makeGRangesFromDataFrame(
    seed_df,
    seqnames.field = "chr",
    start.field = "start",
    end.field = "end",
    keep.extra.columns = TRUE,
    seqinfo = Seqinfo(genome = "hg38")
)

seed_truth_hits <- GenomicRanges::findOverlaps(seed_gr, truth_dmrs, ignore.strand = TRUE)
seed_overlaps_truth <- rep(FALSE, length(seed_gr))
truth_has_seed <- rep(FALSE, length(truth_dmrs))
truth_has_enough_seeds <- 0L

if (length(seed_truth_hits) > 0) {
    seed_overlaps_truth[unique(S4Vectors::queryHits(seed_truth_hits))] <- TRUE
    truth_has_seed[unique(S4Vectors::subjectHits(seed_truth_hits))] <- TRUE
}

seeds_per_truth <- if (length(truth_dmrs) > 0) {
    tabulate(S4Vectors::subjectHits(seed_truth_hits), nbins = length(truth_dmrs))
} else {
    integer(0)
}
seeds_per_truth_within_mindist <- if (length(seed_truth_hits) > 0) {
    seed_positions <- GenomicRanges::start(seed_gr)[S4Vectors::queryHits(seed_truth_hits)]
    seeds_within_mindist <- list()
    for (i in seq_along(truth_dmrs)){
        dmr_seeds_mask <- S4Vectors::subjectHits(seed_truth_hits) == i
        if (sum(dmr_seeds_mask) == 0) {
            seeds_within_mindist[[i]] <- 0
        }
        sorts <- order(seed_positions[dmr_seeds_mask])
        dists <- diff(seed_positions[dmr_seeds_mask][sorts])
        seeds_within_mindist[[i]] <- sum(dists <= max_lookup_dist)
    }
    unlist(seeds_within_mindist)
} else {
    integer(0)
}
truth_has_enough_seeds <- sum(seeds_per_truth >= min_seeds)
safe_pct <- function(x) if (length(x) == 0) 0 else round(100 * mean(x), 1)

seed_truth_summary <- data.frame(
    Seeds_Total = length(seed_gr),
    Seeds_Overlapping_Truth = sum(seed_overlaps_truth),
    Seeds_Overlap_Pct = safe_pct(seed_overlaps_truth),
    Truth_DMRs_Total = length(truth_dmrs),
    Truth_DMRs_With_Seed_Pct = safe_pct(truth_has_seed),
    Truth_DMRs_With_Enough_Seeds_Pct = ifelse(length(truth_dmrs) == 0, 0, 100 * truth_has_enough_seeds / length(truth_dmrs)),
    Truth_DMRs_With_Enough_Proximal_Seeds_Pct = ifelse(length(truth_dmrs) == 0, 0, 100 * sum(seeds_per_truth_within_mindist >= min_seeds) / length(truth_dmrs)),
    Median_Seeds_Per_Truth_DMR = ifelse(length(seeds_per_truth) == 0, NA_real_, stats::median(seeds_per_truth)),
    Median_Seeds_Per_Hit_Truth_DMR = ifelse(sum(truth_has_seed) == 0, 0, stats::median(seeds_per_truth[truth_has_seed])),
    Max_Seeds_In_Truth_DMR = ifelse(length(seeds_per_truth) == 0, NA_integer_, max(seeds_per_truth))
)
seed_sign <- sign(as.numeric(seed_df$delta_beta))
truth_sign <- sign(as.numeric(SummarizedExperiment::mcols(truth_dmrs)$delta_beta))
pair_match <- seed_sign[S4Vectors::queryHits(seed_truth_hits)] ==
    truth_sign[S4Vectors::subjectHits(seed_truth_hits)]

seed_pair_direction_match_pct <- if (all(is.na(pair_match))) NA_real_ else 100 * mean(pair_match, na.rm = TRUE)
per_seed_dir_match <- tapply(pair_match, S4Vectors::queryHits(seed_truth_hits), function(x) any(x, na.rm = TRUE))
seed_any_direction_match_pct <- if (length(per_seed_dir_match) == 0) NA_real_ else 100 * mean(per_seed_dir_match)

seed_truth_summary$Overlap_Pairs_Direction_Match_Pct <- round(seed_pair_direction_match_pct, 1)
seed_truth_summary$Overlapping_Seeds_Any_Direction_Match_Pct <- round(seed_any_direction_match_pct, 1)

for (i in seq_along(seed_truth_summary)) {
    cat(sprintf("- **%s**: %s\n", names(seed_truth_summary)[i], seed_truth_summary[[i]]))
}
  • Seeds_Total: 74216
  • Seeds_Overlapping_Truth: 71112
  • Seeds_Overlap_Pct: 95.8
  • Truth_DMRs_Total: 1000
  • Truth_DMRs_With_Seed_Pct: 100
  • Truth_DMRs_With_Enough_Seeds_Pct: 100
  • Truth_DMRs_With_Enough_Proximal_Seeds_Pct: 100
  • Median_Seeds_Per_Truth_DMR: 48
  • Median_Seeds_Per_Hit_Truth_DMR: 48
  • Max_Seeds_In_Truth_DMR: 423
  • Overlap_Pairs_Direction_Match_Pct: 100
  • Overlapping_Seeds_Any_Direction_Match_Pct: 100
score_col <- "score"

ord <- order(seed_df[[score_col]], decreasing = TRUE, na.last = NA)
top_k <- unique(pmin(length(ord), c(100, 500, 1000, 5000, 10000, length(ord))))
top_seed_overlap <- do.call(rbind, lapply(top_k, function(k) {
    idx <- ord[seq_len(k)]
    n_hit <- sum(seed_overlaps_truth[idx])
    data.frame(
        `Top Seeds` = k,
        `Overlap Pct` = 100 * n_hit / k
    )
}))

knitr::kable(
    top_seed_overlap,
    digits = 2,
    caption = sprintf("Ground-truth overlap enrichment among top-scoring seeds")
)

Table 3: Ground-truth overlap enrichment among top-scoring seeds
Top.Seeds Overlap.Pct
100 100.00
500 100.00
1000 100.00
5000 100.00
10000 100.00
74216 95.82
if (!file.exists(cment_file)) {
    default_cment_njobs <- max(1L, min(8L, parallel::detectCores(logical = TRUE) - 1L))
    cment_njobs <- suppressWarnings(as.integer(
        Sys.getenv("CMENT_BENCHMARK_NJOBS", as.character(default_cment_njobs))
    ))
    if (!is.finite(cment_njobs) || is.na(cment_njobs) || cment_njobs < 1L) {
        cment_njobs <- default_cment_njobs
    }
    cment_njobs <- max(1L, min(cment_njobs, parallel::detectCores(logical = TRUE)))
    options("CMEnt.njobs" = cment_njobs)
    options("CMEnt.verbose" = 3)
    cat(sprintf("CMEnt benchmark jobs: %d\n", cment_njobs))
    start_time <- Sys.time()

    beta_handler <- CMEnt::getBetaHandler(
        beta = ont_sim_bs
    )
    gc(verbose = FALSE)
    dmrs_cment <- CMEnt::buildDMRs(
        beta = beta_handler,
        seeds = seeds,
        pheno = pheno,
        seeds_id_col = "site_id",
        sample_group_col = "Sample_Group",
        casecontrol_col = "casecontrol",
        genome = "hg38",
        array = "NULL",
        entanglement = "weak",
        testing_mode = testing_mode,
        max_bridge_seeds_gaps = max_bridge_seeds_gaps,
        max_bridge_extension_gaps = max_bridge_extension_gaps,
        min_seeds = min_seeds,
        min_sites = min_sites/2,
        max_pval = max_pval,
        ext_site_delta_beta = ext_site_delta_beta,
        max_lookup_dist = max_lookup_dist, # Use a larger lookup distance (10k is the default)
        annotate_with_genes = FALSE,
        extract_motifs = FALSE,
        njobs = cment_njobs
    )
    end_time <- Sys.time()
    time_cment <- end_time - start_time
    saveRDS(list(dmrs = dmrs_cment, time = time_cment), cment_file)
    rm(beta_handler)
    colData(ont_sim_bs) <- S4Vectors::DataFrame(pheno, row.names = pheno$Sample_ID)
    gc(verbose = FALSE)
} else {
    ret <- readRDS(cment_file)
    dmrs_cment <- ret$dmrs
    time_cment <- ret$time
}

0.5.2 DMRcate

run_dmrcate_bsseq <- function(
    bs_obj,
    design,
    max_sites_per_chunk = 500000L,
    chunk_overlap_sites = 2000L
) {
    if (!requireNamespace("DMRcate", quietly = TRUE)) {
        warning("DMRcate not installed; returning empty set.")
        return(GenomicRanges::GRanges())
    }
    dmrcate_design <- as.matrix(design)
    if (nrow(dmrcate_design) != ncol(bs_obj)) {
        warning(sprintf(
            "DMRcate design rows (%d) do not match BSseq samples (%d); returning empty set.",
            nrow(dmrcate_design),
            ncol(bs_obj)
        ))
        return(GenomicRanges::GRanges())
    }
    if (ncol(dmrcate_design) < 2L) {
        warning("DMRcate requires at least two groups in the design matrix; returning empty set.")
        return(GenomicRanges::GRanges())
    }

    colnames(dmrcate_design) <- gsub("Sample_Group", "", colnames(dmrcate_design))
    colnames(dmrcate_design)[1] <- "Intercept"
    dmrcate_design <- edgeR::modelMatrixMeth(dmrcate_design)

    coef_names <- colnames(dmrcate_design)
    sample_cols <- grepl("^Sample[0-9]+$", coef_names)
    intercept_cols <- coef_names %in% c("Intercept", "(Intercept)")
    contrast_candidates <- coef_names[!(sample_cols | intercept_cols)]

    if (length(contrast_candidates) == 0) {
        warning(sprintf(
            "DMRcate could not infer a contrast coefficient from methdesign columns (%s); returning empty set.",
            paste(coef_names, collapse = ", ")
        ))
        return(GenomicRanges::GRanges())
    }

    preferred_coef <- c("Condition2_vs_Condition1", "Condition2", "cancer_vs_normal")
    selected_coef_name <- preferred_coef[preferred_coef %in% contrast_candidates][1]
    if (is.na(selected_coef_name)) {
        selected_coef_name <- contrast_candidates[length(contrast_candidates)]
    }
    selected_coef <- match(selected_coef_name, coef_names)
    cat(sprintf(
        "DMRcate contrast coefficient: %s (column %d of methdesign)\n",
        selected_coef_name,
        selected_coef
    ))

    all_gr <- bsseq::granges(bs_obj)
    all_seqnames <- as.character(GenomicRanges::seqnames(all_gr))
    all_pos <- GenomicRanges::start(all_gr)
    rm(all_gr)
    gc(verbose = FALSE)

    chr_vec <- unique(all_seqnames)
    chr_vec <- chr_vec[!is.na(chr_vec) & nzchar(chr_vec)]

    if (length(chr_vec) == 0) {
        warning("No chromosomes found in BSseq object; returning empty set.")
        return(GenomicRanges::GRanges())
    }

    dmrs_by_chr <- vector("list", length(chr_vec))
    names(dmrs_by_chr) <- chr_vec

    for (chr_name in chr_vec) {

        chr_idx <- which(all_seqnames == chr_name)
        if (length(chr_idx) == 0) {
            next
        }

        n_chr <- length(chr_idx)
        chunk_starts <- seq.int(1L, n_chr, by = as.integer(max_sites_per_chunk))
        chr_chunk_dmrs <- vector("list", length(chunk_starts))

        for (chunk_i in seq_along(chunk_starts)) {
            core_from <- chunk_starts[chunk_i]
            core_to <- min(core_from + as.integer(max_sites_per_chunk) - 1L, n_chr)

            ext_from <- max(1L, core_from - as.integer(chunk_overlap_sites))
            ext_to <- min(n_chr, core_to + as.integer(chunk_overlap_sites))

            idx_ext <- chr_idx[ext_from:ext_to]
            core_start_pos <- all_pos[chr_idx[core_from]]
            core_end_pos <- all_pos[chr_idx[core_to]]

            cat(sprintf(
                "DMRcate: %s chunk %d/%d (%s sites core, %s sites with overlap)\n",
                chr_name,
                chunk_i,
                length(chunk_starts),
                format(core_to - core_from + 1L, big.mark = ","),
                format(length(idx_ext), big.mark = ",")
            ))

            chunk_bs <- bs_obj[idx_ext, , drop = FALSE]
            if (nrow(chunk_bs) < 3L) {
                chr_chunk_dmrs[[chunk_i]] <- GenomicRanges::GRanges()
                rm(chunk_bs)
                gc(verbose = FALSE)
                next
            }

            chunk_out <- tryCatch(
                {
                    ann <- DMRcate::sequencing.annotate(
                        obj = chunk_bs,
                        all.cov = TRUE,
                        methdesign = dmrcate_design,
                        coef = selected_coef,
                        fdr = max_pval
                    )
                    dmrcate_res <- DMRcate::dmrcate(
                        ann,
                        lambda = 1000,
                        C = 2,
                        min.cpgs = min_sites
                    )
                    ranges_out <- DMRcate::extractRanges(dmrcate_res, genome = "hg38")
                    rm(ann, dmrcate_res)
                    gc(verbose = FALSE)

                    if (length(ranges_out) > 0) {
                        mids <- floor((GenomicRanges::start(ranges_out) + GenomicRanges::end(ranges_out)) / 2)
                        keep <- mids >= core_start_pos & mids <= core_end_pos
                        ranges_out <- ranges_out[keep]
                    }

                    ranges_out
                },
                error = function(e) {
                    cat(sprintf(
                        "DMRcate chunk failed (%s chunk %d/%d, fdr=0.05): %s\n",
                        chr_name,
                        chunk_i,
                        length(chunk_starts),
                        e$message
                    ))
                    GenomicRanges::GRanges()
                }
            )
            cat(sprintf(
                "DMRcate chunk %d/%d found %d DMRs\n",
                chunk_i,
                length(chunk_starts),
                length(chunk_out)
            ))
            chr_chunk_dmrs[[chunk_i]] <- chunk_out
            rm(chunk_bs, chunk_out)
            gc(verbose = FALSE)
        }

        chr_chunk_dmrs <- Filter(function(x) !is.null(x) && length(x) > 0, chr_chunk_dmrs)
        if (length(chr_chunk_dmrs) == 0) {
            dmrs_by_chr[[chr_name]] <- NULL
        } else {
            dmrs_by_chr[[chr_name]] <- GenomicRanges::sort(do.call(c, chr_chunk_dmrs))
        }

        rm(chr_chunk_dmrs)
        gc(verbose = FALSE)
    }

    dmrs_by_chr <- Filter(function(x) !is.null(x) && length(x) > 0, dmrs_by_chr)
    if (length(dmrs_by_chr) == 0) {
        return(GenomicRanges::GRanges())
    }

    out <- do.call(c, unname(dmrs_by_chr))
    out <- GenomicRanges::sort(out)
    out
}

if (!file.exists(dmrcate_file)) {
    start_time <- Sys.time()
    design <- model.matrix(~ Sample_Group, data = colData(ont_sim_bs))
    dmrs_dmrcate <- run_dmrcate_bsseq(ont_sim_bs, design)
    end_time <- Sys.time()
    time_dmrcate <- end_time - start_time
    saveRDS(list(dmrs = dmrs_dmrcate, time = time_dmrcate), dmrcate_file)
} else {
    ret <- readRDS(dmrcate_file)
    dmrs_dmrcate <- ret$dmrs
    time_dmrcate <- ret$time
}

0.5.3 dmrseq

run_dmrseq_bsseq <- function(
    bs_obj,
    chunk = TRUE,
    max_sites_per_chunk = 500000L,
    chunk_overlap_sites = 2000L
) {
    if (!requireNamespace("dmrseq", quietly = TRUE)) {
        warning("dmrseq not installed; returning empty set.")
        return(GenomicRanges::GRanges())
    }

    if (!"Sample_Group" %in% colnames(as.data.frame(colData(bs_obj)))) {
        warning("Sample_Group not present in BSseq colData; returning empty set.")
        return(GenomicRanges::GRanges())
    }

    all_gr <- bsseq::granges(bs_obj)
    all_seqnames <- as.character(GenomicRanges::seqnames(all_gr))
    all_pos <- GenomicRanges::start(all_gr)
    rm(all_gr)
    gc(verbose = FALSE)

    chr_vec <- unique(all_seqnames)
    chr_vec <- chr_vec[!is.na(chr_vec) & nzchar(chr_vec)]

    if (length(chr_vec) == 0) {
        warning("No chromosomes found in BSseq object; returning empty set.")
        return(GenomicRanges::GRanges())
    }

    dmrs_by_chr <- vector("list", length(chr_vec))
    names(dmrs_by_chr) <- chr_vec
    bpparam <- CMEnt:::.makeBiocParallelParam(max(1L, min(8L, parallel::detectCores(logical = TRUE) - 1L)), progressbar = TRUE)
    .runDMRSeq <- function(chunk_bs, chr_name, chunk_i, core_start_pos = NA_integer_, core_end_pos = NA_integer_) {
        chunk_out <- tryCatch(
            {
                fit <- dmrseq::dmrseq(
                    bs = chunk_bs,
                    testCovariate = "Sample_Group",
                    cutoff = 0.1,
                    minNumRegion = min_sites,
                    verbose = FALSE,
                    BPPARAM = bpparam
                )
                qvals <- mcols(fit)$qval
                if (length(fit) > 0 && any(is.finite(qvals))) {
                    fit <- fit[is.finite(qvals) & qvals < 0.05]
                } else if (length(fit) > 0) {
                    cat(sprintf(
                        "dmrseq chunk %s %d/%d returned observed candidates but no finite q-values; keeping candidates without permutation significance.\n",
                        chr_name,
                        chunk_i,
                        length(chunk_starts)
                    ))
                }

                if (length(fit) > 0 && !is.na(core_start_pos) && !is.na(core_end_pos)) {
                    mids <- floor((GenomicRanges::start(fit) + GenomicRanges::end(fit)) / 2)
                    keep <- mids >= core_start_pos & mids <= core_end_pos
                    fit <- fit[keep]
                }

                gc(verbose = FALSE)
                fit
            },
            error = function(e) {
                cat(sprintf(
                    "dmrseq chunk failed (%s chunk %d/%d): %s\n",
                    chr_name,
                    chunk_i,
                    length(chunk_starts),
                    e$message
                ))
                NULL
            }
        )
        chunk_out
    }
    if (!chunk) {
        out <- .runDMRSeq(bs_obj, "", 1, NA_integer_, NA_integer_)
    } else {
        for (chr_name in chr_vec) {
            chr_idx <- which(all_seqnames == chr_name)
            if (length(chr_idx) == 0) {
                next
            }

            n_chr <- length(chr_idx)
            chunk_starts <- seq.int(1L, n_chr, by = as.integer(max_sites_per_chunk))
            chr_chunk_dmrs <- vector("list", length(chunk_starts))

            for (chunk_i in seq_along(chunk_starts)) {
                core_from <- chunk_starts[chunk_i]
                core_to <- min(core_from + as.integer(max_sites_per_chunk) - 1L, n_chr)

                ext_from <- max(1L, core_from - as.integer(chunk_overlap_sites))
                ext_to <- min(n_chr, core_to + as.integer(chunk_overlap_sites))

                idx_ext <- chr_idx[ext_from:ext_to]
                core_start_pos <- all_pos[chr_idx[core_from]]
                core_end_pos <- all_pos[chr_idx[core_to]]

                cat(sprintf(
                    "dmrseq: %s chunk %d/%d (%s sites core, %s sites with overlap)\n",
                    chr_name,
                    chunk_i,
                    length(chunk_starts),
                    format(core_to - core_from + 1L, big.mark = ","),
                    format(length(idx_ext), big.mark = ",")
                ))

                chunk_bs <- bs_obj[idx_ext, , drop = FALSE]
                chr_chunk_dmrs[[chunk_i]] <- .runDMRSeq(chunk_bs, chr_name, chunk_i, core_start_pos, core_end_pos)
                rm(chunk_bs)
                gc(verbose = FALSE)
            }

            chr_chunk_dmrs <- Filter(function(x) !is.null(x) && length(x) > 0, chr_chunk_dmrs)
            if (length(chr_chunk_dmrs) == 0) {
                dmrs_by_chr[[chr_name]] <- NULL
            } else {
                dmrs_by_chr[[chr_name]] <- GenomicRanges::sort(do.call(c, chr_chunk_dmrs))
            }

            rm(chr_chunk_dmrs)
            gc(verbose = FALSE)
        }

        dmrs_by_chr <- Filter(function(x) !is.null(x) && length(x) > 0, dmrs_by_chr)
        if (length(dmrs_by_chr) == 0) {
            return(GenomicRanges::GRanges())
        }

        out <- do.call(c, unname(dmrs_by_chr))
    }
    out <- GenomicRanges::sort(out)
    out
}

if (!file.exists(dmrseq_file)) {
    start_time <- Sys.time()
    dmrs_dmrseq <- run_dmrseq_bsseq(ont_sim_bs)
    end_time <- Sys.time()
    time_dmrseq <- end_time - start_time
    saveRDS(list(dmrs = dmrs_dmrseq, time = time_dmrseq), dmrseq_file)
} else {
    ret <- readRDS(dmrseq_file)
    dmrs_dmrseq <- ret$dmrs
    time_dmrseq <- ret$time
}

0.5.4 hummingbird

run_hummingbird_bsseq <- function(
    bs_obj,
    max_gap = 300L, # default
    min_sites = 3L, # default is 10
    min_length = 50L # default is 500
) {
    sample_df <- as.data.frame(SummarizedExperiment::colData(bs_obj))
    if (!"Sample_Group" %in% colnames(sample_df)) {
        warning("Sample_Group not present in BSseq colData; returning empty set.")
        return(GenomicRanges::GPos())
    }

    ctrl_idx <- which(sample_df$Sample_Group == "Condition1")
    case_idx <- which(sample_df$Sample_Group == "Condition2")
    if (length(ctrl_idx) == 0 || length(case_idx) == 0) {
        warning("Need both Condition1 and Condition2 samples for hummingbird.")
        return(GenomicRanges::GPos())
    }

    row_gr <- bsseq::granges(bs_obj)
    # granges to gpos for hummingbird
    row_gr <- GPos(
        seqnames = GenomicRanges::seqnames(row_gr),
        pos = GenomicRanges::start(row_gr)
    )

    to_count_matrix <- function(x) {
        x <- as.matrix(x)
        x[!is.finite(x)] <- 0
        x[x < 0] <- 0
        storage.mode(x) <- "integer"
        x
    }

    all_seqnames <- as.character(GenomicRanges::seqnames(row_gr))
    chr_vec <- unique(all_seqnames)
    chr_vec <- chr_vec[!is.na(chr_vec) & nzchar(chr_vec)]
    if (length(chr_vec) == 0) {
        warning("No chromosomes found in BSseq object; returning empty set.")
        return(GenomicRanges::GPos())
    }

    dmrs_by_chr <- vector("list", length(chr_vec))
    names(dmrs_by_chr) <- chr_vec

    for (chr_name in chr_vec) {
        chr_idx <- which(all_seqnames == chr_name)
        if (length(chr_idx) < as.integer(min_sites)) {
            next
        }

        chr_gr <- row_gr[chr_idx]
        chr_bs <- bs_obj[chr_idx, , drop = FALSE]
        meth_mat <- bsseq::getCoverage(chr_bs, type = "M")
        cov_mat <- bsseq::getCoverage(chr_bs, type = "Cov")
        unmeth_mat <- cov_mat - meth_mat
        control_se <- SummarizedExperiment::SummarizedExperiment(
            rowRanges = chr_gr,
            assays = list(
                normM = to_count_matrix(meth_mat[, ctrl_idx, drop = FALSE]),
                normUM = to_count_matrix(unmeth_mat[, ctrl_idx, drop = FALSE])
            )
        )
        case_se <- SummarizedExperiment::SummarizedExperiment(
            rowRanges = chr_gr,
            assays = list(
                abnormM = to_count_matrix(meth_mat[, case_idx, drop = FALSE]),
                abnormUM = to_count_matrix(unmeth_mat[, case_idx, drop = FALSE])
            )
        )

        chr_out <- tryCatch(
            {
                hb_fit <- hummingbird::hummingbirdEM(experimentInfoControl = control_se, experimentInfoCase = case_se, binSize = 40)
                hb_fit <- hummingbird::hummingbirdPostAdjustment(
                    emInfo = hb_fit,
                    experimentInfoControl = control_se,
                    experimentInfoCase = case_se,
                    maxGap = as.integer(max_gap),
                    minCpGs = as.integer(min_sites),
                    minLength = as.integer(min_length)
                )

                hb_fit$DMRs
            },
            error = function(e) {
                cat(sprintf("hummingbird failed (%s): %s\n", chr_name, e$message))
                NULL
            }
        )

        dmrs_by_chr[[chr_name]] <- chr_out
        rm(chr_bs, chr_gr, meth_mat, cov_mat, unmeth_mat, control_se, case_se, chr_out)
        gc(verbose = FALSE)
    }

    dmrs_by_chr <- Filter(function(x) !is.null(x) && length(x) > 0, dmrs_by_chr)
    if (length(dmrs_by_chr) == 0) {
        return(GenomicRanges::GPos())
    }

    out <- suppressWarnings(do.call(c, unname(dmrs_by_chr)))
    GenomicRanges::sort(out)
}

if (!file.exists(hummingbird_file)) {
    start_time <- Sys.time()
    dmrs_hummingbird <- run_hummingbird_bsseq(ont_sim_bs, min_sites = min_sites, max_gap = 300L)
    end_time <- Sys.time()
    time_hummingbird <- end_time - start_time
    saveRDS(list(dmrs = dmrs_hummingbird, time = time_hummingbird), hummingbird_file)
} else {
    ret <- readRDS(hummingbird_file)
    dmrs_hummingbird <- ret$dmrs
    time_hummingbird <- ret$time
}
cat(sprintf("**CMEnt**: Found %d DMRs in %d seconds\n", length(dmrs_cment), round(as.numeric(time_cment, units = "secs"))))

CMEnt: Found 1051 DMRs in 606 seconds

cat(sprintf("**DMRcate**: Found %d DMRs in %d seconds\n", length(dmrs_dmrcate), round(as.numeric(time_dmrcate, units = "secs"))))

DMRcate: Found 445 DMRs in 243 seconds

cat(sprintf("**dmrseq**: Found %d DMRs in %d seconds\n", length(dmrs_dmrseq), round(as.numeric(time_dmrseq, units = "secs"))))

dmrseq: Found 800 DMRs in 3636 seconds

cat(sprintf("**hummingbird**: Found %d DMRs in %d seconds\n", length(dmrs_hummingbird), round(as.numeric(time_hummingbird, units = "secs"))))

hummingbird: Found 549 DMRs in 45 seconds

0.6 Comparative Analysis

methods_list <- list(
    CMEnt = dmrs_cment,
    DMRcate = dmrs_dmrcate,
    dmrseq = dmrs_dmrseq,
    hummingbird = dmrs_hummingbird
)
method_order <- names(methods_list)

get_dmr_stats <- function(dmrs, method) {
    if (length(dmrs) == 0) {
        return(data.frame(
            Method = method,
            `Number of DMRs` = 0,
            `Mean DMR Width` = NA_real_,
            `Median DMR Width` = NA_real_,
            `Total Coverage bp` = 0,
            `Min Width` = NA_real_,
            `Max Width` = NA_real_
        ))
    }

    data.frame(
        Method = method,
        `Number of DMRs` = length(dmrs),
        `Mean DMR Width` = mean(width(dmrs)),
        `Median DMR Width` = median(width(dmrs)),
        `Total Coverage bp` = sum(width(dmrs)),
        `Min Width` = min(width(dmrs)),
        `Max Width` = max(width(dmrs))
    )
}

comparison_stats <- do.call(rbind, lapply(names(methods_list), function(m) {
    get_dmr_stats(methods_list[[m]], m)
}))

truth_stats <- get_dmr_stats(truth_dmrs, "Ground Truth")
comparison_stats <- rbind(comparison_stats, truth_stats)
comparison_stats$Method <- factor(
    comparison_stats$Method,
    levels = c(method_order, "Ground Truth")
)
comparison_stats <- comparison_stats[order(comparison_stats$Method), , drop = FALSE]
comparison_stats$Method <- as.character(comparison_stats$Method)

knitr::kable(comparison_stats, digits = 2, caption = "Basic DMR Statistics")

Table 4: Basic DMR Statistics
Method Number.of.DMRs Mean.DMR.Width Median.DMR.Width Total.Coverage.bp Min.Width Max.Width
CMEnt 1051 5448.85 4610.0 5726738 227 21553
DMRcate 445 4575.96 3835.0 2036302 783 14431
dmrseq 800 4754.24 3923.0 3803389 694 20771
hummingbird 549 2901.64 2520.0 1593000 400 13680
Ground Truth 1000 5335.16 4305.5 5335161 694 22892

0.7 Ground-Truth Evaluation

evaluate_against_truth <- function(pred, truth) {
    n_pred <- length(pred)
    n_truth <- length(truth)
    pred_overlap_truth <- sum(overlapsAny(pred, truth))
    truth_overlap_pred <- sum(overlapsAny(truth, pred))
    fp <- n_pred - pred_overlap_truth
    fn <- n_truth - truth_overlap_pred

    precision <- ifelse(n_pred == 0, 0, pred_overlap_truth / n_pred)
    recall <- ifelse(n_truth == 0, 0, truth_overlap_pred / n_truth)
    f1 <- ifelse(precision + recall == 0, 0, 2 * precision * recall / (precision + recall))

    pred_red <- reduce(pred)
    truth_red <- reduce(truth)
    inter <- GenomicRanges::intersect(pred_red, truth_red)

    pred_bp <- sum(width(pred_red))
    truth_bp <- sum(width(truth_red))
    inter_bp <- ifelse(length(inter) == 0, 0, sum(width(inter)))
    union_bp <- pred_bp + truth_bp - inter_bp

    precision_bp <- ifelse(pred_bp == 0, 0, inter_bp / pred_bp)
    recall_bp <- ifelse(truth_bp == 0, 0, inter_bp / truth_bp)
    f1_bp <- ifelse(precision_bp + recall_bp == 0, 0, 2 * precision_bp * recall_bp / (precision_bp + recall_bp))
    iou_bp <- ifelse(union_bp == 0, 0, inter_bp / union_bp)

    data.frame(
        TP_regions = truth_overlap_pred,
        FP_regions = fp,
        FN_regions = fn,
        Precision_regions = precision,
        Recall_regions = recall,
        F1_regions = f1,
        Predicted_bp = pred_bp,
        Truth_bp = truth_bp,
        Intersect_bp = inter_bp,
        Precision_bp = precision_bp,
        Recall_bp = recall_bp,
        F1_bp = f1_bp,
        IoU_bp = iou_bp
    )
}

truth_eval <- do.call(rbind, lapply(names(methods_list), function(method) {
    out <- evaluate_against_truth(methods_list[[method]], truth_dmrs)
    out$Method <- method
    out
}))
truth_eval$Method <- factor(truth_eval$Method, levels = method_order)
truth_eval <- truth_eval[order(truth_eval$Method), , drop = FALSE]
truth_eval$Method <- as.character(truth_eval$Method)


truth_eval <- truth_eval |>
    dplyr::select(
        Method,
        TP_regions,
        FP_regions, FN_regions,
        Precision_regions, Recall_regions, F1_regions,
        Precision_bp, Recall_bp, F1_bp, IoU_bp
    )

regions_truth_eval <- truth_eval |>
    dplyr::select(Method, TP_regions, FP_regions, FN_regions, Precision_regions, Recall_regions, F1_regions) |>
    dplyr::rename_with(~ sub("_regions$", "", .), ends_with("_regions"))

bp_truth_eval <- truth_eval |>
    dplyr::select(Method, Precision_bp, Recall_bp, F1_bp, IoU_bp) |>
    dplyr::rename_with(~ sub("_bp$", "", .), ends_with("_bp"))
knitr::kable(regions_truth_eval, digits = 4, caption = "Macro (Region-level) Performance Against Ground Truth")

Table 5: Macro (Region-level) Performance Against Ground Truth
Method TP FP FN Precision Recall F1
CMEnt 994 20 6 0.981 0.994 0.9874
DMRcate 439 0 561 1.000 0.439 0.6101
dmrseq 800 0 200 1.000 0.800 0.8889
hummingbird 408 0 592 1.000 0.408 0.5795
knitr::kable(bp_truth_eval, digits = 4, caption = "Micro (Base-pair-level) Performance Against Ground Truth")

Table 6: Micro (Base-pair-level) Performance Against Ground Truth
Method Precision Recall F1 IoU
CMEnt 0.8938 0.9594 0.9254 0.8611
DMRcate 0.9972 0.3806 0.5509 0.3802
dmrseq 0.9938 0.7085 0.8272 0.7053
hummingbird 0.9980 0.2980 0.4590 0.2978

0.7.1 Overlap Matrices (Methods + Truth)

all_sets <- c(methods_list, list(`Ground Truth` = truth_dmrs))

calculate_overlap <- function(gr1, gr2) {
    if (length(gr1) == 0 || length(gr2) == 0) {
        return(list(count = 0, percentage = 0))
    }
    overlap_count <- sum(overlapsAny(gr1, gr2))
    pct <- overlap_count / length(gr1) * 100
    list(count = overlap_count, percentage = pct)
}

overlap_count_mat <- matrix(0, nrow = length(all_sets), ncol = length(all_sets))
overlap_pct_mat <- matrix(0, nrow = length(all_sets), ncol = length(all_sets))
rownames(overlap_count_mat) <- colnames(overlap_count_mat) <- names(all_sets)
rownames(overlap_pct_mat) <- colnames(overlap_pct_mat) <- names(all_sets)

for (i in seq_along(all_sets)) {
    for (j in seq_along(all_sets)) {
        if (i == j) {
            overlap_count_mat[i, j] <- length(all_sets[[i]])
            overlap_pct_mat[i, j] <- 100
        } else {
            ov <- calculate_overlap(all_sets[[i]], all_sets[[j]])
            overlap_count_mat[i, j] <- ov$count
            overlap_pct_mat[i, j] <- ov$percentage
        }
    }
}

knitr::kable(overlap_count_mat, digits = 0, caption = "Overlap Counts (query=row, reference=column)")

Table 7: Overlap Counts (query=row, reference=column)
CMEnt DMRcate dmrseq hummingbird Ground Truth
CMEnt 1051 446 829 441 1031
DMRcate 445 445 443 322 445
dmrseq 800 442 800 415 800
hummingbird 549 432 544 549 549
Ground Truth 994 439 800 408 1000
knitr::kable(overlap_pct_mat, digits = 2, caption = "Overlap Percentages (query=row, reference=column)")

Table 8: Overlap Percentages (query=row, reference=column)
CMEnt DMRcate dmrseq hummingbird Ground Truth
CMEnt 100.0 42.44 78.88 41.96 98.1
DMRcate 100.0 100.00 99.55 72.36 100.0
dmrseq 100.0 55.25 100.00 51.88 100.0
hummingbird 100.0 78.69 99.09 100.00 100.0
Ground Truth 99.4 43.90 80.00 40.80 100.0

0.7.2 CMEnt Stage 1 Quality Assessment (Seed Linkage)

Assessing seed-space recoverability and seed capture efficiency.

Metrics explanation:

  • Truth_DMRs_Total: Total number of ground truth DMRs.
  • Truth_DMRs_With_Seeds: Number of truth DMRs that have at least one overlapping seed site.
  • Truth_DMRs_Stage1Recoverable_MinSeeds: Number of truth DMRs that have at least the configured minimum number of overlapping seed sites.
  • Truth_DMRs_Stage1Recoverable_WithMinCapturedSeeds: Number of truth DMRs that have at least the configured minimum number of overlapping seed sites that also overlap CMEnt-predicted DMRs, indicating they are potentially recoverable and captured by the method.
  • Truth_DMRs_Stage1Recoverable_FullyCapturedSeeds: Number of truth DMRs where all overlapping seed sites are captured by CMEnt-predicted DMRs, indicating ideal stage 1 capture.
  • Seed_Capture_Rate_Pct_in_Truth: Percentage of seed sites that overlap truth DMRs that are also captured by CMEnt-predicted DMRs.
  • Median_Seed_Capture_Pct_Per_Truth: For truth DMRs that have at least one overlapping seed site, the median percentage of those seed sites that are captured by CMEnt-predicted DMRs.
  • Truth_DMRs_With_Missed_Seeds: Number of truth DMRs that have at least one overlapping seed site that is not captured by CMEnt-predicted DMRs.
  • Median_Max_Missed_Run_Per_Truth: For truth DMRs with missed seeds, the median length of the longest consecutive run of missed seed sites.
  • Max_Missed_Run_Observed: The maximum length of consecutive missed seed sites observed across all truth DMRs.

Metrics values:

seed_gr <- sort(seed_gr)
mcols(seed_gr)$index <- seq_along(seed_gr)
seeds_in_truth <- subsetByOverlaps(seed_gr, truth_dmrs, ignore.strand = TRUE)

if (length(dmrs_cment) > 0) {
    seeds_in_dmrs <- subsetByOverlaps(seed_gr, dmrs_cment, ignore.strand = TRUE)
    seeds_in_truth_and_dmrs <- subsetByOverlaps(seeds_in_truth, dmrs_cment, ignore.strand = TRUE)
    seeds_in_truth_not_dmrs <- subsetByOverlaps(seeds_in_truth, dmrs_cment, ignore.strand = TRUE, invert = TRUE)
} else {
    seeds_in_dmrs <- seed_gr[0]
    seeds_in_truth_and_dmrs <- seed_gr[0]
    seeds_in_truth_not_dmrs <- seeds_in_truth
}

seed_truth_ov <- findOverlaps(seeds_in_truth, truth_dmrs, ignore.strand = TRUE)
truth_seed_total <- tabulate(subjectHits(seed_truth_ov), nbins = length(truth_dmrs))
seed_truth_hit_ov <- findOverlaps(seeds_in_truth_and_dmrs, truth_dmrs, ignore.strand = TRUE)
truth_seed_captured <- tabulate(subjectHits(seed_truth_hit_ov), nbins = length(truth_dmrs))
truth_seed_capture_pct <- ifelse(truth_seed_total > 0, round(100 * truth_seed_captured / truth_seed_total, 1), NA_real_)

recoverable_truth <- truth_seed_total >= min_seeds
recoverable_with_captured <- recoverable_truth & truth_seed_captured >= min_seeds
recoverable_fully_captured <- recoverable_truth & truth_seed_captured == truth_seed_total

stage1_quality <- data.frame(
    Truth_DMRs_Total = length(truth_dmrs),
    Truth_DMRs_With_Seeds = sum(truth_seed_total >= 1L),
    Truth_DMRs_Stage1Recoverable_MinSeeds = sum(recoverable_truth),
    Truth_DMRs_Stage1Recoverable_WithMinCapturedSeeds = sum(recoverable_with_captured),
    Truth_DMRs_Stage1Recoverable_FullyCapturedSeeds = sum(recoverable_fully_captured),
    Seed_Capture_Rate_Pct_in_Truth = ifelse(length(seeds_in_truth) == 0, NA_real_, round(100 * length(seeds_in_truth_and_dmrs) / length(seeds_in_truth), 1)),
    Median_Seed_Capture_Pct_Per_Truth = ifelse(sum(truth_seed_total > 0) == 0, NA_real_, stats::median(truth_seed_capture_pct[truth_seed_total > 0], na.rm = TRUE))
)

for (col in colnames(stage1_quality)) {
    cat(sprintf("- **%s**: %s\n", col, stage1_quality[[col]]))
}
  • Truth_DMRs_Total: 1000
  • Truth_DMRs_With_Seeds: 1000
  • Truth_DMRs_Stage1Recoverable_MinSeeds: 1000
  • Truth_DMRs_Stage1Recoverable_WithMinCapturedSeeds: 994
  • Truth_DMRs_Stage1Recoverable_FullyCapturedSeeds: 818
  • Seed_Capture_Rate_Pct_in_Truth: 99
  • Median_Seed_Capture_Pct_Per_Truth: 100
# Consecutive missed-seed runs are computed per truth DMR using original seed index.
ov_truth <- findOverlaps(seeds_in_truth_not_dmrs, truth_dmrs, ignore.strand = TRUE)
qh_by_truth <- split(queryHits(ov_truth), subjectHits(ov_truth))
run_df <- do.call(rbind, lapply(names(qh_by_truth), function(truth_id_chr) {
    truth_id <- as.integer(truth_id_chr)
    qh <- qh_by_truth[[truth_id_chr]]
    idx <- mcols(seeds_in_truth_not_dmrs)$index[qh]
    db <- as.numeric(mcols(seeds_in_truth_not_dmrs)$delta_beta[qh])

    ord <- order(idx)
    idx <- idx[ord]
    db <- db[ord]

    run_id <- cumsum(c(TRUE, diff(idx) != 1L))
    data.frame(
        Truth_DMR_Index = truth_id,
        run_length = as.integer(tabulate(run_id)),
        mean_delta_beta = as.numeric(tapply(db, run_id, mean, na.rm = TRUE))
    )
}))

run_count <- as.data.frame(table(run_df$run_length))
colnames(run_count) <- c("Consecutive_Seed_Length", "Count")
run_delta <- aggregate(mean_delta_beta ~ run_length, data = run_df, FUN = mean, na.rm = TRUE)
colnames(run_delta) <- c("Consecutive_Seed_Length", "Mean_Delta_Beta")
run_summary <- merge(run_count, run_delta, by = "Consecutive_Seed_Length", all.x = TRUE)

max_run_per_truth <- aggregate(run_length ~ Truth_DMR_Index, data = run_df, FUN = max)
run_overview <- data.frame(
    Truth_DMRs_With_Missed_Seeds = nrow(max_run_per_truth),
    Median_Max_Missed_Run_Per_Truth = stats::median(max_run_per_truth$run_length),
    Max_Missed_Run_Observed = max(max_run_per_truth$run_length)
)
for (col in colnames(run_overview)) {
    cat(sprintf("- **%s**: %s\n", col, run_overview[[col]]))
}
  • Truth_DMRs_With_Missed_Seeds: 182
  • Median_Max_Missed_Run_Per_Truth: 3
  • Max_Missed_Run_Observed: 26

0.7.3 CMEnt Stage 2-4 Quality Assessment (Extension and Merging)

Assessing truth-side recovery and prediction-side fragmentation/purity after extension and merging. Metrics explanation:

  • Pred_DMRs_Total: Total number of CMEnt DMRs.
  • Pred_DMRs_Overlapping_Truth: Number of CMEnt DMRs that overlap at least one truth DMR.
  • Pred_DMRs_BPPrecision_ge50pct: Number of CMEnt DMRs with base-pair precision >= 50% (i.e., at least 50% of the CMEnt DMR’s base pairs overlap with truth DMRs).
  • Pred_DMRs_BPPrecision_ge80pct: Number of CMEnt DMRs with base-pair precision >= 80%.
  • Truth_DMRs_Total: Total number of truth DMRs.
  • Truth_DMRs_Hit: Number of truth DMRs that have at least one overlapping CMEnt DMR.
  • Truth_DMRs_BPRecall_ge50pct: Number of truth DMRs with base-pair recall >= 50% (i.e., at least 50% of the truth DMR’s base pairs are covered by CMEnt DMRs).
  • Truth_DMRs_BPRecall_ge80pct: Number of truth DMRs with base-pair recall >= 80%.
  • Median_Truth_BPRecall_Pct_Hit: Median base-pair recall percentage among truth DMRs that have at least one overlapping CMEnt DMR.
  • Median_Pred_BPPrecision_Pct_Overlap: Median base-pair precision percentage among CMEnt DMRs that have at least some overlap with truth DMRs.
  • Truth_DMRs_With_Multiple_Pred: Number of truth DMRs that have 2 or more overlapping CMEnt DMRs, indicating potential fragmentation.
  • Median_Preds_Per_Hit_Truth: Median number of overlapping CMEnt DMRs among truth DMRs that have at least one overlapping CMEnt DMR.
  • Max_Preds_On_Truth: Maximum number of overlapping CMEnt DMRs observed on any single truth DMR.

Metrics values:

truth_to_pred_hits <- findOverlaps(truth_dmrs, dmrs_cment, ignore.strand = TRUE)
preds_per_truth <- tabulate(queryHits(truth_to_pred_hits), nbins = length(truth_dmrs))

truth_overlap_bp <- numeric(length(truth_dmrs))
if (length(truth_to_pred_hits) > 0) {
    truth_intersections <- pintersect(
        truth_dmrs[queryHits(truth_to_pred_hits)],
        dmrs_cment[subjectHits(truth_to_pred_hits)]
    )
    truth_intersections_by_truth <- split(truth_intersections, queryHits(truth_to_pred_hits))
    truth_overlap_bp[as.integer(names(truth_intersections_by_truth))] <- vapply(
        truth_intersections_by_truth,
        function(gr) sum(width(reduce(gr))),
        numeric(1)
    )
}
truth_bp_recall <- ifelse(width(truth_dmrs) > 0, truth_overlap_bp / width(truth_dmrs), 0)

pred_to_truth_hits <- findOverlaps(dmrs_cment, truth_dmrs, ignore.strand = TRUE)
pred_overlap_bp <- numeric(length(dmrs_cment))
if (length(pred_to_truth_hits) > 0) {
    pred_intersections <- pintersect(
        dmrs_cment[queryHits(pred_to_truth_hits)],
        truth_dmrs[subjectHits(pred_to_truth_hits)]
    )
    pred_intersections_by_pred <- split(pred_intersections, queryHits(pred_to_truth_hits))
    pred_overlap_bp[as.integer(names(pred_intersections_by_pred))] <- vapply(
        pred_intersections_by_pred,
        function(gr) sum(width(reduce(gr))),
        numeric(1)
    )
}
pred_bp_precision <- ifelse(width(dmrs_cment) > 0, pred_overlap_bp / width(dmrs_cment), 0)

stage24_quality <- data.frame(
    Pred_DMRs_Total = length(dmrs_cment),
    Pred_DMRs_Overlapping_Truth = sum(overlapsAny(dmrs_cment, truth_dmrs)),
    Pred_DMRs_BPPrecision_ge50pct = sum(pred_bp_precision >= 0.5),
    Pred_DMRs_BPPrecision_ge80pct = sum(pred_bp_precision >= 0.8),
    Truth_DMRs_Total = length(truth_dmrs),
    Truth_DMRs_Hit = sum(preds_per_truth > 0),
    Truth_DMRs_BPRecall_ge50pct = sum(truth_bp_recall >= 0.5),
    Truth_DMRs_BPRecall_ge80pct = sum(truth_bp_recall >= 0.8),
    Median_Truth_BPRecall_Pct_Hit = ifelse(sum(preds_per_truth > 0) == 0, 0, 100 * stats::median(truth_bp_recall[preds_per_truth > 0])),
    Median_Pred_BPPrecision_Pct_Overlap = ifelse(sum(pred_bp_precision > 0) == 0, 0, 100 * stats::median(pred_bp_precision[pred_bp_precision > 0])),
    Truth_DMRs_With_Multiple_Pred = sum(preds_per_truth >= 2),
    Median_Preds_Per_Hit_Truth = ifelse(sum(preds_per_truth > 0) == 0, 0, stats::median(preds_per_truth[preds_per_truth > 0])),
    Max_Preds_On_Truth = ifelse(length(preds_per_truth) == 0, 0, max(preds_per_truth))
)
for (col in names(stage24_quality)) {
    cat(sprintf("- **%s**: %s\n", col, stage24_quality[[col]]))
}
  • Pred_DMRs_Total: 1051
  • Pred_DMRs_Overlapping_Truth: 1031
  • Pred_DMRs_BPPrecision_ge50pct: 1019
  • Pred_DMRs_BPPrecision_ge80pct: 776
  • Truth_DMRs_Total: 1000
  • Truth_DMRs_Hit: 994
  • Truth_DMRs_BPRecall_ge50pct: 991
  • Truth_DMRs_BPRecall_ge80pct: 957
  • Median_Truth_BPRecall_Pct_Hit: 100
  • Median_Pred_BPPrecision_Pct_Overlap: 90.1453750160813
  • Truth_DMRs_With_Multiple_Pred: 67
  • Median_Preds_Per_Hit_Truth: 1
  • Max_Preds_On_Truth: 4

0.8 Visualizations

0.8.1 Ground-Truth Metrics

plot_df <- truth_eval |>
    dplyr::select(Method, F1_regions, F1_bp, IoU_bp) |>
    tidyr::pivot_longer(
        cols = c(F1_regions, F1_bp, IoU_bp),
        names_to = "Metric",
        values_to = "Value"
    )

plot_df$Metric <- factor(
    plot_df$Metric,
    levels = c("F1_regions", "F1_bp", "IoU_bp"),
    labels = c("F1 (Region)", "F1 (Base-pair)", "IoU (Base-pair)")
)
plot_df$Method <- factor(plot_df$Method, levels = method_order)

ggplot(plot_df, aes(x = Method, y = Value, fill = Method)) +
    geom_col() +
    facet_wrap(~Metric, nrow = 1) +
    theme_minimal() +
    theme(
        axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "none"
    ) +
    ylim(0, 1) +
    labs(
        title = "Method Performance vs Ground Truth",
        x = "",
        y = "Score"
    )

0.8.2 Width Distributions

width_data <- do.call(rbind, lapply(names(all_sets), function(method) {
    x <- all_sets[[method]]
    if (length(x) == 0) {
        return(NULL)
    }
    data.frame(Method = method, Width = width(x))
}))
width_data$Method <- factor(width_data$Method, levels = names(all_sets))

ggplot(width_data, aes(x = Method, y = Width, fill = Method)) +
    geom_violin(alpha = 0.7) +
    geom_boxplot(width = 0.2, alpha = 0.5, outlier.size = 0.5) +
    scale_y_log10(labels = scales::comma) +
    theme_minimal() +
    theme(
        axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "none"
    ) +
    labs(
        title = "DMR Width Distributions",
        x = "",
        y = "Width (bp, log scale)"
    )

0.9 Summary

cat("\n## Key Findings\n\n")

0.10 Key Findings

if (nrow(truth_eval) > 0) {
    best_f1 <- truth_eval[which.max(truth_eval$F1_regions), ]
    best_iou <- truth_eval[which.max(truth_eval$IoU_bp), ]

    cat(sprintf(
        "- Best region-level F1: **%s** (%.3f)\n",
        best_f1$Method, best_f1$F1_regions
    ))
    cat(sprintf(
        "- Best base-pair IoU: **%s** (%.3f)\n",
        best_iou$Method, best_iou$IoU_bp
    ))

    if ("CMEnt" %in% truth_eval$Method) {
        seg <- truth_eval[truth_eval$Method == "CMEnt", ]
        cat(sprintf(
            "- **CMEnt** region metrics: Precision = %.3f, Recall = %.3f, F1 = %.3f\n",
            seg$Precision_regions, seg$Recall_regions, seg$F1_regions
        ))
        cat(sprintf(
            "- **CMEnt** base-pair metrics: Precision = %.3f, Recall = %.3f, F1 = %.3f, IoU = %.3f\n",
            seg$Precision_bp, seg$Recall_bp, seg$F1_bp, seg$IoU_bp
        ))
    }
}

0.11 Session Info

sessionInfo()

R version 4.6.0 (2026-04-24) Platform: x86_64-pc-linux-gnu Running under: Ubuntu 24.04.4 LTS

Matrix products: default BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.12.0 LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0 LAPACK version 3.12.0

locale: [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C

time zone: Europe/Brussels tzcode source: system (glibc)

attached base packages: [1] parallel stats4 stats graphics grDevices datasets utils
[8] methods base

other attached packages: [1] bsseq_1.48.0 CMEnt_0.99.0
[3] dplyr_1.2.1 ggplot2_4.0.3
[5] data.table_1.18.4 minfi_1.58.0
[7] bumphunter_1.54.0 locfit_1.5-9.12
[9] iterators_1.0.14 foreach_1.5.2
[11] Biostrings_2.80.1 XVector_0.52.0
[13] SummarizedExperiment_1.42.0 Biobase_2.72.0
[15] MatrixGenerics_1.24.0 matrixStats_1.5.0
[17] GenomicRanges_1.64.0 Seqinfo_1.2.0
[19] IRanges_2.46.0 S4Vectors_0.50.1
[21] BiocGenerics_0.58.1 generics_0.1.4
[23] testthat_3.3.2 BiocStyle_2.40.0

loaded via a namespace (and not attached): [1] igraph_2.3.2
[2] plotly_4.12.0
[3] ChAMPdata_2.44.0
[4] Formula_1.2-5
[5] devtools_2.5.2
[6] tidyselect_1.2.1
[7] bit_4.6.0
[8] doParallel_1.0.17
[9] clue_0.3-68
[10] lattice_0.22-9
[11] rjson_0.2.23
[12] nor1mix_1.3-3
[13] blob_1.3.0
[14] stringr_1.6.0
[15] rngtools_1.5.2
[16] S4Arrays_1.12.0
[17] base64_2.0.2
[18] dichromat_2.0-0.1
[19] scrime_1.3.7
[20] seqLogo_1.78.0
[21] png_0.1-9
[22] tinytex_0.59
[23] cli_3.6.6
[24] marray_1.90.0
[25] bedr_1.1.5
[26] outliers_0.15
[27] ProtGenerics_1.44.0
[28] askpass_1.2.1
[29] multtest_2.68.0
[30] openssl_2.4.2
[31] JADE_2.0-4
[32] nleqslv_3.3.7
[33] pkgdown_2.2.0
[34] BiocIO_1.22.0
[35] purrr_1.2.2
[36] dendextend_1.19.1
[37] curl_7.1.0
[38] mime_0.13
[39] evaluate_1.0.5
[40] wateRmelon_2.18.0
[41] ComplexHeatmap_2.28.0
[42] stringi_1.8.7
[43] backports_1.5.1
[44] desc_1.4.3
[45] XML_3.99-0.23
[46] httpuv_1.6.17
[47] AnnotationDbi_1.74.0
[48] magrittr_2.0.5
[49] rappdirs_0.3.4
[50] splines_4.6.0
[51] mclust_6.1.2
[52] DelayedDataFrame_1.28.0
[53] BiasedUrn_2.0.12
[54] jpeg_0.1-11
[55] doRNG_1.8.6.3
[56] rentrez_1.2.4
[57] org.Hs.eg.db_3.23.1
[58] IlluminaHumanMethylation450kmanifest_0.4.0
[59] DT_0.34.0
[60] sessioninfo_1.2.4
[61] DBI_1.3.0
[62] HDF5Array_1.40.0
[63] jquerylib_0.1.4
[64] genefilter_1.94.0
[65] withr_3.0.2
[66] class_7.3-23
[67] rprojroot_2.1.1
[68] brio_1.1.5
[69] sva_3.60.0
[70] isva_1.10
[71] formatR_1.14
[72] rtracklayer_1.72.0
[73] BiocManager_1.30.27
[74] Illumina450ProbeVariants.db_1.48.0
[75] htmlwidgets_1.6.4
[76] fs_2.1.0
[77] biomaRt_2.68.0
[78] missMethyl_1.46.0
[79] labeling_0.4.3
[80] SparseArray_1.12.2
[81] shinycssloaders_1.1.0
[82] h5mread_1.4.0
[83] annotate_1.90.0
[84] VariantAnnotation_1.58.0
[85] knitr_1.51
[86] TFBSTools_1.50.0
[87] UCSC.utils_1.8.0
[88] beanplot_1.3.1
[89] TFMPvalue_1.0.0
[90] ChAMP_2.42.0
[91] annotatr_1.38.0
[92] caTools_1.18.3
[93] grid_4.6.0
[94] rhdf5_2.56.0
[95] pwalign_1.8.0
[96] R.oo_1.27.1
[97] regioneR_1.44.0
[98] ellipsis_0.3.3
[99] lazyeval_0.2.3
[100] yaml_2.3.12
[101] survival_3.8-6
[102] RPMM_1.25
[103] BiocVersion_3.23.1
[104] crayon_1.5.3
[105] RColorBrewer_1.1-3
[106] tidyr_1.3.2
[107] later_1.4.8
[108] codetools_0.2-20
[109] base64enc_0.1-6
[110] GlobalOptions_0.1.4
[111] KEGGREST_1.52.0
[112] shape_1.4.6.1
[113] fastICA_1.2-7
[114] limma_3.68.4
[115] IlluminaHumanMethylationEPICanno.ilm10b4.hg19_0.6.0 [116] Rsamtools_2.28.0
[117] filelock_1.0.3
[118] showtext_0.9-8
[119] foreign_0.8-90
[120] pkgconfig_2.0.3
[121] IlluminaHumanMethylation450kanno.ilmn12.hg19_0.6.1 [122] xml2_1.5.2
[123] GenomicAlignments_1.48.0
[124] hummingbird_1.22.0
[125] BSgenome_1.80.0
[126] viridisLite_0.4.3
[127] biovizBase_1.60.0
[128] gridBase_0.4-7
[129] xtable_1.8-8
[130] interp_1.1-6
[131] lumi_2.64.0
[132] plyr_1.8.9
[133] httr_1.4.8
[134] tools_4.6.0
[135] pkgbuild_1.4.8
[136] htmlTable_2.5.0
[137] checkmate_2.3.4
[138] nlme_3.1-168
[139] affy_1.90.0
[140] futile.logger_1.4.9
[141] lambda.r_1.2.4
[142] dbplyr_2.5.2
[143] ExperimentHub_3.2.0
[144] IlluminaHumanMethylationEPICmanifest_0.3.0
[145] digest_0.6.39
[146] permute_0.9-10
[147] bookdown_0.46
[148] Matrix_1.7-5
[149] farver_2.1.2
[150] tzdb_0.5.0
[151] AnnotationFilter_1.36.0
[152] reshape2_1.4.5
[153] viridis_0.6.5
[154] DirichletMultinomial_1.54.0
[155] rpart_4.1.27
[156] glue_1.8.1
[157] cachem_1.1.0
[158] bspm_0.5.8
[159] VennDiagram_1.8.2
[160] BiocFileCache_3.2.0
[161] methylumi_2.58.0
[162] Hmisc_5.2-5
[163] txdbmaker_1.8.0
[164] sysfonts_0.8.9
[165] pkgload_1.5.2
[166] statmod_1.5.2
[167] impute_1.86.0
[168] GEOquery_2.80.0
[169] httr2_1.2.2
[170] showtextdb_3.0
[171] dmrseq_1.32.0
[172] siggenes_1.86.0
[173] gtools_3.9.5
[174] preprocessCore_1.74.0
[175] affyio_1.82.0
[176] gridExtra_2.3
[177] shiny_1.13.0
[178] R.utils_2.13.0
[179] rhdf5filters_1.24.0
[180] RCurl_1.98-1.19
[181] memoise_2.0.1
[182] rmarkdown_2.31
[183] scales_1.4.0
[184] R.methodsS3_1.8.2
[185] reshape_0.8.10
[186] illuminaio_0.54.0
[187] rstudioapi_0.18.0
[188] cluster_2.1.8.2
[189] ROC_1.88.0
[190] hms_1.1.4
[191] globaltest_5.66.0
[192] DMRcate_3.8.0
[193] colorspace_2.1-2
[194] FNN_1.1.4.1
[195] rlang_1.2.0
[196] quadprog_1.5-8
[197] GenomeInfoDb_1.48.0
[198] DelayedMatrixStats_1.34.0
[199] sparseMatrixStats_1.24.0
[200] shinythemes_1.2.0
[201] circlize_0.4.18
[202] mgcv_1.9-4
[203] xfun_0.58
[204] ggseqlogo_0.2.2
[205] e1071_1.7-17
[206] abind_1.4-8
[207] tibble_3.3.1
[208] Rhdf5lib_2.0.0
[209] readr_2.2.0
[210] futile.options_1.0.1
[211] bitops_1.0-9
[212] ps_1.9.3
[213] promises_1.5.0
[214] inline_0.3.21
[215] RSQLite_3.53.1
[216] qvalue_2.44.0
[217] DelayedArray_0.38.2
[218] proxy_0.4-29
[219] GO.db_3.23.1
[220] compiler_4.6.0
[221] prettyunits_1.2.0
[222] beachmat_2.28.0
[223] Rcpp_1.1.1-1.1
[224] DNAcopy_1.86.0
[225] edgeR_4.10.1
[226] AnnotationHub_4.2.0
[227] usethis_3.2.1
[228] MASS_7.3-65
[229] progress_1.2.3
[230] BiocParallel_1.46.0
[231] R6_2.6.1
[232] fastmap_1.2.0
[233] ensembldb_2.36.1
[234] nnet_7.3-20
[235] gtable_0.3.6
[236] KernSmooth_2.23-26
[237] strex_2.0.1
[238] latticeExtra_0.6-31
[239] deldir_2.0-4
[240] htmltools_0.5.9
[241] bit64_4.8.2
[242] lifecycle_1.0.5
[243] S7_0.2.2
[244] processx_3.9.0
[245] callr_3.8.0
[246] Gviz_1.56.0
[247] restfulr_0.0.16
[248] sass_0.4.10
[249] vctrs_0.7.3
[250] goseq_1.64.0
[251] bslib_0.11.0
[252] pillar_1.11.1
[253] GenomicFeatures_1.64.0
[254] magick_2.9.1
[255] otel_0.2.0
[256] combinat_0.0-8
[257] geneLenDataBase_1.48.0
[258] jsonlite_2.0.0
[259] cigarillo_1.2.0
[260] GetoptLong_1.1.1