Benchmarking CMEnt on Simulated ONT Data

0.1 Introduction

This vignette cached_results DMR detection on simulated ONT-style methylation data with known ground truth. The dataset used is a simulated ONT dataset, based on an in-house dataset of 5 samples from neuroendocrine islets, augmented to 20 samples using CMEnt::augmentBSSeq, subset to chromosomes 20 and 21, and with 1000 DMRs spiked in using CMEnt::simulateDMRs, with max delta beta (delta_max) set to 0.3 and expected within-dmr correlation (expected_corr) set to 0.7.

Methods:

CMEnt seed-guided, correlation-based DMR assembly
DMRcate Gaussian Kernel smoothing-based DMR detection
dmrseq BSseq-native region-level inference
hummingbird HMM-based DMR detection

Primary goal:

assess CMEnt performance against ground truth
compare with DMRcate, dmrseq, and hummingbird using the same dataset

0.2 Setup and Dependencies

try(devtools::load_all(), silent = TRUE)
suppressPackageStartupMessages({
    try(library(CMEnt), silent = TRUE)
    library(bsseq)
    library(data.table)
    library(ggplot2)
    library(dplyr)
    library(GenomicRanges)
    library(SummarizedExperiment)
    library(S4Vectors)
})

getBenchmarkFile <- function(filename) {
    benchmark_file <- ""
    folder <- "simulated_ont"
    if (getwd() == "notebooks") {
        benchmark_file <- file.path("cached_results", folder, filename)
    } else if ("NAMESPACE" %in% dir()) {
        benchmark_file <- file.path("notebooks", "cached_results", folder, filename)
    } else {
        benchmark_file <- file.path("cached_results", folder, filename)
    }
    dir.create(dirname(benchmark_file), showWarnings = FALSE, recursive = TRUE)
    benchmark_file
}

min_seeds <- 2L
min_sites <- 50L
max_pval <- 0.05
testing_mode  <- "parametric"
ext_site_delta_beta  <- 0.1
max_bridge_seeds_gaps <- 1L
max_bridge_extension_gaps <- 2L
max_lookup_dist <- 1000L
method_cache_tag <- sprintf(
    "sites%d_p%s_ext%s_lookup%d",
    min_sites,
    gsub("[^0-9]+", "", sprintf("%.3f", max_pval)),
    gsub("[^0-9]+", "", sprintf("%.3f", ext_site_delta_beta)),
    max_lookup_dist
)

0.2.1 Install External Dependencies

if (!requireNamespace("BiocManager", quietly = TRUE)) {
    install.packages("BiocManager")
}

if (!requireNamespace("DMRcate", quietly = TRUE)) {
    BiocManager::install("DMRcate", update = FALSE, ask = FALSE)
}

if (!requireNamespace("dmrseq", quietly = TRUE)) {
    BiocManager::install("dmrseq", update = FALSE, ask = FALSE)
}

if (!requireNamespace("hummingbird", quietly = TRUE)) {
    BiocManager::install("hummingbird", update = FALSE, ask = FALSE)
}

if (!requireNamespace("tidyr", quietly = TRUE)) {
    install.packages("tidyr")
}

0.3 Load Simulated Data and Ground Truth

cment_file <- getBenchmarkFile(sprintf("dmrs.CMEnt.%s.rds", method_cache_tag))
dmrcate_file <- getBenchmarkFile(sprintf("dmrs.DMRcate.%s.rds", method_cache_tag))
dmrseq_file <- getBenchmarkFile(sprintf("dmrs.dmrseq.%s.rds", method_cache_tag))
hummingbird_file <- getBenchmarkFile(sprintf("dmrs.hummingbird.%s.rds", method_cache_tag))
pheno_file <- getBenchmarkFile("ont_sim_samplesheet.tsv")
seeds_file <- getBenchmarkFile(sprintf("seeds.p%s.tsv.gz", gsub("[^0-9]+", "", sprintf("%.3f", max_pval))))

computation_needed <- !file.exists(cment_file) || !file.exists(dmrcate_file) || !file.exists(dmrseq_file) || !file.exists(hummingbird_file) || !file.exists(pheno_file) || !file.exists(seeds_file)
zenodo_base_url <- "https://zenodo.org/records/20592944/files/"

sim_truth_path <- getBenchmarkFile("ont_sim_dmrs.tsv")
if (computation_needed) {
    dir.create("data", showWarnings = FALSE, recursive = TRUE)
    ont_sim_bs_path <- getBenchmarkFile("ont_sim_bs.rds")
    ont_sim_bs_hdf5_path <- getBenchmarkFile("ont_sim_bs.h5")
    if (file.exists(ont_sim_bs_hdf5_path)) {
        ont_sim_bs <- HDF5Array::loadHDF5SummarizedExperiment(ont_sim_bs_hdf5_path)
    } else {

        if (!file.exists(ont_sim_bs_path)) {
            curl::curl_download(
                url = paste0(zenodo_base_url, basename(ont_sim_bs_path), "?download=1"),
                destfile = ont_sim_bs_path
            )
        }

        ont_sim_bs <- readRDS(ont_sim_bs_path)
        ont_sim_bs <- sort(ont_sim_bs)

        ont_sim_bs <- HDF5Array::saveHDF5SummarizedExperiment(
            ont_sim_bs,
            dir = ont_sim_bs_hdf5_path,
            replace = TRUE
        )
    }

    stopifnot(inherits(ont_sim_bs, "BSseq"))
    cat(sprintf("Loaded BSseq with %d sites and %d samples\n", nrow(ont_sim_bs), ncol(ont_sim_bs)))

}

Loaded BSseq with 2292633 sites and 20 samples

if (!file.exists(sim_truth_path)) {
    curl::curl_download(
        url = paste0(zenodo_base_url, basename(sim_truth_path), "?download=1"),
        destfile = sim_truth_path
    )
}
truth_df <- data.table::fread(sim_truth_path)
truth_dmrs <- makeGRangesFromDataFrame(
    truth_df,
    seqnames.field = "seqnames",
    start.field = "start",
    end.field = "end",
    keep.extra.columns = TRUE,
    seqinfo = Seqinfo(genome = "hg38")
)

cat(sprintf("Loaded %d ground-truth DMRs\n", length(truth_dmrs)))

Loaded 1000 ground-truth DMRs

0.4 Build Phenotype Table from Sample Names

if (!file.exists(pheno_file)) {
    sample_ids <- colnames(ont_sim_bs)
    sample_group <- ifelse(grepl("^Condition1", sample_ids), "Condition1", "Condition2")

    pheno <- data.frame(
        Sample_ID = sample_ids,
        Sample_Group = sample_group,
        casecontrol = sample_group == "Condition2",
        stringsAsFactors = FALSE
    )
    rownames(pheno) <- pheno$Sample_ID
    data.table::fwrite(pheno, pheno_file, sep = "\t")
} else {
    pheno <- data.table::fread(pheno_file, data.table = FALSE)
}
if (computation_needed) {
    colData(ont_sim_bs) <- S4Vectors::DataFrame(pheno, row.names = pheno$Sample_ID)
    ont_sim_bs  <- sort(ont_sim_bs)
}
knitr::kable(
    table(pheno$Sample_Group),
    caption = "Phenotype Frequency Table"
)

Table 1: Phenotype Frequency Table
Var1	Freq
Condition1	10
Condition2	10

0.5 Benchmarking DMR Detection Methods

0.5.1 CMEnt

0.5.1.1 Adjusting parameters for NGS simulation

knitr::kable(
    data.frame(
        Parameter = c(
            "min_seeds", "min_sites", "max_pval", "testing_mode",
            "ext_site_delta_beta", "max_bridge_seeds_gaps",
            "max_bridge_extension_gaps", "max_lookup_dist"
        ),
        Value = c(
            min_seeds, min_sites, max_pval, testing_mode,
            ext_site_delta_beta, max_bridge_seeds_gaps,
            max_bridge_extension_gaps, max_lookup_dist
        )
    ),
    caption = "Benchmark detection parameters"
)

Table 2: Benchmark detection parameters
Parameter	Value
min_seeds	2
min_sites	50
max_pval	0.05
testing_mode	parametric
ext_site_delta_beta	0.1
max_bridge_seeds_gaps	1
max_bridge_extension_gaps	2
max_lookup_dist	1000

0.5.1.2 Extract Methylation Matrix and Generate Seeds

This step is crucial for CMEnt, which relies on seed sites to guide DMR assembly. We use a DSS approach to identify candidate seeds based on differential methylation evidence. We keep the FDR threshold relatively lenient, to account for the low sample size.

if (!file.exists(seeds_file)) {
    seeds <- findDMPsBSSeq(
        bsseq = ont_sim_bs,
        samplesheet = pheno,
        sample_group_col = "Sample_Group",
        id_col = "Sample_ID",
        case_group = "Condition2",
        covariates = NULL,
        output_file = seeds_file
    )
    seeds <- seeds[seeds$qval < max_pval, , drop = FALSE]
} else {
    seeds <- data.table::fread(seeds_file)
    rownames(seeds) <- seeds$site_id
}
seeds <- as.data.frame(seeds, stringsAsFactors = FALSE)
rownames(seeds) <- seeds$site_id
seeds$Row.names <- NULL
seed_dist <- unlist(lapply(split(seeds$start, seeds$chr), function(x) {
    if (length(x) < 2) return(numeric(0))
    diff(x)
}))
seeds_qs <- round(stats::quantile(seed_dist, probs = c(0, 0.25, 0.5, 0.75, 0.9, 0.99, 1)))
cat(sprintf("Number of seeds for CMEnt: %d\n", nrow(seeds)))

Number of seeds for CMEnt: 74216

knitr::kable(
    seeds_qs,
    digits = 2,
    col.names = "Distance (bp)",
    caption = sprintf("Seed distance quantiles")
)

(#tab:seeds_calculation)Seed distance quantiles
	Distance (bp)
0%	1
25%	1
50%	15
75%	104
90%	329
99%	41460
100%	412180

0.5.1.3 Seed Overlap with Ground Truth

Metrics explanation:

Seeds_Total: Total number of seed sites identified.
Seeds_Overlapping_Truth: Number of seed sites that overlap any ground-truth DMR.
Seeds_Overlap_Pct: Percentage of seed sites that overlap ground-truth DMRs.
Truth_DMRs_Total: Total number of ground-truth DMRs.
Truth_DMRs_With_Seed_Pct: Percentage of ground-truth DMRs that have at least one overlapping seed site.
Truth_DMRs_With_Enough_Seeds_Pct: Percentage of ground-truth DMRs that have enough overlapping seed sites to be captured by CMEnt.
Truth_DMRs_With_Enough_Proximal_Seeds_Pct: Percentage of ground-truth DMRs that have enough overlapping seed sites within a specified proximity (max_lookup_dist) to be captured by CMEnt.
Median_Seeds_Per_Truth_DMR: Median number of overlapping seeds per ground-truth DMR (including those with zero seeds).
Median_Seeds_Per_Hit_Truth_DMR: Median number of overlapping seeds among ground-truth DMRs that have at least one seed.
Max_Seeds_In_Truth_DMR: Maximum number of overlapping seeds in any single ground-truth DMR.
Overlap_Pairs_Direction_Match_Pct: Among seed-truth overlap pairs, the percentage where the direction of methylation change (hyper/hypo) matches between the seed and the truth DMR.
Overlapping_Seeds_Any_Direction_Match_Pct: Percentage of seeds that have at least one overlapping truth DMR with a matching direction of change.

Metrics values:

seed_df <- as.data.frame(seeds, stringsAsFactors = FALSE)
seed_df <- seed_df[!is.na(seed_df$start) & nzchar(seed_df$chr), , drop = FALSE]

seed_gr <- GenomicRanges::makeGRangesFromDataFrame(
    seed_df,
    seqnames.field = "chr",
    start.field = "start",
    end.field = "end",
    keep.extra.columns = TRUE,
    seqinfo = Seqinfo(genome = "hg38")
)

seed_truth_hits <- GenomicRanges::findOverlaps(seed_gr, truth_dmrs, ignore.strand = TRUE)
seed_overlaps_truth <- rep(FALSE, length(seed_gr))
truth_has_seed <- rep(FALSE, length(truth_dmrs))
truth_has_enough_seeds <- 0L

if (length(seed_truth_hits) > 0) {
    seed_overlaps_truth[unique(S4Vectors::queryHits(seed_truth_hits))] <- TRUE
    truth_has_seed[unique(S4Vectors::subjectHits(seed_truth_hits))] <- TRUE
}

seeds_per_truth <- if (length(truth_dmrs) > 0) {
    tabulate(S4Vectors::subjectHits(seed_truth_hits), nbins = length(truth_dmrs))
} else {
    integer(0)
}
seeds_per_truth_within_mindist <- if (length(seed_truth_hits) > 0) {
    seed_positions <- GenomicRanges::start(seed_gr)[S4Vectors::queryHits(seed_truth_hits)]
    seeds_within_mindist <- list()
    for (i in seq_along(truth_dmrs)){
        dmr_seeds_mask <- S4Vectors::subjectHits(seed_truth_hits) == i
        if (sum(dmr_seeds_mask) == 0) {
            seeds_within_mindist[[i]] <- 0
        }
        sorts <- order(seed_positions[dmr_seeds_mask])
        dists <- diff(seed_positions[dmr_seeds_mask][sorts])
        seeds_within_mindist[[i]] <- sum(dists <= max_lookup_dist)
    }
    unlist(seeds_within_mindist)
} else {
    integer(0)
}
truth_has_enough_seeds <- sum(seeds_per_truth >= min_seeds)
safe_pct <- function(x) if (length(x) == 0) 0 else round(100 * mean(x), 1)

seed_truth_summary <- data.frame(
    Seeds_Total = length(seed_gr),
    Seeds_Overlapping_Truth = sum(seed_overlaps_truth),
    Seeds_Overlap_Pct = safe_pct(seed_overlaps_truth),
    Truth_DMRs_Total = length(truth_dmrs),
    Truth_DMRs_With_Seed_Pct = safe_pct(truth_has_seed),
    Truth_DMRs_With_Enough_Seeds_Pct = ifelse(length(truth_dmrs) == 0, 0, 100 * truth_has_enough_seeds / length(truth_dmrs)),
    Truth_DMRs_With_Enough_Proximal_Seeds_Pct = ifelse(length(truth_dmrs) == 0, 0, 100 * sum(seeds_per_truth_within_mindist >= min_seeds) / length(truth_dmrs)),
    Median_Seeds_Per_Truth_DMR = ifelse(length(seeds_per_truth) == 0, NA_real_, stats::median(seeds_per_truth)),
    Median_Seeds_Per_Hit_Truth_DMR = ifelse(sum(truth_has_seed) == 0, 0, stats::median(seeds_per_truth[truth_has_seed])),
    Max_Seeds_In_Truth_DMR = ifelse(length(seeds_per_truth) == 0, NA_integer_, max(seeds_per_truth))
)
seed_sign <- sign(as.numeric(seed_df$delta_beta))
truth_sign <- sign(as.numeric(SummarizedExperiment::mcols(truth_dmrs)$delta_beta))
pair_match <- seed_sign[S4Vectors::queryHits(seed_truth_hits)] ==
    truth_sign[S4Vectors::subjectHits(seed_truth_hits)]

seed_pair_direction_match_pct <- if (all(is.na(pair_match))) NA_real_ else 100 * mean(pair_match, na.rm = TRUE)
per_seed_dir_match <- tapply(pair_match, S4Vectors::queryHits(seed_truth_hits), function(x) any(x, na.rm = TRUE))
seed_any_direction_match_pct <- if (length(per_seed_dir_match) == 0) NA_real_ else 100 * mean(per_seed_dir_match)

seed_truth_summary$Overlap_Pairs_Direction_Match_Pct <- round(seed_pair_direction_match_pct, 1)
seed_truth_summary$Overlapping_Seeds_Any_Direction_Match_Pct <- round(seed_any_direction_match_pct, 1)

for (i in seq_along(seed_truth_summary)) {
    cat(sprintf("- **%s**: %s\n", names(seed_truth_summary)[i], seed_truth_summary[[i]]))
}

Seeds_Total: 74216
Seeds_Overlapping_Truth: 71112
Seeds_Overlap_Pct: 95.8
Truth_DMRs_Total: 1000
Truth_DMRs_With_Seed_Pct: 100
Truth_DMRs_With_Enough_Seeds_Pct: 100
Truth_DMRs_With_Enough_Proximal_Seeds_Pct: 100
Median_Seeds_Per_Truth_DMR: 48
Median_Seeds_Per_Hit_Truth_DMR: 48
Max_Seeds_In_Truth_DMR: 423
Overlap_Pairs_Direction_Match_Pct: 100
Overlapping_Seeds_Any_Direction_Match_Pct: 100

score_col <- "score"

ord <- order(seed_df[[score_col]], decreasing = TRUE, na.last = NA)
top_k <- unique(pmin(length(ord), c(100, 500, 1000, 5000, 10000, length(ord))))
top_seed_overlap <- do.call(rbind, lapply(top_k, function(k) {
    idx <- ord[seq_len(k)]
    n_hit <- sum(seed_overlaps_truth[idx])
    data.frame(
        `Top Seeds` = k,
        `Overlap Pct` = 100 * n_hit / k
    )
}))

knitr::kable(
    top_seed_overlap,
    digits = 2,
    caption = sprintf("Ground-truth overlap enrichment among top-scoring seeds")
)

Table 3: Ground-truth overlap enrichment among top-scoring seeds
Top.Seeds	Overlap.Pct
100	100.00
500	100.00
1000	100.00
5000	100.00
10000	100.00
74216	95.82

if (!file.exists(cment_file)) {
    default_cment_njobs <- max(1L, min(8L, parallel::detectCores(logical = TRUE) - 1L))
    cment_njobs <- suppressWarnings(as.integer(
        Sys.getenv("CMENT_BENCHMARK_NJOBS", as.character(default_cment_njobs))
    ))
    if (!is.finite(cment_njobs) || is.na(cment_njobs) || cment_njobs < 1L) {
        cment_njobs <- default_cment_njobs
    }
    cment_njobs <- max(1L, min(cment_njobs, parallel::detectCores(logical = TRUE)))
    options("CMEnt.njobs" = cment_njobs)
    options("CMEnt.verbose" = 3)
    cat(sprintf("CMEnt benchmark jobs: %d\n", cment_njobs))
    start_time <- Sys.time()

    beta_handler <- CMEnt::getBetaHandler(
        beta = ont_sim_bs
    )
    gc(verbose = FALSE)
    dmrs_cment <- CMEnt::buildDMRs(
        beta = beta_handler,
        seeds = seeds,
        pheno = pheno,
        seeds_id_col = "site_id",
        sample_group_col = "Sample_Group",
        casecontrol_col = "casecontrol",
        genome = "hg38",
        array = "NULL",
        entanglement = "weak",
        testing_mode = testing_mode,
        max_bridge_seeds_gaps = max_bridge_seeds_gaps,
        max_bridge_extension_gaps = max_bridge_extension_gaps,
        min_seeds = min_seeds,
        min_sites = min_sites/2,
        max_pval = max_pval,
        ext_site_delta_beta = ext_site_delta_beta,
        max_lookup_dist = max_lookup_dist, # Use a larger lookup distance (10k is the default)
        annotate_with_genes = FALSE,
        extract_motifs = FALSE,
        njobs = cment_njobs
    )
    end_time <- Sys.time()
    time_cment <- end_time - start_time
    saveRDS(list(dmrs = dmrs_cment, time = time_cment), cment_file)
    rm(beta_handler)
    colData(ont_sim_bs) <- S4Vectors::DataFrame(pheno, row.names = pheno$Sample_ID)
    gc(verbose = FALSE)
} else {
    ret <- readRDS(cment_file)
    dmrs_cment <- ret$dmrs
    time_cment <- ret$time
}

0.5.2 DMRcate

run_dmrcate_bsseq <- function(
    bs_obj,
    design,
    max_sites_per_chunk = 500000L,
    chunk_overlap_sites = 2000L
) {
    if (!requireNamespace("DMRcate", quietly = TRUE)) {
        warning("DMRcate not installed; returning empty set.")
        return(GenomicRanges::GRanges())
    }
    dmrcate_design <- as.matrix(design)
    if (nrow(dmrcate_design) != ncol(bs_obj)) {
        warning(sprintf(
            "DMRcate design rows (%d) do not match BSseq samples (%d); returning empty set.",
            nrow(dmrcate_design),
            ncol(bs_obj)
        ))
        return(GenomicRanges::GRanges())
    }
    if (ncol(dmrcate_design) < 2L) {
        warning("DMRcate requires at least two groups in the design matrix; returning empty set.")
        return(GenomicRanges::GRanges())
    }

    colnames(dmrcate_design) <- gsub("Sample_Group", "", colnames(dmrcate_design))
    colnames(dmrcate_design)[1] <- "Intercept"
    dmrcate_design <- edgeR::modelMatrixMeth(dmrcate_design)

    coef_names <- colnames(dmrcate_design)
    sample_cols <- grepl("^Sample[0-9]+$", coef_names)
    intercept_cols <- coef_names %in% c("Intercept", "(Intercept)")
    contrast_candidates <- coef_names[!(sample_cols | intercept_cols)]

    if (length(contrast_candidates) == 0) {
        warning(sprintf(
            "DMRcate could not infer a contrast coefficient from methdesign columns (%s); returning empty set.",
            paste(coef_names, collapse = ", ")
        ))
        return(GenomicRanges::GRanges())
    }

    preferred_coef <- c("Condition2_vs_Condition1", "Condition2", "cancer_vs_normal")
    selected_coef_name <- preferred_coef[preferred_coef %in% contrast_candidates][1]
    if (is.na(selected_coef_name)) {
        selected_coef_name <- contrast_candidates[length(contrast_candidates)]
    }
    selected_coef <- match(selected_coef_name, coef_names)
    cat(sprintf(
        "DMRcate contrast coefficient: %s (column %d of methdesign)\n",
        selected_coef_name,
        selected_coef
    ))

    all_gr <- bsseq::granges(bs_obj)
    all_seqnames <- as.character(GenomicRanges::seqnames(all_gr))
    all_pos <- GenomicRanges::start(all_gr)
    rm(all_gr)
    gc(verbose = FALSE)

    chr_vec <- unique(all_seqnames)
    chr_vec <- chr_vec[!is.na(chr_vec) & nzchar(chr_vec)]

    if (length(chr_vec) == 0) {
        warning("No chromosomes found in BSseq object; returning empty set.")
        return(GenomicRanges::GRanges())
    }

    dmrs_by_chr <- vector("list", length(chr_vec))
    names(dmrs_by_chr) <- chr_vec

    for (chr_name in chr_vec) {

        chr_idx <- which(all_seqnames == chr_name)
        if (length(chr_idx) == 0) {
            next
        }

        n_chr <- length(chr_idx)
        chunk_starts <- seq.int(1L, n_chr, by = as.integer(max_sites_per_chunk))
        chr_chunk_dmrs <- vector("list", length(chunk_starts))

        for (chunk_i in seq_along(chunk_starts)) {
            core_from <- chunk_starts[chunk_i]
            core_to <- min(core_from + as.integer(max_sites_per_chunk) - 1L, n_chr)

            ext_from <- max(1L, core_from - as.integer(chunk_overlap_sites))
            ext_to <- min(n_chr, core_to + as.integer(chunk_overlap_sites))

            idx_ext <- chr_idx[ext_from:ext_to]
            core_start_pos <- all_pos[chr_idx[core_from]]
            core_end_pos <- all_pos[chr_idx[core_to]]

            cat(sprintf(
                "DMRcate: %s chunk %d/%d (%s sites core, %s sites with overlap)\n",
                chr_name,
                chunk_i,
                length(chunk_starts),
                format(core_to - core_from + 1L, big.mark = ","),
                format(length(idx_ext), big.mark = ",")
            ))

            chunk_bs <- bs_obj[idx_ext, , drop = FALSE]
            if (nrow(chunk_bs) < 3L) {
                chr_chunk_dmrs[[chunk_i]] <- GenomicRanges::GRanges()
                rm(chunk_bs)
                gc(verbose = FALSE)
                next
            }

            chunk_out <- tryCatch(
                {
                    ann <- DMRcate::sequencing.annotate(
                        obj = chunk_bs,
                        all.cov = TRUE,
                        methdesign = dmrcate_design,
                        coef = selected_coef,
                        fdr = max_pval
                    )
                    dmrcate_res <- DMRcate::dmrcate(
                        ann,
                        lambda = 1000,
                        C = 2,
                        min.cpgs = min_sites
                    )
                    ranges_out <- DMRcate::extractRanges(dmrcate_res, genome = "hg38")
                    rm(ann, dmrcate_res)
                    gc(verbose = FALSE)

                    if (length(ranges_out) > 0) {
                        mids <- floor((GenomicRanges::start(ranges_out) + GenomicRanges::end(ranges_out)) / 2)
                        keep <- mids >= core_start_pos & mids <= core_end_pos
                        ranges_out <- ranges_out[keep]
                    }

                    ranges_out
                },
                error = function(e) {
                    cat(sprintf(
                        "DMRcate chunk failed (%s chunk %d/%d, fdr=0.05): %s\n",
                        chr_name,
                        chunk_i,
                        length(chunk_starts),
                        e$message
                    ))
                    GenomicRanges::GRanges()
                }
            )
            cat(sprintf(
                "DMRcate chunk %d/%d found %d DMRs\n",
                chunk_i,
                length(chunk_starts),
                length(chunk_out)
            ))
            chr_chunk_dmrs[[chunk_i]] <- chunk_out
            rm(chunk_bs, chunk_out)
            gc(verbose = FALSE)
        }

        chr_chunk_dmrs <- Filter(function(x) !is.null(x) && length(x) > 0, chr_chunk_dmrs)
        if (length(chr_chunk_dmrs) == 0) {
            dmrs_by_chr[[chr_name]] <- NULL
        } else {
            dmrs_by_chr[[chr_name]] <- GenomicRanges::sort(do.call(c, chr_chunk_dmrs))
        }

        rm(chr_chunk_dmrs)
        gc(verbose = FALSE)
    }

    dmrs_by_chr <- Filter(function(x) !is.null(x) && length(x) > 0, dmrs_by_chr)
    if (length(dmrs_by_chr) == 0) {
        return(GenomicRanges::GRanges())
    }

    out <- do.call(c, unname(dmrs_by_chr))
    out <- GenomicRanges::sort(out)
    out
}

if (!file.exists(dmrcate_file)) {
    start_time <- Sys.time()
    design <- model.matrix(~ Sample_Group, data = colData(ont_sim_bs))
    dmrs_dmrcate <- run_dmrcate_bsseq(ont_sim_bs, design)
    end_time <- Sys.time()
    time_dmrcate <- end_time - start_time
    saveRDS(list(dmrs = dmrs_dmrcate, time = time_dmrcate), dmrcate_file)
} else {
    ret <- readRDS(dmrcate_file)
    dmrs_dmrcate <- ret$dmrs
    time_dmrcate <- ret$time
}

0.5.3 dmrseq

run_dmrseq_bsseq <- function(
    bs_obj,
    chunk = TRUE,
    max_sites_per_chunk = 500000L,
    chunk_overlap_sites = 2000L
) {
    if (!requireNamespace("dmrseq", quietly = TRUE)) {
        warning("dmrseq not installed; returning empty set.")
        return(GenomicRanges::GRanges())
    }

    if (!"Sample_Group" %in% colnames(as.data.frame(colData(bs_obj)))) {
        warning("Sample_Group not present in BSseq colData; returning empty set.")
        return(GenomicRanges::GRanges())
    }

    all_gr <- bsseq::granges(bs_obj)
    all_seqnames <- as.character(GenomicRanges::seqnames(all_gr))
    all_pos <- GenomicRanges::start(all_gr)
    rm(all_gr)
    gc(verbose = FALSE)

    chr_vec <- unique(all_seqnames)
    chr_vec <- chr_vec[!is.na(chr_vec) & nzchar(chr_vec)]

    if (length(chr_vec) == 0) {
        warning("No chromosomes found in BSseq object; returning empty set.")
        return(GenomicRanges::GRanges())
    }

    dmrs_by_chr <- vector("list", length(chr_vec))
    names(dmrs_by_chr) <- chr_vec
    bpparam <- CMEnt:::.makeBiocParallelParam(max(1L, min(8L, parallel::detectCores(logical = TRUE) - 1L)), progressbar = TRUE)
    .runDMRSeq <- function(chunk_bs, chr_name, chunk_i, core_start_pos = NA_integer_, core_end_pos = NA_integer_) {
        chunk_out <- tryCatch(
            {
                fit <- dmrseq::dmrseq(
                    bs = chunk_bs,
                    testCovariate = "Sample_Group",
                    cutoff = 0.1,
                    minNumRegion = min_sites,
                    verbose = FALSE,
                    BPPARAM = bpparam
                )
                qvals <- mcols(fit)$qval
                if (length(fit) > 0 && any(is.finite(qvals))) {
                    fit <- fit[is.finite(qvals) & qvals < 0.05]
                } else if (length(fit) > 0) {
                    cat(sprintf(
                        "dmrseq chunk %s %d/%d returned observed candidates but no finite q-values; keeping candidates without permutation significance.\n",
                        chr_name,
                        chunk_i,
                        length(chunk_starts)
                    ))
                }

                if (length(fit) > 0 && !is.na(core_start_pos) && !is.na(core_end_pos)) {
                    mids <- floor((GenomicRanges::start(fit) + GenomicRanges::end(fit)) / 2)
                    keep <- mids >= core_start_pos & mids <= core_end_pos
                    fit <- fit[keep]
                }

                gc(verbose = FALSE)
                fit
            },
            error = function(e) {
                cat(sprintf(
                    "dmrseq chunk failed (%s chunk %d/%d): %s\n",
                    chr_name,
                    chunk_i,
                    length(chunk_starts),
                    e$message
                ))
                NULL
            }
        )
        chunk_out
    }
    if (!chunk) {
        out <- .runDMRSeq(bs_obj, "", 1, NA_integer_, NA_integer_)
    } else {
        for (chr_name in chr_vec) {
            chr_idx <- which(all_seqnames == chr_name)
            if (length(chr_idx) == 0) {
                next
            }

            n_chr <- length(chr_idx)
            chunk_starts <- seq.int(1L, n_chr, by = as.integer(max_sites_per_chunk))
            chr_chunk_dmrs <- vector("list", length(chunk_starts))

            for (chunk_i in seq_along(chunk_starts)) {
                core_from <- chunk_starts[chunk_i]
                core_to <- min(core_from + as.integer(max_sites_per_chunk) - 1L, n_chr)

                ext_from <- max(1L, core_from - as.integer(chunk_overlap_sites))
                ext_to <- min(n_chr, core_to + as.integer(chunk_overlap_sites))

                idx_ext <- chr_idx[ext_from:ext_to]
                core_start_pos <- all_pos[chr_idx[core_from]]
                core_end_pos <- all_pos[chr_idx[core_to]]

                cat(sprintf(
                    "dmrseq: %s chunk %d/%d (%s sites core, %s sites with overlap)\n",
                    chr_name,
                    chunk_i,
                    length(chunk_starts),
                    format(core_to - core_from + 1L, big.mark = ","),
                    format(length(idx_ext), big.mark = ",")
                ))

                chunk_bs <- bs_obj[idx_ext, , drop = FALSE]
                chr_chunk_dmrs[[chunk_i]] <- .runDMRSeq(chunk_bs, chr_name, chunk_i, core_start_pos, core_end_pos)
                rm(chunk_bs)
                gc(verbose = FALSE)
            }

            chr_chunk_dmrs <- Filter(function(x) !is.null(x) && length(x) > 0, chr_chunk_dmrs)
            if (length(chr_chunk_dmrs) == 0) {
                dmrs_by_chr[[chr_name]] <- NULL
            } else {
                dmrs_by_chr[[chr_name]] <- GenomicRanges::sort(do.call(c, chr_chunk_dmrs))
            }

            rm(chr_chunk_dmrs)
            gc(verbose = FALSE)
        }

        dmrs_by_chr <- Filter(function(x) !is.null(x) && length(x) > 0, dmrs_by_chr)
        if (length(dmrs_by_chr) == 0) {
            return(GenomicRanges::GRanges())
        }

        out <- do.call(c, unname(dmrs_by_chr))
    }
    out <- GenomicRanges::sort(out)
    out
}

if (!file.exists(dmrseq_file)) {
    start_time <- Sys.time()
    dmrs_dmrseq <- run_dmrseq_bsseq(ont_sim_bs)
    end_time <- Sys.time()
    time_dmrseq <- end_time - start_time
    saveRDS(list(dmrs = dmrs_dmrseq, time = time_dmrseq), dmrseq_file)
} else {
    ret <- readRDS(dmrseq_file)
    dmrs_dmrseq <- ret$dmrs
    time_dmrseq <- ret$time
}

0.5.4 hummingbird

run_hummingbird_bsseq <- function(
    bs_obj,
    max_gap = 300L, # default
    min_sites = 3L, # default is 10
    min_length = 50L # default is 500
) {
    sample_df <- as.data.frame(SummarizedExperiment::colData(bs_obj))
    if (!"Sample_Group" %in% colnames(sample_df)) {
        warning("Sample_Group not present in BSseq colData; returning empty set.")
        return(GenomicRanges::GPos())
    }

    ctrl_idx <- which(sample_df$Sample_Group == "Condition1")
    case_idx <- which(sample_df$Sample_Group == "Condition2")
    if (length(ctrl_idx) == 0 || length(case_idx) == 0) {
        warning("Need both Condition1 and Condition2 samples for hummingbird.")
        return(GenomicRanges::GPos())
    }

    row_gr <- bsseq::granges(bs_obj)
    # granges to gpos for hummingbird
    row_gr <- GPos(
        seqnames = GenomicRanges::seqnames(row_gr),
        pos = GenomicRanges::start(row_gr)
    )

    to_count_matrix <- function(x) {
        x <- as.matrix(x)
        x[!is.finite(x)] <- 0
        x[x < 0] <- 0
        storage.mode(x) <- "integer"
        x
    }

    all_seqnames <- as.character(GenomicRanges::seqnames(row_gr))
    chr_vec <- unique(all_seqnames)
    chr_vec <- chr_vec[!is.na(chr_vec) & nzchar(chr_vec)]
    if (length(chr_vec) == 0) {
        warning("No chromosomes found in BSseq object; returning empty set.")
        return(GenomicRanges::GPos())
    }

    dmrs_by_chr <- vector("list", length(chr_vec))
    names(dmrs_by_chr) <- chr_vec

    for (chr_name in chr_vec) {
        chr_idx <- which(all_seqnames == chr_name)
        if (length(chr_idx) < as.integer(min_sites)) {
            next
        }

        chr_gr <- row_gr[chr_idx]
        chr_bs <- bs_obj[chr_idx, , drop = FALSE]
        meth_mat <- bsseq::getCoverage(chr_bs, type = "M")
        cov_mat <- bsseq::getCoverage(chr_bs, type = "Cov")
        unmeth_mat <- cov_mat - meth_mat
        control_se <- SummarizedExperiment::SummarizedExperiment(
            rowRanges = chr_gr,
            assays = list(
                normM = to_count_matrix(meth_mat[, ctrl_idx, drop = FALSE]),
                normUM = to_count_matrix(unmeth_mat[, ctrl_idx, drop = FALSE])
            )
        )
        case_se <- SummarizedExperiment::SummarizedExperiment(
            rowRanges = chr_gr,
            assays = list(
                abnormM = to_count_matrix(meth_mat[, case_idx, drop = FALSE]),
                abnormUM = to_count_matrix(unmeth_mat[, case_idx, drop = FALSE])
            )
        )

        chr_out <- tryCatch(
            {
                hb_fit <- hummingbird::hummingbirdEM(experimentInfoControl = control_se, experimentInfoCase = case_se, binSize = 40)
                hb_fit <- hummingbird::hummingbirdPostAdjustment(
                    emInfo = hb_fit,
                    experimentInfoControl = control_se,
                    experimentInfoCase = case_se,
                    maxGap = as.integer(max_gap),
                    minCpGs = as.integer(min_sites),
                    minLength = as.integer(min_length)
                )

                hb_fit$DMRs
            },
            error = function(e) {
                cat(sprintf("hummingbird failed (%s): %s\n", chr_name, e$message))
                NULL
            }
        )

        dmrs_by_chr[[chr_name]] <- chr_out
        rm(chr_bs, chr_gr, meth_mat, cov_mat, unmeth_mat, control_se, case_se, chr_out)
        gc(verbose = FALSE)
    }

    dmrs_by_chr <- Filter(function(x) !is.null(x) && length(x) > 0, dmrs_by_chr)
    if (length(dmrs_by_chr) == 0) {
        return(GenomicRanges::GPos())
    }

    out <- suppressWarnings(do.call(c, unname(dmrs_by_chr)))
    GenomicRanges::sort(out)
}

if (!file.exists(hummingbird_file)) {
    start_time <- Sys.time()
    dmrs_hummingbird <- run_hummingbird_bsseq(ont_sim_bs, min_sites = min_sites, max_gap = 300L)
    end_time <- Sys.time()
    time_hummingbird <- end_time - start_time
    saveRDS(list(dmrs = dmrs_hummingbird, time = time_hummingbird), hummingbird_file)
} else {
    ret <- readRDS(hummingbird_file)
    dmrs_hummingbird <- ret$dmrs
    time_hummingbird <- ret$time
}

cat(sprintf("**CMEnt**: Found %d DMRs in %d seconds\n", length(dmrs_cment), round(as.numeric(time_cment, units = "secs"))))

CMEnt: Found 1051 DMRs in 606 seconds

cat(sprintf("**DMRcate**: Found %d DMRs in %d seconds\n", length(dmrs_dmrcate), round(as.numeric(time_dmrcate, units = "secs"))))

DMRcate: Found 445 DMRs in 243 seconds

cat(sprintf("**dmrseq**: Found %d DMRs in %d seconds\n", length(dmrs_dmrseq), round(as.numeric(time_dmrseq, units = "secs"))))

dmrseq: Found 800 DMRs in 3636 seconds

cat(sprintf("**hummingbird**: Found %d DMRs in %d seconds\n", length(dmrs_hummingbird), round(as.numeric(time_hummingbird, units = "secs"))))

hummingbird: Found 549 DMRs in 45 seconds

0.6 Comparative Analysis

methods_list <- list(
    CMEnt = dmrs_cment,
    DMRcate = dmrs_dmrcate,
    dmrseq = dmrs_dmrseq,
    hummingbird = dmrs_hummingbird
)
method_order <- names(methods_list)

get_dmr_stats <- function(dmrs, method) {
    if (length(dmrs) == 0) {
        return(data.frame(
            Method = method,
            `Number of DMRs` = 0,
            `Mean DMR Width` = NA_real_,
            `Median DMR Width` = NA_real_,
            `Total Coverage bp` = 0,
            `Min Width` = NA_real_,
            `Max Width` = NA_real_
        ))
    }

    data.frame(
        Method = method,
        `Number of DMRs` = length(dmrs),
        `Mean DMR Width` = mean(width(dmrs)),
        `Median DMR Width` = median(width(dmrs)),
        `Total Coverage bp` = sum(width(dmrs)),
        `Min Width` = min(width(dmrs)),
        `Max Width` = max(width(dmrs))
    )
}

comparison_stats <- do.call(rbind, lapply(names(methods_list), function(m) {
    get_dmr_stats(methods_list[[m]], m)
}))

truth_stats <- get_dmr_stats(truth_dmrs, "Ground Truth")
comparison_stats <- rbind(comparison_stats, truth_stats)
comparison_stats$Method <- factor(
    comparison_stats$Method,
    levels = c(method_order, "Ground Truth")
)
comparison_stats <- comparison_stats[order(comparison_stats$Method), , drop = FALSE]
comparison_stats$Method <- as.character(comparison_stats$Method)

knitr::kable(comparison_stats, digits = 2, caption = "Basic DMR Statistics")

Table 4: Basic DMR Statistics
Method	Number.of.DMRs	Mean.DMR.Width	Median.DMR.Width	Total.Coverage.bp	Min.Width	Max.Width
CMEnt	1051	5448.85	4610.0	5726738	227	21553
DMRcate	445	4575.96	3835.0	2036302	783	14431
dmrseq	800	4754.24	3923.0	3803389	694	20771
hummingbird	549	2901.64	2520.0	1593000	400	13680
Ground Truth	1000	5335.16	4305.5	5335161	694	22892

0.7 Ground-Truth Evaluation

evaluate_against_truth <- function(pred, truth) {
    n_pred <- length(pred)
    n_truth <- length(truth)
    pred_overlap_truth <- sum(overlapsAny(pred, truth))
    truth_overlap_pred <- sum(overlapsAny(truth, pred))
    fp <- n_pred - pred_overlap_truth
    fn <- n_truth - truth_overlap_pred

    precision <- ifelse(n_pred == 0, 0, pred_overlap_truth / n_pred)
    recall <- ifelse(n_truth == 0, 0, truth_overlap_pred / n_truth)
    f1 <- ifelse(precision + recall == 0, 0, 2 * precision * recall / (precision + recall))

    pred_red <- reduce(pred)
    truth_red <- reduce(truth)
    inter <- GenomicRanges::intersect(pred_red, truth_red)

    pred_bp <- sum(width(pred_red))
    truth_bp <- sum(width(truth_red))
    inter_bp <- ifelse(length(inter) == 0, 0, sum(width(inter)))
    union_bp <- pred_bp + truth_bp - inter_bp

    precision_bp <- ifelse(pred_bp == 0, 0, inter_bp / pred_bp)
    recall_bp <- ifelse(truth_bp == 0, 0, inter_bp / truth_bp)
    f1_bp <- ifelse(precision_bp + recall_bp == 0, 0, 2 * precision_bp * recall_bp / (precision_bp + recall_bp))
    iou_bp <- ifelse(union_bp == 0, 0, inter_bp / union_bp)

    data.frame(
        TP_regions = truth_overlap_pred,
        FP_regions = fp,
        FN_regions = fn,
        Precision_regions = precision,
        Recall_regions = recall,
        F1_regions = f1,
        Predicted_bp = pred_bp,
        Truth_bp = truth_bp,
        Intersect_bp = inter_bp,
        Precision_bp = precision_bp,
        Recall_bp = recall_bp,
        F1_bp = f1_bp,
        IoU_bp = iou_bp
    )
}

truth_eval <- do.call(rbind, lapply(names(methods_list), function(method) {
    out <- evaluate_against_truth(methods_list[[method]], truth_dmrs)
    out$Method <- method
    out
}))
truth_eval$Method <- factor(truth_eval$Method, levels = method_order)
truth_eval <- truth_eval[order(truth_eval$Method), , drop = FALSE]
truth_eval$Method <- as.character(truth_eval$Method)


truth_eval <- truth_eval |>
    dplyr::select(
        Method,
        TP_regions,
        FP_regions, FN_regions,
        Precision_regions, Recall_regions, F1_regions,
        Precision_bp, Recall_bp, F1_bp, IoU_bp
    )

regions_truth_eval <- truth_eval |>
    dplyr::select(Method, TP_regions, FP_regions, FN_regions, Precision_regions, Recall_regions, F1_regions) |>
    dplyr::rename_with(~ sub("_regions$", "", .), ends_with("_regions"))

bp_truth_eval <- truth_eval |>
    dplyr::select(Method, Precision_bp, Recall_bp, F1_bp, IoU_bp) |>
    dplyr::rename_with(~ sub("_bp$", "", .), ends_with("_bp"))

knitr::kable(regions_truth_eval, digits = 4, caption = "Macro (Region-level) Performance Against Ground Truth")

Table 5: Macro (Region-level) Performance Against Ground Truth
Method	TP	FP	FN	Precision	Recall	F1
CMEnt	994	20	6	0.981	0.994	0.9874
DMRcate	439	0	561	1.000	0.439	0.6101
dmrseq	800	0	200	1.000	0.800	0.8889
hummingbird	408	0	592	1.000	0.408	0.5795

knitr::kable(bp_truth_eval, digits = 4, caption = "Micro (Base-pair-level) Performance Against Ground Truth")

Table 6: Micro (Base-pair-level) Performance Against Ground Truth
Method	Precision	Recall	F1	IoU
CMEnt	0.8938	0.9594	0.9254	0.8611
DMRcate	0.9972	0.3806	0.5509	0.3802
dmrseq	0.9938	0.7085	0.8272	0.7053
hummingbird	0.9980	0.2980	0.4590	0.2978

0.7.1 Overlap Matrices (Methods + Truth)

all_sets <- c(methods_list, list(`Ground Truth` = truth_dmrs))

calculate_overlap <- function(gr1, gr2) {
    if (length(gr1) == 0 || length(gr2) == 0) {
        return(list(count = 0, percentage = 0))
    }
    overlap_count <- sum(overlapsAny(gr1, gr2))
    pct <- overlap_count / length(gr1) * 100
    list(count = overlap_count, percentage = pct)
}

overlap_count_mat <- matrix(0, nrow = length(all_sets), ncol = length(all_sets))
overlap_pct_mat <- matrix(0, nrow = length(all_sets), ncol = length(all_sets))
rownames(overlap_count_mat) <- colnames(overlap_count_mat) <- names(all_sets)
rownames(overlap_pct_mat) <- colnames(overlap_pct_mat) <- names(all_sets)

for (i in seq_along(all_sets)) {
    for (j in seq_along(all_sets)) {
        if (i == j) {
            overlap_count_mat[i, j] <- length(all_sets[[i]])
            overlap_pct_mat[i, j] <- 100
        } else {
            ov <- calculate_overlap(all_sets[[i]], all_sets[[j]])
            overlap_count_mat[i, j] <- ov$count
            overlap_pct_mat[i, j] <- ov$percentage
        }
    }
}

knitr::kable(overlap_count_mat, digits = 0, caption = "Overlap Counts (query=row, reference=column)")

Table 7: Overlap Counts (query=row, reference=column)
	CMEnt	DMRcate	dmrseq	hummingbird	Ground Truth
CMEnt	1051	446	829	441	1031
DMRcate	445	445	443	322	445
dmrseq	800	442	800	415	800
hummingbird	549	432	544	549	549
Ground Truth	994	439	800	408	1000

knitr::kable(overlap_pct_mat, digits = 2, caption = "Overlap Percentages (query=row, reference=column)")

Table 8: Overlap Percentages (query=row, reference=column)
	CMEnt	DMRcate	dmrseq	hummingbird	Ground Truth
CMEnt	100.0	42.44	78.88	41.96	98.1
DMRcate	100.0	100.00	99.55	72.36	100.0
dmrseq	100.0	55.25	100.00	51.88	100.0
hummingbird	100.0	78.69	99.09	100.00	100.0
Ground Truth	99.4	43.90	80.00	40.80	100.0

0.7.2 CMEnt Stage 1 Quality Assessment (Seed Linkage)

Assessing seed-space recoverability and seed capture efficiency.

Metrics explanation:

Truth_DMRs_Total: Total number of ground truth DMRs.
Truth_DMRs_With_Seeds: Number of truth DMRs that have at least one overlapping seed site.
Truth_DMRs_Stage1Recoverable_MinSeeds: Number of truth DMRs that have at least the configured minimum number of overlapping seed sites.
Truth_DMRs_Stage1Recoverable_WithMinCapturedSeeds: Number of truth DMRs that have at least the configured minimum number of overlapping seed sites that also overlap CMEnt-predicted DMRs, indicating they are potentially recoverable and captured by the method.
Truth_DMRs_Stage1Recoverable_FullyCapturedSeeds: Number of truth DMRs where all overlapping seed sites are captured by CMEnt-predicted DMRs, indicating ideal stage 1 capture.
Seed_Capture_Rate_Pct_in_Truth: Percentage of seed sites that overlap truth DMRs that are also captured by CMEnt-predicted DMRs.
Median_Seed_Capture_Pct_Per_Truth: For truth DMRs that have at least one overlapping seed site, the median percentage of those seed sites that are captured by CMEnt-predicted DMRs.
Truth_DMRs_With_Missed_Seeds: Number of truth DMRs that have at least one overlapping seed site that is not captured by CMEnt-predicted DMRs.
Median_Max_Missed_Run_Per_Truth: For truth DMRs with missed seeds, the median length of the longest consecutive run of missed seed sites.
Max_Missed_Run_Observed: The maximum length of consecutive missed seed sites observed across all truth DMRs.

Metrics values:

seed_gr <- sort(seed_gr)
mcols(seed_gr)$index <- seq_along(seed_gr)
seeds_in_truth <- subsetByOverlaps(seed_gr, truth_dmrs, ignore.strand = TRUE)

if (length(dmrs_cment) > 0) {
    seeds_in_dmrs <- subsetByOverlaps(seed_gr, dmrs_cment, ignore.strand = TRUE)
    seeds_in_truth_and_dmrs <- subsetByOverlaps(seeds_in_truth, dmrs_cment, ignore.strand = TRUE)
    seeds_in_truth_not_dmrs <- subsetByOverlaps(seeds_in_truth, dmrs_cment, ignore.strand = TRUE, invert = TRUE)
} else {
    seeds_in_dmrs <- seed_gr[0]
    seeds_in_truth_and_dmrs <- seed_gr[0]
    seeds_in_truth_not_dmrs <- seeds_in_truth
}

seed_truth_ov <- findOverlaps(seeds_in_truth, truth_dmrs, ignore.strand = TRUE)
truth_seed_total <- tabulate(subjectHits(seed_truth_ov), nbins = length(truth_dmrs))
seed_truth_hit_ov <- findOverlaps(seeds_in_truth_and_dmrs, truth_dmrs, ignore.strand = TRUE)
truth_seed_captured <- tabulate(subjectHits(seed_truth_hit_ov), nbins = length(truth_dmrs))
truth_seed_capture_pct <- ifelse(truth_seed_total > 0, round(100 * truth_seed_captured / truth_seed_total, 1), NA_real_)

recoverable_truth <- truth_seed_total >= min_seeds
recoverable_with_captured <- recoverable_truth & truth_seed_captured >= min_seeds
recoverable_fully_captured <- recoverable_truth & truth_seed_captured == truth_seed_total

stage1_quality <- data.frame(
    Truth_DMRs_Total = length(truth_dmrs),
    Truth_DMRs_With_Seeds = sum(truth_seed_total >= 1L),
    Truth_DMRs_Stage1Recoverable_MinSeeds = sum(recoverable_truth),
    Truth_DMRs_Stage1Recoverable_WithMinCapturedSeeds = sum(recoverable_with_captured),
    Truth_DMRs_Stage1Recoverable_FullyCapturedSeeds = sum(recoverable_fully_captured),
    Seed_Capture_Rate_Pct_in_Truth = ifelse(length(seeds_in_truth) == 0, NA_real_, round(100 * length(seeds_in_truth_and_dmrs) / length(seeds_in_truth), 1)),
    Median_Seed_Capture_Pct_Per_Truth = ifelse(sum(truth_seed_total > 0) == 0, NA_real_, stats::median(truth_seed_capture_pct[truth_seed_total > 0], na.rm = TRUE))
)

for (col in colnames(stage1_quality)) {
    cat(sprintf("- **%s**: %s\n", col, stage1_quality[[col]]))
}

Truth_DMRs_Total: 1000
Truth_DMRs_With_Seeds: 1000
Truth_DMRs_Stage1Recoverable_MinSeeds: 1000
Truth_DMRs_Stage1Recoverable_WithMinCapturedSeeds: 994
Truth_DMRs_Stage1Recoverable_FullyCapturedSeeds: 818
Seed_Capture_Rate_Pct_in_Truth: 99
Median_Seed_Capture_Pct_Per_Truth: 100

# Consecutive missed-seed runs are computed per truth DMR using original seed index.
ov_truth <- findOverlaps(seeds_in_truth_not_dmrs, truth_dmrs, ignore.strand = TRUE)
qh_by_truth <- split(queryHits(ov_truth), subjectHits(ov_truth))
run_df <- do.call(rbind, lapply(names(qh_by_truth), function(truth_id_chr) {
    truth_id <- as.integer(truth_id_chr)
    qh <- qh_by_truth[[truth_id_chr]]
    idx <- mcols(seeds_in_truth_not_dmrs)$index[qh]
    db <- as.numeric(mcols(seeds_in_truth_not_dmrs)$delta_beta[qh])

    ord <- order(idx)
    idx <- idx[ord]
    db <- db[ord]

    run_id <- cumsum(c(TRUE, diff(idx) != 1L))
    data.frame(
        Truth_DMR_Index = truth_id,
        run_length = as.integer(tabulate(run_id)),
        mean_delta_beta = as.numeric(tapply(db, run_id, mean, na.rm = TRUE))
    )
}))

run_count <- as.data.frame(table(run_df$run_length))
colnames(run_count) <- c("Consecutive_Seed_Length", "Count")
run_delta <- aggregate(mean_delta_beta ~ run_length, data = run_df, FUN = mean, na.rm = TRUE)
colnames(run_delta) <- c("Consecutive_Seed_Length", "Mean_Delta_Beta")
run_summary <- merge(run_count, run_delta, by = "Consecutive_Seed_Length", all.x = TRUE)

max_run_per_truth <- aggregate(run_length ~ Truth_DMR_Index, data = run_df, FUN = max)
run_overview <- data.frame(
    Truth_DMRs_With_Missed_Seeds = nrow(max_run_per_truth),
    Median_Max_Missed_Run_Per_Truth = stats::median(max_run_per_truth$run_length),
    Max_Missed_Run_Observed = max(max_run_per_truth$run_length)
)
for (col in colnames(run_overview)) {
    cat(sprintf("- **%s**: %s\n", col, run_overview[[col]]))
}

Truth_DMRs_With_Missed_Seeds: 182
Median_Max_Missed_Run_Per_Truth: 3
Max_Missed_Run_Observed: 26

0.7.3 CMEnt Stage 2-4 Quality Assessment (Extension and Merging)

Assessing truth-side recovery and prediction-side fragmentation/purity after extension and merging. Metrics explanation:

Pred_DMRs_Total: Total number of CMEnt DMRs.
Pred_DMRs_Overlapping_Truth: Number of CMEnt DMRs that overlap at least one truth DMR.
Pred_DMRs_BPPrecision_ge50pct: Number of CMEnt DMRs with base-pair precision >= 50% (i.e., at least 50% of the CMEnt DMR’s base pairs overlap with truth DMRs).
Pred_DMRs_BPPrecision_ge80pct: Number of CMEnt DMRs with base-pair precision >= 80%.
Truth_DMRs_Total: Total number of truth DMRs.
Truth_DMRs_Hit: Number of truth DMRs that have at least one overlapping CMEnt DMR.
Truth_DMRs_BPRecall_ge50pct: Number of truth DMRs with base-pair recall >= 50% (i.e., at least 50% of the truth DMR’s base pairs are covered by CMEnt DMRs).
Truth_DMRs_BPRecall_ge80pct: Number of truth DMRs with base-pair recall >= 80%.
Median_Truth_BPRecall_Pct_Hit: Median base-pair recall percentage among truth DMRs that have at least one overlapping CMEnt DMR.
Median_Pred_BPPrecision_Pct_Overlap: Median base-pair precision percentage among CMEnt DMRs that have at least some overlap with truth DMRs.
Truth_DMRs_With_Multiple_Pred: Number of truth DMRs that have 2 or more overlapping CMEnt DMRs, indicating potential fragmentation.
Median_Preds_Per_Hit_Truth: Median number of overlapping CMEnt DMRs among truth DMRs that have at least one overlapping CMEnt DMR.
Max_Preds_On_Truth: Maximum number of overlapping CMEnt DMRs observed on any single truth DMR.

Metrics values:

truth_to_pred_hits <- findOverlaps(truth_dmrs, dmrs_cment, ignore.strand = TRUE)
preds_per_truth <- tabulate(queryHits(truth_to_pred_hits), nbins = length(truth_dmrs))

truth_overlap_bp <- numeric(length(truth_dmrs))
if (length(truth_to_pred_hits) > 0) {
    truth_intersections <- pintersect(
        truth_dmrs[queryHits(truth_to_pred_hits)],
        dmrs_cment[subjectHits(truth_to_pred_hits)]
    )
    truth_intersections_by_truth <- split(truth_intersections, queryHits(truth_to_pred_hits))
    truth_overlap_bp[as.integer(names(truth_intersections_by_truth))] <- vapply(
        truth_intersections_by_truth,
        function(gr) sum(width(reduce(gr))),
        numeric(1)
    )
}
truth_bp_recall <- ifelse(width(truth_dmrs) > 0, truth_overlap_bp / width(truth_dmrs), 0)

pred_to_truth_hits <- findOverlaps(dmrs_cment, truth_dmrs, ignore.strand = TRUE)
pred_overlap_bp <- numeric(length(dmrs_cment))
if (length(pred_to_truth_hits) > 0) {
    pred_intersections <- pintersect(
        dmrs_cment[queryHits(pred_to_truth_hits)],
        truth_dmrs[subjectHits(pred_to_truth_hits)]
    )
    pred_intersections_by_pred <- split(pred_intersections, queryHits(pred_to_truth_hits))
    pred_overlap_bp[as.integer(names(pred_intersections_by_pred))] <- vapply(
        pred_intersections_by_pred,
        function(gr) sum(width(reduce(gr))),
        numeric(1)
    )
}
pred_bp_precision <- ifelse(width(dmrs_cment) > 0, pred_overlap_bp / width(dmrs_cment), 0)

stage24_quality <- data.frame(
    Pred_DMRs_Total = length(dmrs_cment),
    Pred_DMRs_Overlapping_Truth = sum(overlapsAny(dmrs_cment, truth_dmrs)),
    Pred_DMRs_BPPrecision_ge50pct = sum(pred_bp_precision >= 0.5),
    Pred_DMRs_BPPrecision_ge80pct = sum(pred_bp_precision >= 0.8),
    Truth_DMRs_Total = length(truth_dmrs),
    Truth_DMRs_Hit = sum(preds_per_truth > 0),
    Truth_DMRs_BPRecall_ge50pct = sum(truth_bp_recall >= 0.5),
    Truth_DMRs_BPRecall_ge80pct = sum(truth_bp_recall >= 0.8),
    Median_Truth_BPRecall_Pct_Hit = ifelse(sum(preds_per_truth > 0) == 0, 0, 100 * stats::median(truth_bp_recall[preds_per_truth > 0])),
    Median_Pred_BPPrecision_Pct_Overlap = ifelse(sum(pred_bp_precision > 0) == 0, 0, 100 * stats::median(pred_bp_precision[pred_bp_precision > 0])),
    Truth_DMRs_With_Multiple_Pred = sum(preds_per_truth >= 2),
    Median_Preds_Per_Hit_Truth = ifelse(sum(preds_per_truth > 0) == 0, 0, stats::median(preds_per_truth[preds_per_truth > 0])),
    Max_Preds_On_Truth = ifelse(length(preds_per_truth) == 0, 0, max(preds_per_truth))
)
for (col in names(stage24_quality)) {
    cat(sprintf("- **%s**: %s\n", col, stage24_quality[[col]]))
}

Pred_DMRs_Total: 1051
Pred_DMRs_Overlapping_Truth: 1031
Pred_DMRs_BPPrecision_ge50pct: 1019
Pred_DMRs_BPPrecision_ge80pct: 776
Truth_DMRs_Total: 1000
Truth_DMRs_Hit: 994
Truth_DMRs_BPRecall_ge50pct: 991
Truth_DMRs_BPRecall_ge80pct: 957
Median_Truth_BPRecall_Pct_Hit: 100
Median_Pred_BPPrecision_Pct_Overlap: 90.1453750160813
Truth_DMRs_With_Multiple_Pred: 67
Median_Preds_Per_Hit_Truth: 1
Max_Preds_On_Truth: 4

0.8 Visualizations

0.8.1 Ground-Truth Metrics

plot_df <- truth_eval |>
    dplyr::select(Method, F1_regions, F1_bp, IoU_bp) |>
    tidyr::pivot_longer(
        cols = c(F1_regions, F1_bp, IoU_bp),
        names_to = "Metric",
        values_to = "Value"
    )

plot_df$Metric <- factor(
    plot_df$Metric,
    levels = c("F1_regions", "F1_bp", "IoU_bp"),
    labels = c("F1 (Region)", "F1 (Base-pair)", "IoU (Base-pair)")
)
plot_df$Method <- factor(plot_df$Method, levels = method_order)

ggplot(plot_df, aes(x = Method, y = Value, fill = Method)) +
    geom_col() +
    facet_wrap(~Metric, nrow = 1) +
    theme_minimal() +
    theme(
        axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "none"
    ) +
    ylim(0, 1) +
    labs(
        title = "Method Performance vs Ground Truth",
        x = "",
        y = "Score"
    )

0.8.2 Width Distributions

width_data <- do.call(rbind, lapply(names(all_sets), function(method) {
    x <- all_sets[[method]]
    if (length(x) == 0) {
        return(NULL)
    }
    data.frame(Method = method, Width = width(x))
}))
width_data$Method <- factor(width_data$Method, levels = names(all_sets))

ggplot(width_data, aes(x = Method, y = Width, fill = Method)) +
    geom_violin(alpha = 0.7) +
    geom_boxplot(width = 0.2, alpha = 0.5, outlier.size = 0.5) +
    scale_y_log10(labels = scales::comma) +
    theme_minimal() +
    theme(
        axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "none"
    ) +
    labs(
        title = "DMR Width Distributions",
        x = "",
        y = "Width (bp, log scale)"
    )

0.9 Summary

cat("\n## Key Findings\n\n")

0.10 Key Findings

if (nrow(truth_eval) > 0) {
    best_f1 <- truth_eval[which.max(truth_eval$F1_regions), ]
    best_iou <- truth_eval[which.max(truth_eval$IoU_bp), ]

    cat(sprintf(
        "- Best region-level F1: **%s** (%.3f)\n",
        best_f1$Method, best_f1$F1_regions
    ))
    cat(sprintf(
        "- Best base-pair IoU: **%s** (%.3f)\n",
        best_iou$Method, best_iou$IoU_bp
    ))

    if ("CMEnt" %in% truth_eval$Method) {
        seg <- truth_eval[truth_eval$Method == "CMEnt", ]
        cat(sprintf(
            "- **CMEnt** region metrics: Precision = %.3f, Recall = %.3f, F1 = %.3f\n",
            seg$Precision_regions, seg$Recall_regions, seg$F1_regions
        ))
        cat(sprintf(
            "- **CMEnt** base-pair metrics: Precision = %.3f, Recall = %.3f, F1 = %.3f, IoU = %.3f\n",
            seg$Precision_bp, seg$Recall_bp, seg$F1_bp, seg$IoU_bp
        ))
    }
}

Best region-level F1: CMEnt (0.987)
Best base-pair IoU: CMEnt (0.861)
CMEnt region metrics: Precision = 0.981, Recall = 0.994, F1 = 0.987
CMEnt base-pair metrics: Precision = 0.894, Recall = 0.959, F1 = 0.925, IoU = 0.861

0.11 Session Info

sessionInfo()

R version 4.6.0 (2026-04-24) Platform: x86_64-pc-linux-gnu Running under: Ubuntu 24.04.4 LTS

Matrix products: default BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.12.0 LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0 LAPACK version 3.12.0

locale: [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C

time zone: Europe/Brussels tzcode source: system (glibc)

attached base packages: [1] parallel stats4 stats graphics grDevices datasets utils
[8] methods base

other attached packages: [1] bsseq_1.48.0 CMEnt_0.99.0
[3] dplyr_1.2.1 ggplot2_4.0.3
[5] data.table_1.18.4 minfi_1.58.0
[7] bumphunter_1.54.0 locfit_1.5-9.12
[9] iterators_1.0.14 foreach_1.5.2
[11] Biostrings_2.80.1 XVector_0.52.0
[13] SummarizedExperiment_1.42.0 Biobase_2.72.0
[15] MatrixGenerics_1.24.0 matrixStats_1.5.0
[17] GenomicRanges_1.64.0 Seqinfo_1.2.0
[19] IRanges_2.46.0 S4Vectors_0.50.1
[21] BiocGenerics_0.58.1 generics_0.1.4
[23] testthat_3.3.2 BiocStyle_2.40.0

loaded via a namespace (and not attached): [1] igraph_2.3.2
[2] plotly_4.12.0
[3] ChAMPdata_2.44.0
[4] Formula_1.2-5
[5] devtools_2.5.2
[6] tidyselect_1.2.1
[7] bit_4.6.0
[8] doParallel_1.0.17
[9] clue_0.3-68
[10] lattice_0.22-9
[11] rjson_0.2.23
[12] nor1mix_1.3-3
[13] blob_1.3.0
[14] stringr_1.6.0
[15] rngtools_1.5.2
[16] S4Arrays_1.12.0
[17] base64_2.0.2
[18] dichromat_2.0-0.1
[19] scrime_1.3.7
[20] seqLogo_1.78.0
[21] png_0.1-9
[22] tinytex_0.59
[23] cli_3.6.6
[24] marray_1.90.0
[25] bedr_1.1.5
[26] outliers_0.15
[27] ProtGenerics_1.44.0
[28] askpass_1.2.1
[29] multtest_2.68.0
[30] openssl_2.4.2
[31] JADE_2.0-4
[32] nleqslv_3.3.7
[33] pkgdown_2.2.0
[34] BiocIO_1.22.0
[35] purrr_1.2.2
[36] dendextend_1.19.1
[37] curl_7.1.0
[38] mime_0.13
[39] evaluate_1.0.5
[40] wateRmelon_2.18.0
[41] ComplexHeatmap_2.28.0
[42] stringi_1.8.7
[43] backports_1.5.1
[44] desc_1.4.3
[45] XML_3.99-0.23
[46] httpuv_1.6.17
[47] AnnotationDbi_1.74.0
[48] magrittr_2.0.5
[49] rappdirs_0.3.4
[50] splines_4.6.0
[51] mclust_6.1.2
[52] DelayedDataFrame_1.28.0
[53] BiasedUrn_2.0.12
[54] jpeg_0.1-11
[55] doRNG_1.8.6.3
[56] rentrez_1.2.4
[57] org.Hs.eg.db_3.23.1
[58] IlluminaHumanMethylation450kmanifest_0.4.0
[59] DT_0.34.0
[60] sessioninfo_1.2.4
[61] DBI_1.3.0
[62] HDF5Array_1.40.0
[63] jquerylib_0.1.4
[64] genefilter_1.94.0
[65] withr_3.0.2
[66] class_7.3-23
[67] rprojroot_2.1.1
[68] brio_1.1.5
[69] sva_3.60.0
[70] isva_1.10
[71] formatR_1.14
[72] rtracklayer_1.72.0
[73] BiocManager_1.30.27
[74] Illumina450ProbeVariants.db_1.48.0
[75] htmlwidgets_1.6.4
[76] fs_2.1.0
[77] biomaRt_2.68.0
[78] missMethyl_1.46.0
[79] labeling_0.4.3
[80] SparseArray_1.12.2
[81] shinycssloaders_1.1.0
[82] h5mread_1.4.0
[83] annotate_1.90.0
[84] VariantAnnotation_1.58.0
[85] knitr_1.51
[86] TFBSTools_1.50.0
[87] UCSC.utils_1.8.0
[88] beanplot_1.3.1
[89] TFMPvalue_1.0.0
[90] ChAMP_2.42.0
[91] annotatr_1.38.0
[92] caTools_1.18.3
[93] grid_4.6.0
[94] rhdf5_2.56.0
[95] pwalign_1.8.0
[96] R.oo_1.27.1
[97] regioneR_1.44.0
[98] ellipsis_0.3.3
[99] lazyeval_0.2.3
[100] yaml_2.3.12
[101] survival_3.8-6
[102] RPMM_1.25
[103] BiocVersion_3.23.1
[104] crayon_1.5.3
[105] RColorBrewer_1.1-3
[106] tidyr_1.3.2
[107] later_1.4.8
[108] codetools_0.2-20
[109] base64enc_0.1-6
[110] GlobalOptions_0.1.4
[111] KEGGREST_1.52.0
[112] shape_1.4.6.1
[113] fastICA_1.2-7
[114] limma_3.68.4
[115] IlluminaHumanMethylationEPICanno.ilm10b4.hg19_0.6.0 [116] Rsamtools_2.28.0
[117] filelock_1.0.3
[118] showtext_0.9-8
[119] foreign_0.8-90
[120] pkgconfig_2.0.3
[121] IlluminaHumanMethylation450kanno.ilmn12.hg19_0.6.1 [122] xml2_1.5.2
[123] GenomicAlignments_1.48.0
[124] hummingbird_1.22.0
[125] BSgenome_1.80.0
[126] viridisLite_0.4.3
[127] biovizBase_1.60.0
[128] gridBase_0.4-7
[129] xtable_1.8-8
[130] interp_1.1-6
[131] lumi_2.64.0
[132] plyr_1.8.9
[133] httr_1.4.8
[134] tools_4.6.0
[135] pkgbuild_1.4.8
[136] htmlTable_2.5.0
[137] checkmate_2.3.4
[138] nlme_3.1-168
[139] affy_1.90.0
[140] futile.logger_1.4.9
[141] lambda.r_1.2.4
[142] dbplyr_2.5.2
[143] ExperimentHub_3.2.0
[144] IlluminaHumanMethylationEPICmanifest_0.3.0
[145] digest_0.6.39
[146] permute_0.9-10
[147] bookdown_0.46
[148] Matrix_1.7-5
[149] farver_2.1.2
[150] tzdb_0.5.0
[151] AnnotationFilter_1.36.0
[152] reshape2_1.4.5
[153] viridis_0.6.5
[154] DirichletMultinomial_1.54.0
[155] rpart_4.1.27
[156] glue_1.8.1
[157] cachem_1.1.0
[158] bspm_0.5.8
[159] VennDiagram_1.8.2
[160] BiocFileCache_3.2.0
[161] methylumi_2.58.0
[162] Hmisc_5.2-5
[163] txdbmaker_1.8.0
[164] sysfonts_0.8.9
[165] pkgload_1.5.2
[166] statmod_1.5.2
[167] impute_1.86.0
[168] GEOquery_2.80.0
[169] httr2_1.2.2
[170] showtextdb_3.0
[171] dmrseq_1.32.0
[172] siggenes_1.86.0
[173] gtools_3.9.5
[174] preprocessCore_1.74.0
[175] affyio_1.82.0
[176] gridExtra_2.3
[177] shiny_1.13.0
[178] R.utils_2.13.0
[179] rhdf5filters_1.24.0
[180] RCurl_1.98-1.19
[181] memoise_2.0.1
[182] rmarkdown_2.31
[183] scales_1.4.0
[184] R.methodsS3_1.8.2
[185] reshape_0.8.10
[186] illuminaio_0.54.0
[187] rstudioapi_0.18.0
[188] cluster_2.1.8.2
[189] ROC_1.88.0
[190] hms_1.1.4
[191] globaltest_5.66.0
[192] DMRcate_3.8.0
[193] colorspace_2.1-2
[194] FNN_1.1.4.1
[195] rlang_1.2.0
[196] quadprog_1.5-8
[197] GenomeInfoDb_1.48.0
[198] DelayedMatrixStats_1.34.0
[199] sparseMatrixStats_1.24.0
[200] shinythemes_1.2.0
[201] circlize_0.4.18
[202] mgcv_1.9-4
[203] xfun_0.58
[204] ggseqlogo_0.2.2
[205] e1071_1.7-17
[206] abind_1.4-8
[207] tibble_3.3.1
[208] Rhdf5lib_2.0.0
[209] readr_2.2.0
[210] futile.options_1.0.1
[211] bitops_1.0-9
[212] ps_1.9.3
[213] promises_1.5.0
[214] inline_0.3.21
[215] RSQLite_3.53.1
[216] qvalue_2.44.0
[217] DelayedArray_0.38.2
[218] proxy_0.4-29
[219] GO.db_3.23.1
[220] compiler_4.6.0
[221] prettyunits_1.2.0
[222] beachmat_2.28.0
[223] Rcpp_1.1.1-1.1
[224] DNAcopy_1.86.0
[225] edgeR_4.10.1
[226] AnnotationHub_4.2.0
[227] usethis_3.2.1
[228] MASS_7.3-65
[229] progress_1.2.3
[230] BiocParallel_1.46.0
[231] R6_2.6.1
[232] fastmap_1.2.0
[233] ensembldb_2.36.1
[234] nnet_7.3-20
[235] gtable_0.3.6
[236] KernSmooth_2.23-26
[237] strex_2.0.1
[238] latticeExtra_0.6-31
[239] deldir_2.0-4
[240] htmltools_0.5.9
[241] bit64_4.8.2
[242] lifecycle_1.0.5
[243] S7_0.2.2
[244] processx_3.9.0
[245] callr_3.8.0
[246] Gviz_1.56.0
[247] restfulr_0.0.16
[248] sass_0.4.10
[249] vctrs_0.7.3
[250] goseq_1.64.0
[251] bslib_0.11.0
[252] pillar_1.11.1
[253] GenomicFeatures_1.64.0
[254] magick_2.9.1
[255] otel_0.2.0
[256] combinat_0.0-8
[257] geneLenDataBase_1.48.0
[258] jsonlite_2.0.0
[259] cigarillo_1.2.0
[260] GetoptLong_1.1.1

Benchmarking CMEnt on Simulated ONT Data

CMEnt vs DMRcate vs dmrseq vs hummingbird

2026-06-11

Contents

0.1 Introduction

0.2 Setup and Dependencies

0.2.1 Install External Dependencies

0.3 Load Simulated Data and Ground Truth

0.4 Build Phenotype Table from Sample Names

0.5 Benchmarking DMR Detection Methods

0.5.1 CMEnt

0.5.1.1 Adjusting parameters for NGS simulation

0.5.1.2 Extract Methylation Matrix and Generate Seeds

0.5.1.3 Seed Overlap with Ground Truth

0.5.2 DMRcate

0.5.3 dmrseq

0.5.4 hummingbird

0.6 Comparative Analysis

0.7 Ground-Truth Evaluation

0.7.1 Overlap Matrices (Methods + Truth)

0.7.2 CMEnt Stage 1 Quality Assessment (Seed Linkage)

0.7.3 CMEnt Stage 2-4 Quality Assessment (Extension and Merging)

0.8 Visualizations

0.8.1 Ground-Truth Metrics

0.8.2 Width Distributions

0.9 Summary

0.10 Key Findings

0.11 Session Info