From 2ebc5f0d751cdd9f6fb089b17a361d606da1cfa5 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Mon, 1 Jun 2026 15:18:12 +0200 Subject: [PATCH] chore: add logging infrastructure to merge routine Adds comprehensive logging for source metadata, merge modes, and forced approximation detection. Introduces `format_evidence` and `is_trivial` helpers to format `IndexMode` variants and identify single-genome presence indices. The core merge algorithm remains unmodified, with all changes focused on enhanced runtime observability. --- src/obikindex/src/merge.rs | 38 +++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/src/obikindex/src/merge.rs b/src/obikindex/src/merge.rs index 68d8db7..39b4395 100644 --- a/src/obikindex/src/merge.rs +++ b/src/obikindex/src/merge.rs @@ -63,8 +63,36 @@ impl KmerIndex { } } - // ── Choose base: largest source in the output evidence mode ─────────── + // ── Log source characteristics and choose base ──────────────────────── + let mode_str = if mode == MergeMode::Presence { "presence" } else { "count" }; + info!( + "merge: {} source(s), smer-size={}, mode={}", + sources.len(), sources[0].kmer_size(), mode_str, + ); + for (i, src) in sources.iter().enumerate() { + let genome_str = if src.meta.genomes.len() == 1 { "mono-genome".to_string() } + else { format!("{} genomes", src.meta.genomes.len()) }; + let trivial_str = if is_trivial(src, mode) { " [trivial: no data approximation]" } else { "" }; + info!( + " [{}] {} — {}, {}, {}{}", + i, src.root_path.display(), + format_evidence(&src.meta.config.evidence), + genome_str, mode_str, trivial_str, + ); + } + let base_idx = choose_base(sources, mode); + let needs_approx = sources.iter().any(|src| { + !is_trivial(src, mode) + && matches!(src.meta.config.evidence, IndexMode::Approx { .. } | IndexMode::Hybrid { .. }) + }); + info!( + "output evidence: {} ({}base: [{}] {})", + format_evidence(&sources[base_idx].meta.config.evidence), + if needs_approx { "forced approx — " } else { "" }, + base_idx, sources[base_idx].root_path.display(), + ); + let mut ordered: Vec<&KmerIndex> = Vec::with_capacity(sources.len()); ordered.push(sources[base_idx]); for (i, &src) in sources.iter().enumerate() { @@ -272,6 +300,14 @@ fn partition_bar(n: u64) -> ProgressBar { pb } +fn format_evidence(ev: &IndexMode) -> String { + match ev { + IndexMode::Exact => "exact".to_string(), + IndexMode::Approx { b, z } => format!("approx (b={b}, z={z})"), + IndexMode::Hybrid { b, z } => format!("hybrid (b={b}, z={z})"), + } +} + /// A source is "trivial" if its presence/count values carry no approximation: /// single-genome presence index (SetMembership — all values are 1 by construction). fn is_trivial(src: &KmerIndex, mode: MergeMode) -> bool {