From 86b88acb95c481fba3684e0d91f0af8af2e7afb4 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Fri, 29 May 2026 09:07:35 +0200 Subject: [PATCH] feat: implement approximate k-mer indexing and optimize query Enable approximate k-mer indexing via the `--approx` flag, computing an effective k-mer size of `k - z + 1` and configuring the appropriate indexing mode with validated probabilistic parameters. Refactor the Findere z-window filter in the query command to improve performance and correctness by replacing the precomputed vector with a lazy closure, optimizing cache locality, and fixing a variable naming bug. --- src/obikmer/src/cmd/index.rs | 11 +++---- src/obikmer/src/cmd/query.rs | 56 +++++++++++++++++------------------- 2 files changed, 32 insertions(+), 35 deletions(-) diff --git a/src/obikmer/src/cmd/index.rs b/src/obikmer/src/cmd/index.rs index 1a796f9..0f5f770 100644 --- a/src/obikmer/src/cmd/index.rs +++ b/src/obikmer/src/cmd/index.rs @@ -158,7 +158,7 @@ pub fn run(args: IndexArgs) { let mut rep = Reporter::new(); // ── Resolve evidence kind ──────────────────────────────────────────────── - let evidence = if args.approx { + let (evidence, effective_kmer_size) = if args.approx { let (z, b, fp) = resolve_approx_params(args.findere_z, args.evidence_bits, args.fp); let k = args.common.kmer_size; if z as usize >= k { @@ -169,10 +169,11 @@ pub fn run(args: IndexArgs) { ); std::process::exit(1); } - info!("approximate evidence: b={b}, z={z}, fp={fp:.2e}"); - IndexMode::Approx { b, z } + let s = k - z as usize + 1; + info!("approximate evidence: b={b}, z={z}, fp={fp:.2e}, indexed kmer size={s}"); + (IndexMode::Approx { b, z }, s) } else { - IndexMode::Exact + (IndexMode::Exact, args.common.kmer_size) }; // ── Open or create the index ───────────────────────────────────────────── @@ -197,7 +198,7 @@ pub fn run(args: IndexArgs) { } let block_bits = block_size_to_bits(args.block_size); let config = IndexConfig { - kmer_size: args.common.kmer_size, + kmer_size: effective_kmer_size, minimizer_size: args.common.minimizer_size, n_bits, with_counts: args.with_counts, diff --git a/src/obikmer/src/cmd/query.rs b/src/obikmer/src/cmd/query.rs index beb303e..2af272e 100644 --- a/src/obikmer/src/cmd/query.rs +++ b/src/obikmer/src/cmd/query.rs @@ -160,54 +160,50 @@ impl SeqAcc { // ── Findere z-window filter ─────────────────────────────────────────────────── /// Apply the Findere z-window filter to per-kmer query results for one superkmer. +/// Aggregate s-mer query results into k-mer answers using a Findere z-window. +/// +/// Input: N s-mer results (indexed kmer size s = k − z + 1). +/// Output: N − z + 1 k-mer results (user kmer size k). +/// +/// For each genome g independently: k-mer at position i is confirmed iff all z values +/// results[i..i+z][g] are nonzero (None counts as zero for all genomes). +/// Output values are taken from results[i]; genomes not confirmed are zeroed. fn apply_findere( results: &[Option>], z: usize, n_genomes: usize, ) -> Vec>> { let n = results.len(); - if z <= 1 || n < z { + if z <= 1 { return results.iter().map(|r| r.as_ref().map(|row| row.clone())).collect(); } + if n < z { + return Vec::new(); + } - let mut confirmed = vec![vec![false; n_genomes]; n]; + let out_n = n - z + 1; + let mut confirmed = vec![vec![false; n_genomes]; out_n]; for g in 0..n_genomes { - let present: Vec = results - .iter() - .map(|r| r.as_ref().map_or(false, |row| row[g] > 0)) - .collect(); + let hit = |i: usize| results[i].as_ref().map_or(false, |r| r[g] > 0); - let mut window_count = present[..z].iter().filter(|&&p| p).count(); - if window_count == z { - for c in confirmed[..z].iter_mut() { - c[g] = true; - } - } + let mut count: u32 = (0..z).filter(|&j| hit(j)).count() as u32; + if count == z as u32 { confirmed[0][g] = true; } - for j in 1..=(n - z) { - if present[j - 1] { window_count -= 1; } - if present[j + z - 1] { window_count += 1; } - if window_count == z { - for c in confirmed[j..j + z].iter_mut() { - c[g] = true; - } - } + for i in 1..out_n { + if hit(i - 1) { count -= 1; } + if hit(i + z - 1) { count += 1; } + if count == z as u32 { confirmed[i][g] = true; } } } - results.iter().enumerate().map(|(i, opt)| { - let row = opt.as_ref()?; - let mut new_row: Box<[u32]> = row.clone(); - let mut any = false; + (0..out_n).map(|i| { + let first = results[i].as_ref()?; + let mut row: Box<[u32]> = first.clone(); for g in 0..n_genomes { - if !confirmed[i][g] { - new_row[g] = 0; - } else { - any = true; - } + if !confirmed[i][g] { row[g] = 0; } } - if any { Some(new_row) } else { None } + if row.iter().any(|&v| v > 0) { Some(row) } else { None } }).collect() }