feat: implement approximate k-mer indexing and optimize query

Enable approximate k-mer indexing via the `--approx` flag, computing an effective k-mer size of `k - z + 1` and configuring the appropriate indexing mode with validated probabilistic parameters. Refactor the Findere z-window filter in the query command to improve performance and correctness by replacing the precomputed vector with a lazy closure, optimizing cache locality, and fixing a variable naming bug.
This commit is contained in:
Eric Coissac
2026-05-29 09:07:35 +02:00
parent be0e8f1041
commit 86b88acb95
2 changed files with 32 additions and 35 deletions
+6 -5
View File
@@ -158,7 +158,7 @@ pub fn run(args: IndexArgs) {
let mut rep = Reporter::new(); let mut rep = Reporter::new();
// ── Resolve evidence kind ──────────────────────────────────────────────── // ── Resolve evidence kind ────────────────────────────────────────────────
let evidence = if args.approx { let (evidence, effective_kmer_size) = if args.approx {
let (z, b, fp) = resolve_approx_params(args.findere_z, args.evidence_bits, args.fp); let (z, b, fp) = resolve_approx_params(args.findere_z, args.evidence_bits, args.fp);
let k = args.common.kmer_size; let k = args.common.kmer_size;
if z as usize >= k { if z as usize >= k {
@@ -169,10 +169,11 @@ pub fn run(args: IndexArgs) {
); );
std::process::exit(1); std::process::exit(1);
} }
info!("approximate evidence: b={b}, z={z}, fp={fp:.2e}"); let s = k - z as usize + 1;
IndexMode::Approx { b, z } info!("approximate evidence: b={b}, z={z}, fp={fp:.2e}, indexed kmer size={s}");
(IndexMode::Approx { b, z }, s)
} else { } else {
IndexMode::Exact (IndexMode::Exact, args.common.kmer_size)
}; };
// ── Open or create the index ───────────────────────────────────────────── // ── Open or create the index ─────────────────────────────────────────────
@@ -197,7 +198,7 @@ pub fn run(args: IndexArgs) {
} }
let block_bits = block_size_to_bits(args.block_size); let block_bits = block_size_to_bits(args.block_size);
let config = IndexConfig { let config = IndexConfig {
kmer_size: args.common.kmer_size, kmer_size: effective_kmer_size,
minimizer_size: args.common.minimizer_size, minimizer_size: args.common.minimizer_size,
n_bits, n_bits,
with_counts: args.with_counts, with_counts: args.with_counts,
+26 -30
View File
@@ -160,54 +160,50 @@ impl SeqAcc {
// ── Findere z-window filter ─────────────────────────────────────────────────── // ── Findere z-window filter ───────────────────────────────────────────────────
/// Apply the Findere z-window filter to per-kmer query results for one superkmer. /// Apply the Findere z-window filter to per-kmer query results for one superkmer.
/// Aggregate s-mer query results into k-mer answers using a Findere z-window.
///
/// Input: N s-mer results (indexed kmer size s = k z + 1).
/// Output: N z + 1 k-mer results (user kmer size k).
///
/// For each genome g independently: k-mer at position i is confirmed iff all z values
/// results[i..i+z][g] are nonzero (None counts as zero for all genomes).
/// Output values are taken from results[i]; genomes not confirmed are zeroed.
fn apply_findere( fn apply_findere(
results: &[Option<Box<[u32]>>], results: &[Option<Box<[u32]>>],
z: usize, z: usize,
n_genomes: usize, n_genomes: usize,
) -> Vec<Option<Box<[u32]>>> { ) -> Vec<Option<Box<[u32]>>> {
let n = results.len(); let n = results.len();
if z <= 1 || n < z { if z <= 1 {
return results.iter().map(|r| r.as_ref().map(|row| row.clone())).collect(); return results.iter().map(|r| r.as_ref().map(|row| row.clone())).collect();
} }
if n < z {
return Vec::new();
}
let mut confirmed = vec![vec![false; n_genomes]; n]; let out_n = n - z + 1;
let mut confirmed = vec![vec![false; n_genomes]; out_n];
for g in 0..n_genomes { for g in 0..n_genomes {
let present: Vec<bool> = results let hit = |i: usize| results[i].as_ref().map_or(false, |r| r[g] > 0);
.iter()
.map(|r| r.as_ref().map_or(false, |row| row[g] > 0))
.collect();
let mut window_count = present[..z].iter().filter(|&&p| p).count(); let mut count: u32 = (0..z).filter(|&j| hit(j)).count() as u32;
if window_count == z { if count == z as u32 { confirmed[0][g] = true; }
for c in confirmed[..z].iter_mut() {
c[g] = true;
}
}
for j in 1..=(n - z) { for i in 1..out_n {
if present[j - 1] { window_count -= 1; } if hit(i - 1) { count -= 1; }
if present[j + z - 1] { window_count += 1; } if hit(i + z - 1) { count += 1; }
if window_count == z { if count == z as u32 { confirmed[i][g] = true; }
for c in confirmed[j..j + z].iter_mut() {
c[g] = true;
}
}
} }
} }
results.iter().enumerate().map(|(i, opt)| { (0..out_n).map(|i| {
let row = opt.as_ref()?; let first = results[i].as_ref()?;
let mut new_row: Box<[u32]> = row.clone(); let mut row: Box<[u32]> = first.clone();
let mut any = false;
for g in 0..n_genomes { for g in 0..n_genomes {
if !confirmed[i][g] { if !confirmed[i][g] { row[g] = 0; }
new_row[g] = 0;
} else {
any = true;
}
} }
if any { Some(new_row) } else { None } if row.iter().any(|&v| v > 0) { Some(row) } else { None }
}).collect() }).collect()
} }