diff --git a/src/obikmer/src/cmd/query.rs b/src/obikmer/src/cmd/query.rs index 10662da..f5172f7 100644 --- a/src/obikmer/src/cmd/query.rs +++ b/src/obikmer/src/cmd/query.rs @@ -1,4 +1,4 @@ -use std::collections::HashMap; +use std::collections::{HashMap, VecDeque}; use std::io::{self, BufWriter, Write}; use std::path::PathBuf; use std::sync::Arc; @@ -277,10 +277,18 @@ fn process_chunk( }); } - // Findere z-window filter + accumulation — no intermediate allocations. - // One `confirmed` buffer reused across all sequences. + // Sliding window minimum — one reusable buffer and one deque per batch. + // + // win_min[pos * n_genomes + g] = min count across the z-window [pos, pos+z) + // for genome g, where "not in index" counts as 0. + // + // win_min > 0 ↔ all z consecutive kmers are in the index with count > 0 + // ↔ Findere confirmation (for z=1 this degenerates to the + // simple case with no overhead). + // + // Works uniformly for count matrices and presence/absence (0/1) matrices. let max_n_kmers = batch.n_kmers.iter().map(|&n| n as usize).max().unwrap_or(0); - let mut confirmed = vec![false; max_n_kmers * n_genomes]; + let mut win_min = vec![0u32; max_n_kmers * n_genomes]; let mut accs: Vec = (0..n_seqs).map(|_| SeqAcc::new(n_genomes)).collect(); @@ -289,114 +297,74 @@ fn process_chunk( .iter() .map(|&n| { let n = n as usize; - if n >= effective_z { - n - effective_z + 1 - } else { - 0 - } + if n >= effective_z { n - effective_z + 1 } else { 0 } }) .collect(); let mut cov: Vec>> = if detail { - n_kmers_out - .iter() - .map(|&n| vec![vec![0u32; n]; n_genomes]) - .collect() + n_kmers_out.iter().map(|&n| vec![vec![0u32; n]; n_genomes]).collect() } else { Vec::new() }; - let presence = force_presence || !with_counts; + let presence = force_presence || !with_counts; let threshold = presence_threshold; - let z = effective_z; + let z = effective_z; + + // Deque reused across all (seq, genome) pairs. + let mut dq: VecDeque<(usize, u32)> = VecDeque::with_capacity(z + 1); for seq_idx in 0..n_seqs { - let n = results.n_kmers_for(seq_idx); + let n = results.n_kmers_for(seq_idx); let out_n = n_kmers_out[seq_idx]; - if out_n == 0 { - continue; + if out_n == 0 { continue; } + + let mins = &mut win_min[..out_n * n_genomes]; + mins.fill(0); + + // ── Per-genome sliding window minimum ───────────────────────────────── + for g in 0..n_genomes { + dq.clear(); + for i in 0..n { + let v_i = if results.is_in_index(seq_idx, i) { + results.val(seq_idx, i, g) + } else { + 0 + }; + // Evict elements that have left the window. + while dq.front().map_or(false, |&(f, _)| f + z <= i) { + dq.pop_front(); + } + // Maintain monotone non-decreasing back→front for minimum at front. + while dq.back().map_or(false, |&(_, v)| v >= v_i) { + dq.pop_back(); + } + dq.push_back((i, v_i)); + // Window [pos, pos+z) is complete when i = pos + z - 1. + if i + 1 >= z { + let pos = i + 1 - z; + mins[pos * n_genomes + g] = dq.front().unwrap().1; + } + } } - if z <= 1 { - // No Findere — every indexed s-mer is confirmed. - let acc = &mut accs[seq_idx]; - for pos in 0..n { + // ── Accumulate ──────────────────────────────────────────────────────── + let acc = &mut accs[seq_idx]; + for pos in 0..out_n { + let any = (0..n_genomes).any(|g| mins[pos * n_genomes + g] > 0); + if !any { if !results.is_in_index(seq_idx, pos) { acc.kmer_missing += 1; - continue; - } - acc.kmer_count += 1; - for g in 0..n_genomes { - let v = results.val(seq_idx, pos, g); - if v == 0 { - continue; - } - let c = if presence { - u32::from(v >= threshold) - } else { - v - }; - acc.genome_totals[g] += c; - if detail { - cov[seq_idx][g][pos] += c; - } } + continue; } - } else { - // Build confirmed[pos * n_genomes + g] via sliding window. - let conf = &mut confirmed[..out_n * n_genomes]; - conf.fill(false); - + acc.kmer_count += 1; for g in 0..n_genomes { - let hit = - |i: usize| results.is_in_index(seq_idx, i) && results.val(seq_idx, i, g) > 0; - - let mut cnt: u32 = (0..z).filter(|&j| hit(j)).count() as u32; - if cnt == z as u32 { - conf[g] = true; - } - - for i in 1..out_n { - if hit(i - 1) { - cnt -= 1; - } - if hit(i + z - 1) { - cnt += 1; - } - if cnt == z as u32 { - conf[i * n_genomes + g] = true; - } - } - } - - let acc = &mut accs[seq_idx]; - for pos in 0..out_n { - let any = (0..n_genomes).any(|g| conf[pos * n_genomes + g]); - if !any { - if !results.is_in_index(seq_idx, pos) { - acc.kmer_missing += 1; - } - continue; - } - acc.kmer_count += 1; - for g in 0..n_genomes { - if !conf[pos * n_genomes + g] { - continue; - } - let v = results.val(seq_idx, pos, g); - if v == 0 { - continue; - } - let c = if presence { - u32::from(v >= threshold) - } else { - v - }; - acc.genome_totals[g] += c; - if detail { - cov[seq_idx][g][pos] += c; - } - } + let v = mins[pos * n_genomes + g]; + if v == 0 { continue; } + let c = if presence { u32::from(v >= threshold) } else { v }; + acc.genome_totals[g] += c; + if detail { cov[seq_idx][g][pos] += c; } } } }