feat: implement approximate k-mer indexing and optimize query
Enable approximate k-mer indexing via the `--approx` flag, computing an effective k-mer size of `k - z + 1` and configuring the appropriate indexing mode with validated probabilistic parameters. Refactor the Findere z-window filter in the query command to improve performance and correctness by replacing the precomputed vector with a lazy closure, optimizing cache locality, and fixing a variable naming bug.
This commit is contained in:
@@ -158,7 +158,7 @@ pub fn run(args: IndexArgs) {
|
||||
let mut rep = Reporter::new();
|
||||
|
||||
// ── Resolve evidence kind ────────────────────────────────────────────────
|
||||
let evidence = if args.approx {
|
||||
let (evidence, effective_kmer_size) = if args.approx {
|
||||
let (z, b, fp) = resolve_approx_params(args.findere_z, args.evidence_bits, args.fp);
|
||||
let k = args.common.kmer_size;
|
||||
if z as usize >= k {
|
||||
@@ -169,10 +169,11 @@ pub fn run(args: IndexArgs) {
|
||||
);
|
||||
std::process::exit(1);
|
||||
}
|
||||
info!("approximate evidence: b={b}, z={z}, fp={fp:.2e}");
|
||||
IndexMode::Approx { b, z }
|
||||
let s = k - z as usize + 1;
|
||||
info!("approximate evidence: b={b}, z={z}, fp={fp:.2e}, indexed kmer size={s}");
|
||||
(IndexMode::Approx { b, z }, s)
|
||||
} else {
|
||||
IndexMode::Exact
|
||||
(IndexMode::Exact, args.common.kmer_size)
|
||||
};
|
||||
|
||||
// ── Open or create the index ─────────────────────────────────────────────
|
||||
@@ -197,7 +198,7 @@ pub fn run(args: IndexArgs) {
|
||||
}
|
||||
let block_bits = block_size_to_bits(args.block_size);
|
||||
let config = IndexConfig {
|
||||
kmer_size: args.common.kmer_size,
|
||||
kmer_size: effective_kmer_size,
|
||||
minimizer_size: args.common.minimizer_size,
|
||||
n_bits,
|
||||
with_counts: args.with_counts,
|
||||
|
||||
@@ -160,54 +160,50 @@ impl SeqAcc {
|
||||
// ── Findere z-window filter ───────────────────────────────────────────────────
|
||||
|
||||
/// Apply the Findere z-window filter to per-kmer query results for one superkmer.
|
||||
/// Aggregate s-mer query results into k-mer answers using a Findere z-window.
|
||||
///
|
||||
/// Input: N s-mer results (indexed kmer size s = k − z + 1).
|
||||
/// Output: N − z + 1 k-mer results (user kmer size k).
|
||||
///
|
||||
/// For each genome g independently: k-mer at position i is confirmed iff all z values
|
||||
/// results[i..i+z][g] are nonzero (None counts as zero for all genomes).
|
||||
/// Output values are taken from results[i]; genomes not confirmed are zeroed.
|
||||
fn apply_findere(
|
||||
results: &[Option<Box<[u32]>>],
|
||||
z: usize,
|
||||
n_genomes: usize,
|
||||
) -> Vec<Option<Box<[u32]>>> {
|
||||
let n = results.len();
|
||||
if z <= 1 || n < z {
|
||||
if z <= 1 {
|
||||
return results.iter().map(|r| r.as_ref().map(|row| row.clone())).collect();
|
||||
}
|
||||
if n < z {
|
||||
return Vec::new();
|
||||
}
|
||||
|
||||
let mut confirmed = vec![vec![false; n_genomes]; n];
|
||||
let out_n = n - z + 1;
|
||||
let mut confirmed = vec![vec![false; n_genomes]; out_n];
|
||||
|
||||
for g in 0..n_genomes {
|
||||
let present: Vec<bool> = results
|
||||
.iter()
|
||||
.map(|r| r.as_ref().map_or(false, |row| row[g] > 0))
|
||||
.collect();
|
||||
let hit = |i: usize| results[i].as_ref().map_or(false, |r| r[g] > 0);
|
||||
|
||||
let mut window_count = present[..z].iter().filter(|&&p| p).count();
|
||||
if window_count == z {
|
||||
for c in confirmed[..z].iter_mut() {
|
||||
c[g] = true;
|
||||
let mut count: u32 = (0..z).filter(|&j| hit(j)).count() as u32;
|
||||
if count == z as u32 { confirmed[0][g] = true; }
|
||||
|
||||
for i in 1..out_n {
|
||||
if hit(i - 1) { count -= 1; }
|
||||
if hit(i + z - 1) { count += 1; }
|
||||
if count == z as u32 { confirmed[i][g] = true; }
|
||||
}
|
||||
}
|
||||
|
||||
for j in 1..=(n - z) {
|
||||
if present[j - 1] { window_count -= 1; }
|
||||
if present[j + z - 1] { window_count += 1; }
|
||||
if window_count == z {
|
||||
for c in confirmed[j..j + z].iter_mut() {
|
||||
c[g] = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
results.iter().enumerate().map(|(i, opt)| {
|
||||
let row = opt.as_ref()?;
|
||||
let mut new_row: Box<[u32]> = row.clone();
|
||||
let mut any = false;
|
||||
(0..out_n).map(|i| {
|
||||
let first = results[i].as_ref()?;
|
||||
let mut row: Box<[u32]> = first.clone();
|
||||
for g in 0..n_genomes {
|
||||
if !confirmed[i][g] {
|
||||
new_row[g] = 0;
|
||||
} else {
|
||||
any = true;
|
||||
if !confirmed[i][g] { row[g] = 0; }
|
||||
}
|
||||
}
|
||||
if any { Some(new_row) } else { None }
|
||||
if row.iter().any(|&v| v > 0) { Some(row) } else { None }
|
||||
}).collect()
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user