Push ruqusmkoyvwm #16

Merged
coissac merged 10 commits from push-ruqusmkoyvwm into main 2026-06-05 08:41:08 +00:00
Showing only changes of commit 9306ec1c56 - Show all commits
+61 -93
View File
@@ -1,4 +1,4 @@
use std::collections::HashMap;
use std::collections::{HashMap, VecDeque};
use std::io::{self, BufWriter, Write};
use std::path::PathBuf;
use std::sync::Arc;
@@ -277,10 +277,18 @@ fn process_chunk(
});
}
// Findere z-window filter + accumulation — no intermediate allocations.
// One `confirmed` buffer reused across all sequences.
// Sliding window minimum — one reusable buffer and one deque per batch.
//
// win_min[pos * n_genomes + g] = min count across the z-window [pos, pos+z)
// for genome g, where "not in index" counts as 0.
//
// win_min > 0 ↔ all z consecutive kmers are in the index with count > 0
// ↔ Findere confirmation (for z=1 this degenerates to the
// simple case with no overhead).
//
// Works uniformly for count matrices and presence/absence (0/1) matrices.
let max_n_kmers = batch.n_kmers.iter().map(|&n| n as usize).max().unwrap_or(0);
let mut confirmed = vec![false; max_n_kmers * n_genomes];
let mut win_min = vec![0u32; max_n_kmers * n_genomes];
let mut accs: Vec<SeqAcc> = (0..n_seqs).map(|_| SeqAcc::new(n_genomes)).collect();
@@ -289,114 +297,74 @@ fn process_chunk(
.iter()
.map(|&n| {
let n = n as usize;
if n >= effective_z {
n - effective_z + 1
} else {
0
}
if n >= effective_z { n - effective_z + 1 } else { 0 }
})
.collect();
let mut cov: Vec<Vec<Vec<u32>>> = if detail {
n_kmers_out
.iter()
.map(|&n| vec![vec![0u32; n]; n_genomes])
.collect()
n_kmers_out.iter().map(|&n| vec![vec![0u32; n]; n_genomes]).collect()
} else {
Vec::new()
};
let presence = force_presence || !with_counts;
let presence = force_presence || !with_counts;
let threshold = presence_threshold;
let z = effective_z;
let z = effective_z;
// Deque reused across all (seq, genome) pairs.
let mut dq: VecDeque<(usize, u32)> = VecDeque::with_capacity(z + 1);
for seq_idx in 0..n_seqs {
let n = results.n_kmers_for(seq_idx);
let n = results.n_kmers_for(seq_idx);
let out_n = n_kmers_out[seq_idx];
if out_n == 0 {
continue;
if out_n == 0 { continue; }
let mins = &mut win_min[..out_n * n_genomes];
mins.fill(0);
// ── Per-genome sliding window minimum ─────────────────────────────────
for g in 0..n_genomes {
dq.clear();
for i in 0..n {
let v_i = if results.is_in_index(seq_idx, i) {
results.val(seq_idx, i, g)
} else {
0
};
// Evict elements that have left the window.
while dq.front().map_or(false, |&(f, _)| f + z <= i) {
dq.pop_front();
}
// Maintain monotone non-decreasing back→front for minimum at front.
while dq.back().map_or(false, |&(_, v)| v >= v_i) {
dq.pop_back();
}
dq.push_back((i, v_i));
// Window [pos, pos+z) is complete when i = pos + z - 1.
if i + 1 >= z {
let pos = i + 1 - z;
mins[pos * n_genomes + g] = dq.front().unwrap().1;
}
}
}
if z <= 1 {
// No Findere — every indexed s-mer is confirmed.
let acc = &mut accs[seq_idx];
for pos in 0..n {
// ── Accumulate ────────────────────────────────────────────────────────
let acc = &mut accs[seq_idx];
for pos in 0..out_n {
let any = (0..n_genomes).any(|g| mins[pos * n_genomes + g] > 0);
if !any {
if !results.is_in_index(seq_idx, pos) {
acc.kmer_missing += 1;
continue;
}
acc.kmer_count += 1;
for g in 0..n_genomes {
let v = results.val(seq_idx, pos, g);
if v == 0 {
continue;
}
let c = if presence {
u32::from(v >= threshold)
} else {
v
};
acc.genome_totals[g] += c;
if detail {
cov[seq_idx][g][pos] += c;
}
}
continue;
}
} else {
// Build confirmed[pos * n_genomes + g] via sliding window.
let conf = &mut confirmed[..out_n * n_genomes];
conf.fill(false);
acc.kmer_count += 1;
for g in 0..n_genomes {
let hit =
|i: usize| results.is_in_index(seq_idx, i) && results.val(seq_idx, i, g) > 0;
let mut cnt: u32 = (0..z).filter(|&j| hit(j)).count() as u32;
if cnt == z as u32 {
conf[g] = true;
}
for i in 1..out_n {
if hit(i - 1) {
cnt -= 1;
}
if hit(i + z - 1) {
cnt += 1;
}
if cnt == z as u32 {
conf[i * n_genomes + g] = true;
}
}
}
let acc = &mut accs[seq_idx];
for pos in 0..out_n {
let any = (0..n_genomes).any(|g| conf[pos * n_genomes + g]);
if !any {
if !results.is_in_index(seq_idx, pos) {
acc.kmer_missing += 1;
}
continue;
}
acc.kmer_count += 1;
for g in 0..n_genomes {
if !conf[pos * n_genomes + g] {
continue;
}
let v = results.val(seq_idx, pos, g);
if v == 0 {
continue;
}
let c = if presence {
u32::from(v >= threshold)
} else {
v
};
acc.genome_totals[g] += c;
if detail {
cov[seq_idx][g][pos] += c;
}
}
let v = mins[pos * n_genomes + g];
if v == 0 { continue; }
let c = if presence { u32::from(v >= threshold) } else { v };
acc.genome_totals[g] += c;
if detail { cov[seq_idx][g][pos] += c; }
}
}
}