Push ruqusmkoyvwm #16
@@ -1,4 +1,4 @@
|
||||
use std::collections::HashMap;
|
||||
use std::collections::{HashMap, VecDeque};
|
||||
use std::io::{self, BufWriter, Write};
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
@@ -277,10 +277,18 @@ fn process_chunk(
|
||||
});
|
||||
}
|
||||
|
||||
// Findere z-window filter + accumulation — no intermediate allocations.
|
||||
// One `confirmed` buffer reused across all sequences.
|
||||
// Sliding window minimum — one reusable buffer and one deque per batch.
|
||||
//
|
||||
// win_min[pos * n_genomes + g] = min count across the z-window [pos, pos+z)
|
||||
// for genome g, where "not in index" counts as 0.
|
||||
//
|
||||
// win_min > 0 ↔ all z consecutive kmers are in the index with count > 0
|
||||
// ↔ Findere confirmation (for z=1 this degenerates to the
|
||||
// simple case with no overhead).
|
||||
//
|
||||
// Works uniformly for count matrices and presence/absence (0/1) matrices.
|
||||
let max_n_kmers = batch.n_kmers.iter().map(|&n| n as usize).max().unwrap_or(0);
|
||||
let mut confirmed = vec![false; max_n_kmers * n_genomes];
|
||||
let mut win_min = vec![0u32; max_n_kmers * n_genomes];
|
||||
|
||||
let mut accs: Vec<SeqAcc> = (0..n_seqs).map(|_| SeqAcc::new(n_genomes)).collect();
|
||||
|
||||
@@ -289,19 +297,12 @@ fn process_chunk(
|
||||
.iter()
|
||||
.map(|&n| {
|
||||
let n = n as usize;
|
||||
if n >= effective_z {
|
||||
n - effective_z + 1
|
||||
} else {
|
||||
0
|
||||
}
|
||||
if n >= effective_z { n - effective_z + 1 } else { 0 }
|
||||
})
|
||||
.collect();
|
||||
|
||||
let mut cov: Vec<Vec<Vec<u32>>> = if detail {
|
||||
n_kmers_out
|
||||
.iter()
|
||||
.map(|&n| vec![vec![0u32; n]; n_genomes])
|
||||
.collect()
|
||||
n_kmers_out.iter().map(|&n| vec![vec![0u32; n]; n_genomes]).collect()
|
||||
} else {
|
||||
Vec::new()
|
||||
};
|
||||
@@ -310,68 +311,47 @@ fn process_chunk(
|
||||
let threshold = presence_threshold;
|
||||
let z = effective_z;
|
||||
|
||||
// Deque reused across all (seq, genome) pairs.
|
||||
let mut dq: VecDeque<(usize, u32)> = VecDeque::with_capacity(z + 1);
|
||||
|
||||
for seq_idx in 0..n_seqs {
|
||||
let n = results.n_kmers_for(seq_idx);
|
||||
let out_n = n_kmers_out[seq_idx];
|
||||
if out_n == 0 {
|
||||
continue;
|
||||
}
|
||||
if out_n == 0 { continue; }
|
||||
|
||||
if z <= 1 {
|
||||
// No Findere — every indexed s-mer is confirmed.
|
||||
let acc = &mut accs[seq_idx];
|
||||
for pos in 0..n {
|
||||
if !results.is_in_index(seq_idx, pos) {
|
||||
acc.kmer_missing += 1;
|
||||
continue;
|
||||
}
|
||||
acc.kmer_count += 1;
|
||||
let mins = &mut win_min[..out_n * n_genomes];
|
||||
mins.fill(0);
|
||||
|
||||
// ── Per-genome sliding window minimum ─────────────────────────────────
|
||||
for g in 0..n_genomes {
|
||||
let v = results.val(seq_idx, pos, g);
|
||||
if v == 0 {
|
||||
continue;
|
||||
}
|
||||
let c = if presence {
|
||||
u32::from(v >= threshold)
|
||||
dq.clear();
|
||||
for i in 0..n {
|
||||
let v_i = if results.is_in_index(seq_idx, i) {
|
||||
results.val(seq_idx, i, g)
|
||||
} else {
|
||||
v
|
||||
0
|
||||
};
|
||||
acc.genome_totals[g] += c;
|
||||
if detail {
|
||||
cov[seq_idx][g][pos] += c;
|
||||
// Evict elements that have left the window.
|
||||
while dq.front().map_or(false, |&(f, _)| f + z <= i) {
|
||||
dq.pop_front();
|
||||
}
|
||||
// Maintain monotone non-decreasing back→front for minimum at front.
|
||||
while dq.back().map_or(false, |&(_, v)| v >= v_i) {
|
||||
dq.pop_back();
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Build confirmed[pos * n_genomes + g] via sliding window.
|
||||
let conf = &mut confirmed[..out_n * n_genomes];
|
||||
conf.fill(false);
|
||||
|
||||
for g in 0..n_genomes {
|
||||
let hit =
|
||||
|i: usize| results.is_in_index(seq_idx, i) && results.val(seq_idx, i, g) > 0;
|
||||
|
||||
let mut cnt: u32 = (0..z).filter(|&j| hit(j)).count() as u32;
|
||||
if cnt == z as u32 {
|
||||
conf[g] = true;
|
||||
}
|
||||
|
||||
for i in 1..out_n {
|
||||
if hit(i - 1) {
|
||||
cnt -= 1;
|
||||
}
|
||||
if hit(i + z - 1) {
|
||||
cnt += 1;
|
||||
}
|
||||
if cnt == z as u32 {
|
||||
conf[i * n_genomes + g] = true;
|
||||
dq.push_back((i, v_i));
|
||||
// Window [pos, pos+z) is complete when i = pos + z - 1.
|
||||
if i + 1 >= z {
|
||||
let pos = i + 1 - z;
|
||||
mins[pos * n_genomes + g] = dq.front().unwrap().1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ── Accumulate ────────────────────────────────────────────────────────
|
||||
let acc = &mut accs[seq_idx];
|
||||
for pos in 0..out_n {
|
||||
let any = (0..n_genomes).any(|g| conf[pos * n_genomes + g]);
|
||||
let any = (0..n_genomes).any(|g| mins[pos * n_genomes + g] > 0);
|
||||
if !any {
|
||||
if !results.is_in_index(seq_idx, pos) {
|
||||
acc.kmer_missing += 1;
|
||||
@@ -380,23 +360,11 @@ fn process_chunk(
|
||||
}
|
||||
acc.kmer_count += 1;
|
||||
for g in 0..n_genomes {
|
||||
if !conf[pos * n_genomes + g] {
|
||||
continue;
|
||||
}
|
||||
let v = results.val(seq_idx, pos, g);
|
||||
if v == 0 {
|
||||
continue;
|
||||
}
|
||||
let c = if presence {
|
||||
u32::from(v >= threshold)
|
||||
} else {
|
||||
v
|
||||
};
|
||||
let v = mins[pos * n_genomes + g];
|
||||
if v == 0 { continue; }
|
||||
let c = if presence { u32::from(v >= threshold) } else { v };
|
||||
acc.genome_totals[g] += c;
|
||||
if detail {
|
||||
cov[seq_idx][g][pos] += c;
|
||||
}
|
||||
}
|
||||
if detail { cov[seq_idx][g][pos] += c; }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user