fix: account for k-mer overlap in total_bases calculation

Introduces a `kmer_overlap` variable (`k-1`) and modifies the `total_bases` accumulation to subtract this overlap from each sequence's length. This ensures the base count accurately reflects only valid k-mer starting positions rather than raw sequence length.
This commit is contained in:
Eric Coissac
2026-06-03 15:10:15 +02:00
parent bfe0cb4b82
commit bba5147f0f
+4 -1
View File
@@ -95,10 +95,13 @@ pub fn scatter(
let mut ema_rate: f64 = 0.0;
let mut last_t = Instant::now();
let mut last_bases: u64 = 0;
let kmer_overlap = (k - 1) as u64;
const ALPHA: f64 = 0.15;
for batch in pipe.apply(throttled, n_workers, 1) {
total_bases += batch.iter().map(|sk| sk.seql() as u64).sum::<u64>();
total_bases += batch.iter()
.map(|sk| (sk.seql() as u64).saturating_sub(kmer_overlap))
.sum::<u64>();
let now = Instant::now();
let dt = now.duration_since(last_t).as_secs_f64();
if dt > 0.1 {