feat: add parallel k-mer counting and stats CLI

Introduces allocation-free `sum()` and `count_nonzero()` methods for compact integer vectors, extending the `ColumnWeights` trait with `partial_kmer_counts`. Adds parallel partition scanning to the k-mer index for computing per-genome distinct k-mer counts, and exposes a new `--stats` CLI flag to output these statistics as CSV.
This commit is contained in:
Eric Coissac
2026-06-12 08:49:11 +02:00
parent 94e0a370b3
commit f44fe042bc
5 changed files with 126 additions and 3 deletions
+21
View File
@@ -54,6 +54,14 @@ impl ColumnarCompactIntMatrix {
Array1::from_vec(sums)
}
pub(crate) fn count_nonzero(&self) -> Array1<u64> {
let counts: Vec<u64> = (0..self.n_cols())
.into_par_iter()
.map(|c| self.col(c).count_nonzero())
.collect();
Array1::from_vec(counts)
}
pub(crate) fn partial_bray_dist_matrix(&self) -> Array2<u64> {
self.pairwise_u64(|i, j| self.col(i).partial_bray_dist(self.col(j)))
}
@@ -234,6 +242,14 @@ impl PackedCompactIntMatrix {
)
}
pub(crate) fn count_nonzero(&self) -> Array1<u64> {
Array1::from_vec(
(0..self.n_cols).into_par_iter()
.map(|c| (0..self.n_rows).filter(|&s| self.get(c, s) > 0).count() as u64)
.collect()
)
}
// ── Pair primitives ───────────────────────────────────────────────────────
fn pair_partial_bray(&self, i: usize, j: usize) -> u64 {
@@ -421,6 +437,10 @@ impl PersistentCompactIntMatrix {
match self { Self::Columnar(m) => m.sum(), Self::Packed(m) => m.sum() }
}
pub fn count_nonzero(&self) -> Array1<u64> {
match self { Self::Columnar(m) => m.count_nonzero(), Self::Packed(m) => m.count_nonzero() }
}
pub fn partial_bray_dist_matrix(&self) -> Array2<u64> {
match self { Self::Columnar(m) => m.partial_bray_dist_matrix(), Self::Packed(m) => m.partial_bray_dist_matrix() }
}
@@ -451,6 +471,7 @@ use crate::traits::{ColumnWeights, CountPartials};
impl ColumnWeights for PersistentCompactIntMatrix {
fn col_weights(&self) -> Array1<u64> { self.sum() }
fn partial_kmer_counts(&self) -> Array1<u64> { self.count_nonzero() }
}
impl CountPartials for PersistentCompactIntMatrix {