feat: add parallel k-mer counting and stats CLI

Introduces allocation-free `sum()` and `count_nonzero()` methods for compact integer vectors, extending the `ColumnWeights` trait with `partial_kmer_counts`. Adds parallel partition scanning to the k-mer index for computing per-genome distinct k-mer counts, and exposes a new `--stats` CLI flag to output these statistics as CSV.
This commit is contained in:
Eric Coissac
2026-06-12 08:49:11 +02:00
parent 94e0a370b3
commit f44fe042bc
5 changed files with 126 additions and 3 deletions
+21
View File
@@ -54,6 +54,14 @@ impl ColumnarCompactIntMatrix {
Array1::from_vec(sums)
}
pub(crate) fn count_nonzero(&self) -> Array1<u64> {
let counts: Vec<u64> = (0..self.n_cols())
.into_par_iter()
.map(|c| self.col(c).count_nonzero())
.collect();
Array1::from_vec(counts)
}
pub(crate) fn partial_bray_dist_matrix(&self) -> Array2<u64> {
self.pairwise_u64(|i, j| self.col(i).partial_bray_dist(self.col(j)))
}
@@ -234,6 +242,14 @@ impl PackedCompactIntMatrix {
)
}
pub(crate) fn count_nonzero(&self) -> Array1<u64> {
Array1::from_vec(
(0..self.n_cols).into_par_iter()
.map(|c| (0..self.n_rows).filter(|&s| self.get(c, s) > 0).count() as u64)
.collect()
)
}
// ── Pair primitives ───────────────────────────────────────────────────────
fn pair_partial_bray(&self, i: usize, j: usize) -> u64 {
@@ -421,6 +437,10 @@ impl PersistentCompactIntMatrix {
match self { Self::Columnar(m) => m.sum(), Self::Packed(m) => m.sum() }
}
pub fn count_nonzero(&self) -> Array1<u64> {
match self { Self::Columnar(m) => m.count_nonzero(), Self::Packed(m) => m.count_nonzero() }
}
pub fn partial_bray_dist_matrix(&self) -> Array2<u64> {
match self { Self::Columnar(m) => m.partial_bray_dist_matrix(), Self::Packed(m) => m.partial_bray_dist_matrix() }
}
@@ -451,6 +471,7 @@ use crate::traits::{ColumnWeights, CountPartials};
impl ColumnWeights for PersistentCompactIntMatrix {
fn col_weights(&self) -> Array1<u64> { self.sum() }
fn partial_kmer_counts(&self) -> Array1<u64> { self.count_nonzero() }
}
impl CountPartials for PersistentCompactIntMatrix {
+5 -1
View File
@@ -133,11 +133,15 @@ impl PersistentCompactIntVec {
}
#[inline]
/// Returns the sum of all values in the compact int vector.
pub fn sum(&self) -> u64 {
self.iter().map(|v| v as u64).sum()
}
#[inline]
pub fn count_nonzero(&self) -> u64 {
self.iter().filter(|&v| v > 0).count() as u64
}
#[inline]
/// Returns the Bray-Curtis distance between two compact int vectors.
pub fn bray_dist(&self, other: &PersistentCompactIntVec) -> f64 {
+8
View File
@@ -2,8 +2,16 @@ use ndarray::{Array1, Array2};
/// Column-level weight statistic — total count or presence count per column.
/// Additive across layers and partitions; used as denominator in normalised distances.
///
/// `partial_kmer_counts` returns the number of **distinct k-mers** present per
/// column (presence = 1 entries; count > 0 entries). For presence matrices this
/// equals `col_weights`; for count matrices it differs (count_nonzero vs sum).
pub trait ColumnWeights: Send + Sync {
fn col_weights(&self) -> Array1<u64>;
fn partial_kmer_counts(&self) -> Array1<u64> {
self.col_weights()
}
}
/// Partial distance matrices for count-based data (`PersistentCompactIntMatrix`).