feat: add parallel k-mer counting and stats CLI
Introduces allocation-free `sum()` and `count_nonzero()` methods for compact integer vectors, extending the `ColumnWeights` trait with `partial_kmer_counts`. Adds parallel partition scanning to the k-mer index for computing per-genome distinct k-mer counts, and exposes a new `--stats` CLI flag to output these statistics as CSV.
This commit is contained in:
@@ -54,6 +54,14 @@ impl ColumnarCompactIntMatrix {
|
||||
Array1::from_vec(sums)
|
||||
}
|
||||
|
||||
pub(crate) fn count_nonzero(&self) -> Array1<u64> {
|
||||
let counts: Vec<u64> = (0..self.n_cols())
|
||||
.into_par_iter()
|
||||
.map(|c| self.col(c).count_nonzero())
|
||||
.collect();
|
||||
Array1::from_vec(counts)
|
||||
}
|
||||
|
||||
pub(crate) fn partial_bray_dist_matrix(&self) -> Array2<u64> {
|
||||
self.pairwise_u64(|i, j| self.col(i).partial_bray_dist(self.col(j)))
|
||||
}
|
||||
@@ -234,6 +242,14 @@ impl PackedCompactIntMatrix {
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) fn count_nonzero(&self) -> Array1<u64> {
|
||||
Array1::from_vec(
|
||||
(0..self.n_cols).into_par_iter()
|
||||
.map(|c| (0..self.n_rows).filter(|&s| self.get(c, s) > 0).count() as u64)
|
||||
.collect()
|
||||
)
|
||||
}
|
||||
|
||||
// ── Pair primitives ───────────────────────────────────────────────────────
|
||||
|
||||
fn pair_partial_bray(&self, i: usize, j: usize) -> u64 {
|
||||
@@ -421,6 +437,10 @@ impl PersistentCompactIntMatrix {
|
||||
match self { Self::Columnar(m) => m.sum(), Self::Packed(m) => m.sum() }
|
||||
}
|
||||
|
||||
pub fn count_nonzero(&self) -> Array1<u64> {
|
||||
match self { Self::Columnar(m) => m.count_nonzero(), Self::Packed(m) => m.count_nonzero() }
|
||||
}
|
||||
|
||||
pub fn partial_bray_dist_matrix(&self) -> Array2<u64> {
|
||||
match self { Self::Columnar(m) => m.partial_bray_dist_matrix(), Self::Packed(m) => m.partial_bray_dist_matrix() }
|
||||
}
|
||||
@@ -451,6 +471,7 @@ use crate::traits::{ColumnWeights, CountPartials};
|
||||
|
||||
impl ColumnWeights for PersistentCompactIntMatrix {
|
||||
fn col_weights(&self) -> Array1<u64> { self.sum() }
|
||||
fn partial_kmer_counts(&self) -> Array1<u64> { self.count_nonzero() }
|
||||
}
|
||||
|
||||
impl CountPartials for PersistentCompactIntMatrix {
|
||||
|
||||
@@ -133,11 +133,15 @@ impl PersistentCompactIntVec {
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// Returns the sum of all values in the compact int vector.
|
||||
pub fn sum(&self) -> u64 {
|
||||
self.iter().map(|v| v as u64).sum()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn count_nonzero(&self) -> u64 {
|
||||
self.iter().filter(|&v| v > 0).count() as u64
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// Returns the Bray-Curtis distance between two compact int vectors.
|
||||
pub fn bray_dist(&self, other: &PersistentCompactIntVec) -> f64 {
|
||||
|
||||
@@ -2,8 +2,16 @@ use ndarray::{Array1, Array2};
|
||||
|
||||
/// Column-level weight statistic — total count or presence count per column.
|
||||
/// Additive across layers and partitions; used as denominator in normalised distances.
|
||||
///
|
||||
/// `partial_kmer_counts` returns the number of **distinct k-mers** present per
|
||||
/// column (presence = 1 entries; count > 0 entries). For presence matrices this
|
||||
/// equals `col_weights`; for count matrices it differs (count_nonzero vs sum).
|
||||
pub trait ColumnWeights: Send + Sync {
|
||||
fn col_weights(&self) -> Array1<u64>;
|
||||
|
||||
fn partial_kmer_counts(&self) -> Array1<u64> {
|
||||
self.col_weights()
|
||||
}
|
||||
}
|
||||
|
||||
/// Partial distance matrices for count-based data (`PersistentCompactIntMatrix`).
|
||||
|
||||
Reference in New Issue
Block a user