feat: add parallel k-mer counting and stats CLI
Introduces allocation-free `sum()` and `count_nonzero()` methods for compact integer vectors, extending the `ColumnWeights` trait with `partial_kmer_counts`. Adds parallel partition scanning to the k-mer index for computing per-genome distinct k-mer counts, and exposes a new `--stats` CLI flag to output these statistics as CSV.
This commit is contained in:
@@ -54,6 +54,14 @@ impl ColumnarCompactIntMatrix {
|
||||
Array1::from_vec(sums)
|
||||
}
|
||||
|
||||
pub(crate) fn count_nonzero(&self) -> Array1<u64> {
|
||||
let counts: Vec<u64> = (0..self.n_cols())
|
||||
.into_par_iter()
|
||||
.map(|c| self.col(c).count_nonzero())
|
||||
.collect();
|
||||
Array1::from_vec(counts)
|
||||
}
|
||||
|
||||
pub(crate) fn partial_bray_dist_matrix(&self) -> Array2<u64> {
|
||||
self.pairwise_u64(|i, j| self.col(i).partial_bray_dist(self.col(j)))
|
||||
}
|
||||
@@ -234,6 +242,14 @@ impl PackedCompactIntMatrix {
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) fn count_nonzero(&self) -> Array1<u64> {
|
||||
Array1::from_vec(
|
||||
(0..self.n_cols).into_par_iter()
|
||||
.map(|c| (0..self.n_rows).filter(|&s| self.get(c, s) > 0).count() as u64)
|
||||
.collect()
|
||||
)
|
||||
}
|
||||
|
||||
// ── Pair primitives ───────────────────────────────────────────────────────
|
||||
|
||||
fn pair_partial_bray(&self, i: usize, j: usize) -> u64 {
|
||||
@@ -421,6 +437,10 @@ impl PersistentCompactIntMatrix {
|
||||
match self { Self::Columnar(m) => m.sum(), Self::Packed(m) => m.sum() }
|
||||
}
|
||||
|
||||
pub fn count_nonzero(&self) -> Array1<u64> {
|
||||
match self { Self::Columnar(m) => m.count_nonzero(), Self::Packed(m) => m.count_nonzero() }
|
||||
}
|
||||
|
||||
pub fn partial_bray_dist_matrix(&self) -> Array2<u64> {
|
||||
match self { Self::Columnar(m) => m.partial_bray_dist_matrix(), Self::Packed(m) => m.partial_bray_dist_matrix() }
|
||||
}
|
||||
@@ -451,6 +471,7 @@ use crate::traits::{ColumnWeights, CountPartials};
|
||||
|
||||
impl ColumnWeights for PersistentCompactIntMatrix {
|
||||
fn col_weights(&self) -> Array1<u64> { self.sum() }
|
||||
fn partial_kmer_counts(&self) -> Array1<u64> { self.count_nonzero() }
|
||||
}
|
||||
|
||||
impl CountPartials for PersistentCompactIntMatrix {
|
||||
|
||||
@@ -133,11 +133,15 @@ impl PersistentCompactIntVec {
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// Returns the sum of all values in the compact int vector.
|
||||
pub fn sum(&self) -> u64 {
|
||||
self.iter().map(|v| v as u64).sum()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn count_nonzero(&self) -> u64 {
|
||||
self.iter().filter(|&v| v > 0).count() as u64
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// Returns the Bray-Curtis distance between two compact int vectors.
|
||||
pub fn bray_dist(&self, other: &PersistentCompactIntVec) -> f64 {
|
||||
|
||||
@@ -2,8 +2,16 @@ use ndarray::{Array1, Array2};
|
||||
|
||||
/// Column-level weight statistic — total count or presence count per column.
|
||||
/// Additive across layers and partitions; used as denominator in normalised distances.
|
||||
///
|
||||
/// `partial_kmer_counts` returns the number of **distinct k-mers** present per
|
||||
/// column (presence = 1 entries; count > 0 entries). For presence matrices this
|
||||
/// equals `col_weights`; for count matrices it differs (count_nonzero vs sum).
|
||||
pub trait ColumnWeights: Send + Sync {
|
||||
fn col_weights(&self) -> Array1<u64>;
|
||||
|
||||
fn partial_kmer_counts(&self) -> Array1<u64> {
|
||||
self.col_weights()
|
||||
}
|
||||
}
|
||||
|
||||
/// Partial distance matrices for count-based data (`PersistentCompactIntMatrix`).
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
|
||||
use obicompactvec::LayerMeta;
|
||||
use obicompactvec::{LayerMeta, PersistentBitMatrix, PersistentCompactIntMatrix};
|
||||
use obicompactvec::traits::ColumnWeights;
|
||||
use obilayeredmap::meta::PartitionMeta;
|
||||
use rayon::prelude::*;
|
||||
|
||||
@@ -124,4 +125,68 @@ impl KmerIndex {
|
||||
total: bpk(mphf_b + evidence_b + matrix_b),
|
||||
})
|
||||
}
|
||||
|
||||
/// Return `(total_distinct_kmers, per_genome_kmer_counts)`.
|
||||
///
|
||||
/// For each genome, the count is the number of distinct k-mers for which
|
||||
/// that genome has a non-zero value (presence = 1, count > 0).
|
||||
/// Partitions are scanned in parallel; results are summed across partitions.
|
||||
pub fn genome_kmer_counts(&self) -> OKIResult<(usize, Vec<u64>)> {
|
||||
let n = self.n_partitions();
|
||||
let n_genomes = self.meta.genomes.len();
|
||||
|
||||
let partials: Vec<(usize, Vec<u64>)> = (0..n)
|
||||
.into_par_iter()
|
||||
.map(|i| {
|
||||
let mut counts = vec![0u64; n_genomes];
|
||||
let mut n_kmers = 0usize;
|
||||
|
||||
let index_dir = self.partition.part_dir(i).join("index");
|
||||
if !index_dir.exists() { return (0, counts); }
|
||||
|
||||
let n_layers = PartitionMeta::load(&index_dir)
|
||||
.map(|m| m.n_layers)
|
||||
.unwrap_or(0);
|
||||
|
||||
for l in 0..n_layers {
|
||||
let layer_dir = index_dir.join(format!("layer_{l}"));
|
||||
if !layer_dir.exists() { continue; }
|
||||
|
||||
n_kmers += LayerMeta::load(&layer_dir).map(|m| m.n).unwrap_or(0);
|
||||
|
||||
let mat: Box<dyn ColumnWeights> =
|
||||
if layer_dir.join("counts").exists()
|
||||
&& !layer_dir.join("presence").exists()
|
||||
{
|
||||
match PersistentCompactIntMatrix::open(&layer_dir) {
|
||||
Ok(m) => Box::new(m),
|
||||
Err(_) => continue,
|
||||
}
|
||||
} else {
|
||||
match PersistentBitMatrix::open(&layer_dir) {
|
||||
Ok(m) => Box::new(m),
|
||||
Err(_) => continue,
|
||||
}
|
||||
};
|
||||
let col_counts = mat.partial_kmer_counts();
|
||||
|
||||
for (c, &v) in col_counts.iter().enumerate() {
|
||||
if c < n_genomes { counts[c] += v; }
|
||||
}
|
||||
}
|
||||
|
||||
(n_kmers, counts)
|
||||
})
|
||||
.collect();
|
||||
|
||||
let total_kmers: usize = partials.iter().map(|(n, _)| n).sum();
|
||||
let mut total_counts = vec![0u64; n_genomes];
|
||||
for (_, counts) in partials {
|
||||
for (i, v) in counts.into_iter().enumerate() {
|
||||
total_counts[i] += v;
|
||||
}
|
||||
}
|
||||
|
||||
Ok((total_kmers, total_counts))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -20,6 +20,10 @@ pub struct UtilsArgs {
|
||||
/// Print bits-per-kmer statistics (MPHF, evidence, matrix, total)
|
||||
#[arg(long)]
|
||||
pub bits_per_kmer: bool,
|
||||
|
||||
/// Print per-genome k-mer counts as CSV (one row per genome + total line)
|
||||
#[arg(long)]
|
||||
pub stats: bool,
|
||||
}
|
||||
|
||||
pub fn run(args: UtilsArgs) {
|
||||
@@ -40,12 +44,33 @@ pub fn run(args: UtilsArgs) {
|
||||
run_bits_per_kmer(&args.index);
|
||||
}
|
||||
|
||||
if args.stats {
|
||||
any = true;
|
||||
run_stats(&args.index);
|
||||
}
|
||||
|
||||
if !any {
|
||||
eprintln!("utils: no operation specified. Available options: --new-label NEW=OLD, --upgrade-index, --bits-per-kmer");
|
||||
eprintln!("utils: no operation specified. Available options: --new-label NEW=OLD, --upgrade-index, --bits-per-kmer, --stats");
|
||||
std::process::exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
fn run_stats(index_path: &PathBuf) {
|
||||
let idx = KmerIndex::open(index_path).unwrap_or_else(|e| {
|
||||
eprintln!("error opening index: {e}");
|
||||
std::process::exit(1);
|
||||
});
|
||||
let (total, per_genome) = idx.genome_kmer_counts().unwrap_or_else(|e| {
|
||||
eprintln!("error computing stats: {e}");
|
||||
std::process::exit(1);
|
||||
});
|
||||
println!("genome,n_kmers");
|
||||
for (g, &n) in idx.meta().genomes.iter().zip(per_genome.iter()) {
|
||||
println!("{},{}", g.label, n);
|
||||
}
|
||||
println!("total,{total}");
|
||||
}
|
||||
|
||||
fn run_bits_per_kmer(index_path: &PathBuf) {
|
||||
let idx = KmerIndex::open(index_path).unwrap_or_else(|e| {
|
||||
eprintln!("error opening index: {e}");
|
||||
|
||||
Reference in New Issue
Block a user