feat: add parallel k-mer counting and stats CLI

Introduces allocation-free `sum()` and `count_nonzero()` methods for compact integer vectors, extending the `ColumnWeights` trait with `partial_kmer_counts`. Adds parallel partition scanning to the k-mer index for computing per-genome distinct k-mer counts, and exposes a new `--stats` CLI flag to output these statistics as CSV.
This commit is contained in:
Eric Coissac
2026-06-12 08:49:11 +02:00
parent 94e0a370b3
commit f44fe042bc
5 changed files with 126 additions and 3 deletions
+66 -1
View File
@@ -1,7 +1,8 @@
use std::fs;
use std::path::Path;
use obicompactvec::LayerMeta;
use obicompactvec::{LayerMeta, PersistentBitMatrix, PersistentCompactIntMatrix};
use obicompactvec::traits::ColumnWeights;
use obilayeredmap::meta::PartitionMeta;
use rayon::prelude::*;
@@ -124,4 +125,68 @@ impl KmerIndex {
total: bpk(mphf_b + evidence_b + matrix_b),
})
}
/// Return `(total_distinct_kmers, per_genome_kmer_counts)`.
///
/// For each genome, the count is the number of distinct k-mers for which
/// that genome has a non-zero value (presence = 1, count > 0).
/// Partitions are scanned in parallel; results are summed across partitions.
pub fn genome_kmer_counts(&self) -> OKIResult<(usize, Vec<u64>)> {
let n = self.n_partitions();
let n_genomes = self.meta.genomes.len();
let partials: Vec<(usize, Vec<u64>)> = (0..n)
.into_par_iter()
.map(|i| {
let mut counts = vec![0u64; n_genomes];
let mut n_kmers = 0usize;
let index_dir = self.partition.part_dir(i).join("index");
if !index_dir.exists() { return (0, counts); }
let n_layers = PartitionMeta::load(&index_dir)
.map(|m| m.n_layers)
.unwrap_or(0);
for l in 0..n_layers {
let layer_dir = index_dir.join(format!("layer_{l}"));
if !layer_dir.exists() { continue; }
n_kmers += LayerMeta::load(&layer_dir).map(|m| m.n).unwrap_or(0);
let mat: Box<dyn ColumnWeights> =
if layer_dir.join("counts").exists()
&& !layer_dir.join("presence").exists()
{
match PersistentCompactIntMatrix::open(&layer_dir) {
Ok(m) => Box::new(m),
Err(_) => continue,
}
} else {
match PersistentBitMatrix::open(&layer_dir) {
Ok(m) => Box::new(m),
Err(_) => continue,
}
};
let col_counts = mat.partial_kmer_counts();
for (c, &v) in col_counts.iter().enumerate() {
if c < n_genomes { counts[c] += v; }
}
}
(n_kmers, counts)
})
.collect();
let total_kmers: usize = partials.iter().map(|(n, _)| n).sum();
let mut total_counts = vec![0u64; n_genomes];
for (_, counts) in partials {
for (i, v) in counts.into_iter().enumerate() {
total_counts[i] += v;
}
}
Ok((total_kmers, total_counts))
}
}