feat: add parallel k-mer counting and stats CLI
Introduces allocation-free `sum()` and `count_nonzero()` methods for compact integer vectors, extending the `ColumnWeights` trait with `partial_kmer_counts`. Adds parallel partition scanning to the k-mer index for computing per-genome distinct k-mer counts, and exposes a new `--stats` CLI flag to output these statistics as CSV.
This commit is contained in:
@@ -54,6 +54,14 @@ impl ColumnarCompactIntMatrix {
|
|||||||
Array1::from_vec(sums)
|
Array1::from_vec(sums)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) fn count_nonzero(&self) -> Array1<u64> {
|
||||||
|
let counts: Vec<u64> = (0..self.n_cols())
|
||||||
|
.into_par_iter()
|
||||||
|
.map(|c| self.col(c).count_nonzero())
|
||||||
|
.collect();
|
||||||
|
Array1::from_vec(counts)
|
||||||
|
}
|
||||||
|
|
||||||
pub(crate) fn partial_bray_dist_matrix(&self) -> Array2<u64> {
|
pub(crate) fn partial_bray_dist_matrix(&self) -> Array2<u64> {
|
||||||
self.pairwise_u64(|i, j| self.col(i).partial_bray_dist(self.col(j)))
|
self.pairwise_u64(|i, j| self.col(i).partial_bray_dist(self.col(j)))
|
||||||
}
|
}
|
||||||
@@ -234,6 +242,14 @@ impl PackedCompactIntMatrix {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) fn count_nonzero(&self) -> Array1<u64> {
|
||||||
|
Array1::from_vec(
|
||||||
|
(0..self.n_cols).into_par_iter()
|
||||||
|
.map(|c| (0..self.n_rows).filter(|&s| self.get(c, s) > 0).count() as u64)
|
||||||
|
.collect()
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
// ── Pair primitives ───────────────────────────────────────────────────────
|
// ── Pair primitives ───────────────────────────────────────────────────────
|
||||||
|
|
||||||
fn pair_partial_bray(&self, i: usize, j: usize) -> u64 {
|
fn pair_partial_bray(&self, i: usize, j: usize) -> u64 {
|
||||||
@@ -421,6 +437,10 @@ impl PersistentCompactIntMatrix {
|
|||||||
match self { Self::Columnar(m) => m.sum(), Self::Packed(m) => m.sum() }
|
match self { Self::Columnar(m) => m.sum(), Self::Packed(m) => m.sum() }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn count_nonzero(&self) -> Array1<u64> {
|
||||||
|
match self { Self::Columnar(m) => m.count_nonzero(), Self::Packed(m) => m.count_nonzero() }
|
||||||
|
}
|
||||||
|
|
||||||
pub fn partial_bray_dist_matrix(&self) -> Array2<u64> {
|
pub fn partial_bray_dist_matrix(&self) -> Array2<u64> {
|
||||||
match self { Self::Columnar(m) => m.partial_bray_dist_matrix(), Self::Packed(m) => m.partial_bray_dist_matrix() }
|
match self { Self::Columnar(m) => m.partial_bray_dist_matrix(), Self::Packed(m) => m.partial_bray_dist_matrix() }
|
||||||
}
|
}
|
||||||
@@ -451,6 +471,7 @@ use crate::traits::{ColumnWeights, CountPartials};
|
|||||||
|
|
||||||
impl ColumnWeights for PersistentCompactIntMatrix {
|
impl ColumnWeights for PersistentCompactIntMatrix {
|
||||||
fn col_weights(&self) -> Array1<u64> { self.sum() }
|
fn col_weights(&self) -> Array1<u64> { self.sum() }
|
||||||
|
fn partial_kmer_counts(&self) -> Array1<u64> { self.count_nonzero() }
|
||||||
}
|
}
|
||||||
|
|
||||||
impl CountPartials for PersistentCompactIntMatrix {
|
impl CountPartials for PersistentCompactIntMatrix {
|
||||||
|
|||||||
@@ -133,11 +133,15 @@ impl PersistentCompactIntVec {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
/// Returns the sum of all values in the compact int vector.
|
|
||||||
pub fn sum(&self) -> u64 {
|
pub fn sum(&self) -> u64 {
|
||||||
self.iter().map(|v| v as u64).sum()
|
self.iter().map(|v| v as u64).sum()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
pub fn count_nonzero(&self) -> u64 {
|
||||||
|
self.iter().filter(|&v| v > 0).count() as u64
|
||||||
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
/// Returns the Bray-Curtis distance between two compact int vectors.
|
/// Returns the Bray-Curtis distance between two compact int vectors.
|
||||||
pub fn bray_dist(&self, other: &PersistentCompactIntVec) -> f64 {
|
pub fn bray_dist(&self, other: &PersistentCompactIntVec) -> f64 {
|
||||||
|
|||||||
@@ -2,8 +2,16 @@ use ndarray::{Array1, Array2};
|
|||||||
|
|
||||||
/// Column-level weight statistic — total count or presence count per column.
|
/// Column-level weight statistic — total count or presence count per column.
|
||||||
/// Additive across layers and partitions; used as denominator in normalised distances.
|
/// Additive across layers and partitions; used as denominator in normalised distances.
|
||||||
|
///
|
||||||
|
/// `partial_kmer_counts` returns the number of **distinct k-mers** present per
|
||||||
|
/// column (presence = 1 entries; count > 0 entries). For presence matrices this
|
||||||
|
/// equals `col_weights`; for count matrices it differs (count_nonzero vs sum).
|
||||||
pub trait ColumnWeights: Send + Sync {
|
pub trait ColumnWeights: Send + Sync {
|
||||||
fn col_weights(&self) -> Array1<u64>;
|
fn col_weights(&self) -> Array1<u64>;
|
||||||
|
|
||||||
|
fn partial_kmer_counts(&self) -> Array1<u64> {
|
||||||
|
self.col_weights()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Partial distance matrices for count-based data (`PersistentCompactIntMatrix`).
|
/// Partial distance matrices for count-based data (`PersistentCompactIntMatrix`).
|
||||||
|
|||||||
@@ -1,7 +1,8 @@
|
|||||||
use std::fs;
|
use std::fs;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
|
|
||||||
use obicompactvec::LayerMeta;
|
use obicompactvec::{LayerMeta, PersistentBitMatrix, PersistentCompactIntMatrix};
|
||||||
|
use obicompactvec::traits::ColumnWeights;
|
||||||
use obilayeredmap::meta::PartitionMeta;
|
use obilayeredmap::meta::PartitionMeta;
|
||||||
use rayon::prelude::*;
|
use rayon::prelude::*;
|
||||||
|
|
||||||
@@ -124,4 +125,68 @@ impl KmerIndex {
|
|||||||
total: bpk(mphf_b + evidence_b + matrix_b),
|
total: bpk(mphf_b + evidence_b + matrix_b),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Return `(total_distinct_kmers, per_genome_kmer_counts)`.
|
||||||
|
///
|
||||||
|
/// For each genome, the count is the number of distinct k-mers for which
|
||||||
|
/// that genome has a non-zero value (presence = 1, count > 0).
|
||||||
|
/// Partitions are scanned in parallel; results are summed across partitions.
|
||||||
|
pub fn genome_kmer_counts(&self) -> OKIResult<(usize, Vec<u64>)> {
|
||||||
|
let n = self.n_partitions();
|
||||||
|
let n_genomes = self.meta.genomes.len();
|
||||||
|
|
||||||
|
let partials: Vec<(usize, Vec<u64>)> = (0..n)
|
||||||
|
.into_par_iter()
|
||||||
|
.map(|i| {
|
||||||
|
let mut counts = vec![0u64; n_genomes];
|
||||||
|
let mut n_kmers = 0usize;
|
||||||
|
|
||||||
|
let index_dir = self.partition.part_dir(i).join("index");
|
||||||
|
if !index_dir.exists() { return (0, counts); }
|
||||||
|
|
||||||
|
let n_layers = PartitionMeta::load(&index_dir)
|
||||||
|
.map(|m| m.n_layers)
|
||||||
|
.unwrap_or(0);
|
||||||
|
|
||||||
|
for l in 0..n_layers {
|
||||||
|
let layer_dir = index_dir.join(format!("layer_{l}"));
|
||||||
|
if !layer_dir.exists() { continue; }
|
||||||
|
|
||||||
|
n_kmers += LayerMeta::load(&layer_dir).map(|m| m.n).unwrap_or(0);
|
||||||
|
|
||||||
|
let mat: Box<dyn ColumnWeights> =
|
||||||
|
if layer_dir.join("counts").exists()
|
||||||
|
&& !layer_dir.join("presence").exists()
|
||||||
|
{
|
||||||
|
match PersistentCompactIntMatrix::open(&layer_dir) {
|
||||||
|
Ok(m) => Box::new(m),
|
||||||
|
Err(_) => continue,
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
match PersistentBitMatrix::open(&layer_dir) {
|
||||||
|
Ok(m) => Box::new(m),
|
||||||
|
Err(_) => continue,
|
||||||
|
}
|
||||||
|
};
|
||||||
|
let col_counts = mat.partial_kmer_counts();
|
||||||
|
|
||||||
|
for (c, &v) in col_counts.iter().enumerate() {
|
||||||
|
if c < n_genomes { counts[c] += v; }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
(n_kmers, counts)
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let total_kmers: usize = partials.iter().map(|(n, _)| n).sum();
|
||||||
|
let mut total_counts = vec![0u64; n_genomes];
|
||||||
|
for (_, counts) in partials {
|
||||||
|
for (i, v) in counts.into_iter().enumerate() {
|
||||||
|
total_counts[i] += v;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok((total_kmers, total_counts))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -20,6 +20,10 @@ pub struct UtilsArgs {
|
|||||||
/// Print bits-per-kmer statistics (MPHF, evidence, matrix, total)
|
/// Print bits-per-kmer statistics (MPHF, evidence, matrix, total)
|
||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
pub bits_per_kmer: bool,
|
pub bits_per_kmer: bool,
|
||||||
|
|
||||||
|
/// Print per-genome k-mer counts as CSV (one row per genome + total line)
|
||||||
|
#[arg(long)]
|
||||||
|
pub stats: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn run(args: UtilsArgs) {
|
pub fn run(args: UtilsArgs) {
|
||||||
@@ -40,12 +44,33 @@ pub fn run(args: UtilsArgs) {
|
|||||||
run_bits_per_kmer(&args.index);
|
run_bits_per_kmer(&args.index);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if args.stats {
|
||||||
|
any = true;
|
||||||
|
run_stats(&args.index);
|
||||||
|
}
|
||||||
|
|
||||||
if !any {
|
if !any {
|
||||||
eprintln!("utils: no operation specified. Available options: --new-label NEW=OLD, --upgrade-index, --bits-per-kmer");
|
eprintln!("utils: no operation specified. Available options: --new-label NEW=OLD, --upgrade-index, --bits-per-kmer, --stats");
|
||||||
std::process::exit(1);
|
std::process::exit(1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn run_stats(index_path: &PathBuf) {
|
||||||
|
let idx = KmerIndex::open(index_path).unwrap_or_else(|e| {
|
||||||
|
eprintln!("error opening index: {e}");
|
||||||
|
std::process::exit(1);
|
||||||
|
});
|
||||||
|
let (total, per_genome) = idx.genome_kmer_counts().unwrap_or_else(|e| {
|
||||||
|
eprintln!("error computing stats: {e}");
|
||||||
|
std::process::exit(1);
|
||||||
|
});
|
||||||
|
println!("genome,n_kmers");
|
||||||
|
for (g, &n) in idx.meta().genomes.iter().zip(per_genome.iter()) {
|
||||||
|
println!("{},{}", g.label, n);
|
||||||
|
}
|
||||||
|
println!("total,{total}");
|
||||||
|
}
|
||||||
|
|
||||||
fn run_bits_per_kmer(index_path: &PathBuf) {
|
fn run_bits_per_kmer(index_path: &PathBuf) {
|
||||||
let idx = KmerIndex::open(index_path).unwrap_or_else(|e| {
|
let idx = KmerIndex::open(index_path).unwrap_or_else(|e| {
|
||||||
eprintln!("error opening index: {e}");
|
eprintln!("error opening index: {e}");
|
||||||
|
|||||||
Reference in New Issue
Block a user