diff --git a/src/obicompactvec/src/intmatrix.rs b/src/obicompactvec/src/intmatrix.rs index 37547c6..8db78da 100644 --- a/src/obicompactvec/src/intmatrix.rs +++ b/src/obicompactvec/src/intmatrix.rs @@ -54,6 +54,14 @@ impl ColumnarCompactIntMatrix { Array1::from_vec(sums) } + pub(crate) fn count_nonzero(&self) -> Array1 { + let counts: Vec = (0..self.n_cols()) + .into_par_iter() + .map(|c| self.col(c).count_nonzero()) + .collect(); + Array1::from_vec(counts) + } + pub(crate) fn partial_bray_dist_matrix(&self) -> Array2 { self.pairwise_u64(|i, j| self.col(i).partial_bray_dist(self.col(j))) } @@ -234,6 +242,14 @@ impl PackedCompactIntMatrix { ) } + pub(crate) fn count_nonzero(&self) -> Array1 { + Array1::from_vec( + (0..self.n_cols).into_par_iter() + .map(|c| (0..self.n_rows).filter(|&s| self.get(c, s) > 0).count() as u64) + .collect() + ) + } + // ── Pair primitives ─────────────────────────────────────────────────────── fn pair_partial_bray(&self, i: usize, j: usize) -> u64 { @@ -421,6 +437,10 @@ impl PersistentCompactIntMatrix { match self { Self::Columnar(m) => m.sum(), Self::Packed(m) => m.sum() } } + pub fn count_nonzero(&self) -> Array1 { + match self { Self::Columnar(m) => m.count_nonzero(), Self::Packed(m) => m.count_nonzero() } + } + pub fn partial_bray_dist_matrix(&self) -> Array2 { match self { Self::Columnar(m) => m.partial_bray_dist_matrix(), Self::Packed(m) => m.partial_bray_dist_matrix() } } @@ -451,6 +471,7 @@ use crate::traits::{ColumnWeights, CountPartials}; impl ColumnWeights for PersistentCompactIntMatrix { fn col_weights(&self) -> Array1 { self.sum() } + fn partial_kmer_counts(&self) -> Array1 { self.count_nonzero() } } impl CountPartials for PersistentCompactIntMatrix { diff --git a/src/obicompactvec/src/reader.rs b/src/obicompactvec/src/reader.rs index 0f4ce25..057ce29 100644 --- a/src/obicompactvec/src/reader.rs +++ b/src/obicompactvec/src/reader.rs @@ -133,11 +133,15 @@ impl PersistentCompactIntVec { } #[inline] - /// Returns the sum of all values in the compact int vector. pub fn sum(&self) -> u64 { self.iter().map(|v| v as u64).sum() } + #[inline] + pub fn count_nonzero(&self) -> u64 { + self.iter().filter(|&v| v > 0).count() as u64 + } + #[inline] /// Returns the Bray-Curtis distance between two compact int vectors. pub fn bray_dist(&self, other: &PersistentCompactIntVec) -> f64 { diff --git a/src/obicompactvec/src/traits.rs b/src/obicompactvec/src/traits.rs index f65ad9d..b61e69b 100644 --- a/src/obicompactvec/src/traits.rs +++ b/src/obicompactvec/src/traits.rs @@ -2,8 +2,16 @@ use ndarray::{Array1, Array2}; /// Column-level weight statistic — total count or presence count per column. /// Additive across layers and partitions; used as denominator in normalised distances. +/// +/// `partial_kmer_counts` returns the number of **distinct k-mers** present per +/// column (presence = 1 entries; count > 0 entries). For presence matrices this +/// equals `col_weights`; for count matrices it differs (count_nonzero vs sum). pub trait ColumnWeights: Send + Sync { fn col_weights(&self) -> Array1; + + fn partial_kmer_counts(&self) -> Array1 { + self.col_weights() + } } /// Partial distance matrices for count-based data (`PersistentCompactIntMatrix`). diff --git a/src/obikindex/src/stats.rs b/src/obikindex/src/stats.rs index e3dffc9..434af17 100644 --- a/src/obikindex/src/stats.rs +++ b/src/obikindex/src/stats.rs @@ -1,7 +1,8 @@ use std::fs; use std::path::Path; -use obicompactvec::LayerMeta; +use obicompactvec::{LayerMeta, PersistentBitMatrix, PersistentCompactIntMatrix}; +use obicompactvec::traits::ColumnWeights; use obilayeredmap::meta::PartitionMeta; use rayon::prelude::*; @@ -124,4 +125,68 @@ impl KmerIndex { total: bpk(mphf_b + evidence_b + matrix_b), }) } + + /// Return `(total_distinct_kmers, per_genome_kmer_counts)`. + /// + /// For each genome, the count is the number of distinct k-mers for which + /// that genome has a non-zero value (presence = 1, count > 0). + /// Partitions are scanned in parallel; results are summed across partitions. + pub fn genome_kmer_counts(&self) -> OKIResult<(usize, Vec)> { + let n = self.n_partitions(); + let n_genomes = self.meta.genomes.len(); + + let partials: Vec<(usize, Vec)> = (0..n) + .into_par_iter() + .map(|i| { + let mut counts = vec![0u64; n_genomes]; + let mut n_kmers = 0usize; + + let index_dir = self.partition.part_dir(i).join("index"); + if !index_dir.exists() { return (0, counts); } + + let n_layers = PartitionMeta::load(&index_dir) + .map(|m| m.n_layers) + .unwrap_or(0); + + for l in 0..n_layers { + let layer_dir = index_dir.join(format!("layer_{l}")); + if !layer_dir.exists() { continue; } + + n_kmers += LayerMeta::load(&layer_dir).map(|m| m.n).unwrap_or(0); + + let mat: Box = + if layer_dir.join("counts").exists() + && !layer_dir.join("presence").exists() + { + match PersistentCompactIntMatrix::open(&layer_dir) { + Ok(m) => Box::new(m), + Err(_) => continue, + } + } else { + match PersistentBitMatrix::open(&layer_dir) { + Ok(m) => Box::new(m), + Err(_) => continue, + } + }; + let col_counts = mat.partial_kmer_counts(); + + for (c, &v) in col_counts.iter().enumerate() { + if c < n_genomes { counts[c] += v; } + } + } + + (n_kmers, counts) + }) + .collect(); + + let total_kmers: usize = partials.iter().map(|(n, _)| n).sum(); + let mut total_counts = vec![0u64; n_genomes]; + for (_, counts) in partials { + for (i, v) in counts.into_iter().enumerate() { + total_counts[i] += v; + } + } + + Ok((total_kmers, total_counts)) + } } diff --git a/src/obikmer/src/cmd/utils.rs b/src/obikmer/src/cmd/utils.rs index 5aa564a..d4dc2f6 100644 --- a/src/obikmer/src/cmd/utils.rs +++ b/src/obikmer/src/cmd/utils.rs @@ -20,6 +20,10 @@ pub struct UtilsArgs { /// Print bits-per-kmer statistics (MPHF, evidence, matrix, total) #[arg(long)] pub bits_per_kmer: bool, + + /// Print per-genome k-mer counts as CSV (one row per genome + total line) + #[arg(long)] + pub stats: bool, } pub fn run(args: UtilsArgs) { @@ -40,12 +44,33 @@ pub fn run(args: UtilsArgs) { run_bits_per_kmer(&args.index); } + if args.stats { + any = true; + run_stats(&args.index); + } + if !any { - eprintln!("utils: no operation specified. Available options: --new-label NEW=OLD, --upgrade-index, --bits-per-kmer"); + eprintln!("utils: no operation specified. Available options: --new-label NEW=OLD, --upgrade-index, --bits-per-kmer, --stats"); std::process::exit(1); } } +fn run_stats(index_path: &PathBuf) { + let idx = KmerIndex::open(index_path).unwrap_or_else(|e| { + eprintln!("error opening index: {e}"); + std::process::exit(1); + }); + let (total, per_genome) = idx.genome_kmer_counts().unwrap_or_else(|e| { + eprintln!("error computing stats: {e}"); + std::process::exit(1); + }); + println!("genome,n_kmers"); + for (g, &n) in idx.meta().genomes.iter().zip(per_genome.iter()) { + println!("{},{}", g.label, n); + } + println!("total,{total}"); +} + fn run_bits_per_kmer(index_path: &PathBuf) { let idx = KmerIndex::open(index_path).unwrap_or_else(|e| { eprintln!("error opening index: {e}");