Push ylnwstyzqwrt #22

Merged
coissac merged 2 commits from push-ylnwstyzqwrt into main 2026-06-12 10:10:04 +00:00
5 changed files with 126 additions and 3 deletions
Showing only changes of commit f44fe042bc - Show all commits
+21
View File
@@ -54,6 +54,14 @@ impl ColumnarCompactIntMatrix {
Array1::from_vec(sums) Array1::from_vec(sums)
} }
pub(crate) fn count_nonzero(&self) -> Array1<u64> {
let counts: Vec<u64> = (0..self.n_cols())
.into_par_iter()
.map(|c| self.col(c).count_nonzero())
.collect();
Array1::from_vec(counts)
}
pub(crate) fn partial_bray_dist_matrix(&self) -> Array2<u64> { pub(crate) fn partial_bray_dist_matrix(&self) -> Array2<u64> {
self.pairwise_u64(|i, j| self.col(i).partial_bray_dist(self.col(j))) self.pairwise_u64(|i, j| self.col(i).partial_bray_dist(self.col(j)))
} }
@@ -234,6 +242,14 @@ impl PackedCompactIntMatrix {
) )
} }
pub(crate) fn count_nonzero(&self) -> Array1<u64> {
Array1::from_vec(
(0..self.n_cols).into_par_iter()
.map(|c| (0..self.n_rows).filter(|&s| self.get(c, s) > 0).count() as u64)
.collect()
)
}
// ── Pair primitives ─────────────────────────────────────────────────────── // ── Pair primitives ───────────────────────────────────────────────────────
fn pair_partial_bray(&self, i: usize, j: usize) -> u64 { fn pair_partial_bray(&self, i: usize, j: usize) -> u64 {
@@ -421,6 +437,10 @@ impl PersistentCompactIntMatrix {
match self { Self::Columnar(m) => m.sum(), Self::Packed(m) => m.sum() } match self { Self::Columnar(m) => m.sum(), Self::Packed(m) => m.sum() }
} }
pub fn count_nonzero(&self) -> Array1<u64> {
match self { Self::Columnar(m) => m.count_nonzero(), Self::Packed(m) => m.count_nonzero() }
}
pub fn partial_bray_dist_matrix(&self) -> Array2<u64> { pub fn partial_bray_dist_matrix(&self) -> Array2<u64> {
match self { Self::Columnar(m) => m.partial_bray_dist_matrix(), Self::Packed(m) => m.partial_bray_dist_matrix() } match self { Self::Columnar(m) => m.partial_bray_dist_matrix(), Self::Packed(m) => m.partial_bray_dist_matrix() }
} }
@@ -451,6 +471,7 @@ use crate::traits::{ColumnWeights, CountPartials};
impl ColumnWeights for PersistentCompactIntMatrix { impl ColumnWeights for PersistentCompactIntMatrix {
fn col_weights(&self) -> Array1<u64> { self.sum() } fn col_weights(&self) -> Array1<u64> { self.sum() }
fn partial_kmer_counts(&self) -> Array1<u64> { self.count_nonzero() }
} }
impl CountPartials for PersistentCompactIntMatrix { impl CountPartials for PersistentCompactIntMatrix {
+5 -1
View File
@@ -133,11 +133,15 @@ impl PersistentCompactIntVec {
} }
#[inline] #[inline]
/// Returns the sum of all values in the compact int vector.
pub fn sum(&self) -> u64 { pub fn sum(&self) -> u64 {
self.iter().map(|v| v as u64).sum() self.iter().map(|v| v as u64).sum()
} }
#[inline]
pub fn count_nonzero(&self) -> u64 {
self.iter().filter(|&v| v > 0).count() as u64
}
#[inline] #[inline]
/// Returns the Bray-Curtis distance between two compact int vectors. /// Returns the Bray-Curtis distance between two compact int vectors.
pub fn bray_dist(&self, other: &PersistentCompactIntVec) -> f64 { pub fn bray_dist(&self, other: &PersistentCompactIntVec) -> f64 {
+8
View File
@@ -2,8 +2,16 @@ use ndarray::{Array1, Array2};
/// Column-level weight statistic — total count or presence count per column. /// Column-level weight statistic — total count or presence count per column.
/// Additive across layers and partitions; used as denominator in normalised distances. /// Additive across layers and partitions; used as denominator in normalised distances.
///
/// `partial_kmer_counts` returns the number of **distinct k-mers** present per
/// column (presence = 1 entries; count > 0 entries). For presence matrices this
/// equals `col_weights`; for count matrices it differs (count_nonzero vs sum).
pub trait ColumnWeights: Send + Sync { pub trait ColumnWeights: Send + Sync {
fn col_weights(&self) -> Array1<u64>; fn col_weights(&self) -> Array1<u64>;
fn partial_kmer_counts(&self) -> Array1<u64> {
self.col_weights()
}
} }
/// Partial distance matrices for count-based data (`PersistentCompactIntMatrix`). /// Partial distance matrices for count-based data (`PersistentCompactIntMatrix`).
+66 -1
View File
@@ -1,7 +1,8 @@
use std::fs; use std::fs;
use std::path::Path; use std::path::Path;
use obicompactvec::LayerMeta; use obicompactvec::{LayerMeta, PersistentBitMatrix, PersistentCompactIntMatrix};
use obicompactvec::traits::ColumnWeights;
use obilayeredmap::meta::PartitionMeta; use obilayeredmap::meta::PartitionMeta;
use rayon::prelude::*; use rayon::prelude::*;
@@ -124,4 +125,68 @@ impl KmerIndex {
total: bpk(mphf_b + evidence_b + matrix_b), total: bpk(mphf_b + evidence_b + matrix_b),
}) })
} }
/// Return `(total_distinct_kmers, per_genome_kmer_counts)`.
///
/// For each genome, the count is the number of distinct k-mers for which
/// that genome has a non-zero value (presence = 1, count > 0).
/// Partitions are scanned in parallel; results are summed across partitions.
pub fn genome_kmer_counts(&self) -> OKIResult<(usize, Vec<u64>)> {
let n = self.n_partitions();
let n_genomes = self.meta.genomes.len();
let partials: Vec<(usize, Vec<u64>)> = (0..n)
.into_par_iter()
.map(|i| {
let mut counts = vec![0u64; n_genomes];
let mut n_kmers = 0usize;
let index_dir = self.partition.part_dir(i).join("index");
if !index_dir.exists() { return (0, counts); }
let n_layers = PartitionMeta::load(&index_dir)
.map(|m| m.n_layers)
.unwrap_or(0);
for l in 0..n_layers {
let layer_dir = index_dir.join(format!("layer_{l}"));
if !layer_dir.exists() { continue; }
n_kmers += LayerMeta::load(&layer_dir).map(|m| m.n).unwrap_or(0);
let mat: Box<dyn ColumnWeights> =
if layer_dir.join("counts").exists()
&& !layer_dir.join("presence").exists()
{
match PersistentCompactIntMatrix::open(&layer_dir) {
Ok(m) => Box::new(m),
Err(_) => continue,
}
} else {
match PersistentBitMatrix::open(&layer_dir) {
Ok(m) => Box::new(m),
Err(_) => continue,
}
};
let col_counts = mat.partial_kmer_counts();
for (c, &v) in col_counts.iter().enumerate() {
if c < n_genomes { counts[c] += v; }
}
}
(n_kmers, counts)
})
.collect();
let total_kmers: usize = partials.iter().map(|(n, _)| n).sum();
let mut total_counts = vec![0u64; n_genomes];
for (_, counts) in partials {
for (i, v) in counts.into_iter().enumerate() {
total_counts[i] += v;
}
}
Ok((total_kmers, total_counts))
}
} }
+26 -1
View File
@@ -20,6 +20,10 @@ pub struct UtilsArgs {
/// Print bits-per-kmer statistics (MPHF, evidence, matrix, total) /// Print bits-per-kmer statistics (MPHF, evidence, matrix, total)
#[arg(long)] #[arg(long)]
pub bits_per_kmer: bool, pub bits_per_kmer: bool,
/// Print per-genome k-mer counts as CSV (one row per genome + total line)
#[arg(long)]
pub stats: bool,
} }
pub fn run(args: UtilsArgs) { pub fn run(args: UtilsArgs) {
@@ -40,12 +44,33 @@ pub fn run(args: UtilsArgs) {
run_bits_per_kmer(&args.index); run_bits_per_kmer(&args.index);
} }
if args.stats {
any = true;
run_stats(&args.index);
}
if !any { if !any {
eprintln!("utils: no operation specified. Available options: --new-label NEW=OLD, --upgrade-index, --bits-per-kmer"); eprintln!("utils: no operation specified. Available options: --new-label NEW=OLD, --upgrade-index, --bits-per-kmer, --stats");
std::process::exit(1); std::process::exit(1);
} }
} }
fn run_stats(index_path: &PathBuf) {
let idx = KmerIndex::open(index_path).unwrap_or_else(|e| {
eprintln!("error opening index: {e}");
std::process::exit(1);
});
let (total, per_genome) = idx.genome_kmer_counts().unwrap_or_else(|e| {
eprintln!("error computing stats: {e}");
std::process::exit(1);
});
println!("genome,n_kmers");
for (g, &n) in idx.meta().genomes.iter().zip(per_genome.iter()) {
println!("{},{}", g.label, n);
}
println!("total,{total}");
}
fn run_bits_per_kmer(index_path: &PathBuf) { fn run_bits_per_kmer(index_path: &PathBuf) {
let idx = KmerIndex::open(index_path).unwrap_or_else(|e| { let idx = KmerIndex::open(index_path).unwrap_or_else(|e| {
eprintln!("error opening index: {e}"); eprintln!("error opening index: {e}");