diff --git a/src/obikindex/src/lib.rs b/src/obikindex/src/lib.rs index b5e881e..f5f20ae 100644 --- a/src/obikindex/src/lib.rs +++ b/src/obikindex/src/lib.rs @@ -7,6 +7,7 @@ mod index; mod merge; mod rebuild; mod reindex; +mod stats; pub use error::{OKIError, OKIResult}; pub use distance::{DistanceMetric, DistanceOutput}; @@ -14,3 +15,4 @@ pub use index::KmerIndex; pub use merge::MergeMode; pub use meta::{validate_label, GenomeInfo, IndexConfig, IndexMeta, META_FILENAME}; pub use state::{IndexState, SENTINEL_COUNTED, SENTINEL_INDEXED, SENTINEL_SCATTERED}; +pub use stats::IndexBitsPerKmer; diff --git a/src/obikindex/src/stats.rs b/src/obikindex/src/stats.rs new file mode 100644 index 0000000..e3dffc9 --- /dev/null +++ b/src/obikindex/src/stats.rs @@ -0,0 +1,127 @@ +use std::fs; +use std::path::Path; + +use obicompactvec::LayerMeta; +use obilayeredmap::meta::PartitionMeta; +use rayon::prelude::*; + +use crate::error::OKIResult; +use crate::index::KmerIndex; + +/// Bits per kmer broken down by index component. +pub struct IndexBitsPerKmer { + /// Total distinct k-mers across all partitions and layers. + pub n_kmers: usize, + /// Number of genomes in the index. + pub n_genomes: usize, + /// Bits used by the minimal perfect hash function (`mphf.bin`). + pub mphf: f64, + /// Bits used by the evidence files (`evidence.bin`, `unitigs.bin*`, + /// `fingerprint.bin`). + pub evidence: f64, + /// Bits used by the count/presence matrices (`counts/` and `presence/`), + /// normalised by k-mers only. + pub matrix: f64, + /// `matrix` divided by the number of genomes — intrinsic encoding + /// efficiency, independent of index size. + pub matrix_per_genome: f64, + /// Sum of mphf + evidence + matrix. + pub total: f64, +} + +// ── File-size helpers ───────────────────────────────────────────────────────── + +fn file_bytes(path: &Path) -> u64 { + fs::metadata(path).map(|m| m.len()).unwrap_or(0) +} + +fn dir_bytes(dir: &Path) -> u64 { + if !dir.exists() { return 0; } + fs::read_dir(dir) + .map(|entries| { + entries + .filter_map(|e| e.ok()) + .filter_map(|e| e.metadata().ok()) + .filter(|m| m.is_file()) + .map(|m| m.len()) + .sum() + }) + .unwrap_or(0) +} + +// ── Per-layer accounting ────────────────────────────────────────────────────── + +struct LayerBytes { + n_kmers: usize, + mphf: u64, + evidence: u64, + matrix: u64, +} + +fn layer_bytes(layer_dir: &Path) -> LayerBytes { + let n_kmers = LayerMeta::load(layer_dir).map(|m| m.n).unwrap_or(0); + + let mphf = file_bytes(&layer_dir.join("mphf.bin")); + + let evidence = file_bytes(&layer_dir.join("unitigs.bin")) + + file_bytes(&layer_dir.join("unitigs.bin.idx")) + + file_bytes(&layer_dir.join("evidence.bin")) + + file_bytes(&layer_dir.join("fingerprint.bin")); + + let matrix = dir_bytes(&layer_dir.join("counts")) + + dir_bytes(&layer_dir.join("presence")); + + LayerBytes { n_kmers, mphf, evidence, matrix } +} + +// ── KmerIndex::bits_per_kmer ────────────────────────────────────────────────── + +impl KmerIndex { + /// Compute bits-per-kmer statistics for the built index. + /// + /// File sizes are read directly from disk; kmer counts come from + /// `layer_meta.json` (no need to scan the MPHF or unitig files). + /// Computation is parallelised across partitions. + pub fn bits_per_kmer(&self) -> OKIResult { + let n = self.n_partitions(); + let n_genomes = self.meta().genomes.len().max(1); + + let (n_kmers, mphf_b, evidence_b, matrix_b) = (0..n) + .into_par_iter() + .map(|i| { + let index_dir = self.partition.part_dir(i).join("index"); + if !index_dir.exists() { return (0usize, 0u64, 0u64, 0u64); } + + let n_layers = PartitionMeta::load(&index_dir) + .map(|m| m.n_layers) + .unwrap_or(0); + + (0..n_layers).fold((0usize, 0u64, 0u64, 0u64), |acc, l| { + let lb = layer_bytes(&index_dir.join(format!("layer_{l}"))); + (acc.0 + lb.n_kmers, acc.1 + lb.mphf, acc.2 + lb.evidence, acc.3 + lb.matrix) + }) + }) + .reduce(|| (0, 0, 0, 0), |a, b| (a.0 + b.0, a.1 + b.1, a.2 + b.2, a.3 + b.3)); + + if n_kmers == 0 { + return Ok(IndexBitsPerKmer { + n_kmers: 0, n_genomes, + mphf: 0.0, evidence: 0.0, + matrix: 0.0, matrix_per_genome: 0.0, total: 0.0, + }); + } + + let bpk = |bytes: u64| bytes as f64 * 8.0 / n_kmers as f64; + let matrix = bpk(matrix_b); + + Ok(IndexBitsPerKmer { + n_kmers, + n_genomes, + mphf: bpk(mphf_b), + evidence: bpk(evidence_b), + matrix, + matrix_per_genome: matrix / n_genomes as f64, + total: bpk(mphf_b + evidence_b + matrix_b), + }) + } +} diff --git a/src/obikmer/src/cmd/utils.rs b/src/obikmer/src/cmd/utils.rs index c5fa3e7..5aa564a 100644 --- a/src/obikmer/src/cmd/utils.rs +++ b/src/obikmer/src/cmd/utils.rs @@ -1,7 +1,7 @@ use std::path::PathBuf; use clap::Args; -use obikindex::{validate_label, KmerIndex}; +use obikindex::{validate_label, IndexBitsPerKmer, KmerIndex}; use tracing::info; #[derive(Args)] @@ -16,6 +16,10 @@ pub struct UtilsArgs { /// Add missing layer_meta.json files to each layer (required after upgrading from old indexes) #[arg(long)] pub upgrade_index: bool, + + /// Print bits-per-kmer statistics (MPHF, evidence, matrix, total) + #[arg(long)] + pub bits_per_kmer: bool, } pub fn run(args: UtilsArgs) { @@ -31,12 +35,35 @@ pub fn run(args: UtilsArgs) { run_upgrade_index(&args.index); } + if args.bits_per_kmer { + any = true; + run_bits_per_kmer(&args.index); + } + if !any { - eprintln!("utils: no operation specified. Available options: --new-label NEW=OLD, --upgrade-index"); + eprintln!("utils: no operation specified. Available options: --new-label NEW=OLD, --upgrade-index, --bits-per-kmer"); std::process::exit(1); } } +fn run_bits_per_kmer(index_path: &PathBuf) { + let idx = KmerIndex::open(index_path).unwrap_or_else(|e| { + eprintln!("error opening index: {e}"); + std::process::exit(1); + }); + let stats: IndexBitsPerKmer = idx.bits_per_kmer().unwrap_or_else(|e| { + eprintln!("error computing bits/kmer: {e}"); + std::process::exit(1); + }); + println!("k-mers : {}", stats.n_kmers); + println!("genomes : {}", stats.n_genomes); + println!("mphf : {:6.2} bits/kmer", stats.mphf); + println!("evidence : {:6.2} bits/kmer", stats.evidence); + println!("matrix : {:6.2} bits/kmer ({:.2} bits/kmer/genome)", + stats.matrix, stats.matrix_per_genome); + println!("total : {:6.2} bits/kmer", stats.total); +} + fn run_upgrade_index(index_path: &PathBuf) { let idx = KmerIndex::open(index_path).unwrap_or_else(|e| { eprintln!("error opening index: {e}");