feat: add bits-per-kmer diagnostic and stats module
Introduce a `stats` module to compute normalized storage efficiency metrics. The new `KmerIndex::bits_per_kmer()` method parallelizes disk I/O across partitions to aggregate file sizes for MPHF, evidence, and matrix components. Publicly export `IndexBitsPerKmer` and add a `--bits-per-kmer` CLI flag to trigger the diagnostic routine and print detailed statistics.
This commit is contained in:
@@ -7,6 +7,7 @@ mod index;
|
|||||||
mod merge;
|
mod merge;
|
||||||
mod rebuild;
|
mod rebuild;
|
||||||
mod reindex;
|
mod reindex;
|
||||||
|
mod stats;
|
||||||
|
|
||||||
pub use error::{OKIError, OKIResult};
|
pub use error::{OKIError, OKIResult};
|
||||||
pub use distance::{DistanceMetric, DistanceOutput};
|
pub use distance::{DistanceMetric, DistanceOutput};
|
||||||
@@ -14,3 +15,4 @@ pub use index::KmerIndex;
|
|||||||
pub use merge::MergeMode;
|
pub use merge::MergeMode;
|
||||||
pub use meta::{validate_label, GenomeInfo, IndexConfig, IndexMeta, META_FILENAME};
|
pub use meta::{validate_label, GenomeInfo, IndexConfig, IndexMeta, META_FILENAME};
|
||||||
pub use state::{IndexState, SENTINEL_COUNTED, SENTINEL_INDEXED, SENTINEL_SCATTERED};
|
pub use state::{IndexState, SENTINEL_COUNTED, SENTINEL_INDEXED, SENTINEL_SCATTERED};
|
||||||
|
pub use stats::IndexBitsPerKmer;
|
||||||
|
|||||||
@@ -0,0 +1,127 @@
|
|||||||
|
use std::fs;
|
||||||
|
use std::path::Path;
|
||||||
|
|
||||||
|
use obicompactvec::LayerMeta;
|
||||||
|
use obilayeredmap::meta::PartitionMeta;
|
||||||
|
use rayon::prelude::*;
|
||||||
|
|
||||||
|
use crate::error::OKIResult;
|
||||||
|
use crate::index::KmerIndex;
|
||||||
|
|
||||||
|
/// Bits per kmer broken down by index component.
|
||||||
|
pub struct IndexBitsPerKmer {
|
||||||
|
/// Total distinct k-mers across all partitions and layers.
|
||||||
|
pub n_kmers: usize,
|
||||||
|
/// Number of genomes in the index.
|
||||||
|
pub n_genomes: usize,
|
||||||
|
/// Bits used by the minimal perfect hash function (`mphf.bin`).
|
||||||
|
pub mphf: f64,
|
||||||
|
/// Bits used by the evidence files (`evidence.bin`, `unitigs.bin*`,
|
||||||
|
/// `fingerprint.bin`).
|
||||||
|
pub evidence: f64,
|
||||||
|
/// Bits used by the count/presence matrices (`counts/` and `presence/`),
|
||||||
|
/// normalised by k-mers only.
|
||||||
|
pub matrix: f64,
|
||||||
|
/// `matrix` divided by the number of genomes — intrinsic encoding
|
||||||
|
/// efficiency, independent of index size.
|
||||||
|
pub matrix_per_genome: f64,
|
||||||
|
/// Sum of mphf + evidence + matrix.
|
||||||
|
pub total: f64,
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── File-size helpers ─────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
fn file_bytes(path: &Path) -> u64 {
|
||||||
|
fs::metadata(path).map(|m| m.len()).unwrap_or(0)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn dir_bytes(dir: &Path) -> u64 {
|
||||||
|
if !dir.exists() { return 0; }
|
||||||
|
fs::read_dir(dir)
|
||||||
|
.map(|entries| {
|
||||||
|
entries
|
||||||
|
.filter_map(|e| e.ok())
|
||||||
|
.filter_map(|e| e.metadata().ok())
|
||||||
|
.filter(|m| m.is_file())
|
||||||
|
.map(|m| m.len())
|
||||||
|
.sum()
|
||||||
|
})
|
||||||
|
.unwrap_or(0)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Per-layer accounting ──────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
struct LayerBytes {
|
||||||
|
n_kmers: usize,
|
||||||
|
mphf: u64,
|
||||||
|
evidence: u64,
|
||||||
|
matrix: u64,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn layer_bytes(layer_dir: &Path) -> LayerBytes {
|
||||||
|
let n_kmers = LayerMeta::load(layer_dir).map(|m| m.n).unwrap_or(0);
|
||||||
|
|
||||||
|
let mphf = file_bytes(&layer_dir.join("mphf.bin"));
|
||||||
|
|
||||||
|
let evidence = file_bytes(&layer_dir.join("unitigs.bin"))
|
||||||
|
+ file_bytes(&layer_dir.join("unitigs.bin.idx"))
|
||||||
|
+ file_bytes(&layer_dir.join("evidence.bin"))
|
||||||
|
+ file_bytes(&layer_dir.join("fingerprint.bin"));
|
||||||
|
|
||||||
|
let matrix = dir_bytes(&layer_dir.join("counts"))
|
||||||
|
+ dir_bytes(&layer_dir.join("presence"));
|
||||||
|
|
||||||
|
LayerBytes { n_kmers, mphf, evidence, matrix }
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── KmerIndex::bits_per_kmer ──────────────────────────────────────────────────
|
||||||
|
|
||||||
|
impl KmerIndex {
|
||||||
|
/// Compute bits-per-kmer statistics for the built index.
|
||||||
|
///
|
||||||
|
/// File sizes are read directly from disk; kmer counts come from
|
||||||
|
/// `layer_meta.json` (no need to scan the MPHF or unitig files).
|
||||||
|
/// Computation is parallelised across partitions.
|
||||||
|
pub fn bits_per_kmer(&self) -> OKIResult<IndexBitsPerKmer> {
|
||||||
|
let n = self.n_partitions();
|
||||||
|
let n_genomes = self.meta().genomes.len().max(1);
|
||||||
|
|
||||||
|
let (n_kmers, mphf_b, evidence_b, matrix_b) = (0..n)
|
||||||
|
.into_par_iter()
|
||||||
|
.map(|i| {
|
||||||
|
let index_dir = self.partition.part_dir(i).join("index");
|
||||||
|
if !index_dir.exists() { return (0usize, 0u64, 0u64, 0u64); }
|
||||||
|
|
||||||
|
let n_layers = PartitionMeta::load(&index_dir)
|
||||||
|
.map(|m| m.n_layers)
|
||||||
|
.unwrap_or(0);
|
||||||
|
|
||||||
|
(0..n_layers).fold((0usize, 0u64, 0u64, 0u64), |acc, l| {
|
||||||
|
let lb = layer_bytes(&index_dir.join(format!("layer_{l}")));
|
||||||
|
(acc.0 + lb.n_kmers, acc.1 + lb.mphf, acc.2 + lb.evidence, acc.3 + lb.matrix)
|
||||||
|
})
|
||||||
|
})
|
||||||
|
.reduce(|| (0, 0, 0, 0), |a, b| (a.0 + b.0, a.1 + b.1, a.2 + b.2, a.3 + b.3));
|
||||||
|
|
||||||
|
if n_kmers == 0 {
|
||||||
|
return Ok(IndexBitsPerKmer {
|
||||||
|
n_kmers: 0, n_genomes,
|
||||||
|
mphf: 0.0, evidence: 0.0,
|
||||||
|
matrix: 0.0, matrix_per_genome: 0.0, total: 0.0,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
let bpk = |bytes: u64| bytes as f64 * 8.0 / n_kmers as f64;
|
||||||
|
let matrix = bpk(matrix_b);
|
||||||
|
|
||||||
|
Ok(IndexBitsPerKmer {
|
||||||
|
n_kmers,
|
||||||
|
n_genomes,
|
||||||
|
mphf: bpk(mphf_b),
|
||||||
|
evidence: bpk(evidence_b),
|
||||||
|
matrix,
|
||||||
|
matrix_per_genome: matrix / n_genomes as f64,
|
||||||
|
total: bpk(mphf_b + evidence_b + matrix_b),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,7 +1,7 @@
|
|||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
|
|
||||||
use clap::Args;
|
use clap::Args;
|
||||||
use obikindex::{validate_label, KmerIndex};
|
use obikindex::{validate_label, IndexBitsPerKmer, KmerIndex};
|
||||||
use tracing::info;
|
use tracing::info;
|
||||||
|
|
||||||
#[derive(Args)]
|
#[derive(Args)]
|
||||||
@@ -16,6 +16,10 @@ pub struct UtilsArgs {
|
|||||||
/// Add missing layer_meta.json files to each layer (required after upgrading from old indexes)
|
/// Add missing layer_meta.json files to each layer (required after upgrading from old indexes)
|
||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
pub upgrade_index: bool,
|
pub upgrade_index: bool,
|
||||||
|
|
||||||
|
/// Print bits-per-kmer statistics (MPHF, evidence, matrix, total)
|
||||||
|
#[arg(long)]
|
||||||
|
pub bits_per_kmer: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn run(args: UtilsArgs) {
|
pub fn run(args: UtilsArgs) {
|
||||||
@@ -31,12 +35,35 @@ pub fn run(args: UtilsArgs) {
|
|||||||
run_upgrade_index(&args.index);
|
run_upgrade_index(&args.index);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if args.bits_per_kmer {
|
||||||
|
any = true;
|
||||||
|
run_bits_per_kmer(&args.index);
|
||||||
|
}
|
||||||
|
|
||||||
if !any {
|
if !any {
|
||||||
eprintln!("utils: no operation specified. Available options: --new-label NEW=OLD, --upgrade-index");
|
eprintln!("utils: no operation specified. Available options: --new-label NEW=OLD, --upgrade-index, --bits-per-kmer");
|
||||||
std::process::exit(1);
|
std::process::exit(1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn run_bits_per_kmer(index_path: &PathBuf) {
|
||||||
|
let idx = KmerIndex::open(index_path).unwrap_or_else(|e| {
|
||||||
|
eprintln!("error opening index: {e}");
|
||||||
|
std::process::exit(1);
|
||||||
|
});
|
||||||
|
let stats: IndexBitsPerKmer = idx.bits_per_kmer().unwrap_or_else(|e| {
|
||||||
|
eprintln!("error computing bits/kmer: {e}");
|
||||||
|
std::process::exit(1);
|
||||||
|
});
|
||||||
|
println!("k-mers : {}", stats.n_kmers);
|
||||||
|
println!("genomes : {}", stats.n_genomes);
|
||||||
|
println!("mphf : {:6.2} bits/kmer", stats.mphf);
|
||||||
|
println!("evidence : {:6.2} bits/kmer", stats.evidence);
|
||||||
|
println!("matrix : {:6.2} bits/kmer ({:.2} bits/kmer/genome)",
|
||||||
|
stats.matrix, stats.matrix_per_genome);
|
||||||
|
println!("total : {:6.2} bits/kmer", stats.total);
|
||||||
|
}
|
||||||
|
|
||||||
fn run_upgrade_index(index_path: &PathBuf) {
|
fn run_upgrade_index(index_path: &PathBuf) {
|
||||||
let idx = KmerIndex::open(index_path).unwrap_or_else(|e| {
|
let idx = KmerIndex::open(index_path).unwrap_or_else(|e| {
|
||||||
eprintln!("error opening index: {e}");
|
eprintln!("error opening index: {e}");
|
||||||
|
|||||||
Reference in New Issue
Block a user