Push ruqusmkoyvwm #16

Merged
coissac merged 10 commits from push-ruqusmkoyvwm into main 2026-06-05 08:41:08 +00:00
3 changed files with 158 additions and 2 deletions
Showing only changes of commit edd5e3f8ee - Show all commits
+2
View File
@@ -7,6 +7,7 @@ mod index;
mod merge;
mod rebuild;
mod reindex;
mod stats;
pub use error::{OKIError, OKIResult};
pub use distance::{DistanceMetric, DistanceOutput};
@@ -14,3 +15,4 @@ pub use index::KmerIndex;
pub use merge::MergeMode;
pub use meta::{validate_label, GenomeInfo, IndexConfig, IndexMeta, META_FILENAME};
pub use state::{IndexState, SENTINEL_COUNTED, SENTINEL_INDEXED, SENTINEL_SCATTERED};
pub use stats::IndexBitsPerKmer;
+127
View File
@@ -0,0 +1,127 @@
use std::fs;
use std::path::Path;
use obicompactvec::LayerMeta;
use obilayeredmap::meta::PartitionMeta;
use rayon::prelude::*;
use crate::error::OKIResult;
use crate::index::KmerIndex;
/// Bits per kmer broken down by index component.
pub struct IndexBitsPerKmer {
/// Total distinct k-mers across all partitions and layers.
pub n_kmers: usize,
/// Number of genomes in the index.
pub n_genomes: usize,
/// Bits used by the minimal perfect hash function (`mphf.bin`).
pub mphf: f64,
/// Bits used by the evidence files (`evidence.bin`, `unitigs.bin*`,
/// `fingerprint.bin`).
pub evidence: f64,
/// Bits used by the count/presence matrices (`counts/` and `presence/`),
/// normalised by k-mers only.
pub matrix: f64,
/// `matrix` divided by the number of genomes — intrinsic encoding
/// efficiency, independent of index size.
pub matrix_per_genome: f64,
/// Sum of mphf + evidence + matrix.
pub total: f64,
}
// ── File-size helpers ─────────────────────────────────────────────────────────
fn file_bytes(path: &Path) -> u64 {
fs::metadata(path).map(|m| m.len()).unwrap_or(0)
}
fn dir_bytes(dir: &Path) -> u64 {
if !dir.exists() { return 0; }
fs::read_dir(dir)
.map(|entries| {
entries
.filter_map(|e| e.ok())
.filter_map(|e| e.metadata().ok())
.filter(|m| m.is_file())
.map(|m| m.len())
.sum()
})
.unwrap_or(0)
}
// ── Per-layer accounting ──────────────────────────────────────────────────────
struct LayerBytes {
n_kmers: usize,
mphf: u64,
evidence: u64,
matrix: u64,
}
fn layer_bytes(layer_dir: &Path) -> LayerBytes {
let n_kmers = LayerMeta::load(layer_dir).map(|m| m.n).unwrap_or(0);
let mphf = file_bytes(&layer_dir.join("mphf.bin"));
let evidence = file_bytes(&layer_dir.join("unitigs.bin"))
+ file_bytes(&layer_dir.join("unitigs.bin.idx"))
+ file_bytes(&layer_dir.join("evidence.bin"))
+ file_bytes(&layer_dir.join("fingerprint.bin"));
let matrix = dir_bytes(&layer_dir.join("counts"))
+ dir_bytes(&layer_dir.join("presence"));
LayerBytes { n_kmers, mphf, evidence, matrix }
}
// ── KmerIndex::bits_per_kmer ──────────────────────────────────────────────────
impl KmerIndex {
/// Compute bits-per-kmer statistics for the built index.
///
/// File sizes are read directly from disk; kmer counts come from
/// `layer_meta.json` (no need to scan the MPHF or unitig files).
/// Computation is parallelised across partitions.
pub fn bits_per_kmer(&self) -> OKIResult<IndexBitsPerKmer> {
let n = self.n_partitions();
let n_genomes = self.meta().genomes.len().max(1);
let (n_kmers, mphf_b, evidence_b, matrix_b) = (0..n)
.into_par_iter()
.map(|i| {
let index_dir = self.partition.part_dir(i).join("index");
if !index_dir.exists() { return (0usize, 0u64, 0u64, 0u64); }
let n_layers = PartitionMeta::load(&index_dir)
.map(|m| m.n_layers)
.unwrap_or(0);
(0..n_layers).fold((0usize, 0u64, 0u64, 0u64), |acc, l| {
let lb = layer_bytes(&index_dir.join(format!("layer_{l}")));
(acc.0 + lb.n_kmers, acc.1 + lb.mphf, acc.2 + lb.evidence, acc.3 + lb.matrix)
})
})
.reduce(|| (0, 0, 0, 0), |a, b| (a.0 + b.0, a.1 + b.1, a.2 + b.2, a.3 + b.3));
if n_kmers == 0 {
return Ok(IndexBitsPerKmer {
n_kmers: 0, n_genomes,
mphf: 0.0, evidence: 0.0,
matrix: 0.0, matrix_per_genome: 0.0, total: 0.0,
});
}
let bpk = |bytes: u64| bytes as f64 * 8.0 / n_kmers as f64;
let matrix = bpk(matrix_b);
Ok(IndexBitsPerKmer {
n_kmers,
n_genomes,
mphf: bpk(mphf_b),
evidence: bpk(evidence_b),
matrix,
matrix_per_genome: matrix / n_genomes as f64,
total: bpk(mphf_b + evidence_b + matrix_b),
})
}
}
+29 -2
View File
@@ -1,7 +1,7 @@
use std::path::PathBuf;
use clap::Args;
use obikindex::{validate_label, KmerIndex};
use obikindex::{validate_label, IndexBitsPerKmer, KmerIndex};
use tracing::info;
#[derive(Args)]
@@ -16,6 +16,10 @@ pub struct UtilsArgs {
/// Add missing layer_meta.json files to each layer (required after upgrading from old indexes)
#[arg(long)]
pub upgrade_index: bool,
/// Print bits-per-kmer statistics (MPHF, evidence, matrix, total)
#[arg(long)]
pub bits_per_kmer: bool,
}
pub fn run(args: UtilsArgs) {
@@ -31,12 +35,35 @@ pub fn run(args: UtilsArgs) {
run_upgrade_index(&args.index);
}
if args.bits_per_kmer {
any = true;
run_bits_per_kmer(&args.index);
}
if !any {
eprintln!("utils: no operation specified. Available options: --new-label NEW=OLD, --upgrade-index");
eprintln!("utils: no operation specified. Available options: --new-label NEW=OLD, --upgrade-index, --bits-per-kmer");
std::process::exit(1);
}
}
fn run_bits_per_kmer(index_path: &PathBuf) {
let idx = KmerIndex::open(index_path).unwrap_or_else(|e| {
eprintln!("error opening index: {e}");
std::process::exit(1);
});
let stats: IndexBitsPerKmer = idx.bits_per_kmer().unwrap_or_else(|e| {
eprintln!("error computing bits/kmer: {e}");
std::process::exit(1);
});
println!("k-mers : {}", stats.n_kmers);
println!("genomes : {}", stats.n_genomes);
println!("mphf : {:6.2} bits/kmer", stats.mphf);
println!("evidence : {:6.2} bits/kmer", stats.evidence);
println!("matrix : {:6.2} bits/kmer ({:.2} bits/kmer/genome)",
stats.matrix, stats.matrix_per_genome);
println!("total : {:6.2} bits/kmer", stats.total);
}
fn run_upgrade_index(index_path: &PathBuf) {
let idx = KmerIndex::open(index_path).unwrap_or_else(|e| {
eprintln!("error opening index: {e}");