Push ruqusmkoyvwm #16
@@ -7,6 +7,7 @@ mod index;
|
||||
mod merge;
|
||||
mod rebuild;
|
||||
mod reindex;
|
||||
mod stats;
|
||||
|
||||
pub use error::{OKIError, OKIResult};
|
||||
pub use distance::{DistanceMetric, DistanceOutput};
|
||||
@@ -14,3 +15,4 @@ pub use index::KmerIndex;
|
||||
pub use merge::MergeMode;
|
||||
pub use meta::{validate_label, GenomeInfo, IndexConfig, IndexMeta, META_FILENAME};
|
||||
pub use state::{IndexState, SENTINEL_COUNTED, SENTINEL_INDEXED, SENTINEL_SCATTERED};
|
||||
pub use stats::IndexBitsPerKmer;
|
||||
|
||||
@@ -0,0 +1,127 @@
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
|
||||
use obicompactvec::LayerMeta;
|
||||
use obilayeredmap::meta::PartitionMeta;
|
||||
use rayon::prelude::*;
|
||||
|
||||
use crate::error::OKIResult;
|
||||
use crate::index::KmerIndex;
|
||||
|
||||
/// Bits per kmer broken down by index component.
|
||||
pub struct IndexBitsPerKmer {
|
||||
/// Total distinct k-mers across all partitions and layers.
|
||||
pub n_kmers: usize,
|
||||
/// Number of genomes in the index.
|
||||
pub n_genomes: usize,
|
||||
/// Bits used by the minimal perfect hash function (`mphf.bin`).
|
||||
pub mphf: f64,
|
||||
/// Bits used by the evidence files (`evidence.bin`, `unitigs.bin*`,
|
||||
/// `fingerprint.bin`).
|
||||
pub evidence: f64,
|
||||
/// Bits used by the count/presence matrices (`counts/` and `presence/`),
|
||||
/// normalised by k-mers only.
|
||||
pub matrix: f64,
|
||||
/// `matrix` divided by the number of genomes — intrinsic encoding
|
||||
/// efficiency, independent of index size.
|
||||
pub matrix_per_genome: f64,
|
||||
/// Sum of mphf + evidence + matrix.
|
||||
pub total: f64,
|
||||
}
|
||||
|
||||
// ── File-size helpers ─────────────────────────────────────────────────────────
|
||||
|
||||
fn file_bytes(path: &Path) -> u64 {
|
||||
fs::metadata(path).map(|m| m.len()).unwrap_or(0)
|
||||
}
|
||||
|
||||
fn dir_bytes(dir: &Path) -> u64 {
|
||||
if !dir.exists() { return 0; }
|
||||
fs::read_dir(dir)
|
||||
.map(|entries| {
|
||||
entries
|
||||
.filter_map(|e| e.ok())
|
||||
.filter_map(|e| e.metadata().ok())
|
||||
.filter(|m| m.is_file())
|
||||
.map(|m| m.len())
|
||||
.sum()
|
||||
})
|
||||
.unwrap_or(0)
|
||||
}
|
||||
|
||||
// ── Per-layer accounting ──────────────────────────────────────────────────────
|
||||
|
||||
struct LayerBytes {
|
||||
n_kmers: usize,
|
||||
mphf: u64,
|
||||
evidence: u64,
|
||||
matrix: u64,
|
||||
}
|
||||
|
||||
fn layer_bytes(layer_dir: &Path) -> LayerBytes {
|
||||
let n_kmers = LayerMeta::load(layer_dir).map(|m| m.n).unwrap_or(0);
|
||||
|
||||
let mphf = file_bytes(&layer_dir.join("mphf.bin"));
|
||||
|
||||
let evidence = file_bytes(&layer_dir.join("unitigs.bin"))
|
||||
+ file_bytes(&layer_dir.join("unitigs.bin.idx"))
|
||||
+ file_bytes(&layer_dir.join("evidence.bin"))
|
||||
+ file_bytes(&layer_dir.join("fingerprint.bin"));
|
||||
|
||||
let matrix = dir_bytes(&layer_dir.join("counts"))
|
||||
+ dir_bytes(&layer_dir.join("presence"));
|
||||
|
||||
LayerBytes { n_kmers, mphf, evidence, matrix }
|
||||
}
|
||||
|
||||
// ── KmerIndex::bits_per_kmer ──────────────────────────────────────────────────
|
||||
|
||||
impl KmerIndex {
|
||||
/// Compute bits-per-kmer statistics for the built index.
|
||||
///
|
||||
/// File sizes are read directly from disk; kmer counts come from
|
||||
/// `layer_meta.json` (no need to scan the MPHF or unitig files).
|
||||
/// Computation is parallelised across partitions.
|
||||
pub fn bits_per_kmer(&self) -> OKIResult<IndexBitsPerKmer> {
|
||||
let n = self.n_partitions();
|
||||
let n_genomes = self.meta().genomes.len().max(1);
|
||||
|
||||
let (n_kmers, mphf_b, evidence_b, matrix_b) = (0..n)
|
||||
.into_par_iter()
|
||||
.map(|i| {
|
||||
let index_dir = self.partition.part_dir(i).join("index");
|
||||
if !index_dir.exists() { return (0usize, 0u64, 0u64, 0u64); }
|
||||
|
||||
let n_layers = PartitionMeta::load(&index_dir)
|
||||
.map(|m| m.n_layers)
|
||||
.unwrap_or(0);
|
||||
|
||||
(0..n_layers).fold((0usize, 0u64, 0u64, 0u64), |acc, l| {
|
||||
let lb = layer_bytes(&index_dir.join(format!("layer_{l}")));
|
||||
(acc.0 + lb.n_kmers, acc.1 + lb.mphf, acc.2 + lb.evidence, acc.3 + lb.matrix)
|
||||
})
|
||||
})
|
||||
.reduce(|| (0, 0, 0, 0), |a, b| (a.0 + b.0, a.1 + b.1, a.2 + b.2, a.3 + b.3));
|
||||
|
||||
if n_kmers == 0 {
|
||||
return Ok(IndexBitsPerKmer {
|
||||
n_kmers: 0, n_genomes,
|
||||
mphf: 0.0, evidence: 0.0,
|
||||
matrix: 0.0, matrix_per_genome: 0.0, total: 0.0,
|
||||
});
|
||||
}
|
||||
|
||||
let bpk = |bytes: u64| bytes as f64 * 8.0 / n_kmers as f64;
|
||||
let matrix = bpk(matrix_b);
|
||||
|
||||
Ok(IndexBitsPerKmer {
|
||||
n_kmers,
|
||||
n_genomes,
|
||||
mphf: bpk(mphf_b),
|
||||
evidence: bpk(evidence_b),
|
||||
matrix,
|
||||
matrix_per_genome: matrix / n_genomes as f64,
|
||||
total: bpk(mphf_b + evidence_b + matrix_b),
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -1,7 +1,7 @@
|
||||
use std::path::PathBuf;
|
||||
|
||||
use clap::Args;
|
||||
use obikindex::{validate_label, KmerIndex};
|
||||
use obikindex::{validate_label, IndexBitsPerKmer, KmerIndex};
|
||||
use tracing::info;
|
||||
|
||||
#[derive(Args)]
|
||||
@@ -16,6 +16,10 @@ pub struct UtilsArgs {
|
||||
/// Add missing layer_meta.json files to each layer (required after upgrading from old indexes)
|
||||
#[arg(long)]
|
||||
pub upgrade_index: bool,
|
||||
|
||||
/// Print bits-per-kmer statistics (MPHF, evidence, matrix, total)
|
||||
#[arg(long)]
|
||||
pub bits_per_kmer: bool,
|
||||
}
|
||||
|
||||
pub fn run(args: UtilsArgs) {
|
||||
@@ -31,12 +35,35 @@ pub fn run(args: UtilsArgs) {
|
||||
run_upgrade_index(&args.index);
|
||||
}
|
||||
|
||||
if args.bits_per_kmer {
|
||||
any = true;
|
||||
run_bits_per_kmer(&args.index);
|
||||
}
|
||||
|
||||
if !any {
|
||||
eprintln!("utils: no operation specified. Available options: --new-label NEW=OLD, --upgrade-index");
|
||||
eprintln!("utils: no operation specified. Available options: --new-label NEW=OLD, --upgrade-index, --bits-per-kmer");
|
||||
std::process::exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
fn run_bits_per_kmer(index_path: &PathBuf) {
|
||||
let idx = KmerIndex::open(index_path).unwrap_or_else(|e| {
|
||||
eprintln!("error opening index: {e}");
|
||||
std::process::exit(1);
|
||||
});
|
||||
let stats: IndexBitsPerKmer = idx.bits_per_kmer().unwrap_or_else(|e| {
|
||||
eprintln!("error computing bits/kmer: {e}");
|
||||
std::process::exit(1);
|
||||
});
|
||||
println!("k-mers : {}", stats.n_kmers);
|
||||
println!("genomes : {}", stats.n_genomes);
|
||||
println!("mphf : {:6.2} bits/kmer", stats.mphf);
|
||||
println!("evidence : {:6.2} bits/kmer", stats.evidence);
|
||||
println!("matrix : {:6.2} bits/kmer ({:.2} bits/kmer/genome)",
|
||||
stats.matrix, stats.matrix_per_genome);
|
||||
println!("total : {:6.2} bits/kmer", stats.total);
|
||||
}
|
||||
|
||||
fn run_upgrade_index(index_path: &PathBuf) {
|
||||
let idx = KmerIndex::open(index_path).unwrap_or_else(|e| {
|
||||
eprintln!("error opening index: {e}");
|
||||
|
||||
Reference in New Issue
Block a user