From bc51cd9861d889911fdf505f7d98ec0a8c7fea96 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Sat, 23 May 2026 12:50:03 +0200 Subject: [PATCH] feat: add configurable block sizes and in-place reindex command Propagate configurable block size (`block_bits`) through index and layer construction to control unitig chunking and optimize memory/performance trade-offs. Introduce an in-place `reindex` command and library method to convert indices between exact and approximate evidence formats. Add validation to reject merging indexes with mismatched evidence types, and update parallel kmer counting to use `AtomicUsize` for thread-safe aggregation. Includes CLI argument parsing, metadata persistence, and updated tests. --- src/obikindex/src/error.rs | 5 +- src/obikindex/src/index.rs | 3 +- src/obikindex/src/lib.rs | 1 + src/obikindex/src/merge.rs | 37 ++++++- src/obikindex/src/meta.rs | 5 + src/obikindex/src/rebuild.rs | 3 +- src/obikindex/src/reindex.rs | 126 ++++++++++++++++++++++ src/obikmer/src/cli.rs | 6 ++ src/obikmer/src/cmd/index.rs | 9 +- src/obikmer/src/cmd/mod.rs | 1 + src/obikmer/src/cmd/reindex.rs | 68 ++++++++++++ src/obikmer/src/cmd/unitig.rs | 6 +- src/obikmer/src/main.rs | 3 + src/obikpartitionner/src/index_layer.rs | 5 +- src/obikpartitionner/src/merge_layer.rs | 3 +- src/obikpartitionner/src/rebuild_layer.rs | 3 +- src/obilayeredmap/src/layer.rs | 44 ++++---- src/obilayeredmap/src/map.rs | 6 +- src/obilayeredmap/src/mphf_layer.rs | 20 ++-- src/obilayeredmap/src/tests/layer.rs | 8 +- src/obiskio/src/unitig_index.rs | 7 +- 21 files changed, 318 insertions(+), 51 deletions(-) create mode 100644 src/obikindex/src/reindex.rs create mode 100644 src/obikmer/src/cmd/reindex.rs diff --git a/src/obikindex/src/error.rs b/src/obikindex/src/error.rs index 96834cb..f9ddf7e 100644 --- a/src/obikindex/src/error.rs +++ b/src/obikindex/src/error.rs @@ -18,6 +18,8 @@ pub enum OKIError { DuplicateGenomeLabel(String), /// Operation not valid for this index configuration. InvalidInput(String), + /// Sources mix exact and approximate evidence, or use incompatible approx parameters. + IncompatibleEvidence(String), } pub type OKIResult = Result; @@ -32,7 +34,8 @@ impl fmt::Display for OKIError { OKIError::IncompatibleConfig => write!(f, "incompatible index configurations"), OKIError::MismatchedMode => write!(f, "count mode requires all sources to have with_counts=true"), OKIError::DuplicateGenomeLabel(l) => write!(f, "duplicate genome label across sources: {l}"), - OKIError::InvalidInput(m) => write!(f, "invalid input: {m}"), + OKIError::InvalidInput(m) => write!(f, "invalid input: {m}"), + OKIError::IncompatibleEvidence(m) => write!(f, "incompatible evidence: {m}"), } } } diff --git a/src/obikindex/src/index.rs b/src/obikindex/src/index.rs index 321d905..8e207a8 100644 --- a/src/obikindex/src/index.rs +++ b/src/obikindex/src/index.rs @@ -145,6 +145,7 @@ impl KmerIndex { let t = Stage::start("index"); let with_counts = self.meta.config.with_counts; let evidence = self.meta.config.evidence.clone(); + let block_bits = self.meta.config.block_bits; let total_kmers = AtomicUsize::new(0); let pb = Arc::new(Mutex::new( @@ -154,7 +155,7 @@ impl KmerIndex { )); (0..n).into_par_iter().for_each(|i| { - match self.partition.build_index_layer(i, min_ab, max_ab, with_counts, &evidence) { + match self.partition.build_index_layer(i, min_ab, max_ab, with_counts, &evidence, block_bits) { Ok(0) => {} Ok(n_kmers) => { total_kmers.fetch_add(n_kmers, Ordering::Relaxed); diff --git a/src/obikindex/src/lib.rs b/src/obikindex/src/lib.rs index 5c9bc85..9f84178 100644 --- a/src/obikindex/src/lib.rs +++ b/src/obikindex/src/lib.rs @@ -6,6 +6,7 @@ mod dump; mod index; mod merge; mod rebuild; +mod reindex; pub use error::{OKIError, OKIResult}; pub use distance::{DistanceMetric, DistanceOutput}; diff --git a/src/obikindex/src/merge.rs b/src/obikindex/src/merge.rs index bb51629..88dead2 100644 --- a/src/obikindex/src/merge.rs +++ b/src/obikindex/src/merge.rs @@ -9,6 +9,8 @@ use obisys::{Reporter, Stage}; use rayon::prelude::*; use tracing::info; +use obilayeredmap::EvidenceKind; + use crate::error::{OKIError, OKIResult}; use crate::index::KmerIndex; use crate::meta::{GenomeInfo, IndexMeta}; @@ -61,6 +63,9 @@ impl KmerIndex { } } + // ── Validate evidence compatibility ─────────────────────────────────── + let evidence = validate_evidence_compat(sources)?; + // ── Compute final genome labels (rename duplicates if requested) ─────── let (source_labels, all_genomes) = compute_labels(sources, rename_duplicates)?; @@ -91,6 +96,7 @@ impl KmerIndex { let mut meta = IndexMeta::read(output).map_err(OKIError::Io)?; meta.genomes = all_genomes; meta.config.with_counts = mode == MergeMode::Count; + meta.config.evidence = evidence; meta.write(output)?; // In presence/absence mode, purge counts/ directories inherited from @@ -134,13 +140,14 @@ impl KmerIndex { let pb = partition_bar(n_partitions as u64); let dst_partition = &dst.partition; + let block_bits = dst.meta.config.block_bits; let errors: Vec = (0..n_partitions) .into_par_iter() .filter_map(|i| { let srcs: Vec<(&obikpartitionner::KmerPartition, usize)> = remaining_sources.iter().map(|s| (&s.partition, s.meta.genomes.len())).collect(); - let result = dst_partition.merge_partition(i, &srcs, mode, n_dst_genomes).err(); + let result = dst_partition.merge_partition(i, &srcs, mode, n_dst_genomes, block_bits).err(); pb.inc(1); result }) @@ -258,6 +265,34 @@ fn partition_bar(n: u64) -> ProgressBar { pb } +/// Check that all sources share the same evidence kind. +/// +/// Rules: +/// - all `Exact` → OK, returns `Exact` +/// - all `Approx { b, z }` same params → OK, returns `Approx { b, z }` +/// - mixed exact/approx or different approx params → `IncompatibleEvidence` +fn validate_evidence_compat(sources: &[&KmerIndex]) -> OKIResult { + let ref_ev = &sources[0].meta.config.evidence; + for src in &sources[1..] { + let ev = &src.meta.config.evidence; + let compat = match (ref_ev, ev) { + (EvidenceKind::Exact, EvidenceKind::Exact) => true, + (EvidenceKind::Approx { b: b1, z: z1 }, + EvidenceKind::Approx { b: b2, z: z2 }) => b1 == b2 && z1 == z2, + _ => false, + }; + if !compat { + return Err(OKIError::IncompatibleEvidence(format!( + "source {:?} has evidence {:?}, expected {:?} — \ + convert all sources to the same evidence kind first \ + (use the `reindex` command)", + src.root_path.display(), ev, ref_ev, + ))); + } + } + Ok(ref_ev.clone()) +} + fn copy_dir_all(src: &Path, dst: &Path) -> io::Result<()> { fs::create_dir_all(dst)?; for entry in fs::read_dir(src)? { diff --git a/src/obikindex/src/meta.rs b/src/obikindex/src/meta.rs index 6d691f8..686c420 100644 --- a/src/obikindex/src/meta.rs +++ b/src/obikindex/src/meta.rs @@ -31,6 +31,11 @@ pub struct IndexConfig { pub with_counts: bool, #[serde(default)] pub evidence: EvidenceKind, + /// Block size for the unitig index as a power-of-two exponent. + /// The `.idx` block covers 2^block_bits consecutive unitigs. + /// 0 = one entry per unitig (O(1) access, largest `.idx`). + #[serde(default)] + pub block_bits: u8, } #[derive(Debug, Clone, Serialize, Deserialize)] diff --git a/src/obikindex/src/rebuild.rs b/src/obikindex/src/rebuild.rs index b360030..95dc4b8 100644 --- a/src/obikindex/src/rebuild.rs +++ b/src/obikindex/src/rebuild.rs @@ -88,12 +88,13 @@ impl KmerIndex { pb.enable_steady_tick(Duration::from_millis(100)); let src_partition = &src.partition; + let block_bits = meta.config.block_bits; let errors: Vec = (0..n_partitions) .into_par_iter() .filter_map(|i| { let result = dst_partition - .rebuild_partition(src_partition, i, filters, mode, n_genomes) + .rebuild_partition(src_partition, i, filters, mode, n_genomes, block_bits) .err(); pb.inc(1); result diff --git a/src/obikindex/src/reindex.rs b/src/obikindex/src/reindex.rs new file mode 100644 index 0000000..674bf4a --- /dev/null +++ b/src/obikindex/src/reindex.rs @@ -0,0 +1,126 @@ +use std::fs; +use std::path::Path; +use std::time::Duration; + +use indicatif::{ProgressBar, ProgressStyle}; +use obilayeredmap::{EvidenceKind, layer::Layer}; +use obilayeredmap::meta::PartitionMeta; +use obisys::{Reporter, Stage}; +use rayon::prelude::*; +use tracing::info; + +use crate::error::{OKIError, OKIResult}; +use crate::index::KmerIndex; +use crate::state::IndexState; + +const EVIDENCE_FILE: &str = "evidence.bin"; +const FINGERPRINT_FILE: &str = "fingerprint.bin"; +const UNITIG_IDX_FILE: &str = "unitigs.bin.idx"; + +fn olm_to_oki(e: obilayeredmap::OLMError) -> OKIError { + OKIError::InvalidInput(e.to_string()) +} + +impl KmerIndex { + /// Convert every layer's evidence bundle to `target` in-place. + /// + /// - `Exact` → builds `evidence.bin` + `unitigs.bin.idx`, removes `fingerprint.bin` + /// - `Approx` → builds `fingerprint.bin`, removes `evidence.bin` + `unitigs.bin.idx` + /// + /// The MPHF (`mphf.bin`) and unitigs (`unitigs.bin`) are never touched. + /// `index.meta` is updated with the new evidence kind on success. + pub fn reindex( + &mut self, + target: EvidenceKind, + block_bits: u8, + rep: &mut Reporter, + ) -> OKIResult<()> { + if self.state() != IndexState::Indexed { + return Err(OKIError::NotIndexed(self.root_path.clone())); + } + + let n = self.partition.n_partitions(); + info!( + "reindex {} partition(s): {:?} → {:?}", + n, self.meta.config.evidence, target, + ); + + let t = Stage::start("reindex"); + let pb = ProgressBar::new(n as u64).with_style( + ProgressStyle::with_template( + "reindex — [{bar:20}] {pos}/{len} | {msg}", + ) + .unwrap() + .tick_strings(&["⠋","⠙","⠹","⠸","⠼","⠴","⠦","⠧","⠇","⠏"]), + ); + pb.enable_steady_tick(Duration::from_millis(80)); + + let errors: Vec = (0..n) + .into_par_iter() + .filter_map(|i| { + let res = reindex_partition( + &self.partition.part_dir(i).join("index"), + &target, + block_bits, + ); + pb.inc(1); + res.err().map(|e| format!("partition {i}: {e}")) + }) + .collect(); + + pb.finish_and_clear(); + + if let Some(e) = errors.into_iter().next() { + return Err(OKIError::InvalidInput(e)); + } + + self.meta.config.evidence = target; + if matches!(self.meta.config.evidence, EvidenceKind::Exact) { + self.meta.config.block_bits = block_bits; + } + self.meta.write(&self.root_path)?; + rep.push(t.stop()); + Ok(()) + } +} + +/// Process all layers of one partition's index directory. +fn reindex_partition(index_dir: &Path, target: &EvidenceKind, block_bits: u8) -> OKIResult<()> { + if !index_dir.exists() { + return Ok(()); + } + let pm = PartitionMeta::load(index_dir).map_err(olm_to_oki)?; + for layer_idx in 0..pm.n_layers { + let layer_dir = index_dir.join(format!("layer_{layer_idx}")); + reindex_layer(&layer_dir, target, block_bits)?; + } + Ok(()) +} + +fn reindex_layer(layer_dir: &Path, target: &EvidenceKind, block_bits: u8) -> OKIResult<()> { + Layer::<()>::build_evidence(layer_dir, target, block_bits).map_err(olm_to_oki)?; + remove_stale_evidence(layer_dir, target) +} + +fn remove_stale_evidence(layer_dir: &Path, target: &EvidenceKind) -> OKIResult<()> { + match target { + EvidenceKind::Exact => { + // fingerprint.bin is no longer valid + remove_if_exists(&layer_dir.join(FINGERPRINT_FILE)); + } + EvidenceKind::Approx { .. } => { + // exact bundle is no longer valid + remove_if_exists(&layer_dir.join(EVIDENCE_FILE)); + remove_if_exists(&layer_dir.join(UNITIG_IDX_FILE)); + } + } + Ok(()) +} + +fn remove_if_exists(path: &Path) { + if let Err(e) = fs::remove_file(path) { + if e.kind() != std::io::ErrorKind::NotFound { + eprintln!("warning: could not remove {}: {e}", path.display()); + } + } +} diff --git a/src/obikmer/src/cli.rs b/src/obikmer/src/cli.rs index b20ae6b..83308d1 100644 --- a/src/obikmer/src/cli.rs +++ b/src/obikmer/src/cli.rs @@ -55,6 +55,12 @@ pub fn partitions_to_bits(n: usize) -> usize { n.max(1).next_power_of_two().trailing_zeros() as usize } +/// Convert a block size (number of unitigs per block) to its `block_bits` exponent. +/// `block_size=1` → `block_bits=0` (one entry per unitig, O(1) random access). +pub fn block_size_to_bits(n: usize) -> u8 { + n.max(1).next_power_of_two().trailing_zeros() as u8 +} + impl CommonArgs { pub fn seqfile_paths(&self) -> obiread::PathIter { let paths = self.inputs.iter().map(PathBuf::from).collect(); diff --git a/src/obikmer/src/cmd/index.rs b/src/obikmer/src/cmd/index.rs index d5c9999..4b53ea3 100644 --- a/src/obikmer/src/cmd/index.rs +++ b/src/obikmer/src/cmd/index.rs @@ -12,7 +12,7 @@ use obikseq::{set_k, set_m}; use obisys::Reporter; use tracing::info; -use crate::cli::{CommonArgs, partitions_to_bits}; +use crate::cli::{CommonArgs, block_size_to_bits, partitions_to_bits}; use crate::steps::scatter; #[derive(Args)] @@ -68,6 +68,11 @@ pub struct IndexArgs { #[arg(long, default_value = None)] pub fp: Option, + /// Block size for exact evidence `.idx` (number of unitigs per block). + /// Must be a power of two; rounded up if not. Default 1 = O(1) random access. + #[arg(long, default_value_t = 1)] + pub block_size: usize, + #[command(flatten)] pub common: CommonArgs, } @@ -179,12 +184,14 @@ pub fn run(args: IndexArgs) { if effective != args.common.partitions { info!("partitions: {} → {} (next power of 2)", args.common.partitions, effective); } + let block_bits = block_size_to_bits(args.block_size); let config = IndexConfig { kmer_size: args.common.kmer_size, minimizer_size: args.common.minimizer_size, n_bits, with_counts: args.with_counts, evidence: evidence.clone(), + block_bits, }; let genome_info = args.label.as_ref().map(|label| { let mut info = GenomeInfo::new(label.clone()); diff --git a/src/obikmer/src/cmd/mod.rs b/src/obikmer/src/cmd/mod.rs index 1645c23..39e9a89 100644 --- a/src/obikmer/src/cmd/mod.rs +++ b/src/obikmer/src/cmd/mod.rs @@ -6,5 +6,6 @@ pub mod index; pub mod merge; pub mod query; pub mod rebuild; +pub mod reindex; pub mod superkmer; pub mod unitig; diff --git a/src/obikmer/src/cmd/reindex.rs b/src/obikmer/src/cmd/reindex.rs new file mode 100644 index 0000000..5d0152b --- /dev/null +++ b/src/obikmer/src/cmd/reindex.rs @@ -0,0 +1,68 @@ +use std::path::PathBuf; + +use clap::Args; +use obikindex::KmerIndex; +use obilayeredmap::EvidenceKind; +use obisys::Reporter; +use tracing::info; + +use crate::cli::block_size_to_bits; +use super::index::resolve_approx_params; + +#[derive(Args)] +pub struct ReindexArgs { + /// Index directory to convert (modified in-place) + pub index: PathBuf, + + /// Convert to approximate evidence (default: convert to exact). + /// Requires --evidence-bits and/or -z and/or --fp. + #[arg(long, default_value_t = false)] + pub approx: bool, + + /// Findere z parameter (≥1). + #[arg(short = 'z', long, default_value = None)] + pub findere_z: Option, + + /// Fingerprint bits per slot (b). + #[arg(long, default_value = None)] + pub evidence_bits: Option, + + /// Target false-positive rate per z-window. + #[arg(long, default_value = None)] + pub fp: Option, + + /// Block size for exact evidence `.idx` (number of unitigs per block). + /// Ignored when converting to approximate evidence. + #[arg(long, default_value_t = 1)] + pub block_size: usize, +} + +pub fn run(args: ReindexArgs) { + let target = if args.approx { + let (z, b, fp) = resolve_approx_params(args.findere_z, args.evidence_bits, args.fp); + info!("target: approximate evidence — b={b}, z={z}, fp={fp:.2e}"); + EvidenceKind::Approx { b, z } + } else { + info!("target: exact evidence"); + EvidenceKind::Exact + }; + + let mut idx = KmerIndex::open(&args.index).unwrap_or_else(|e| { + eprintln!("error opening index: {e}"); + std::process::exit(1); + }); + + info!( + "current evidence: {:?}", + idx.meta().config.evidence, + ); + + let block_bits = block_size_to_bits(args.block_size); + let mut rep = Reporter::new(); + idx.reindex(target, block_bits, &mut rep).unwrap_or_else(|e| { + eprintln!("reindex error: {e}"); + std::process::exit(1); + }); + + rep.print(); +} diff --git a/src/obikmer/src/cmd/unitig.rs b/src/obikmer/src/cmd/unitig.rs index ceb71f8..82fe6bc 100644 --- a/src/obikmer/src/cmd/unitig.rs +++ b/src/obikmer/src/cmd/unitig.rs @@ -35,13 +35,13 @@ pub fn run(args: UnitigArgs) { return; } - let reader = UnitigFileReader::open(&path).unwrap_or_else(|e| { + // open_sequential: works with and without .idx (approx or exact index) + let reader = UnitigFileReader::open_sequential(&path).unwrap_or_else(|e| { eprintln!("error opening unitigs (partition {i}): {e}"); std::process::exit(1) }); - for j in 0..reader.len() { - let unitig = reader.unitig(j); + for (j, unitig) in reader.iter_unitigs() { let mut out = stdout.lock().unwrap(); write_unitig(&unitig, k, i, j, &mut *out).unwrap_or_else(|e| { eprintln!("write error: {e}"); diff --git a/src/obikmer/src/main.rs b/src/obikmer/src/main.rs index 66665f8..e6700e1 100644 --- a/src/obikmer/src/main.rs +++ b/src/obikmer/src/main.rs @@ -34,6 +34,8 @@ enum Commands { Unitig(cmd::unitig::UnitigArgs), /// Estimate approximate-index parameters (z, evidence bits, FP rates) before indexing Estimate(cmd::estimate::EstimateArgs), + /// Convert an index's evidence in-place: exact ↔ approx + Reindex(cmd::reindex::ReindexArgs), } fn main() { @@ -65,6 +67,7 @@ fn main() { Commands::Distance(args) => cmd::distance::run(args), Commands::Unitig(args) => cmd::unitig::run(args), Commands::Estimate(args) => cmd::estimate::run(args), + Commands::Reindex(args) => cmd::reindex::run(args), } #[cfg(feature = "profiling")] diff --git a/src/obikpartitionner/src/index_layer.rs b/src/obikpartitionner/src/index_layer.rs index e51f7cc..4665197 100644 --- a/src/obikpartitionner/src/index_layer.rs +++ b/src/obikpartitionner/src/index_layer.rs @@ -45,6 +45,7 @@ impl KmerPartition { max_ab: Option, with_counts: bool, evidence: &EvidenceKind, + block_bits: u8, ) -> Result { let part_dir = self.part_dir(i); let dedup_path = part_dir.join("dereplicated.skmer.zst"); @@ -109,7 +110,7 @@ impl KmerPartition { uw.close()?; if with_counts { - Layer::::build(&layer_dir, |kmer| { + Layer::::build(&layer_dir, block_bits, |kmer| { match (&mphf1_opt, &counts1_opt) { (Some(mphf), Some(counts)) => counts.get(mphf.index(&kmer.raw())), _ => 1, @@ -117,7 +118,7 @@ impl KmerPartition { }) .map_err(olm_to_sk)?; } else { - Layer::<()>::build(&layer_dir).map_err(olm_to_sk)?; + Layer::<()>::build(&layer_dir, block_bits).map_err(olm_to_sk)?; } // For approximate evidence: replace the exact evidence bundle with a diff --git a/src/obikpartitionner/src/merge_layer.rs b/src/obikpartitionner/src/merge_layer.rs index 21785d0..c07de97 100644 --- a/src/obikpartitionner/src/merge_layer.rs +++ b/src/obikpartitionner/src/merge_layer.rs @@ -161,6 +161,7 @@ impl KmerPartition { sources: &[(&KmerPartition, usize)], mode: MergeMode, n_dst_genomes: usize, + block_bits: u8, ) -> SKResult<()> { let dst_index_dir = self.part_dir(i).join(INDEX_SUBDIR); if !dst_index_dir.exists() { @@ -216,7 +217,7 @@ impl KmerPartition { uw.write(&unitig)?; } uw.close()?; - Layer::<()>::build(&new_layer_dir).map_err(olm_to_sk)?; + Layer::<()>::build(&new_layer_dir, block_bits).map_err(olm_to_sk)?; } drop(g); diff --git a/src/obikpartitionner/src/rebuild_layer.rs b/src/obikpartitionner/src/rebuild_layer.rs index fa581f9..6ba32e4 100644 --- a/src/obikpartitionner/src/rebuild_layer.rs +++ b/src/obikpartitionner/src/rebuild_layer.rs @@ -96,6 +96,7 @@ impl KmerPartition { filters: &[Box], mode: MergeMode, n_genomes: usize, + block_bits: u8, ) -> SKResult<()> { let src_index_dir = src.part_dir(i).join(INDEX_SUBDIR); if !src_index_dir.exists() { @@ -145,7 +146,7 @@ impl KmerPartition { uw.close()?; drop(g); - Layer::<()>::build(&dst_layer_dir).map_err(olm_to_sk)?; + Layer::<()>::build(&dst_layer_dir, block_bits).map_err(olm_to_sk)?; let dst_mphf = MphfLayer::open(&dst_layer_dir).map_err(olm_to_sk)?; // ── Prepare matrix builders (one column per genome) ─────────────────── diff --git a/src/obilayeredmap/src/layer.rs b/src/obilayeredmap/src/layer.rs index e5f531b..6a4f06f 100644 --- a/src/obilayeredmap/src/layer.rs +++ b/src/obilayeredmap/src/layer.rs @@ -80,39 +80,33 @@ impl Layer { /// Build `unitigs.bin.idx` and `evidence.bin` from `unitigs.bin` and /// `mphf.bin` already present in `layer_dir`. - /// - /// See [`MphfLayer::build_exact_evidence`] for the full contract. - pub fn build_exact_evidence(layer_dir: &Path) -> OLMResult { - MphfLayer::build_exact_evidence(layer_dir) + /// `block_bits` controls the `.idx` block size (2^block_bits chunks/block). + pub fn build_exact_evidence(layer_dir: &Path, block_bits: u8) -> OLMResult { + MphfLayer::build_exact_evidence(layer_dir, block_bits) } /// Build `fingerprint.bin` from `unitigs.bin` and `mphf.bin` already /// present in `layer_dir`. `b` — fingerprint bits (1..=64); `z` — Findere /// consecutive k-mer parameter (≥1). - /// - /// See [`MphfLayer::build_approx_evidence`] for the full contract. pub fn build_approx_evidence(layer_dir: &Path, b: u8, z: u8) -> OLMResult { MphfLayer::build_approx_evidence(layer_dir, b, z) } - /// Dispatch to `build_exact_evidence` or `build_approx_evidence` based on - /// `kind`. - pub fn build_evidence(layer_dir: &Path, kind: &EvidenceKind) -> OLMResult { - MphfLayer::build_evidence(layer_dir, kind) + /// Dispatch to `build_exact_evidence` or `build_approx_evidence`. + /// `block_bits` is forwarded to exact evidence only. + pub fn build_evidence(layer_dir: &Path, kind: &EvidenceKind, block_bits: u8) -> OLMResult { + MphfLayer::build_evidence(layer_dir, kind, block_bits) } } // ── Mode 1 — set membership ─────────────────────────────────────────────────── impl Layer<()> { - pub fn build(out_dir: &Path) -> OLMResult { - MphfLayer::build(out_dir, &mut |_, _| Ok(())) + pub fn build(out_dir: &Path, block_bits: u8) -> OLMResult { + MphfLayer::build(out_dir, block_bits, &mut |_, _| Ok(())) } /// Create a presence matrix for a set-membership layer (first merge). - /// - /// All `n_kmers` slots are set to `true`: every kmer in this layer belongs - /// to genome_0, so genome_0 is present at every slot. pub fn init_presence_matrix(layer_dir: &Path, n_kmers: usize) -> OLMResult<()> { let presence_dir = layer_dir.join(PRESENCE_DIR); fs::create_dir_all(&presence_dir).map_err(OLMError::Io)?; @@ -126,16 +120,20 @@ impl Layer<()> { } } -// ── Mode 2 — count matrix (1 column per layer) ──────────────────────────────── +// ── Mode 2 — count matrix ───────────────────────────────────────────────────── impl Layer { - pub fn build(out_dir: &Path, count_of: impl Fn(CanonicalKmer) -> u32) -> OLMResult { + pub fn build( + out_dir: &Path, + block_bits: u8, + count_of: impl Fn(CanonicalKmer) -> u32, + ) -> OLMResult { let n = UnitigFileReader::open_sequential(&out_dir.join(UNITIGS_FILE))?.n_kmers(); let counts_dir = out_dir.join(COUNTS_DIR); let mut mb = PersistentCompactIntMatrixBuilder::new(n, &counts_dir) .map_err(OLMError::Io)?; let mut col = mb.add_col().map_err(OLMError::Io)?; - let n_built = MphfLayer::build(out_dir, &mut |slot, kmer| { + let n_built = MphfLayer::build(out_dir, block_bits, &mut |slot, kmer| { col.set(slot, count_of(kmer)); Ok(()) })?; @@ -146,16 +144,16 @@ impl Layer { pub fn build_from_map( out_dir: &Path, + block_bits: u8, counts: &HashMap, ) -> OLMResult { - Self::build(out_dir, |kmer| counts.get(&kmer).copied().unwrap_or(0)) + Self::build(out_dir, block_bits, |kmer| counts.get(&kmer).copied().unwrap_or(0)) } } // ── Mode 2 — count matrix column append ────────────────────────────────────── impl Layer { - /// Append a genome column to an existing count matrix. pub fn append_genome_column( layer_dir: &Path, value_of: impl Fn(usize) -> u32, @@ -165,10 +163,9 @@ impl Layer { } } -// ── Mode 3 — presence/absence matrix (1 column per genome) ─────────────────── +// ── Mode 3 — presence/absence matrix ───────────────────────────────────────── impl Layer { - /// Append a genome column to an existing presence matrix. pub fn append_genome_column( layer_dir: &Path, value_of: impl Fn(usize) -> bool, @@ -179,6 +176,7 @@ impl Layer { pub fn build_presence( out_dir: &Path, + block_bits: u8, n_genomes: usize, present_in: impl Fn(CanonicalKmer, usize) -> bool, ) -> OLMResult { @@ -188,7 +186,7 @@ impl Layer { let mut cols: Vec<_> = (0..n_genomes) .map(|_| mb.add_col().map_err(OLMError::Io)) .collect::>()?; - let n_built = MphfLayer::build(out_dir, &mut |slot, kmer| { + let n_built = MphfLayer::build(out_dir, block_bits, &mut |slot, kmer| { for (g, col) in cols.iter_mut().enumerate() { col.set(slot, present_in(kmer, g)); } diff --git a/src/obilayeredmap/src/map.rs b/src/obilayeredmap/src/map.rs index e9cb591..da01aee 100644 --- a/src/obilayeredmap/src/map.rs +++ b/src/obilayeredmap/src/map.rs @@ -4,7 +4,7 @@ use std::path::{Path, PathBuf}; use obicompactvec::PersistentCompactIntMatrix; use obikseq::CanonicalKmer; -use obiskio::UnitigFileWriter; +use obiskio::{UnitigFileWriter, DEFAULT_BLOCK_BITS}; use crate::error::OLMResult; use crate::layer::{Hit, Layer, LayerData}; @@ -90,7 +90,7 @@ impl LayeredMap<()> { pub fn push_layer(&mut self) -> OLMResult { let i = self.layers.len(); let dir = layer_dir(&self.root, i); - Layer::<()>::build(&dir)?; + Layer::<()>::build(&dir, DEFAULT_BLOCK_BITS)?; self.append_layer()?; Ok(i) } @@ -102,7 +102,7 @@ impl LayeredMap { pub fn push_layer(&mut self, count_of: impl Fn(CanonicalKmer) -> u32) -> OLMResult { let i = self.layers.len(); let dir = layer_dir(&self.root, i); - Layer::::build(&dir, count_of)?; + Layer::::build(&dir, DEFAULT_BLOCK_BITS, count_of)?; self.append_layer()?; Ok(i) } diff --git a/src/obilayeredmap/src/mphf_layer.rs b/src/obilayeredmap/src/mphf_layer.rs index d6b9510..3bac003 100644 --- a/src/obilayeredmap/src/mphf_layer.rs +++ b/src/obilayeredmap/src/mphf_layer.rs @@ -4,7 +4,7 @@ use std::path::Path; use cacheline_ef::{CachelineEf, CachelineEfVec}; use epserde::prelude::*; use obikseq::CanonicalKmer; -use obiskio::{UnitigFileReader, UnitigFileWriter, build_unitig_idx, DEFAULT_BLOCK_BITS}; +use obiskio::{UnitigFileReader, UnitigFileWriter, build_unitig_idx}; use ptr_hash::{PtrHash, PtrHashParams, bucket_fn::CubicEps, hash::Xx64}; use crate::error::{OLMError, OLMResult}; @@ -110,25 +110,26 @@ impl MphfLayer { } /// Dispatch to `build_exact_evidence` or `build_approx_evidence` based on - /// `kind`. - pub fn build_evidence(dir: &Path, kind: &EvidenceKind) -> OLMResult { + /// `kind`. `block_bits` is forwarded to exact evidence only. + pub fn build_evidence(dir: &Path, kind: &EvidenceKind, block_bits: u8) -> OLMResult { match kind { - EvidenceKind::Exact => Self::build_exact_evidence(dir), - EvidenceKind::Approx { b, z } => Self::build_approx_evidence(dir, *b, *z), + EvidenceKind::Exact => Self::build_exact_evidence(dir, block_bits), + EvidenceKind::Approx { b, z } => Self::build_approx_evidence(dir, *b, *z), } } /// Build `evidence.bin` + `unitigs.bin.idx` from `unitigs.bin` + `mphf.bin`. /// + /// `block_bits` controls the `.idx` block size (2^block_bits chunks per block). /// Uses sequential iteration — no `.idx` required on entry. - pub fn build_exact_evidence(dir: &Path) -> OLMResult { + pub fn build_exact_evidence(dir: &Path, block_bits: u8) -> OLMResult { let unitig_path = dir.join(UNITIGS_FILE); let unitigs = UnitigFileReader::open_sequential(&unitig_path)?; let n = unitigs.n_kmers(); if n == 0 { fs::File::create(dir.join(EVIDENCE_FILE))?; - build_unitig_idx(&unitig_path, DEFAULT_BLOCK_BITS)?; + build_unitig_idx(&unitig_path, block_bits)?; LayerMeta::exact().save(dir)?; return Ok(0); } @@ -154,7 +155,7 @@ impl MphfLayer { } ev.write(&dir.join(EVIDENCE_FILE))?; - build_unitig_idx(&unitig_path, DEFAULT_BLOCK_BITS)?; + build_unitig_idx(&unitig_path, block_bits)?; LayerMeta::exact().save(dir)?; Ok(n) } @@ -202,13 +203,14 @@ impl MphfLayer { /// population. Returns the number of kmers indexed. pub(crate) fn build( dir: &Path, + block_bits: u8, fill_slot: &mut impl FnMut(usize, CanonicalKmer) -> OLMResult<()>, ) -> OLMResult { use rayon::prelude::*; let unitig_path = dir.join(UNITIGS_FILE); - build_unitig_idx(&unitig_path, DEFAULT_BLOCK_BITS)?; + build_unitig_idx(&unitig_path, block_bits)?; let unitigs = UnitigFileReader::open(&unitig_path)?; let n = unitigs.n_kmers(); diff --git a/src/obilayeredmap/src/tests/layer.rs b/src/obilayeredmap/src/tests/layer.rs index 47f6e82..2a9c3da 100644 --- a/src/obilayeredmap/src/tests/layer.rs +++ b/src/obilayeredmap/src/tests/layer.rs @@ -1,6 +1,7 @@ use super::*; use obicompactvec::PersistentCompactIntMatrix; use obikseq::{set_k, Kmer, Sequence as _, Unitig}; +use obiskio::DEFAULT_BLOCK_BITS; use tempfile::tempdir; fn write_unitigs(dir: &Path, seqs: &[&[u8]]) { @@ -23,7 +24,7 @@ fn build_and_query_all_kmers_found() { let dir = tempdir().unwrap(); write_unitigs(dir.path(), &[b"AAAACGT"]); let kmers = all_canonical_kmers(dir.path()); - Layer::<()>::build(dir.path()).unwrap(); + Layer::<()>::build(dir.path(), DEFAULT_BLOCK_BITS).unwrap(); let layer = Layer::<()>::open(dir.path()).unwrap(); for kmer in kmers { assert!(layer.query(kmer).is_some(), "kmer should be present"); @@ -40,6 +41,7 @@ fn counts_are_stored_and_retrieved() { kmers.iter().enumerate().map(|(i, &k)| (k, i as u32 + 1)).collect(); Layer::::build( dir.path(), + DEFAULT_BLOCK_BITS, |kmer| count_map.get(&kmer).copied().unwrap_or(0), ).unwrap(); let layer = Layer::::open(dir.path()).unwrap(); @@ -54,7 +56,7 @@ fn query_absent_returns_none() { set_k(4); let dir = tempdir().unwrap(); write_unitigs(dir.path(), &[b"AAAACGT"]); - Layer::<()>::build(dir.path()).unwrap(); + Layer::<()>::build(dir.path(), DEFAULT_BLOCK_BITS).unwrap(); let layer = Layer::<()>::open(dir.path()).unwrap(); let absent = Kmer::from_ascii(b"CCCC").unwrap().canonical(); assert!(layer.query(absent).is_none()); @@ -65,7 +67,7 @@ fn open_after_build_is_consistent() { set_k(4); let dir = tempdir().unwrap(); write_unitigs(dir.path(), &[b"AAAACGT"]); - let n = Layer::::build(dir.path(), |_| 7).unwrap(); + let n = Layer::::build(dir.path(), DEFAULT_BLOCK_BITS, |_| 7).unwrap(); assert_eq!(n, 4); let layer = Layer::::open(dir.path()).unwrap(); let kmer = Kmer::from_ascii(b"AAAA").unwrap().canonical(); diff --git a/src/obiskio/src/unitig_index.rs b/src/obiskio/src/unitig_index.rs index 75535ae..eb3fa40 100644 --- a/src/obiskio/src/unitig_index.rs +++ b/src/obiskio/src/unitig_index.rs @@ -21,7 +21,7 @@ use crate::error::{SKError, SKResult}; const MAGIC: [u8; 4] = *b"UIX3"; /// Default block granularity used by [`UnitigFileWriter::create`]. -pub const DEFAULT_BLOCK_BITS: u8 = 6; +pub const DEFAULT_BLOCK_BITS: u8 = 0; fn idx_path(path: &Path) -> PathBuf { crate::append_path_suffix(path, ".idx") @@ -325,6 +325,11 @@ impl UnitigFileReader { }) } + /// Iterate all unitigs sequentially. Works without `.idx` (sequential open). + pub fn iter_unitigs(&self) -> impl Iterator + '_ { + self.iter_chunks_sequential() + } + pub fn iter_kmers(&self) -> impl Iterator + '_ { self.iter_chunks_sequential() .flat_map(|(_, u)| u.into_kmers())