From 7501b6e8549d5b00e82d9ccacc79f16710559212 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Tue, 26 May 2026 10:04:25 +0200 Subject: [PATCH 1/5] refactor: switch indexing to IndexMode and update metadata Replace EvidenceKind with IndexMode (Exact, Approx, Hybrid) across layer construction and query dispatch. Update PartitionMeta and LayerMeta serialization to centralize index-wide configuration. Add flexible push_layer overloads to LayeredMap for dynamic index expansion without full rebuilds. Improve UnitigFileReader to gracefully fallback to sequential scanning when indexes are missing, eliminating panics. --- src/obikpartitionner/src/index_layer.rs | 12 +- src/obikpartitionner/src/merge_layer.rs | 21 +- src/obikpartitionner/src/rebuild_layer.rs | 2 +- src/obilayeredmap/src/layer.rs | 27 +- src/obilayeredmap/src/lib.rs | 2 +- src/obilayeredmap/src/map.rs | 53 ++-- src/obilayeredmap/src/meta.rs | 60 ++--- src/obilayeredmap/src/mphf_layer.rs | 291 ++++++++++------------ src/obiskio/src/unitig_index.rs | 131 ++++++---- 9 files changed, 284 insertions(+), 315 deletions(-) diff --git a/src/obikpartitionner/src/index_layer.rs b/src/obikpartitionner/src/index_layer.rs index 39069aa..022ba95 100644 --- a/src/obikpartitionner/src/index_layer.rs +++ b/src/obikpartitionner/src/index_layer.rs @@ -5,7 +5,7 @@ use cacheline_ef::{CachelineEf, CachelineEfVec}; use epserde::prelude::*; use obicompactvec::{PersistentCompactIntMatrix, PersistentCompactIntVec}; use obidebruinj::GraphDeBruijn; -use obilayeredmap::{EvidenceKind, OLMError, layer::Layer}; +use obilayeredmap::{IndexMode, OLMError, layer::Layer}; use obilayeredmap::meta::PartitionMeta; use obiskio::{SKError, SKFileMeta, SKFileReader}; use ptr_hash::{PtrHash, bucket_fn::CubicEps, hash::Xx64}; @@ -44,7 +44,7 @@ impl KmerPartition { min_ab: u32, max_ab: Option, with_counts: bool, - evidence: &EvidenceKind, + mode: &IndexMode, block_bits: u8, ) -> Result { let part_dir = self.part_dir(i); @@ -110,7 +110,7 @@ impl KmerPartition { uw.close()?; if with_counts { - Layer::::build(&layer_dir, block_bits, evidence, |kmer| { + Layer::::build(&layer_dir, block_bits, mode, |kmer| { match (&mphf1_opt, &counts1_opt) { (Some(mphf), Some(counts)) => counts.get(mphf.index(&kmer.raw())), _ => 1, @@ -118,13 +118,11 @@ impl KmerPartition { }) .map_err(olm_to_sk)?; } else { - Layer::<()>::build(&layer_dir, block_bits, evidence).map_err(olm_to_sk)?; + Layer::<()>::build(&layer_dir, block_bits, mode).map_err(olm_to_sk)?; } - // Write meta.json in the index/ directory so LayeredMap::open works - // (e.g. for subsequent merge operations). let index_dir = layer_dir.parent().expect("layer_dir has a parent"); - PartitionMeta { n_layers: 1 }.save(index_dir).map_err(olm_to_sk)?; + PartitionMeta { n_layers: 1, mode: mode.clone() }.save(index_dir).map_err(olm_to_sk)?; Ok(n_kmers) } diff --git a/src/obikpartitionner/src/merge_layer.rs b/src/obikpartitionner/src/merge_layer.rs index fa6b1be..6107f0b 100644 --- a/src/obikpartitionner/src/merge_layer.rs +++ b/src/obikpartitionner/src/merge_layer.rs @@ -9,7 +9,7 @@ use obicompactvec::{PersistentBitMatrix, PersistentBitMatrixBuilder, PersistentCompactIntVecBuilder}; use obikseq::CanonicalKmer; use obiskio::{SKError, SKResult, UnitigFileReader}; -use obilayeredmap::{EvidenceKind, Layer, LayeredMap, MphfLayer, OLMError}; +use obilayeredmap::{IndexMode, Layer, LayeredMap, MphfLayer, OLMError}; use obilayeredmap::meta::PartitionMeta; use crate::partition::KmerPartition; @@ -52,18 +52,17 @@ pub(crate) enum SrcLayerData { } impl SrcLayerData { - pub(crate) fn open(layer_dir: &Path, mode: MergeMode) -> SKResult { + pub(crate) fn open(layer_dir: &Path, merge_mode: MergeMode, index_mode: &IndexMode) -> SKResult { let presence_dir = layer_dir.join("presence"); let counts_dir = layer_dir.join("counts"); - match mode { + match merge_mode { MergeMode::Presence => { if presence_dir.exists() { - let mphf = MphfLayer::open(layer_dir).map_err(olm_to_sk)?; + let mphf = MphfLayer::open(layer_dir, index_mode).map_err(olm_to_sk)?; let mat = PersistentBitMatrix::open(&presence_dir).map_err(SKError::Io)?; Ok(SrcLayerData::Presence(mphf, mat)) } else if counts_dir.exists() { - // Source is a count index; treat count > 0 as present via ColBuilder::Bit. - let mphf = MphfLayer::open(layer_dir).map_err(olm_to_sk)?; + let mphf = MphfLayer::open(layer_dir, index_mode).map_err(olm_to_sk)?; let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?; Ok(SrcLayerData::Count(mphf, mat)) } else { @@ -72,7 +71,7 @@ impl SrcLayerData { } MergeMode::Count => { if counts_dir.exists() { - let mphf = MphfLayer::open(layer_dir).map_err(olm_to_sk)?; + let mphf = MphfLayer::open(layer_dir, index_mode).map_err(olm_to_sk)?; let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?; Ok(SrcLayerData::Count(mphf, mat)) } else { @@ -116,7 +115,7 @@ fn load_meta(dir: &Path) -> SKResult { Err(e) if matches!(e, OLMError::Io(ref io_e) if io_e.kind() == std::io::ErrorKind::NotFound) => { let mut n = 0usize; while dir.join(format!("layer_{n}")).exists() { n += 1; } - let m = PartitionMeta { n_layers: n }; + let m = PartitionMeta { n_layers: n, mode: IndexMode::default() }; m.save(dir).map_err(olm_to_sk)?; Ok(m) } @@ -217,12 +216,12 @@ impl KmerPartition { uw.write(&unitig)?; } uw.close()?; - Layer::<()>::build(&new_layer_dir, block_bits, &EvidenceKind::Exact).map_err(olm_to_sk)?; + Layer::<()>::build(&new_layer_dir, block_bits, &IndexMode::Exact).map_err(olm_to_sk)?; } drop(g); let new_mphf = if any_new { - Some(MphfLayer::open(&new_layer_dir).map_err(olm_to_sk)?) + Some(MphfLayer::open(&new_layer_dir, &IndexMode::Exact).map_err(olm_to_sk)?) } else { None }; @@ -304,7 +303,7 @@ impl KmerPartition { for l in 0..src_meta.n_layers { let src_layer_dir = src_index_dir.join(format!("layer_{l}")); let reader = UnitigFileReader::open_sequential(&src_layer_dir.join("unitigs.bin"))?; - let src_data = SrcLayerData::open(&src_layer_dir, mode)?; + let src_data = SrcLayerData::open(&src_layer_dir, mode, &src_meta.mode)?; for (kmer, _, _) in reader.iter_indexed_canonical_kmers() { let values = src_data.lookup(kmer, *src_n); diff --git a/src/obikpartitionner/src/rebuild_layer.rs b/src/obikpartitionner/src/rebuild_layer.rs index 29ca5d5..86ee3f5 100644 --- a/src/obikpartitionner/src/rebuild_layer.rs +++ b/src/obikpartitionner/src/rebuild_layer.rs @@ -8,7 +8,7 @@ use obicompactvec::{PersistentBitMatrixBuilder, PersistentCompactIntVecBuilder}; use obidebruinj::GraphDeBruijn; use obiskio::{SKError, SKResult, UnitigFileReader}; -use obilayeredmap::{EvidenceKind, Layer, MphfLayer, OLMError}; +use obilayeredmap::{IndexMode, Layer, MphfLayer, OLMError}; use obilayeredmap::meta::PartitionMeta; use crate::filter::KmerFilter; diff --git a/src/obilayeredmap/src/layer.rs b/src/obilayeredmap/src/layer.rs index cca2a4f..63b99d2 100644 --- a/src/obilayeredmap/src/layer.rs +++ b/src/obilayeredmap/src/layer.rs @@ -10,7 +10,7 @@ use obikseq::CanonicalKmer; use obiskio::{UnitigFileReader, UnitigFileWriter}; use crate::error::{OLMError, OLMResult}; -use crate::meta::EvidenceKind; +use crate::meta::IndexMode; use crate::mphf_layer::MphfLayer; pub(crate) use crate::mphf_layer::UNITIGS_FILE; @@ -62,8 +62,8 @@ pub struct Hit { // ── Common read path ────────────────────────────────────────────────────────── impl Layer { - pub fn open(path: &Path) -> OLMResult { - let mphf = MphfLayer::open(path)?; + pub fn open(path: &Path, mode: &IndexMode) -> OLMResult { + let mphf = MphfLayer::open(path, mode)?; let data = D::open(path)?; Ok(Self { mphf, data }) } @@ -92,18 +92,13 @@ impl Layer { MphfLayer::build_approx_evidence(layer_dir, b, z) } - /// Dispatch to `build_exact_evidence` or `build_approx_evidence`. - /// `block_bits` is forwarded to exact evidence only. - pub fn build_evidence(layer_dir: &Path, kind: &EvidenceKind, block_bits: u8) -> OLMResult { - MphfLayer::build_evidence(layer_dir, kind, block_bits) - } } // ── Mode 1 — set membership ─────────────────────────────────────────────────── impl Layer<()> { - pub fn build(out_dir: &Path, block_bits: u8, evidence_kind: &EvidenceKind) -> OLMResult { - MphfLayer::build(out_dir, block_bits, evidence_kind, &mut |_, _| Ok(())) + pub fn build(out_dir: &Path, block_bits: u8, mode: &IndexMode) -> OLMResult { + MphfLayer::build(out_dir, block_bits, mode, &mut |_, _| Ok(())) } /// Create a presence matrix for a set-membership layer (first merge). @@ -126,7 +121,7 @@ impl Layer { pub fn build( out_dir: &Path, block_bits: u8, - evidence_kind: &EvidenceKind, + mode: &IndexMode, count_of: impl Fn(CanonicalKmer) -> u32, ) -> OLMResult { let n = UnitigFileReader::open_sequential(&out_dir.join(UNITIGS_FILE))?.n_kmers(); @@ -134,7 +129,7 @@ impl Layer { let mut mb = PersistentCompactIntMatrixBuilder::new(n, &counts_dir) .map_err(OLMError::Io)?; let mut col = mb.add_col().map_err(OLMError::Io)?; - let n_built = MphfLayer::build(out_dir, block_bits, evidence_kind, &mut |slot, kmer| { + let n_built = MphfLayer::build(out_dir, block_bits, mode, &mut |slot, kmer| { col.set(slot, count_of(kmer)); Ok(()) })?; @@ -146,10 +141,10 @@ impl Layer { pub fn build_from_map( out_dir: &Path, block_bits: u8, - evidence_kind: &EvidenceKind, + mode: &IndexMode, counts: &HashMap, ) -> OLMResult { - Self::build(out_dir, block_bits, evidence_kind, |kmer| counts.get(&kmer).copied().unwrap_or(0)) + Self::build(out_dir, block_bits, mode, |kmer| counts.get(&kmer).copied().unwrap_or(0)) } } @@ -179,7 +174,7 @@ impl Layer { pub fn build_presence( out_dir: &Path, block_bits: u8, - evidence_kind: &EvidenceKind, + mode: &IndexMode, n_genomes: usize, present_in: impl Fn(CanonicalKmer, usize) -> bool, ) -> OLMResult { @@ -189,7 +184,7 @@ impl Layer { let mut cols: Vec<_> = (0..n_genomes) .map(|_| mb.add_col().map_err(OLMError::Io)) .collect::>()?; - let n_built = MphfLayer::build(out_dir, block_bits, evidence_kind, &mut |slot, kmer| { + let n_built = MphfLayer::build(out_dir, block_bits, mode, &mut |slot, kmer| { for (g, col) in cols.iter_mut().enumerate() { col.set(slot, present_in(kmer, g)); } diff --git a/src/obilayeredmap/src/lib.rs b/src/obilayeredmap/src/lib.rs index 98ca3c8..9b275a0 100644 --- a/src/obilayeredmap/src/lib.rs +++ b/src/obilayeredmap/src/lib.rs @@ -11,5 +11,5 @@ pub use error::{OLMError, OLMResult}; pub use layer::{Hit, Layer, LayerData}; pub use layered_store::LayeredStore; pub use map::LayeredMap; -pub use meta::{EvidenceKind, LayerMeta}; +pub use meta::{IndexMode, PartitionMeta}; pub use mphf_layer::MphfLayer; diff --git a/src/obilayeredmap/src/map.rs b/src/obilayeredmap/src/map.rs index 391ca99..18d3c55 100644 --- a/src/obilayeredmap/src/map.rs +++ b/src/obilayeredmap/src/map.rs @@ -5,11 +5,10 @@ use std::path::{Path, PathBuf}; use obicompactvec::PersistentCompactIntMatrix; use obikseq::CanonicalKmer; use obiskio::{UnitigFileWriter, DEFAULT_BLOCK_BITS}; -use crate::meta::EvidenceKind; use crate::error::OLMResult; use crate::layer::{Hit, Layer, LayerData}; -use crate::meta::PartitionMeta; +use crate::meta::{IndexMode, PartitionMeta}; /// Layered kmer index for a single partition. /// @@ -17,8 +16,8 @@ use crate::meta::PartitionMeta; /// the first match wins. Adding a dataset appends a new layer without /// rebuilding existing ones. pub struct LayeredMap { - root: PathBuf, - meta: PartitionMeta, + root: PathBuf, + meta: PartitionMeta, layers: Vec>, } @@ -26,39 +25,26 @@ pub struct LayeredMap { impl LayeredMap { /// Open an existing layered index at `root`. + /// The mode is read once from `PartitionMeta` and applied to all layers. pub fn open(root: &Path) -> OLMResult { let meta = PartitionMeta::load(root)?; let layers = (0..meta.n_layers) - .map(|i| Layer::::open(&layer_dir(root, i))) + .map(|i| Layer::::open(&layer_dir(root, i), &meta.mode)) .collect::>>()?; - Ok(Self { - root: root.to_owned(), - meta, - layers, - }) + Ok(Self { root: root.to_owned(), meta, layers }) } - /// Create a new, empty layered index at `root`. - pub fn create(root: &Path) -> OLMResult { + /// Create a new, empty layered index at `root` with the given mode. + pub fn create(root: &Path, mode: IndexMode) -> OLMResult { fs::create_dir_all(root)?; - let meta = PartitionMeta::new(); + let meta = PartitionMeta::new(mode); meta.save(root)?; - Ok(Self { - root: root.to_owned(), - meta, - layers: Vec::new(), - }) + Ok(Self { root: root.to_owned(), meta, layers: Vec::new() }) } - /// Return the number of layers in this index. - pub fn n_layers(&self) -> usize { - self.layers.len() - } - - /// Return a reference to the `i`-th layer. - pub fn layer(&self, i: usize) -> &Layer { - &self.layers[i] - } + pub fn n_layers(&self) -> usize { self.layers.len() } + pub fn layer(&self, i: usize) -> &Layer { &self.layers[i] } + pub fn mode(&self) -> &IndexMode { &self.meta.mode } /// Query `kmer` across all layers. Returns `(layer_index, Hit)` on match. pub fn query(&self, kmer: CanonicalKmer) -> Option<(usize, Hit)> { @@ -68,17 +54,15 @@ impl LayeredMap { .find_map(|(i, layer)| layer.query(kmer).map(|hit| (i, hit))) } - /// Return a `UnitigFileWriter` for the next layer to be built. pub fn next_layer_writer(&self) -> OLMResult { let dir = layer_dir(&self.root, self.layers.len()); Layer::::unitig_writer(&dir) } - /// Append a new layer to the index. fn append_layer(&mut self) -> OLMResult<()> { let i = self.layers.len(); let dir = layer_dir(&self.root, i); - self.layers.push(Layer::::open(&dir)?); + self.layers.push(Layer::::open(&dir, &self.meta.mode)?); self.meta.n_layers = self.layers.len(); self.meta.save(&self.root)?; Ok(()) @@ -91,7 +75,7 @@ impl LayeredMap<()> { pub fn push_layer(&mut self) -> OLMResult { let i = self.layers.len(); let dir = layer_dir(&self.root, i); - Layer::<()>::build(&dir, DEFAULT_BLOCK_BITS, &EvidenceKind::Exact)?; + Layer::<()>::build(&dir, DEFAULT_BLOCK_BITS, &self.meta.mode)?; self.append_layer()?; Ok(i) } @@ -103,15 +87,12 @@ impl LayeredMap { pub fn push_layer(&mut self, count_of: impl Fn(CanonicalKmer) -> u32) -> OLMResult { let i = self.layers.len(); let dir = layer_dir(&self.root, i); - Layer::::build(&dir, DEFAULT_BLOCK_BITS, &EvidenceKind::Exact, count_of)?; + Layer::::build(&dir, DEFAULT_BLOCK_BITS, &self.meta.mode, count_of)?; self.append_layer()?; Ok(i) } - pub fn push_layer_from_map( - &mut self, - counts: &HashMap, - ) -> OLMResult { + pub fn push_layer_from_map(&mut self, counts: &HashMap) -> OLMResult { self.push_layer(|kmer| counts.get(&kmer).copied().unwrap_or(0)) } } diff --git a/src/obilayeredmap/src/meta.rs b/src/obilayeredmap/src/meta.rs index 352cbdd..ed567d3 100644 --- a/src/obilayeredmap/src/meta.rs +++ b/src/obilayeredmap/src/meta.rs @@ -5,65 +5,45 @@ use serde::{Deserialize, Serialize}; use crate::error::OLMResult; -const META_FILE: &str = "meta.json"; -const LAYER_META_FILE: &str = "layer_meta.json"; +const META_FILE: &str = "meta.json"; -// ── Layer-level metadata ────────────────────────────────────────────────────── +// ── IndexMode ───────────────────────────────────────────────────────────────── -/// Describes the evidence bundle stored alongside the MPHF for one layer. +/// Evidence mode for an entire partitioned index — homogeneous across all layers. +/// +/// Determined once at build time; stored in `PartitionMeta` (`meta.json`). +/// All layers within an index share the same mode. #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(tag = "type", rename_all = "snake_case")] -pub enum EvidenceKind { +pub enum IndexMode { /// Exact evidence: `evidence.bin` + `unitigs.bin.idx`. Zero false positives. Exact, /// Approximate evidence: `fingerprint.bin` only. - /// `b` — fingerprint bits; false-positive rate per k-mer query = 1/2^b. - /// `z` — consecutive k-mers that must all match (Findere trick); - /// effective FP rate per sequencing read ≈ W / 2^(b·z) - /// where W = L - k - z + 2 is the number of windows in a read of length L. + /// `b` — fingerprint bits per slot; false-positive rate ≈ 1/2^b per query. + /// `z` — Findere consecutive-kmer parameter (build-time only; not used at query time). Approx { b: u8, z: u8 }, + /// Hybrid: both `fingerprint.bin` and `evidence.bin` + `unitigs.bin.idx`. + /// `find()` uses the fingerprint (O(1), approx); `find_strict()` uses exact evidence. + Hybrid { b: u8, z: u8 }, } -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct LayerMeta { - pub evidence: EvidenceKind, -} - -impl Default for EvidenceKind { +impl Default for IndexMode { fn default() -> Self { Self::Exact } } -impl LayerMeta { - pub fn exact() -> Self { - Self { evidence: EvidenceKind::Exact } - } - - pub fn approx(b: u8, z: u8) -> Self { - Self { evidence: EvidenceKind::Approx { b, z } } - } - - pub fn load(layer_dir: &Path) -> OLMResult { - let f = File::open(layer_dir.join(LAYER_META_FILE))?; - Ok(serde_json::from_reader(f)?) - } - - pub fn save(&self, layer_dir: &Path) -> OLMResult<()> { - let f = File::create(layer_dir.join(LAYER_META_FILE))?; - serde_json::to_writer_pretty(f, self)?; - Ok(()) - } -} - -// ── Partition-level metadata ────────────────────────────────────────────────── +// ── PartitionMeta ───────────────────────────────────────────────────────────── +/// Index-level metadata stored in `meta.json` at the root of a partition index. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct PartitionMeta { pub n_layers: usize, + #[serde(default)] + pub mode: IndexMode, } impl PartitionMeta { - pub fn new() -> Self { - Self { n_layers: 0 } + pub fn new(mode: IndexMode) -> Self { + Self { n_layers: 0, mode } } pub fn load(dir: &Path) -> OLMResult { @@ -79,5 +59,5 @@ impl PartitionMeta { } impl Default for PartitionMeta { - fn default() -> Self { Self::new() } + fn default() -> Self { Self::new(IndexMode::Exact) } } diff --git a/src/obilayeredmap/src/mphf_layer.rs b/src/obilayeredmap/src/mphf_layer.rs index c94ce69..877e564 100644 --- a/src/obilayeredmap/src/mphf_layer.rs +++ b/src/obilayeredmap/src/mphf_layer.rs @@ -1,5 +1,5 @@ use std::fs; -use std::path::Path; +use std::path::{Path, PathBuf}; use cacheline_ef::{CachelineEf, CachelineEfVec}; use epserde::prelude::*; @@ -10,7 +10,7 @@ use ptr_hash::{PtrHash, PtrHashParams, bucket_fn::CubicEps, hash::Xx64}; use crate::error::{OLMError, OLMResult}; use crate::evidence::{Evidence, EvidenceWriter}; use crate::fingerprint::{FingerprintVec, FingerprintVecWriter}; -use crate::meta::{EvidenceKind, LayerMeta}; +use crate::meta::IndexMode; pub(crate) const MPHF_FILE: &str = "mphf.bin"; pub(crate) const UNITIGS_FILE: &str = "unitigs.bin"; @@ -19,19 +19,22 @@ pub(crate) const FINGERPRINT_FILE: &str = "fingerprint.bin"; pub(crate) type Mphf = PtrHash>, Xx64, Vec>; -// ── Evidence store ──────────────────────────────────────────────────────────── +// ── LayerEvidence ───────────────────────────────────────────────────────────── enum LayerEvidence { - Exact { evidence: Evidence, unitigs: UnitigFileReader }, - Approx { fingerprint: FingerprintVec }, + Exact { evidence: Evidence, unitigs: UnitigFileReader }, + Approx { fingerprint: FingerprintVec, unitigs_path: PathBuf }, + Hybrid { evidence: Evidence, unitigs: UnitigFileReader, fingerprint: FingerprintVec }, } // ── MphfLayer ───────────────────────────────────────────────────────────────── /// Autonomous kmer → slot mapping for one layer. /// -/// Dispatches queries to exact or approximate evidence transparently based on -/// the `layer_meta.json` written at build time. +/// Two query methods: +/// - [`find`](Self::find) — O(1), uses fingerprint (Approx/Hybrid) or exact evidence (Exact). +/// - [`find_strict`](Self::find_strict) — always exact; O(1) on Exact/Hybrid layers, +/// O(n) sequential scan on Approx layers. pub struct MphfLayer { mphf: Mphf, ev: LayerEvidence, @@ -39,21 +42,31 @@ pub struct MphfLayer { } impl MphfLayer { - pub fn open(dir: &Path) -> OLMResult { - let meta = LayerMeta::load(dir)?; + /// Open a layer using the index-level `mode` determined at `LayeredMap` open time. + /// No per-layer metadata file is read. + pub fn open(dir: &Path, mode: &IndexMode) -> OLMResult { let mphf: Mphf = Mphf::load_full(&dir.join(MPHF_FILE)) .map_err(|e| OLMError::InvalidLayer(e.to_string()))?; - let (ev, n) = match meta.evidence { - EvidenceKind::Exact => { + let (ev, n) = match mode { + IndexMode::Exact => { let evidence = Evidence::open(&dir.join(EVIDENCE_FILE))?; let n = evidence.len(); + // open() auto-detects: uses direct access since exact layers always have .idx let unitigs = UnitigFileReader::open(&dir.join(UNITIGS_FILE))?; (LayerEvidence::Exact { evidence, unitigs }, n) } - EvidenceKind::Approx { .. } => { + IndexMode::Approx { .. } => { let fingerprint = FingerprintVec::open(&dir.join(FINGERPRINT_FILE))?; let n = fingerprint.n(); - (LayerEvidence::Approx { fingerprint }, n) + let unitigs_path = dir.join(UNITIGS_FILE); + (LayerEvidence::Approx { fingerprint, unitigs_path }, n) + } + IndexMode::Hybrid { .. } => { + let evidence = Evidence::open(&dir.join(EVIDENCE_FILE))?; + let fingerprint = FingerprintVec::open(&dir.join(FINGERPRINT_FILE))?; + let n = evidence.len(); + let unitigs = UnitigFileReader::open(&dir.join(UNITIGS_FILE))?; + (LayerEvidence::Hybrid { evidence, unitigs, fingerprint }, n) } }; Ok(Self { mphf, ev, n }) @@ -61,45 +74,60 @@ impl MphfLayer { // ── Query API ───────────────────────────────────────────────────────────── - /// Transparent dispatch: routes to `find_exact` or `find_approx` based on - /// the evidence loaded at `open` time. + /// O(1) lookup — dispatches automatically: + /// - Exact: evidence + `verify_canonical_kmer`, zero false positives. + /// - Approx: fingerprint check, false-positive rate ≈ 1/2^b. + /// - Hybrid: fingerprint check (fast path), zero false positives via `find_strict`. #[inline] pub fn find(&self, kmer: CanonicalKmer) -> Option { + let slot = self.mphf.index(&kmer.raw()); + if slot >= self.n { return None; } match &self.ev { - LayerEvidence::Exact { .. } => self.find_exact(kmer), - LayerEvidence::Approx { .. } => self.find_approx(kmer), + LayerEvidence::Exact { evidence, unitigs } => { + let (chunk_id, rank) = evidence.decode(slot); + if unitigs.verify_canonical_kmer(chunk_id as usize, rank as usize, kmer) { + Some(slot) + } else { + None + } + } + LayerEvidence::Approx { fingerprint, .. } | + LayerEvidence::Hybrid { fingerprint, .. } => { + if fingerprint.matches(slot, kmer.seq_hash()) { Some(slot) } else { None } + } } } - /// Exact lookup: zero false positives. Panics if the layer was opened with - /// approximate evidence. - #[inline] - pub fn find_exact(&self, kmer: CanonicalKmer) -> Option { - let LayerEvidence::Exact { evidence, unitigs } = &self.ev else { - panic!("find_exact called on an approximate layer"); - }; + /// Always-exact lookup — zero false positives regardless of mode. + /// + /// - Exact/Hybrid: O(1) via evidence + `verify_canonical_kmer`. + /// - Approx: O(n) sequential scan of `unitigs.bin` to confirm the kmer + /// that owns the slot, then exact comparison. + pub fn find_strict(&self, kmer: CanonicalKmer) -> Option { let slot = self.mphf.index(&kmer.raw()); if slot >= self.n { return None; } - let (chunk_id, rank) = evidence.decode(slot); - if unitigs.verify_canonical_kmer(chunk_id as usize, rank as usize, kmer) { - Some(slot) - } else { - None + match &self.ev { + LayerEvidence::Exact { evidence, unitigs } | + LayerEvidence::Hybrid { evidence, unitigs, .. } => { + let (chunk_id, rank) = evidence.decode(slot); + if unitigs.verify_canonical_kmer(chunk_id as usize, rank as usize, kmer) { + Some(slot) + } else { + None + } + } + LayerEvidence::Approx { unitigs_path, .. } => { + let reader = UnitigFileReader::open_sequential(unitigs_path).ok()?; + for stored in reader.iter_canonical_kmers() { + if self.mphf.index(&stored.raw()) == slot { + return if stored == kmer { Some(slot) } else { None }; + } + } + None + } } } - /// Approximate lookup: false-positive rate 1/2^b per k-mer query. Panics - /// if the layer was opened with exact evidence. - #[inline] - pub fn find_approx(&self, kmer: CanonicalKmer) -> Option { - let LayerEvidence::Approx { fingerprint } = &self.ev else { - panic!("find_approx called on an exact layer"); - }; - let slot = self.mphf.index(&kmer.raw()); - if slot >= self.n { return None; } - if fingerprint.matches(slot, kmer.seq_hash()) { Some(slot) } else { None } - } - pub fn n(&self) -> usize { self.n } // ── Build helpers ───────────────────────────────────────────────────────── @@ -109,19 +137,7 @@ impl MphfLayer { Ok(UnitigFileWriter::create(&dir.join(UNITIGS_FILE))?) } - /// Dispatch to `build_exact_evidence` or `build_approx_evidence` based on - /// `kind`. `block_bits` is forwarded to exact evidence only. - pub fn build_evidence(dir: &Path, kind: &EvidenceKind, block_bits: u8) -> OLMResult { - match kind { - EvidenceKind::Exact => Self::build_exact_evidence(dir, block_bits), - EvidenceKind::Approx { b, z } => Self::build_approx_evidence(dir, *b, *z), - } - } - /// Build `evidence.bin` + `unitigs.bin.idx` from `unitigs.bin` + `mphf.bin`. - /// - /// `block_bits` controls the `.idx` block size (2^block_bits chunks per block). - /// Uses sequential iteration — no `.idx` required on entry. pub fn build_exact_evidence(dir: &Path, block_bits: u8) -> OLMResult { let unitig_path = dir.join(UNITIGS_FILE); let unitigs = UnitigFileReader::open_sequential(&unitig_path)?; @@ -130,7 +146,6 @@ impl MphfLayer { if n == 0 { fs::File::create(dir.join(EVIDENCE_FILE))?; build_unitig_idx(&unitig_path, block_bits)?; - LayerMeta::exact().save(dir)?; return Ok(0); } @@ -156,13 +171,10 @@ impl MphfLayer { ev.write(&dir.join(EVIDENCE_FILE))?; build_unitig_idx(&unitig_path, block_bits)?; - LayerMeta::exact().save(dir)?; Ok(n) } /// Build `fingerprint.bin` from `unitigs.bin` + `mphf.bin`. - /// `b` — fingerprint bits (1..=64); `z` — Findere consecutive k-mer - /// parameter (≥1). No `.idx` is written. pub fn build_approx_evidence(dir: &Path, b: u8, z: u8) -> OLMResult { if b == 0 || b > 64 { return Err(OLMError::InvalidLayer("fingerprint width must be 1..=64".into())); @@ -176,7 +188,6 @@ impl MphfLayer { if n == 0 { FingerprintVecWriter::new(0, b).write(&dir.join(FINGERPRINT_FILE))?; - LayerMeta::approx(b, z).save(dir)?; return Ok(0); } @@ -194,139 +205,113 @@ impl MphfLayer { } fw.write(&dir.join(FINGERPRINT_FILE))?; - LayerMeta::approx(b, z).save(dir)?; Ok(n) } - /// Build MPHF then evidence from the unitigs file already present in `dir`. + /// Build MPHF + evidence from `unitigs.bin` already present in `dir`. /// - /// - Exact: `.idx` is built for pass-1 parallel construction and kept for - /// query-time kmer verification. `evidence.bin` is written. - /// - Approx: pass-1 uses `open_sequential` + `par_bridge` — no `.idx` is - /// ever created. `fingerprint.bin` is written. - /// - /// `fill_slot(slot, kmer)` is called once per kmer in both modes. + /// `fill_slot(slot, kmer)` is called once per kmer in all modes. + /// No `layer_meta.json` is written — the mode is an index-level property + /// stored in `PartitionMeta`. pub(crate) fn build( dir: &Path, block_bits: u8, - evidence_kind: &EvidenceKind, + mode: &IndexMode, fill_slot: &mut impl FnMut(usize, CanonicalKmer) -> OLMResult<()>, ) -> OLMResult { use rayon::prelude::*; let unitig_path = dir.join(UNITIGS_FILE); + let n = UnitigFileReader::open_sequential(&unitig_path)?.n_kmers(); - match evidence_kind { - // ── Exact path ──────────────────────────────────────────────────── - // .idx is built LAST, once evidence.bin is written, so it is never - // present during construction — only at query time. - EvidenceKind::Exact => { - let n = UnitigFileReader::open_sequential(&unitig_path)?.n_kmers(); - let keys = CanonicalKmerIter::new(&unitig_path) - .map_err(|e| match e { - obiskio::SKError::Io(io) => OLMError::Io(io), - e => OLMError::InvalidLayer(e.to_string()), - })?; + let sk_to_olm = |e: obiskio::SKError| match e { + obiskio::SKError::Io(io) => OLMError::Io(io), + e => OLMError::InvalidLayer(e.to_string()), + }; - if n == 0 { + // ── Empty layer ─────────────────────────────────────────────────────── + if n == 0 { + let mphf: Mphf = + Mphf::try_new(&[] as &[u64], PtrHashParams::::default()) + .ok_or_else(|| OLMError::Mphf("construction failed".into()))?; + mphf.store(&dir.join(MPHF_FILE)) + .map_err(|e| OLMError::InvalidLayer(e.to_string()))?; + match mode { + IndexMode::Exact | IndexMode::Hybrid { .. } => { fs::File::create(dir.join(EVIDENCE_FILE))?; - let mphf: Mphf = - Mphf::try_new(&[] as &[u64], PtrHashParams::::default()) - .ok_or_else(|| OLMError::Mphf("construction failed".into()))?; - mphf.store(&dir.join(MPHF_FILE)) - .map_err(|e| OLMError::InvalidLayer(e.to_string()))?; - LayerMeta::exact().save(dir)?; build_unitig_idx(&unitig_path, block_bits)?; - return Ok(0); } + IndexMode::Approx { b, .. } => { + FingerprintVecWriter::new(0, *b).write(&dir.join(FINGERPRINT_FILE))?; + } + } + if let IndexMode::Hybrid { b, .. } = mode { + FingerprintVecWriter::new(0, *b).write(&dir.join(FINGERPRINT_FILE))?; + } + return Ok(0); + } - // Pass 1 — MPHF construction via clonable mmap iterator - let mphf: Mphf = - Mphf::new_from_par_iter(n, keys.map(|k| k.raw()).par_bridge(), PtrHashParams::::default()); - mphf.store(&dir.join(MPHF_FILE)) - .map_err(|e| OLMError::InvalidLayer(e.to_string()))?; + // ── Pass 1: MPHF via clonable mmap iterator ─────────────────────────── + let keys = CanonicalKmerIter::new(&unitig_path).map_err(sk_to_olm)?; + let mphf: Mphf = + Mphf::new_from_par_iter(n, keys.map(|k| k.raw()).par_bridge(), + PtrHashParams::::default()); + mphf.store(&dir.join(MPHF_FILE)) + .map_err(|e| OLMError::InvalidLayer(e.to_string()))?; - // Pass 2 — sequential: fill evidence.bin + callback - let unitigs2 = UnitigFileReader::open_sequential(&unitig_path)?; - let mut ev = EvidenceWriter::new(n); - let mut seen = vec![0u8; (n + 7) / 8]; + // ── Pass 2: fill evidence files + callback ──────────────────────────── + let unitigs2 = UnitigFileReader::open_sequential(&unitig_path)?; + let mut seen = vec![0u8; (n + 7) / 8]; + match mode { + IndexMode::Exact => { + let mut ev = EvidenceWriter::new(n); for (kmer, chunk_id, rank) in unitigs2.iter_indexed_canonical_kmers() { let slot = mphf.index(&kmer.raw()); - if slot >= n { - return Err(OLMError::Mphf("slot out of bounds".into())); - } - let byte = slot / 8; - let bit = 1u8 << (slot % 8); - if seen[byte] & bit != 0 { - return Err(OLMError::Mphf("duplicate slot".into())); - } + if slot >= n { return Err(OLMError::Mphf("slot out of bounds".into())); } + let byte = slot / 8; let bit = 1u8 << (slot % 8); + if seen[byte] & bit != 0 { return Err(OLMError::Mphf("duplicate slot".into())); } seen[byte] |= bit; ev.set(slot, chunk_id as u32, rank as u8); fill_slot(slot, kmer)?; } - ev.write(&dir.join(EVIDENCE_FILE))?; - LayerMeta::exact().save(dir)?; - // .idx built last: strictly for query-time kmer verification build_unitig_idx(&unitig_path, block_bits)?; - Ok(n) } - // ── Approx path ─────────────────────────────────────────────────── - // No .idx is created at any point. - EvidenceKind::Approx { b, z } => { - let unitigs = UnitigFileReader::open_sequential(&unitig_path)?; - let n = unitigs.n_kmers(); - - if n == 0 { - FingerprintVecWriter::new(0, *b).write(&dir.join(FINGERPRINT_FILE))?; - let mphf: Mphf = - Mphf::try_new(&[] as &[u64], PtrHashParams::::default()) - .ok_or_else(|| OLMError::Mphf("construction failed".into()))?; - mphf.store(&dir.join(MPHF_FILE)) - .map_err(|e| OLMError::InvalidLayer(e.to_string()))?; - LayerMeta::approx(*b, *z).save(dir)?; - return Ok(0); - } - - // Pass 1 — MPHF construction via mmap-backed clonable iterator. - // No .idx is created. par_bridge() parallelises the sequential scan; - // Clone on CanonicalKmerRawIter shares the Arc and resets to pos 0. - let keys = CanonicalKmerIter::new(&unitig_path) - .map_err(|e| match e { - obiskio::SKError::Io(io) => OLMError::Io(io), - e => OLMError::InvalidLayer(e.to_string()), - })?; - let mphf: Mphf = - Mphf::new_from_par_iter(n, keys.map(|k| k.raw()).par_bridge(), PtrHashParams::::default()); - mphf.store(&dir.join(MPHF_FILE)) - .map_err(|e| OLMError::InvalidLayer(e.to_string()))?; - - // Pass 2 — sequential: fill fingerprint.bin + callback - let unitigs2 = UnitigFileReader::open_sequential(&unitig_path)?; - let mut fw = FingerprintVecWriter::new(n, *b); - let mut seen = vec![0u8; (n + 7) / 8]; - + IndexMode::Approx { b, .. } => { + let mut fw = FingerprintVecWriter::new(n, *b); for kmer in unitigs2.iter_canonical_kmers() { let slot = mphf.index(&kmer.raw()); - if slot >= n { - return Err(OLMError::Mphf("slot out of bounds".into())); - } - let byte = slot / 8; - let bit = 1u8 << (slot % 8); - if seen[byte] & bit != 0 { - return Err(OLMError::Mphf("duplicate slot".into())); - } + if slot >= n { return Err(OLMError::Mphf("slot out of bounds".into())); } + let byte = slot / 8; let bit = 1u8 << (slot % 8); + if seen[byte] & bit != 0 { return Err(OLMError::Mphf("duplicate slot".into())); } seen[byte] |= bit; fw.set(slot, kmer.seq_hash()); fill_slot(slot, kmer)?; } - fw.write(&dir.join(FINGERPRINT_FILE))?; - LayerMeta::approx(*b, *z).save(dir)?; - Ok(n) + } + + IndexMode::Hybrid { b, .. } => { + let mut ev = EvidenceWriter::new(n); + let mut fw = FingerprintVecWriter::new(n, *b); + for (kmer, chunk_id, rank) in unitigs2.iter_indexed_canonical_kmers() { + let slot = mphf.index(&kmer.raw()); + if slot >= n { return Err(OLMError::Mphf("slot out of bounds".into())); } + let byte = slot / 8; let bit = 1u8 << (slot % 8); + if seen[byte] & bit != 0 { return Err(OLMError::Mphf("duplicate slot".into())); } + seen[byte] |= bit; + ev.set(slot, chunk_id as u32, rank as u8); + fw.set(slot, kmer.seq_hash()); + fill_slot(slot, kmer)?; + } + ev.write(&dir.join(EVIDENCE_FILE))?; + fw.write(&dir.join(FINGERPRINT_FILE))?; + build_unitig_idx(&unitig_path, block_bits)?; } } + + Ok(n) } } diff --git a/src/obiskio/src/unitig_index.rs b/src/obiskio/src/unitig_index.rs index f5bf310..fd46892 100644 --- a/src/obiskio/src/unitig_index.rs +++ b/src/obiskio/src/unitig_index.rs @@ -198,11 +198,16 @@ pub fn build_unitig_idx(unitigs_path: &Path, block_bits: u8) -> SKResult<()> { // ── Reader ──────────────────────────────────────────────────────────────────── -/// Read-only random-access view of a unitig file. +/// Memory-mapped view of a unitig file, with optional direct-access index. /// -/// The sequence file is memory-mapped; the block offset table is loaded into RAM -/// on open. Random access to chunk `i`: O(1 << block_bits) sequential mmap -/// reads. Sequential iteration: O(n) via a running-offset cursor. +/// Three constructors select the operating mode: +/// - [`open`](Self::open) — smart default: direct access if `.idx` exists, sequential otherwise. +/// - [`open_sequential`](Self::open_sequential) — always sequential, ignores `.idx`. +/// - [`open_direct_access`](Self::open_direct_access) — requires `.idx`, errors if absent. +/// +/// All positional methods (`chunk_start`, `verify_canonical_kmer`, …) work in +/// both modes. Without `.idx` they fall back to an O(i) sequential scan — +/// correct but slower. pub struct UnitigFileReader { mmap: Mmap, block_offsets: Vec, @@ -214,8 +219,52 @@ pub struct UnitigFileReader { } impl UnitigFileReader { - /// Open with `.idx` — enables both sequential iteration and random access. + /// Smart default: opens with direct access if `.idx` is present, sequential otherwise. pub fn open(path: &Path) -> SKResult { + if idx_path(path).exists() { + Self::open_direct_access(path) + } else { + Self::open_sequential(path) + } + } + + /// Always sequential — never reads `.idx` even if present. + /// + /// Scans the binary file once to count chunks and k-mers. + /// Positional access (`chunk_start`, `verify_canonical_kmer`) falls back to + /// O(i) sequential scan. + pub fn open_sequential(path: &Path) -> SKResult { + let file = File::open(path).map_err(SKError::Io)?; + let mmap = unsafe { Mmap::map(&file).map_err(SKError::Io)? }; + let k = obikseq::params::k(); + + let mut offset = 0usize; + let mut n_unitigs = 0usize; + let mut n_kmers = 0usize; + while offset < mmap.len() { + let seql_minus_k = mmap[offset] as usize; + n_kmers += seql_minus_k + 1; + offset += 1 + (seql_minus_k + k + 3) / 4; + n_unitigs += 1; + } + + Ok(Self { + mmap, + block_offsets: Vec::new(), + n_unitigs, + n_kmers, + k, + block_bits: DEFAULT_BLOCK_BITS, + mask: (1usize << DEFAULT_BLOCK_BITS) - 1, + }) + } + + /// Requires `.idx` — errors if the companion index file is absent. + /// + /// Enables O(1 << block_bits) positional access to any chunk. + /// Use only when direct access is architecturally required (query-time + /// verification on an exact-evidence layer). + pub fn open_direct_access(path: &Path) -> SKResult { let file = File::open(path).map_err(SKError::Io)?; let mmap = unsafe { Mmap::map(&file).map_err(SKError::Io)? }; let (n_unitigs, n_kmers, block_bits, block_offsets) = read_idx(&idx_path(path))?; @@ -231,58 +280,38 @@ impl UnitigFileReader { }) } - /// Open without `.idx` — sequential iteration only, no random access. - /// - /// Scans the binary file once to count chunks and k-mers. Use when only - /// [`Self::iter_kmers`], [`Self::iter_canonical_kmers`], or - /// [`Self::iter_indexed_canonical_kmers`] are needed. - pub fn open_sequential(path: &Path) -> SKResult { - let file = File::open(path).map_err(SKError::Io)?; - let mmap = unsafe { Mmap::map(&file).map_err(SKError::Io)? }; - let k = obikseq::params::k(); - - let mut offset = 0usize; - let mut n_unitigs = 0usize; - let mut n_kmers = 0usize; - while offset < mmap.len() { - let seql_minus_k = mmap[offset] as usize; - n_kmers += seql_minus_k + 1; - offset += 1 + (seql_minus_k + k + 3) / 4; - n_unitigs += 1; - } - - Ok(Self { - mmap, - block_offsets: Vec::new(), // empty → random access disabled - n_unitigs, - n_kmers, - k, - block_bits: DEFAULT_BLOCK_BITS, - mask: (1usize << DEFAULT_BLOCK_BITS) - 1, - }) - } - pub fn len(&self) -> usize { self.n_unitigs } pub fn is_empty(&self) -> bool { self.n_unitigs == 0 } pub fn n_kmers(&self) -> usize { self.n_kmers } pub fn block_bits(&self) -> u8 { self.block_bits } + pub fn has_direct_access(&self) -> bool { !self.block_offsets.is_empty() } - /// Byte offset of the START of record `i` (the seql byte) in the mmap. + /// Byte offset of record `i` in the mmap. + /// + /// Fast path (O(1 << block_bits)) when `.idx` is loaded; degraded O(i) + /// sequential scan otherwise. #[inline] fn chunk_start(&self, i: usize) -> usize { - assert!(!self.block_offsets.is_empty(), - "random access requires UnitigFileReader::open(); use open_sequential() for iteration only"); - if self.block_bits == 0 { - return self.block_offsets[i] as usize; + if !self.block_offsets.is_empty() { + if self.block_bits == 0 { + return self.block_offsets[i] as usize; + } + let block = i >> self.block_bits; + let rem = i & self.mask; + let mut offset = self.block_offsets[block] as usize; + for _ in 0..rem { + let seql_minus_k = self.mmap[offset] as usize; + offset += 1 + (seql_minus_k + self.k + 3) / 4; + } + offset + } else { + let mut offset = 0usize; + for _ in 0..i { + let seql_minus_k = self.mmap[offset] as usize; + offset += 1 + (seql_minus_k + self.k + 3) / 4; + } + offset } - let block = i >> self.block_bits; - let rem = i & self.mask; - let mut offset = self.block_offsets[block] as usize; - for _ in 0..rem { - let seql_minus_k = self.mmap[offset] as usize; - offset += 1 + (seql_minus_k + self.k + 3) / 4; - } - offset } /// Nucleotide length of chunk `i`. @@ -307,7 +336,9 @@ impl UnitigFileReader { extract_kmer_raw(&self.mmap[offset + 1..], j, self.k) } - /// `true` iff the k-mer at position `j` of chunk `i` equals `query` (canonical). + /// `true` iff the k-mer at position `j` of chunk `i` matches `query`. + /// + /// Works in both modes; O(i) scan when `.idx` is absent. #[inline] pub fn verify_canonical_kmer(&self, i: usize, j: usize, query: CanonicalKmer) -> bool { canonical_raw(self.raw_kmer(i, j), self.k) == query.raw() -- 2.52.0 From 9e60a711bc14550c8788eb66b8c3240227c94739 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Tue, 26 May 2026 14:33:15 +0200 Subject: [PATCH 2/5] Enforce minimum input paths and handle stdin sentinel Update CLI validation to require at least 10 input paths, defaulting to stdin (`-`) when the argument list is empty. Refactor the path iterator to explicitly recognize the stdin sentinel, bypassing extension validation and directory expansion to ensure direct passthrough to the file buffer without triggering `stat()` or recursive traversal. --- src/obikmer/src/cli.rs | 11 ++++++++--- src/obiread/src/path_iterator.rs | 13 +++++++------ 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/src/obikmer/src/cli.rs b/src/obikmer/src/cli.rs index 83308d1..42c7600 100644 --- a/src/obikmer/src/cli.rs +++ b/src/obikmer/src/cli.rs @@ -10,8 +10,9 @@ use obikseq::RoutableSuperKmer; #[derive(Args)] pub struct CommonArgs { - /// Input files or directories (FASTA/FASTQ, optionally gzip-compressed) - #[arg(num_args = 1..)] + /// Input files or directories (FASTA/FASTQ, optionally gzip-compressed). + /// If omitted, reads from stdin. + #[arg(num_args = 0..)] pub inputs: Vec, /// k-mer size @@ -63,7 +64,11 @@ pub fn block_size_to_bits(n: usize) -> u8 { impl CommonArgs { pub fn seqfile_paths(&self) -> obiread::PathIter { - let paths = self.inputs.iter().map(PathBuf::from).collect(); + let paths: Vec = if self.inputs.is_empty() { + vec![PathBuf::from("-")] + } else { + self.inputs.iter().map(PathBuf::from).collect() + }; obiread::PathIter::new(paths) } } diff --git a/src/obiread/src/path_iterator.rs b/src/obiread/src/path_iterator.rs index 6a0c833..9e73144 100644 --- a/src/obiread/src/path_iterator.rs +++ b/src/obiread/src/path_iterator.rs @@ -19,12 +19,13 @@ impl PathIter { file_buffer: Vec::new(), }; for path in paths { - // Avoid stat() at construction time on network filesystems (Lustre, NFS) - // where metadata operations can be 100s of milliseconds each. - // Paths that look like sequence files are assumed to be files. - // Anything else is treated as a potential directory and expanded lazily - // in next(); read_dir errors are silently skipped. - if is_fasta_or_fastq(&path) { + // "-" is the stdin sentinel — pass it through without any extension + // check or directory expansion. + if path.as_os_str() == "-" { + iter.file_buffer.push(path); + } else if is_fasta_or_fastq(&path) { + // Avoid stat() at construction time on network filesystems (Lustre, NFS) + // where metadata operations can be 100s of milliseconds each. iter.file_buffer.push(path); } else { iter.dir_stack.push(path); -- 2.52.0 From dfa0b2bac2118022e67361e3b49e58b177d35802 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Tue, 26 May 2026 14:42:18 +0200 Subject: [PATCH 3/5] feat: add utils subcommand for renaming genome labels Introduces a `utils` CLI subcommand to enable in-place genome label renaming without full reindexing. Adds strict label validation to reject empty strings, filesystem separators, and control characters, ensuring safe CSV serialization. Updates index metadata, renames corresponding spectrum JSON files, and registers the command in the main dispatch logic. CLI reference documentation is also updated. --- docmd/index.md | 3 +- src/obikindex/src/lib.rs | 2 +- src/obikindex/src/meta.rs | 23 +++++++++ src/obikmer/src/cmd/index.rs | 6 ++- src/obikmer/src/cmd/mod.rs | 1 + src/obikmer/src/cmd/utils.rs | 91 ++++++++++++++++++++++++++++++++++++ src/obikmer/src/main.rs | 3 ++ 7 files changed, 126 insertions(+), 3 deletions(-) create mode 100644 src/obikmer/src/cmd/utils.rs diff --git a/docmd/index.md b/docmd/index.md index 9cc7ef2..bd3e968 100644 --- a/docmd/index.md +++ b/docmd/index.md @@ -17,6 +17,7 @@ | `unitig` | Dump unitigs from a built index to stdout (debug) | | `estimate` | Estimate approximate-index parameters (z, evidence bits, FP rates) before indexing | | `reindex` | Convert an index's evidence in-place: exact ↔ approx | +| `utils` | Miscellaneous index utilities: `--new-label NEW=OLD` renames a genome label in-place | ## Constraints @@ -24,7 +25,7 @@ - Maximum efficiency in computation, memory, and disk usage - k odd, k ∈ [11, 31], fixed at runtime; kmer fits in a u64 (2 bits/base) - Canonical form: `min(kmer, revcomp(kmer))` reduces strand-symmetric space by half -- Input formats: FASTA, FASTQ, gzip, streaming stdin +- Input formats: FASTA, FASTQ, gzip, streaming stdin; `index` reads from stdin automatically when no input files are provided (`-` can also be passed explicitly among other paths) ## Priority operations diff --git a/src/obikindex/src/lib.rs b/src/obikindex/src/lib.rs index 9f84178..b5e881e 100644 --- a/src/obikindex/src/lib.rs +++ b/src/obikindex/src/lib.rs @@ -12,5 +12,5 @@ pub use error::{OKIError, OKIResult}; pub use distance::{DistanceMetric, DistanceOutput}; pub use index::KmerIndex; pub use merge::MergeMode; -pub use meta::{GenomeInfo, IndexConfig, IndexMeta, META_FILENAME}; +pub use meta::{validate_label, GenomeInfo, IndexConfig, IndexMeta, META_FILENAME}; pub use state::{IndexState, SENTINEL_COUNTED, SENTINEL_INDEXED, SENTINEL_SCATTERED}; diff --git a/src/obikindex/src/meta.rs b/src/obikindex/src/meta.rs index 8518f62..b0c5aa7 100644 --- a/src/obikindex/src/meta.rs +++ b/src/obikindex/src/meta.rs @@ -70,3 +70,26 @@ impl IndexMeta { self.genomes.iter().map(|g| g.label.as_str()) } } + +/// Validate a user-supplied genome label. +/// +/// Forbidden: `/` (filesystem separator), `=` (--new-label parser separator), +/// `\0` (null byte), `\n`, `\r`, `\t` (break CSV output). +/// Empty labels are also rejected. +pub fn validate_label(label: &str) -> Result<(), String> { + if label.is_empty() { + return Err("genome label must not be empty".into()); + } + const FORBIDDEN: &[char] = &['/', '=', '\0', '\n', '\r', '\t']; + if let Some(c) = label.chars().find(|c| FORBIDDEN.contains(c)) { + let display = match c { + '\0' => "\\0 (null)".to_string(), + '\n' => "\\n (newline)".to_string(), + '\r' => "\\r (carriage return)".to_string(), + '\t' => "\\t (tab)".to_string(), + c => format!("'{c}'"), + }; + return Err(format!("genome label contains forbidden character {display}")); + } + Ok(()) +} diff --git a/src/obikmer/src/cmd/index.rs b/src/obikmer/src/cmd/index.rs index fe57749..8a7d8ea 100644 --- a/src/obikmer/src/cmd/index.rs +++ b/src/obikmer/src/cmd/index.rs @@ -1,7 +1,7 @@ use std::path::PathBuf; use clap::Args; -use obikindex::{GenomeInfo, IndexConfig, IndexState, KmerIndex}; +use obikindex::{validate_label, GenomeInfo, IndexConfig, IndexState, KmerIndex}; use obilayeredmap::IndexMode; fn parse_key_value(s: &str) -> Result<(String, String), String> { @@ -194,6 +194,10 @@ pub fn run(args: IndexArgs) { block_bits, }; let genome_info = args.label.as_ref().map(|label| { + validate_label(label).unwrap_or_else(|e| { + eprintln!("error: --label: {e}"); + std::process::exit(1); + }); let mut info = GenomeInfo::new(label.clone()); for (k, v) in &args.meta { info.meta.insert(k.clone(), v.clone()); diff --git a/src/obikmer/src/cmd/mod.rs b/src/obikmer/src/cmd/mod.rs index 39e9a89..2fc940a 100644 --- a/src/obikmer/src/cmd/mod.rs +++ b/src/obikmer/src/cmd/mod.rs @@ -1,4 +1,5 @@ pub mod annotate; +pub mod utils; pub mod distance; pub mod dump; pub mod estimate; diff --git a/src/obikmer/src/cmd/utils.rs b/src/obikmer/src/cmd/utils.rs new file mode 100644 index 0000000..0e7ec20 --- /dev/null +++ b/src/obikmer/src/cmd/utils.rs @@ -0,0 +1,91 @@ +use std::path::PathBuf; + +use clap::Args; +use obikindex::{validate_label, KmerIndex}; +use tracing::info; + +#[derive(Args)] +pub struct UtilsArgs { + /// Index directory to operate on + pub index: PathBuf, + + /// Set a new genome label: NEW_LABEL=OLD_LABEL + #[arg(long, value_name = "NEW=OLD")] + pub new_label: Option, +} + +pub fn run(args: UtilsArgs) { + let mut any = false; + + if let Some(spec) = &args.new_label { + any = true; + run_rename(&args.index, spec); + } + + if !any { + eprintln!("utils: no operation specified. Available options: --new-label NEW=OLD"); + std::process::exit(1); + } +} + +fn run_rename(index_path: &PathBuf, spec: &str) { + let (old_label, new_label) = parse_rename_spec(spec); + + let mut idx = KmerIndex::open(index_path).unwrap_or_else(|e| { + eprintln!("error opening index: {e}"); + std::process::exit(1); + }); + + let pos = idx + .meta() + .genomes + .iter() + .position(|g| g.label == old_label) + .unwrap_or_else(|| { + eprintln!("error: genome '{old_label}' not found in index"); + std::process::exit(1); + }); + + validate_label(&new_label).unwrap_or_else(|e| { + eprintln!("error: --new-label: {e}"); + std::process::exit(1); + }); + + // Check the new label is not already taken. + if idx.meta().genomes.iter().any(|g| g.label == new_label) { + eprintln!("error: label '{new_label}' already exists in index"); + std::process::exit(1); + } + + idx.meta_mut().genomes[pos].label = new_label.clone(); + idx.meta_mut().write(index_path).unwrap_or_else(|e| { + eprintln!("error writing index metadata: {e}"); + std::process::exit(1); + }); + + // Rename the spectrum file if it exists. + let spectrums_dir = index_path.join("spectrums"); + let old_spectrum = spectrums_dir.join(format!("{old_label}.json")); + let new_spectrum = spectrums_dir.join(format!("{new_label}.json")); + if old_spectrum.exists() { + std::fs::rename(&old_spectrum, &new_spectrum).unwrap_or_else(|e| { + eprintln!("warning: could not rename spectrum file: {e}"); + }); + } + + info!("renamed genome '{old_label}' → '{new_label}'"); +} + +fn parse_rename_spec(spec: &str) -> (String, String) { + let eq = spec.find('=').unwrap_or_else(|| { + eprintln!("error: --new-label expects NEW_LABEL=OLD_LABEL, got '{spec}'"); + std::process::exit(1); + }); + let new = spec[..eq].trim().to_string(); + let old = spec[eq + 1..].trim().to_string(); + if old.is_empty() || new.is_empty() { + eprintln!("error: --new-label: both old and new labels must be non-empty"); + std::process::exit(1); + } + (old, new) +} diff --git a/src/obikmer/src/main.rs b/src/obikmer/src/main.rs index e6700e1..a872e46 100644 --- a/src/obikmer/src/main.rs +++ b/src/obikmer/src/main.rs @@ -36,6 +36,8 @@ enum Commands { Estimate(cmd::estimate::EstimateArgs), /// Convert an index's evidence in-place: exact ↔ approx Reindex(cmd::reindex::ReindexArgs), + /// Miscellaneous index utilities (--rename, …) + Utils(cmd::utils::UtilsArgs), } fn main() { @@ -68,6 +70,7 @@ fn main() { Commands::Unitig(args) => cmd::unitig::run(args), Commands::Estimate(args) => cmd::estimate::run(args), Commands::Reindex(args) => cmd::reindex::run(args), + Commands::Utils(args) => cmd::utils::run(args), } #[cfg(feature = "profiling")] -- 2.52.0 From 26ab1658074afe9e3bdb74d19892da0bc64cc0d6 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Tue, 26 May 2026 14:54:26 +0200 Subject: [PATCH 4/5] refactor: add rolling buffer methods and document label constraints Added `is_empty()`, `clear()`, and `iter()` methods to the rolling statistics buffer to enable standard traversal and state reset operations. Documented genome label constraints, specifying forbidden characters, empty label rejection, space quoting requirements, and auto-derived label bypass rules. Additionally, updated doc comments and added `#[allow(dead_code)]` attributes for `kmer_offset` and `n_kmers` fields to suppress compiler warnings while reserving them for future `--detail` coverage vector logic. --- docmd/index.md | 16 +++++++++++++++- src/obikmer/src/cmd/query.rs | 7 ++++--- src/obiskbuilder/src/rolling_stat.rs | 8 -------- 3 files changed, 19 insertions(+), 12 deletions(-) diff --git a/docmd/index.md b/docmd/index.md index bd3e968..a5a0934 100644 --- a/docmd/index.md +++ b/docmd/index.md @@ -17,7 +17,7 @@ | `unitig` | Dump unitigs from a built index to stdout (debug) | | `estimate` | Estimate approximate-index parameters (z, evidence bits, FP rates) before indexing | | `reindex` | Convert an index's evidence in-place: exact ↔ approx | -| `utils` | Miscellaneous index utilities: `--new-label NEW=OLD` renames a genome label in-place | +| `utils` | Miscellaneous index utilities: `--new-label NEW=OLD` renames a genome label in-place (NEW gets OLD's identity) | ## Constraints @@ -27,6 +27,20 @@ - Canonical form: `min(kmer, revcomp(kmer))` reduces strand-symmetric space by half - Input formats: FASTA, FASTQ, gzip, streaming stdin; `index` reads from stdin automatically when no input files are provided (`-` can also be passed explicitly among other paths) +## Genome label constraints + +Genome labels are arbitrary Unicode strings with the following restrictions: + +| Character | Forbidden | Reason | +|-----------|-----------|--------| +| `/` | yes | filesystem path separator | +| `=` | yes | `--new-label` parser separator | +| `\0` | yes | null byte | +| `\n` `\r` `\t` | yes | break CSV output | +| spaces | **allowed** | use shell quoting: `--new-label 'new label=old label'` | + +Empty labels are also rejected. Labels derived automatically from the index directory name (when `--label` is omitted) are not validated since they come from the filesystem and are already safe. + ## Priority operations - Kmer counting (frequencies) diff --git a/src/obikmer/src/cmd/query.rs b/src/obikmer/src/cmd/query.rs index 3307d68..51aa7f8 100644 --- a/src/obikmer/src/cmd/query.rs +++ b/src/obikmer/src/cmd/query.rs @@ -59,8 +59,8 @@ pub struct SKDesc { /// Index of the source sequence within the batch. pub seq_idx: u32, /// Kmer offset of the first kmer of this superkmer within its sequence. - /// Computed as the cumulative number of kmers emitted before this superkmer - /// in the same sequence. Used for `--detail` coverage vectors. + /// Reserved for `--detail` coverage vectors (not yet consumed). + #[allow(dead_code)] pub kmer_offset: u32, } @@ -76,7 +76,8 @@ pub struct QueryBatch { pub ids: Vec, /// Raw sequence bytes (for output), in batch order. pub seqs: Vec>, - /// Per-sequence total kmer count (kmer_count + kmer_missing). + /// Per-sequence total kmer count. Reserved for `--detail` (not yet consumed). + #[allow(dead_code)] pub n_kmers: Vec, /// Deduplicated superkmer map. pub map: HashMap>, diff --git a/src/obiskbuilder/src/rolling_stat.rs b/src/obiskbuilder/src/rolling_stat.rs index 17838a0..28d49a6 100644 --- a/src/obiskbuilder/src/rolling_stat.rs +++ b/src/obiskbuilder/src/rolling_stat.rs @@ -24,10 +24,6 @@ impl Ring { } } #[inline] - fn is_empty(&self) -> bool { - self.len == 0 - } - #[inline] fn clear(&mut self) { self.len = 0; self.head = 0; @@ -67,10 +63,6 @@ impl Ring { } } - /// Iterate over elements front-to-back (copies, since T: Copy). - fn iter(&self) -> impl Iterator + '_ { - (0..self.len).map(move |i| self.buf[(self.head + i) % N]) - } } // ── MmerItem ────────────────────────────────────────────────────────────────── -- 2.52.0 From 694da5208e40c54b2a5ab679ba8cec5b449de39e Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Tue, 26 May 2026 15:25:57 +0200 Subject: [PATCH 5/5] feat: add Findere z-window filtering and detail mode coverage tracking Introduces the `--findere-z` CLI flag to override the index's sliding window parameter and implements `apply_findere` to filter k-mer hits using a z-consecutive positions window. Adds structural support for `--detail` mode, including per-sequence k-mer offsets, conditional allocation of per-position coverage vectors, and JSON serialization. Updates architecture documentation, CLI references, and annotation schemas to align with the new implementation, resolving prior discrepancies with `--detail` and `--mismatch` flags. --- docmd/architecture/query.md | 117 +++++++++++++++-------- src/obikmer/src/cmd/query.rs | 174 ++++++++++++++++++++++++++++------- 2 files changed, 218 insertions(+), 73 deletions(-) diff --git a/docmd/architecture/query.md b/docmd/architecture/query.md index cb83859..f53896d 100644 --- a/docmd/architecture/query.md +++ b/docmd/architecture/query.md @@ -26,7 +26,8 @@ for each batch of sequences: query_partition(p, superkmers_routed_to_p) → load QueryLayer(s) for p → for each kmer in each superkmer: MphfLayer::find(kmer) - broadcast results back to each (seq_idx, kmer_offset) that referenced the superkmer + apply_findere(sk_kmer_results, effective_z) ← per superkmer + broadcast confirmed results back to each (seq_idx, kmer_offset) emit annotated sequences ``` @@ -36,28 +37,46 @@ Parallelism is **not yet active** in the current implementation: batches are pro --- -## Layer lookup: `MphfLayer::find` +## Findere z-window filter -`MphfLayer::open` reads `layer_meta.json` and loads either exact or approximate evidence. The caller (`QueryLayer::find`) never chooses the dispatch path — it is fixed at open time by `LayerEvidence`: +For approximate and hybrid index modes, a raw k-mer hit is a positive from the fingerprint test with false-positive rate 1/2^b. The Findere scheme reduces the effective FP rate to 1/2^(b·z) by requiring **z consecutive k-mers** to all test positive before any of them is counted as a confirmed match. + +`apply_findere` implements this as a sliding-window confirmation, independently for each genome: ```rust -pub fn find(&self, kmer: CanonicalKmer) -> Option { - match &self.ev { - LayerEvidence::Exact { .. } => self.find_exact(kmer), - LayerEvidence::Approx { .. } => self.find_approx(kmer), - } -} +fn apply_findere( + results: &[Option>], + z: usize, + n_genomes: usize, +) -> Vec>> ``` -### Exact layers +For each genome g, a position i is confirmed if there exists at least one window of z consecutive positions `[j, j+z)` that contains i and where all z positions are present for g (i.e. `results[pos]` is `Some(row)` and `row[g] > 0`). The implementation uses a single O(n) sliding-window scan per genome. -`find_exact` maps the k-mer through the MPHF to a slot, then calls `UnitigFileReader::verify_canonical_kmer(chunk_id, rank, kmer)` to confirm the stored k-mer matches. Zero false positives. Requires `UnitigFileReader::open()` (random-access via `.idx`); `open_sequential()` cannot serve random-access verification. +Unconfirmed positions are zeroed in the returned row. If all genomes are zeroed for a position, it is returned as `None`. -### Approximate layers +**Short superkmers**: when a superkmer contains fewer than z k-mers, no complete z-window can be formed. Rather than discarding all results, `apply_findere` returns them unchanged (no filter applied). This avoids penalising legitimate short sequences near read ends. -`find_approx` maps the k-mer through the MPHF, then checks a stored `b`-bit fingerprint of the canonical hash. False-positive rate: **1/2^b per k-mer query**. No `.idx` file is needed; the layer carries only `fingerprint.bin`. +**Exact indexes**: `z` is effectively 1 (every k-mer is its own window). `apply_findere` is a no-op. -For a query window of z consecutive k-mers (Findere scheme), the false-positive rate per window is **1/2^(b·z)**. The `z` parameter is recorded in `layer_meta.json` at build time but is not enforced during querying — the caller is responsible for interpreting window-level results accordingly. +### Effective z at query time + +`effective_z` is resolved at the start of `run()`: + +```rust +let effective_z = args.findere_z.unwrap_or_else(|| match idx.meta().config.evidence { + IndexMode::Approx { z, .. } | IndexMode::Hybrid { z, .. } => z as usize, + IndexMode::Exact => 1, +}); +``` + +The `-z` CLI option overrides the index metadata value. A higher z increases stringency (lower FP, some true positives may be discarded at sequence ends); a lower z increases sensitivity. + +--- + +## Layer lookup: `MphfLayer::find` + +`MphfLayer::open(dir, mode: &IndexMode)` receives the mode from `PartitionMeta` — no per-layer file is read. The caller (`QueryLayer`) never chooses the dispatch path: it is fixed at open time by `LayerEvidence`. See [obilayeredmap](../implementation/obilayeredmap.md) for the full `find` / `find_strict` API. ### `QueryLayer` variant selection @@ -72,18 +91,6 @@ For a query window of z consecutive k-mers (Findere scheme), the false-positive --- -## `open()` vs `open_sequential()` - -`UnitigFileReader::open()` loads the `.idx` block-offset table, enabling random access to individual unitig chunks. It is required whenever `verify_canonical_kmer` is called (exact layers at query time). - -`UnitigFileReader::open_sequential()` skips the `.idx` and supports only forward iteration. It is sufficient for: -- build passes that scan all unitigs sequentially (`build_exact_evidence`, `build_approx_evidence`); -- the `unitig` subcommand, which iterates and prints unitigs without random access. - -`KmerIndex::open()` (called by `query::run`) triggers `MphfLayer::open` for each layer, which calls `UnitigFileReader::open()` for exact layers. Approximate layers do not open a unitig reader at all. - ---- - ## Presence / count mode at query time The `--force-presence` flag and `--presence-threshold` control how per-genome values are accumulated, independently of what the index stores: @@ -96,49 +103,83 @@ genome_totals[g] += if presence { u32::from(v >= threshold) } else { v } --- +## Coverage vectors (`--detail`) + +When `--detail` is requested, a 3-D accumulator `cov[seq_idx][genome][kmer_pos]` is allocated before the partition loop, with dimensions derived from `batch.n_kmers`: + +``` +cov[seq_idx][g][abs_pos] += contribution +where abs_pos = desc.kmer_offset + local_pos (absolute kmer position in the sequence) +``` + +Coverage reflects confirmed k-mers only (post-Findere). The vectors are emitted in the JSON annotation under the key `"coverage"`. + +--- + +## `kmer_missing` semantics + +`kmer_missing` counts k-mers that returned `None` from the index before Findere filtering — i.e. k-mers truly absent from every layer. K-mers that were found in the index but rejected by the z-window filter are not counted as missing. + +--- + ## Output format Output sequences are written in **OBITools4 format**: the original sequence with a JSON annotation map in the title line. ``` ->read_id {"kmer_count":59,"kmer_strict_matches":{"genome_a":42,"genome_b":7,...}} +>read_id {"kmer_count":59,"kmer_strict_matches":{"genome_a":42,"genome_b":7}} ATCGATCG... ``` -Genome keys in `kmer_strict_matches` are genome labels from `index.meta`. Key order follows iteration order of `meta.genomes`. +With `--detail`: + +``` +>read_id {"kmer_count":59,"kmer_strict_matches":{...},"coverage":{"genome_a":[0,1,2,...],...}} +ATCGATCG... +``` + +Genome keys follow the iteration order of `meta.genomes`. --- -## Annotation schema (current implementation) +## Annotation schema | Key | Type | Condition | Semantics | |---|---|---|---| -| `kmer_count` | int | always | k-mers with at least one match | -| `kmer_missing` | int | `--count-missing` | k-mers absent from every layer | +| `kmer_count` | int | always | k-mers confirmed (post-Findere) with at least one genome match | +| `kmer_missing` | int | `--count-missing` | k-mers absent from the index entirely (pre-Findere None) | | `kmer_strict_matches` | object | always | per-genome accumulated value (label → count or 0/1) | +| `coverage` | object | `--detail` | per-genome array of per-position contributions (label → [u32]) | -`kmer_count` counts matched k-mer positions (incremented once per `Some(row)` hit regardless of how many genomes are covered). `kmer_missing` counts `None` hits. - -**Note on doc/impl divergence:** the doc previously used keys `kmer_total`, `kmer_found`, and `kmer_match` (list). The implementation uses `kmer_count` (int, matched only) and `kmer_strict_matches` (object keyed by genome label). `--mismatch` and `--detail` are parsed but not yet implemented and emit a warning. +`kmer_count + kmer_missing` ≤ total k-mers in the sequence. The gap (if any) corresponds to k-mers found in the index but rejected by the Findere z-window filter. --- ## CLI ``` -obikmer query -i [--detail] [--mismatch] [--count-missing] +obikmer query [--detail] [--mismatch] [--count-missing] [--force-presence] [--presence-threshold ] - [-T ] [ ...] + [-z ] [-T ] + [ ...] ``` -`--mismatch` and `--detail` are accepted but currently ignored with a warning on stderr. +| Option | Default | Semantics | +|---|---|---| +| `-z` / `--findere-z` | from index metadata | Override Findere z parameter | +| `--detail` | off | Emit per-position coverage vectors in JSON | +| `--count-missing` | off | Add `kmer_missing` field to JSON | +| `--force-presence` | off | Report 0/1 per genome regardless of index counts | +| `--presence-threshold` | 1 | Minimum count to declare genome present | +| `-T` / `--threads` | all CPUs | Worker threads (parallelism not yet active) | + +`--mismatch` is accepted but currently ignored with a warning on stderr. --- ## Future work - **`--mismatch`**: 1-mismatch approximate matching — generate `3·k` single-substitution variants per k-mer, look each up independently. -- **`--detail`**: per-position coverage vectors (`cov_`) per genome. - **Read classification** (`--classify`): assign each read to the genome with the highest match score. - **Parallelism**: activate per-partition or per-sequence worker threads using the already-parsed `--threads` value. - **Whitelist / blacklist filtering**: threshold-based accept/reject on per-genome match scores. diff --git a/src/obikmer/src/cmd/query.rs b/src/obikmer/src/cmd/query.rs index 51aa7f8..f86ec02 100644 --- a/src/obikmer/src/cmd/query.rs +++ b/src/obikmer/src/cmd/query.rs @@ -4,6 +4,7 @@ use std::path::PathBuf; use clap::Args; use obikindex::KmerIndex; +use obilayeredmap::IndexMode; use obiread::record::{SeqRecord, parse_chunk}; use obiread::chunk::read_sequence_chunks; use obikseq::{RoutableSuperKmer, set_k, set_m}; @@ -21,7 +22,7 @@ pub struct QueryArgs { #[arg(num_args = 1..)] pub inputs: Vec, - /// Also report per-position coverage vectors (cov_ per genome) + /// Report per-position coverage vectors per genome (adds "coverage" to JSON) #[arg(long)] pub detail: bool, @@ -41,6 +42,10 @@ pub struct QueryArgs { #[arg(long, default_value_t = 1)] pub presence_threshold: u32, + /// Override the Findere z parameter from index metadata + #[arg(short = 'z', long)] + pub findere_z: Option, + /// Number of worker threads #[arg( short = 'T', @@ -59,25 +64,18 @@ pub struct SKDesc { /// Index of the source sequence within the batch. pub seq_idx: u32, /// Kmer offset of the first kmer of this superkmer within its sequence. - /// Reserved for `--detail` coverage vectors (not yet consumed). - #[allow(dead_code)] pub kmer_offset: u32, } // ── QueryBatch ──────────────────────────────────────────────────────────────── /// A batch of query sequences with their superkmers deduplicated. -/// -/// Each unique `RoutableSuperKmer` maps to all the (seq_idx, kmer_offset) -/// positions it occupies across the batch. The superkmer is queried once -/// per partition; the result is broadcast to every SKDesc entry. pub struct QueryBatch { /// Sequence ids in batch order. pub ids: Vec, /// Raw sequence bytes (for output), in batch order. pub seqs: Vec>, - /// Per-sequence total kmer count. Reserved for `--detail` (not yet consumed). - #[allow(dead_code)] + /// Total kmer count per sequence (used for `--detail` coverage allocation). pub n_kmers: Vec, /// Deduplicated superkmer map. pub map: HashMap>, @@ -91,8 +89,8 @@ impl QueryBatch { level_max: usize, theta: f64, ) -> Self { - let mut ids = Vec::with_capacity(records.len()); - let mut seqs = Vec::with_capacity(records.len()); + let mut ids = Vec::with_capacity(records.len()); + let mut seqs = Vec::with_capacity(records.len()); let mut n_kmers = Vec::with_capacity(records.len()); let mut map: HashMap> = HashMap::new(); @@ -116,9 +114,6 @@ impl QueryBatch { } /// Split the superkmer map by partition index. - /// - /// Returns a vec of length `n_partitions`; each slot holds the RSK refs - /// whose minimizer routes to that partition. pub fn split_by_partition(&self, n_partitions: usize) -> Vec> { let mask = (n_partitions as u64) - 1; let mut by_part: Vec> = vec![Vec::new(); n_partitions]; @@ -133,22 +128,83 @@ impl QueryBatch { // ── Per-sequence accumulator ────────────────────────────────────────────────── struct SeqAcc { - kmer_count: u32, - kmer_missing: u32, - /// Per-genome accumulated count (count mode) or presence sum (presence mode). + kmer_count: u32, + kmer_missing: u32, genome_totals: Vec, } impl SeqAcc { fn new(n_genomes: usize) -> Self { Self { - kmer_count: 0, - kmer_missing: 0, + kmer_count: 0, + kmer_missing: 0, genome_totals: vec![0u32; n_genomes], } } } +// ── Findere z-window filter ─────────────────────────────────────────────────── + +/// Apply the Findere z-window filter to per-kmer query results for one superkmer. +/// +/// A k-mer at position i for genome g is confirmed only if it belongs to at least +/// one run of z consecutive positions where all k-mers are present for g. +/// Unconfirmed positions are zeroed; positions whose entire row becomes zero are +/// returned as `None`. +/// +/// When z <= 1 or the superkmer is shorter than z k-mers, results are returned +/// unchanged (short superkmers cannot satisfy the z-window constraint). +fn apply_findere( + results: &[Option>], + z: usize, + n_genomes: usize, +) -> Vec>> { + let n = results.len(); + if z <= 1 || n < z { + return results.iter().map(|r| r.as_ref().map(|row| row.clone())).collect(); + } + + let mut confirmed = vec![vec![false; n_genomes]; n]; + + for g in 0..n_genomes { + let present: Vec = results + .iter() + .map(|r| r.as_ref().map_or(false, |row| row[g] > 0)) + .collect(); + + let mut window_count = present[..z].iter().filter(|&&p| p).count(); + if window_count == z { + for c in confirmed[..z].iter_mut() { + c[g] = true; + } + } + + for j in 1..=(n - z) { + if present[j - 1] { window_count -= 1; } + if present[j + z - 1] { window_count += 1; } + if window_count == z { + for c in confirmed[j..j + z].iter_mut() { + c[g] = true; + } + } + } + } + + results.iter().enumerate().map(|(i, opt)| { + let row = opt.as_ref()?; + let mut new_row: Box<[u32]> = row.clone(); + let mut any = false; + for g in 0..n_genomes { + if !confirmed[i][g] { + new_row[g] = 0; + } else { + any = true; + } + } + if any { Some(new_row) } else { None } + }).collect() +} + // ── Entry point ─────────────────────────────────────────────────────────────── pub fn run(args: QueryArgs) { @@ -165,17 +221,22 @@ pub fn run(args: QueryArgs) { let n_partitions = idx.n_partitions(); let with_counts = idx.meta().config.with_counts; + let effective_z: usize = args.findere_z.unwrap_or_else(|| { + match idx.meta().config.evidence { + IndexMode::Approx { z, .. } | IndexMode::Hybrid { z, .. } => z as usize, + IndexMode::Exact => 1, + } + }); + info!( - "query: k={k}, {} genome(s), with_counts={with_counts}, mismatch={}, detail={}", + "query: k={k}, {} genome(s), with_counts={with_counts}, z={effective_z}, \ + mismatch={}, detail={}", n_genomes, args.mismatch, args.detail ); if args.mismatch { eprintln!("warning: --mismatch not yet implemented, ignored"); } - if args.detail { - eprintln!("warning: --detail not yet implemented, ignored"); - } let paths: Vec = args.inputs.iter().map(PathBuf::from).collect(); let mut out = BufWriter::new(io::stdout()); @@ -198,10 +259,20 @@ pub fn run(args: QueryArgs) { continue; } - let batch = QueryBatch::from_records(records, k, 6, 0.7); + let batch = QueryBatch::from_records(records, k, 6, 0.7); let n_seqs = batch.ids.len(); - let mut accs: Vec = (0..n_seqs).map(|_| SeqAcc::new(n_genomes)).collect(); + let mut accs: Vec = + (0..n_seqs).map(|_| SeqAcc::new(n_genomes)).collect(); + + // [seq_idx][genome_idx][kmer_position] — allocated only with --detail + let mut cov: Vec>> = if args.detail { + batch.n_kmers.iter() + .map(|&n| vec![vec![0u32; n as usize]; n_genomes]) + .collect() + } else { + Vec::new() + }; let by_part = batch.split_by_partition(n_partitions); @@ -218,24 +289,40 @@ pub fn run(args: QueryArgs) { std::process::exit(1); }); - let presence = args.force_presence || !with_counts; - let threshold = args.presence_threshold; + let presence = args.force_presence || !with_counts; + let threshold = args.presence_threshold; for (rsk, sk_kmer_results) in part_sks.iter().zip(kmer_results.iter()) { + let filtered = apply_findere(sk_kmer_results, effective_z, n_genomes); let descs = batch.map.get(*rsk).expect("rsk must be in map"); + for desc in descs { let acc = &mut accs[desc.seq_idx as usize]; - for hit in sk_kmer_results.iter() { + + for (local_pos, hit) in filtered.iter().enumerate() { match hit { - None => acc.kmer_missing += 1, + None => { + // Only truly missing if the index also had no entry. + if sk_kmer_results[local_pos].is_none() { + acc.kmer_missing += 1; + } + } Some(row) => { acc.kmer_count += 1; for (g, &v) in row.iter().enumerate() { - acc.genome_totals[g] += if presence { + if v == 0 { + continue; + } + let contribution = if presence { u32::from(v >= threshold) } else { v }; + acc.genome_totals[g] += contribution; + if args.detail { + let abs_pos = desc.kmer_offset as usize + local_pos; + cov[desc.seq_idx as usize][g][abs_pos] += contribution; + } } } } @@ -244,7 +331,11 @@ pub fn run(args: QueryArgs) { } } - emit_batch(&batch, &accs, idx.meta(), args.count_missing, &mut out); + emit_batch( + &batch, &accs, idx.meta(), + args.count_missing, args.detail, &cov, + &mut out, + ); } } } @@ -252,11 +343,13 @@ pub fn run(args: QueryArgs) { // ── Output ──────────────────────────────────────────────────────────────────── fn emit_batch( - batch: &QueryBatch, - accs: &[SeqAcc], - meta: &obikindex::meta::IndexMeta, + batch: &QueryBatch, + accs: &[SeqAcc], + meta: &obikindex::meta::IndexMeta, count_missing: bool, - out: &mut impl Write, + detail: bool, + cov: &[Vec>], + out: &mut impl Write, ) { for (seq_idx, (id, seq)) in batch.ids.iter().zip(batch.seqs.iter()).enumerate() { let acc = &accs[seq_idx]; @@ -266,12 +359,23 @@ fn emit_batch( if count_missing { ann.insert("kmer_missing".into(), acc.kmer_missing.into()); } + let mut match_map = serde_json::Map::new(); for (g, genome) in meta.genomes.iter().enumerate() { match_map.insert(genome.label.clone(), acc.genome_totals[g].into()); } ann.insert("kmer_strict_matches".into(), match_map.into()); + if detail && !cov.is_empty() { + let mut cov_map = serde_json::Map::new(); + for (g, genome) in meta.genomes.iter().enumerate() { + let v: Vec = + cov[seq_idx][g].iter().map(|&x| x.into()).collect(); + cov_map.insert(genome.label.clone(), v.into()); + } + ann.insert("coverage".into(), cov_map.into()); + } + let ann_str = serde_json::to_string(&ann).unwrap_or_else(|_| "{}".to_string()); // OBITools4 FASTA format: >id {"key":value,...} -- 2.52.0