diff --git a/src/obilayeredmap/src/layer.rs b/src/obilayeredmap/src/layer.rs index 9f41675..a25ee62 100644 --- a/src/obilayeredmap/src/layer.rs +++ b/src/obilayeredmap/src/layer.rs @@ -76,6 +76,14 @@ impl Layer { pub fn unitig_writer(out_dir: &Path) -> OLMResult { MphfLayer::unitig_writer(out_dir) } + + /// Build `unitigs.bin.idx` and `evidence.bin` from `unitigs.bin` and + /// `mphf.bin` already present in `layer_dir`. + /// + /// See [`MphfLayer::build_exact_evidence`] for the full contract. + pub fn build_exact_evidence(layer_dir: &Path) -> OLMResult { + MphfLayer::build_exact_evidence(layer_dir) + } } // ── Mode 1 — set membership ─────────────────────────────────────────────────── @@ -106,7 +114,7 @@ impl Layer<()> { impl Layer { pub fn build(out_dir: &Path, count_of: impl Fn(CanonicalKmer) -> u32) -> OLMResult { - let n = UnitigFileReader::open(&out_dir.join(UNITIGS_FILE))?.n_kmers(); + let n = UnitigFileReader::open_sequential(&out_dir.join(UNITIGS_FILE))?.n_kmers(); let counts_dir = out_dir.join(COUNTS_DIR); let mut mb = PersistentCompactIntMatrixBuilder::new(n, &counts_dir) .map_err(OLMError::Io)?; @@ -158,7 +166,7 @@ impl Layer { n_genomes: usize, present_in: impl Fn(CanonicalKmer, usize) -> bool, ) -> OLMResult { - let n = UnitigFileReader::open(&out_dir.join(UNITIGS_FILE))?.n_kmers(); + let n = UnitigFileReader::open_sequential(&out_dir.join(UNITIGS_FILE))?.n_kmers(); let presence_dir = out_dir.join(PRESENCE_DIR); let mut mb = PersistentBitMatrixBuilder::new(n, &presence_dir).map_err(OLMError::Io)?; let mut cols: Vec<_> = (0..n_genomes) diff --git a/src/obilayeredmap/src/mphf_layer.rs b/src/obilayeredmap/src/mphf_layer.rs index 0844237..1713bce 100644 --- a/src/obilayeredmap/src/mphf_layer.rs +++ b/src/obilayeredmap/src/mphf_layer.rs @@ -4,7 +4,7 @@ use std::path::Path; use cacheline_ef::{CachelineEf, CachelineEfVec}; use epserde::prelude::*; use obikseq::CanonicalKmer; -use obiskio::{UnitigFileReader, UnitigFileWriter}; +use obiskio::{UnitigFileReader, UnitigFileWriter, build_unitig_idx, DEFAULT_BLOCK_BITS}; use ptr_hash::{PtrHash, PtrHashParams, bucket_fn::CubicEps, hash::Xx64}; use crate::error::{OLMError, OLMResult}; @@ -41,7 +41,6 @@ impl MphfLayer { #[inline] pub fn find(&self, kmer: CanonicalKmer) -> Option { let slot = self.mphf.index(&kmer.raw()); - // PtrHash guarantees slot < n only for its key set; arbitrary queries may exceed bounds. if slot >= self.n { return None; } let (chunk_id, rank) = self.evidence.decode(slot); if self.unitigs.verify_canonical_kmer(chunk_id as usize, rank as usize, kmer) { @@ -58,16 +57,73 @@ impl MphfLayer { Ok(UnitigFileWriter::create(&dir.join(UNITIGS_FILE))?) } - /// Builds the MPHF and evidence from the unitigs file already present in `dir`. - /// Calls `fill_slot(slot, kmer)` once per kmer (second pass) for DataStore population. - /// Returns the number of kmers indexed. + /// Build `unitigs.bin.idx` and `evidence.bin` from `unitigs.bin` and + /// `mphf.bin` already present in `dir`. + /// + /// This is the exact-evidence construction route. It can be called: + /// - after the initial build (via [`Self::build`] which calls it internally) + /// - standalone to promote an existing (unitigs + mphf) into an exact index + /// - standalone to rebuild evidence after a format change + /// Build `unitigs.bin.idx` and `evidence.bin` from `unitigs.bin` and + /// `mphf.bin` already present in `dir`. + /// + /// Uses sequential iteration — no `.idx` is required on entry. + /// Writes both `evidence.bin` and `unitigs.bin.idx` on success. + pub fn build_exact_evidence(dir: &Path) -> OLMResult { + let unitig_path = dir.join(UNITIGS_FILE); + + // Sequential scan — no .idx required for iteration + let unitigs = UnitigFileReader::open_sequential(&unitig_path)?; + let n = unitigs.n_kmers(); + + if n == 0 { + fs::File::create(dir.join(EVIDENCE_FILE))?; + build_unitig_idx(&unitig_path, DEFAULT_BLOCK_BITS)?; + return Ok(0); + } + + let mphf: Mphf = Mphf::load_full(&dir.join(MPHF_FILE)) + .map_err(|e| OLMError::InvalidLayer(e.to_string()))?; + + let mut ev = EvidenceWriter::new(n); + let mut seen = vec![0u8; (n + 7) / 8]; + + for (kmer, chunk_id, rank) in unitigs.iter_indexed_canonical_kmers() { + let slot = mphf.index(&kmer.raw()); + if slot >= n { + return Err(OLMError::Mphf("slot out of bounds".into())); + } + let byte = slot / 8; + let bit = 1u8 << (slot % 8); + if seen[byte] & bit != 0 { + return Err(OLMError::Mphf("duplicate slot".into())); + } + seen[byte] |= bit; + ev.set(slot, chunk_id as u32, rank as u8); + } + + ev.write(&dir.join(EVIDENCE_FILE))?; + // Write .idx last: it is only needed for random access (queries). + build_unitig_idx(&unitig_path, DEFAULT_BLOCK_BITS)?; + Ok(n) + } + + /// Build MPHF and exact evidence from the unitigs file already present in + /// `dir`. Calls `fill_slot(slot, kmer)` once per kmer for DataStore + /// population. Returns the number of kmers indexed. pub(crate) fn build( dir: &Path, fill_slot: &mut impl FnMut(usize, CanonicalKmer) -> OLMResult<()>, ) -> OLMResult { use rayon::prelude::*; - let unitigs = UnitigFileReader::open(&dir.join(UNITIGS_FILE))?; + let unitig_path = dir.join(UNITIGS_FILE); + + // Write .idx so that UnitigFileReader::open succeeds and parallel + // random access is available for MPHF construction. + build_unitig_idx(&unitig_path, DEFAULT_BLOCK_BITS)?; + + let unitigs = UnitigFileReader::open(&unitig_path)?; let n = unitigs.n_kmers(); if n == 0 { @@ -80,7 +136,7 @@ impl MphfLayer { return Ok(0); } - // Pass 1 — build MPHF + // Pass 1 — build MPHF (parallel, uses random access via .idx) let keys = (0..unitigs.len()) .into_par_iter() .flat_map_iter(|ci| unitigs.unitig(ci).into_canonical_kmers().map(|km| km.raw())); @@ -90,7 +146,7 @@ impl MphfLayer { .map_err(|e| OLMError::InvalidLayer(e.to_string()))?; // Pass 2 — fill evidence + mode-specific data via callback - let unitigs2 = UnitigFileReader::open(&dir.join(UNITIGS_FILE))?; + let unitigs2 = UnitigFileReader::open(&unitig_path)?; let mut ev = EvidenceWriter::new(n); let mut seen = vec![0u8; (n + 7) / 8]; diff --git a/src/obilayeredmap/src/tests/layer.rs b/src/obilayeredmap/src/tests/layer.rs index 7b0ec32..47f6e82 100644 --- a/src/obilayeredmap/src/tests/layer.rs +++ b/src/obilayeredmap/src/tests/layer.rs @@ -11,16 +11,10 @@ fn write_unitigs(dir: &Path, seqs: &[&[u8]]) { w.close().unwrap(); } -fn all_canonical_kmers(dir: &Path, k: usize) -> Vec { - let r = UnitigFileReader::open(&dir.join(UNITIGS_FILE)).unwrap(); - let mut out = Vec::new(); - for ci in 0..r.len() { - let n = r.seql(ci) - k + 1; - for rank in 0..n { - out.push(Kmer::from_raw(r.raw_kmer(ci, rank)).canonical()); - } - } - out +fn all_canonical_kmers(dir: &Path) -> Vec { + UnitigFileReader::open_sequential(&dir.join(UNITIGS_FILE)).unwrap() + .iter_canonical_kmers() + .collect() } #[test] @@ -28,7 +22,7 @@ fn build_and_query_all_kmers_found() { set_k(4); let dir = tempdir().unwrap(); write_unitigs(dir.path(), &[b"AAAACGT"]); - let kmers = all_canonical_kmers(dir.path(), 4); + let kmers = all_canonical_kmers(dir.path()); Layer::<()>::build(dir.path()).unwrap(); let layer = Layer::<()>::open(dir.path()).unwrap(); for kmer in kmers { @@ -41,7 +35,7 @@ fn counts_are_stored_and_retrieved() { set_k(4); let dir = tempdir().unwrap(); write_unitigs(dir.path(), &[b"AAAACGT"]); - let kmers = all_canonical_kmers(dir.path(), 4); + let kmers = all_canonical_kmers(dir.path()); let count_map: HashMap = kmers.iter().enumerate().map(|(i, &k)| (k, i as u32 + 1)).collect(); Layer::::build( diff --git a/src/obiskio/src/lib.rs b/src/obiskio/src/lib.rs index 8f32ca2..1d960cc 100644 --- a/src/obiskio/src/lib.rs +++ b/src/obiskio/src/lib.rs @@ -8,7 +8,7 @@ pub use error::{SKError, SKResult}; pub use meta::SKFileMeta; pub use pool::{SKFilePool, SKFileWriter, SharedPool, create_token, create_token_with}; pub use reader::{SKFileIter, SKFileReader}; -pub use unitig_index::{UnitigFileReader, UnitigFileWriter}; +pub use unitig_index::{UnitigFileReader, UnitigFileWriter, build_unitig_idx, DEFAULT_BLOCK_BITS}; use std::path::{Path, PathBuf}; diff --git a/src/obiskio/src/tests/unitig_index.rs b/src/obiskio/src/tests/unitig_index.rs index 89e9a4f..5553357 100644 --- a/src/obiskio/src/tests/unitig_index.rs +++ b/src/obiskio/src/tests/unitig_index.rs @@ -18,6 +18,7 @@ fn write_read(seqs: &[&[u8]]) -> (tempfile::TempDir, UnitigFileReader) { w.write(&make_unitig(s)).unwrap(); } w.close().unwrap(); + super::build_unitig_idx(&path, super::DEFAULT_BLOCK_BITS).unwrap(); let r = UnitigFileReader::open(&path).unwrap(); (dir, r) } @@ -31,6 +32,7 @@ fn roundtrip_empty_index() { let path = dir.path().join("unitigs.bin"); let w = UnitigFileWriter::create(&path).unwrap(); w.close().unwrap(); + super::build_unitig_idx(&path, super::DEFAULT_BLOCK_BITS).unwrap(); let r = UnitigFileReader::open(&path).unwrap(); assert_eq!(r.len(), 0); } @@ -145,6 +147,7 @@ fn iter_kmers_empty_file() { let dir = tempdir().unwrap(); let path = dir.path().join("unitigs.bin"); UnitigFileWriter::create(&path).unwrap().close().unwrap(); + super::build_unitig_idx(&path, super::DEFAULT_BLOCK_BITS).unwrap(); let r = UnitigFileReader::open(&path).unwrap(); assert_eq!(r.iter_kmers().count(), 0); } @@ -280,6 +283,7 @@ fn write_read_bb(seqs: &[&[u8]], block_bits: u8) -> (tempfile::TempDir, UnitigFi w.write(&make_unitig(s)).unwrap(); } w.close().unwrap(); + super::build_unitig_idx(&path, block_bits).unwrap(); let r = UnitigFileReader::open(&path).unwrap(); (dir, r) } @@ -293,6 +297,7 @@ fn block_bits_stored_and_read_back() { let w = UnitigFileWriter::create_with_block_bits(&path, bb).unwrap(); assert_eq!(w.block_bits(), bb); w.close().unwrap(); + super::build_unitig_idx(&path, bb).unwrap(); let r = UnitigFileReader::open(&path).unwrap(); assert_eq!(r.block_bits(), bb, "block_bits={bb} not preserved"); } diff --git a/src/obiskio/src/unitig_index.rs b/src/obiskio/src/unitig_index.rs index 074dc32..75535ae 100644 --- a/src/obiskio/src/unitig_index.rs +++ b/src/obiskio/src/unitig_index.rs @@ -39,7 +39,6 @@ fn idx_path(path: &Path) -> PathBuf { /// Unitigs with more than [`MAX_KMERS_PER_CHUNK`] k-mers are transparently split /// into overlapping chunks (k−1 nucleotide overlap) so no k-mer is lost. pub struct UnitigFileWriter { - path: PathBuf, file: BufWriter, block_offsets: Vec, chunk_count: usize, @@ -64,7 +63,6 @@ impl UnitigFileWriter { assert!(block_bits <= 31, "block_bits must be ≤ 31"); let file = File::create(path).map_err(SKError::Io)?; Ok(Self { - path: path.to_owned(), file: BufWriter::new(file), block_offsets: Vec::new(), chunk_count: 0, @@ -122,19 +120,14 @@ impl UnitigFileWriter { Ok(()) } + /// Flush and close the binary sequence file. + /// + /// The companion `.idx` file is **not** written here; call + /// [`build_unitig_idx`] separately when exact evidence is needed. pub fn close(mut self) -> SKResult<()> { self.file.flush().map_err(SKError::Io)?; drop(self.file); - - self.block_offsets.push(self.next_offset); - - write_idx( - &idx_path(&self.path), - self.chunk_count as u32, - self.n_kmers as u64, - self.block_bits, - &self.block_offsets, - ) + Ok(()) } pub fn len(&self) -> usize { self.chunk_count } @@ -160,6 +153,48 @@ fn write_idx( w.flush().map_err(SKError::Io) } +/// Scan an existing `unitigs.bin` file and write its companion `.idx`. +/// +/// Called by the exact-evidence construction route after the sequence file is +/// closed. `block_bits` controls index granularity (1 << block_bits chunks per +/// offset entry); use [`DEFAULT_BLOCK_BITS`] for the default. +pub fn build_unitig_idx(unitigs_path: &Path, block_bits: u8) -> SKResult<()> { + assert!(block_bits <= 31, "block_bits must be ≤ 31"); + + let file = File::open(unitigs_path).map_err(SKError::Io)?; + let mmap = unsafe { Mmap::map(&file).map_err(SKError::Io)? }; + + let k = obikseq::params::k(); + let block_size = 1usize << block_bits; + let mask = block_size - 1; + + let mut block_offsets: Vec = Vec::new(); + let mut offset = 0usize; + let mut chunk_count = 0usize; + let mut n_kmers = 0usize; + + while offset < mmap.len() { + if chunk_count & mask == 0 { + block_offsets.push(offset as u32); + } + let seql_minus_k = mmap[offset] as usize; + let byte_len = (seql_minus_k + k + 3) / 4; + n_kmers += seql_minus_k + 1; + offset += 1 + byte_len; + chunk_count += 1; + } + + block_offsets.push(offset as u32); // sentinel + + write_idx( + &idx_path(unitigs_path), + chunk_count as u32, + n_kmers as u64, + block_bits, + &block_offsets, + ) +} + // ── Reader ──────────────────────────────────────────────────────────────────── /// Read-only random-access view of a unitig file. @@ -178,6 +213,7 @@ pub struct UnitigFileReader { } impl UnitigFileReader { + /// Open with `.idx` — enables both sequential iteration and random access. pub fn open(path: &Path) -> SKResult { let file = File::open(path).map_err(SKError::Io)?; let mmap = unsafe { Mmap::map(&file).map_err(SKError::Io)? }; @@ -194,6 +230,37 @@ impl UnitigFileReader { }) } + /// Open without `.idx` — sequential iteration only, no random access. + /// + /// Scans the binary file once to count chunks and k-mers. Use when only + /// [`Self::iter_kmers`], [`Self::iter_canonical_kmers`], or + /// [`Self::iter_indexed_canonical_kmers`] are needed. + pub fn open_sequential(path: &Path) -> SKResult { + let file = File::open(path).map_err(SKError::Io)?; + let mmap = unsafe { Mmap::map(&file).map_err(SKError::Io)? }; + let k = obikseq::params::k(); + + let mut offset = 0usize; + let mut n_unitigs = 0usize; + let mut n_kmers = 0usize; + while offset < mmap.len() { + let seql_minus_k = mmap[offset] as usize; + n_kmers += seql_minus_k + 1; + offset += 1 + (seql_minus_k + k + 3) / 4; + n_unitigs += 1; + } + + Ok(Self { + mmap, + block_offsets: Vec::new(), // empty → random access disabled + n_unitigs, + n_kmers, + k, + block_bits: DEFAULT_BLOCK_BITS, + mask: (1usize << DEFAULT_BLOCK_BITS) - 1, + }) + } + pub fn len(&self) -> usize { self.n_unitigs } pub fn is_empty(&self) -> bool { self.n_unitigs == 0 } pub fn n_kmers(&self) -> usize { self.n_kmers } @@ -202,6 +269,8 @@ impl UnitigFileReader { /// Byte offset of the START of record `i` (the seql byte) in the mmap. #[inline] fn chunk_start(&self, i: usize) -> usize { + assert!(!self.block_offsets.is_empty(), + "random access requires UnitigFileReader::open(); use open_sequential() for iteration only"); let block = i >> self.block_bits; let rem = i & self.mask; let mut offset = self.block_offsets[block] as usize;