diff --git a/src/obiskio/src/tests/unitig_index.rs b/src/obiskio/src/tests/unitig_index.rs index 4223249..89e9a4f 100644 --- a/src/obiskio/src/tests/unitig_index.rs +++ b/src/obiskio/src/tests/unitig_index.rs @@ -269,3 +269,94 @@ fn long_unitig_split_no_kmer_lost() { // First kmer of chunk 1 = original nucl 256..260 — a different, adjacent kmer. assert!(r.verify_canonical_kmer(1, 0, canonical_of(&seq[256..260]))); } + +// ── block_bits parametrisation ──────────────────────────────────────────────── + +fn write_read_bb(seqs: &[&[u8]], block_bits: u8) -> (tempfile::TempDir, UnitigFileReader) { + let dir = tempdir().unwrap(); + let path = dir.path().join("unitigs.bin"); + let mut w = UnitigFileWriter::create_with_block_bits(&path, block_bits).unwrap(); + for s in seqs { + w.write(&make_unitig(s)).unwrap(); + } + w.close().unwrap(); + let r = UnitigFileReader::open(&path).unwrap(); + (dir, r) +} + +#[test] +fn block_bits_stored_and_read_back() { + set_k(4); + for bb in [0u8, 1, 2, 3, 6] { + let dir = tempdir().unwrap(); + let path = dir.path().join("unitigs.bin"); + let w = UnitigFileWriter::create_with_block_bits(&path, bb).unwrap(); + assert_eq!(w.block_bits(), bb); + w.close().unwrap(); + let r = UnitigFileReader::open(&path).unwrap(); + assert_eq!(r.block_bits(), bb, "block_bits={bb} not preserved"); + } +} + +#[test] +fn block_bits_zero_exact_offsets() { + // block_bits=0 → one offset per chunk, no sequential scan in chunk_start + set_k(4); + let seqs: &[&[u8]] = &[b"ACGTACGT", b"TTTTCCCC", b"GGGAAA", b"AAAACG"]; + let (_dir, r) = write_read_bb(seqs, 0); + assert_eq!(r.len(), seqs.len()); + for (i, s) in seqs.iter().enumerate() { + assert_eq!(r.unitig(i), make_unitig(s), "block_bits=0: unitig {i} mismatch"); + } +} + +#[test] +fn block_bits_one_block_size_two() { + // block_bits=1 → BLOCK_SIZE=2: every other chunk gets an offset + set_k(4); + let seqs: &[&[u8]] = &[b"ACGTACGT", b"TTTTCCCC", b"GGGAAA", b"AAAACG", b"CCCCGT"]; + let (_dir, r) = write_read_bb(seqs, 1); + assert_eq!(r.len(), seqs.len()); + for (i, s) in seqs.iter().enumerate() { + assert_eq!(r.unitig(i), make_unitig(s), "block_bits=1: unitig {i} mismatch"); + } +} + +#[test] +fn block_bits_larger_than_chunk_count() { + // block_bits=6 (BLOCK_SIZE=64) with only 3 chunks: all in block 0 + set_k(4); + let seqs: &[&[u8]] = &[b"ACGTACGT", b"TTTTCCCC", b"GGGAAA"]; + let (_dir, r) = write_read_bb(seqs, 6); + assert_eq!(r.len(), seqs.len()); + for (i, s) in seqs.iter().enumerate() { + assert_eq!(r.unitig(i), make_unitig(s), "block_bits=6: unitig {i} mismatch"); + } +} + +#[test] +fn block_bits_random_access_matches_sequential() { + // Write many chunks with block_bits=2 (BLOCK_SIZE=4), verify random access + // against sequential iteration for every chunk. + set_k(4); + let seqs: Vec> = (0..20_usize) + .map(|i| (0..8_usize).map(|j| b"ACGT"[(i + j) % 4]).collect()) + .collect(); + let seq_refs: Vec<&[u8]> = seqs.iter().map(|s| s.as_slice()).collect(); + let (_dir, r) = write_read_bb(&seq_refs, 2); + assert_eq!(r.len(), seqs.len()); + let sequential: Vec = r.iter_chunks_sequential().map(|(_, u)| u).collect(); + for i in 0..seqs.len() { + assert_eq!(r.unitig(i), sequential[i], "random vs sequential mismatch at chunk {i}"); + } +} + +#[test] +fn block_bits_kmer_count_preserved() { + set_k(4); + // "AAAACG" → 3 kmers, "CCCCAG" → 3 kmers; total = 6 + for bb in [0u8, 1, 2, 6] { + let (_dir, r) = write_read_bb(&[b"AAAACG", b"CCCCAG"], bb); + assert_eq!(r.n_kmers(), 6, "block_bits={bb}: n_kmers mismatch"); + } +} diff --git a/src/obiskio/src/unitig_index.rs b/src/obiskio/src/unitig_index.rs index 0fe4444..074dc32 100644 --- a/src/obiskio/src/unitig_index.rs +++ b/src/obiskio/src/unitig_index.rs @@ -11,18 +11,17 @@ use crate::error::{SKError, SKResult}; // ── Block index parameters ──────────────────────────────────────────────────── // -// One offset entry per BLOCK_SIZE chunks. BLOCK_SIZE must be a power of two -// so that block = i >> LOG2_BLOCK_SIZE and rem = i & (BLOCK_SIZE − 1) are -// branchless shifts/masks rather than divisions. +// BLOCK_SIZE = 1 << block_bits chunks share one offset entry in the index. +// block_bits=0 → one entry per chunk (exact offsets, no scan). +// block_bits=6 → one entry per 64 chunks (default; O(64) scan per lookup). // -// With BLOCK_SIZE = 64 and an average chunk size of ~10 bytes, a random lookup -// scans at most 63 × 10 = 630 bytes sequentially — negligible next to the MPHF -// lookup that precedes it. The index file shrinks from ~5 bytes/chunk to -// ~1/64 bytes/chunk (≈ 300× for typical workloads). +// block_bits is stored in the index file so the reader derives all parameters +// at runtime — no compile-time constant constrains the format. -const MAGIC: [u8; 4] = *b"UIX2"; -const BLOCK_SIZE: usize = 64; -const LOG2_BLOCK_SIZE: u32 = 6; // 2^6 = BLOCK_SIZE +const MAGIC: [u8; 4] = *b"UIX3"; + +/// Default block granularity used by [`UnitigFileWriter::create`]. +pub const DEFAULT_BLOCK_BITS: u8 = 6; fn idx_path(path: &Path) -> PathBuf { crate::append_path_suffix(path, ".idx") @@ -33,23 +32,36 @@ fn idx_path(path: &Path) -> PathBuf { /// Writes a sequence of [`Unitig`] to an uncompressed binary file and builds /// a block-sampled offset index at close time. /// -/// One offset is stored every [`BLOCK_SIZE`] chunks; random access to chunk `i` -/// costs at most `BLOCK_SIZE − 1` sequential chunk scans after the block lookup. +/// One offset is stored every `1 << block_bits` chunks; random access to chunk +/// `i` costs at most `(1 << block_bits) − 1` sequential chunk scans after the +/// block lookup. /// /// Unitigs with more than [`MAX_KMERS_PER_CHUNK`] k-mers are transparently split /// into overlapping chunks (k−1 nucleotide overlap) so no k-mer is lost. pub struct UnitigFileWriter { path: PathBuf, file: BufWriter, - block_offsets: Vec, // byte offset of first record in each block + block_offsets: Vec, chunk_count: usize, - next_offset: u32, // byte offset of the START of the next record + next_offset: u32, n_kmers: usize, k: usize, + block_bits: u8, + mask: usize, // (1 << block_bits) - 1 } impl UnitigFileWriter { + /// Create a writer with the default block size (`DEFAULT_BLOCK_BITS = 6`). pub fn create(path: &Path) -> SKResult { + Self::create_with_block_bits(path, DEFAULT_BLOCK_BITS) + } + + /// Create a writer with a custom block size. + /// + /// `block_bits` must be in 0..=31. `block_bits=0` stores one offset per + /// chunk (exact, no scan); larger values trade index size for scan length. + pub fn create_with_block_bits(path: &Path, block_bits: u8) -> SKResult { + assert!(block_bits <= 31, "block_bits must be ≤ 31"); let file = File::create(path).map_err(SKError::Io)?; Ok(Self { path: path.to_owned(), @@ -59,6 +71,8 @@ impl UnitigFileWriter { next_offset: 0, n_kmers: 0, k: obikseq::params::k(), + block_bits, + mask: (1usize << block_bits) - 1, }) } @@ -95,8 +109,7 @@ impl UnitigFileWriter { debug_assert!(seql - self.k <= u8::MAX as usize, "chunk exceeds MAX_KMERS_PER_CHUNK"); - // Record a block offset at the start of every BLOCK_SIZE-th chunk. - if self.chunk_count & (BLOCK_SIZE - 1) == 0 { + if self.chunk_count & self.mask == 0 { self.block_offsets.push(self.next_offset); } @@ -113,25 +126,32 @@ impl UnitigFileWriter { self.file.flush().map_err(SKError::Io)?; drop(self.file); - // Sentinel: byte offset past the last record (needed for end-of-file detection). self.block_offsets.push(self.next_offset); write_idx( &idx_path(&self.path), self.chunk_count as u32, self.n_kmers as u64, + self.block_bits, &self.block_offsets, ) } pub fn len(&self) -> usize { self.chunk_count } pub fn is_empty(&self) -> bool { self.chunk_count == 0 } + pub fn block_bits(&self) -> u8 { self.block_bits } } -fn write_idx(path: &Path, n_unitigs: u32, n_kmers: u64, block_offsets: &[u32]) -> SKResult<()> { +fn write_idx( + path: &Path, + n_unitigs: u32, + n_kmers: u64, + block_bits: u8, + block_offsets: &[u32], +) -> SKResult<()> { let mut w = BufWriter::new(File::create(path).map_err(SKError::Io)?); w.write_all(&MAGIC).map_err(SKError::Io)?; - w.write_all(&(BLOCK_SIZE as u32).to_le_bytes()).map_err(SKError::Io)?; + w.write_all(&(block_bits as u32).to_le_bytes()).map_err(SKError::Io)?; w.write_all(&n_unitigs.to_le_bytes()).map_err(SKError::Io)?; w.write_all(&n_kmers.to_le_bytes()).map_err(SKError::Io)?; for &off in block_offsets { @@ -145,39 +165,45 @@ fn write_idx(path: &Path, n_unitigs: u32, n_kmers: u64, block_offsets: &[u32]) - /// Read-only random-access view of a unitig file. /// /// The sequence file is memory-mapped; the block offset table is loaded into RAM -/// on open (≈ n_chunks / BLOCK_SIZE entries, negligible memory). -/// -/// Random access to chunk `i`: O(BLOCK_SIZE) sequential mmap reads — branchless -/// shift/mask arithmetic, cache-friendly, negligible versus the MPHF lookup. -/// -/// Sequential iteration: O(n) via a running-offset cursor (no per-chunk overhead). +/// on open. Random access to chunk `i`: O(1 << block_bits) sequential mmap +/// reads. Sequential iteration: O(n) via a running-offset cursor. pub struct UnitigFileReader { mmap: Mmap, block_offsets: Vec, n_unitigs: usize, n_kmers: usize, k: usize, + block_bits: u8, + mask: usize, // (1 << block_bits) - 1 } impl UnitigFileReader { pub fn open(path: &Path) -> SKResult { let file = File::open(path).map_err(SKError::Io)?; let mmap = unsafe { Mmap::map(&file).map_err(SKError::Io)? }; - let (n_unitigs, n_kmers, block_offsets) = read_idx(&idx_path(path))?; + let (n_unitigs, n_kmers, block_bits, block_offsets) = read_idx(&idx_path(path))?; let k = obikseq::params::k(); - Ok(Self { mmap, block_offsets, n_unitigs, n_kmers, k }) + Ok(Self { + mmap, + block_offsets, + n_unitigs, + n_kmers, + k, + block_bits, + mask: (1usize << block_bits) - 1, + }) } pub fn len(&self) -> usize { self.n_unitigs } pub fn is_empty(&self) -> bool { self.n_unitigs == 0 } pub fn n_kmers(&self) -> usize { self.n_kmers } + pub fn block_bits(&self) -> u8 { self.block_bits } /// Byte offset of the START of record `i` (the seql byte) in the mmap. - /// O(BLOCK_SIZE) sequential scan within the block. #[inline] fn chunk_start(&self, i: usize) -> usize { - let block = i >> LOG2_BLOCK_SIZE; - let rem = i & (BLOCK_SIZE - 1); + let block = i >> self.block_bits; + let rem = i & self.mask; let mut offset = self.block_offsets[block] as usize; for _ in 0..rem { let seql_minus_k = self.mmap[offset] as usize; @@ -216,7 +242,6 @@ impl UnitigFileReader { // ── Sequential iterators (O(n) running-offset cursor) ───────────────────── - /// Iterate all chunks in file order with a running byte offset — O(n) total. fn iter_chunks_sequential(&self) -> impl Iterator + '_ { let k = self.k; let mmap = &*self.mmap; @@ -253,7 +278,7 @@ impl UnitigFileReader { } } -fn read_idx(path: &Path) -> SKResult<(usize, usize, Vec)> { +fn read_idx(path: &Path) -> SKResult<(usize, usize, u8, Vec)> { let data = std::fs::read(path).map_err(SKError::Io)?; let mut pos = 0; @@ -261,22 +286,22 @@ fn read_idx(path: &Path) -> SKResult<(usize, usize, Vec)> { .ok_or(SKError::Truncated { context: "unitig index: magic" })?; if magic_bytes != &MAGIC { return Err(SKError::BadMagic { - expected: "UIX2", + expected: "UIX3", got: magic_bytes.try_into().unwrap(), }); } pos += 4; - // block_size stored for forward-compatibility verification - let bs_bytes = data.get(pos..pos + 4) - .ok_or(SKError::Truncated { context: "unitig index: block_size" })?; - let stored_bs = u32::from_le_bytes(bs_bytes.try_into().unwrap()) as usize; - if stored_bs != BLOCK_SIZE { + let bb_bytes = data.get(pos..pos + 4) + .ok_or(SKError::Truncated { context: "unitig index: block_bits" })?; + let block_bits_u32 = u32::from_le_bytes(bb_bytes.try_into().unwrap()); + if block_bits_u32 > 31 { return Err(SKError::InvalidData { context: "unitig index", - detail: format!("block_size mismatch: file={stored_bs} code={BLOCK_SIZE}"), + detail: format!("block_bits out of range: {block_bits_u32}"), }); } + let block_bits = block_bits_u32 as u8; pos += 4; let n_bytes = data.get(pos..pos + 4) @@ -289,8 +314,9 @@ fn read_idx(path: &Path) -> SKResult<(usize, usize, Vec)> { let n_kmers = u64::from_le_bytes(nk_bytes.try_into().unwrap()) as usize; pos += 8; - let n_blocks = (n_unitigs + BLOCK_SIZE - 1) >> LOG2_BLOCK_SIZE; - let n_offsets = n_blocks + 1; // +1 for sentinel + let block_size = 1usize << block_bits; + let n_blocks = (n_unitigs + block_size - 1) >> block_bits; + let n_offsets = n_blocks + 1; let mut block_offsets = Vec::with_capacity(n_offsets); for _ in 0..n_offsets { let off_bytes = data.get(pos..pos + 4) @@ -299,7 +325,7 @@ fn read_idx(path: &Path) -> SKResult<(usize, usize, Vec)> { pos += 4; } - Ok((n_unitigs, n_kmers, block_offsets)) + Ok((n_unitigs, n_kmers, block_bits, block_offsets)) } // ── Kmer utilities ────────────────────────────────────────────────────────────