feat: make index granularity configurable via block_bits

Replaces the hardcoded BLOCK_SIZE constant with a configurable block_bits parameter, enabling variable index granularity to balance index size and sequential scan cost. Both the reader and writer now store block_bits and a precomputed mask for branchless offset arithmetic, while the index file format is upgraded to UIX3 to persist the configuration. Comprehensive unit tests verify serialization, chunk offset indexing, random access consistency, and kmer count accuracy across various block sizes.
This commit is contained in:
Eric Coissac
2026-05-23 08:06:32 +02:00
parent 4a5ab0b8c2
commit 8478072b78
2 changed files with 159 additions and 42 deletions
+91
View File
@@ -269,3 +269,94 @@ fn long_unitig_split_no_kmer_lost() {
// First kmer of chunk 1 = original nucl 256..260 — a different, adjacent kmer. // First kmer of chunk 1 = original nucl 256..260 — a different, adjacent kmer.
assert!(r.verify_canonical_kmer(1, 0, canonical_of(&seq[256..260]))); assert!(r.verify_canonical_kmer(1, 0, canonical_of(&seq[256..260])));
} }
// ── block_bits parametrisation ────────────────────────────────────────────────
fn write_read_bb(seqs: &[&[u8]], block_bits: u8) -> (tempfile::TempDir, UnitigFileReader) {
let dir = tempdir().unwrap();
let path = dir.path().join("unitigs.bin");
let mut w = UnitigFileWriter::create_with_block_bits(&path, block_bits).unwrap();
for s in seqs {
w.write(&make_unitig(s)).unwrap();
}
w.close().unwrap();
let r = UnitigFileReader::open(&path).unwrap();
(dir, r)
}
#[test]
fn block_bits_stored_and_read_back() {
set_k(4);
for bb in [0u8, 1, 2, 3, 6] {
let dir = tempdir().unwrap();
let path = dir.path().join("unitigs.bin");
let w = UnitigFileWriter::create_with_block_bits(&path, bb).unwrap();
assert_eq!(w.block_bits(), bb);
w.close().unwrap();
let r = UnitigFileReader::open(&path).unwrap();
assert_eq!(r.block_bits(), bb, "block_bits={bb} not preserved");
}
}
#[test]
fn block_bits_zero_exact_offsets() {
// block_bits=0 → one offset per chunk, no sequential scan in chunk_start
set_k(4);
let seqs: &[&[u8]] = &[b"ACGTACGT", b"TTTTCCCC", b"GGGAAA", b"AAAACG"];
let (_dir, r) = write_read_bb(seqs, 0);
assert_eq!(r.len(), seqs.len());
for (i, s) in seqs.iter().enumerate() {
assert_eq!(r.unitig(i), make_unitig(s), "block_bits=0: unitig {i} mismatch");
}
}
#[test]
fn block_bits_one_block_size_two() {
// block_bits=1 → BLOCK_SIZE=2: every other chunk gets an offset
set_k(4);
let seqs: &[&[u8]] = &[b"ACGTACGT", b"TTTTCCCC", b"GGGAAA", b"AAAACG", b"CCCCGT"];
let (_dir, r) = write_read_bb(seqs, 1);
assert_eq!(r.len(), seqs.len());
for (i, s) in seqs.iter().enumerate() {
assert_eq!(r.unitig(i), make_unitig(s), "block_bits=1: unitig {i} mismatch");
}
}
#[test]
fn block_bits_larger_than_chunk_count() {
// block_bits=6 (BLOCK_SIZE=64) with only 3 chunks: all in block 0
set_k(4);
let seqs: &[&[u8]] = &[b"ACGTACGT", b"TTTTCCCC", b"GGGAAA"];
let (_dir, r) = write_read_bb(seqs, 6);
assert_eq!(r.len(), seqs.len());
for (i, s) in seqs.iter().enumerate() {
assert_eq!(r.unitig(i), make_unitig(s), "block_bits=6: unitig {i} mismatch");
}
}
#[test]
fn block_bits_random_access_matches_sequential() {
// Write many chunks with block_bits=2 (BLOCK_SIZE=4), verify random access
// against sequential iteration for every chunk.
set_k(4);
let seqs: Vec<Vec<u8>> = (0..20_usize)
.map(|i| (0..8_usize).map(|j| b"ACGT"[(i + j) % 4]).collect())
.collect();
let seq_refs: Vec<&[u8]> = seqs.iter().map(|s| s.as_slice()).collect();
let (_dir, r) = write_read_bb(&seq_refs, 2);
assert_eq!(r.len(), seqs.len());
let sequential: Vec<Unitig> = r.iter_chunks_sequential().map(|(_, u)| u).collect();
for i in 0..seqs.len() {
assert_eq!(r.unitig(i), sequential[i], "random vs sequential mismatch at chunk {i}");
}
}
#[test]
fn block_bits_kmer_count_preserved() {
set_k(4);
// "AAAACG" → 3 kmers, "CCCCAG" → 3 kmers; total = 6
for bb in [0u8, 1, 2, 6] {
let (_dir, r) = write_read_bb(&[b"AAAACG", b"CCCCAG"], bb);
assert_eq!(r.n_kmers(), 6, "block_bits={bb}: n_kmers mismatch");
}
}
+68 -42
View File
@@ -11,18 +11,17 @@ use crate::error::{SKError, SKResult};
// ── Block index parameters ──────────────────────────────────────────────────── // ── Block index parameters ────────────────────────────────────────────────────
// //
// One offset entry per BLOCK_SIZE chunks. BLOCK_SIZE must be a power of two // BLOCK_SIZE = 1 << block_bits chunks share one offset entry in the index.
// so that block = i >> LOG2_BLOCK_SIZE and rem = i & (BLOCK_SIZE 1) are // block_bits=0 → one entry per chunk (exact offsets, no scan).
// branchless shifts/masks rather than divisions. // block_bits=6 → one entry per 64 chunks (default; O(64) scan per lookup).
// //
// With BLOCK_SIZE = 64 and an average chunk size of ~10 bytes, a random lookup // block_bits is stored in the index file so the reader derives all parameters
// scans at most 63 × 10 = 630 bytes sequentially — negligible next to the MPHF // at runtime — no compile-time constant constrains the format.
// lookup that precedes it. The index file shrinks from ~5 bytes/chunk to
// ~1/64 bytes/chunk (≈ 300× for typical workloads).
const MAGIC: [u8; 4] = *b"UIX2"; const MAGIC: [u8; 4] = *b"UIX3";
const BLOCK_SIZE: usize = 64;
const LOG2_BLOCK_SIZE: u32 = 6; // 2^6 = BLOCK_SIZE /// Default block granularity used by [`UnitigFileWriter::create`].
pub const DEFAULT_BLOCK_BITS: u8 = 6;
fn idx_path(path: &Path) -> PathBuf { fn idx_path(path: &Path) -> PathBuf {
crate::append_path_suffix(path, ".idx") crate::append_path_suffix(path, ".idx")
@@ -33,23 +32,36 @@ fn idx_path(path: &Path) -> PathBuf {
/// Writes a sequence of [`Unitig`] to an uncompressed binary file and builds /// Writes a sequence of [`Unitig`] to an uncompressed binary file and builds
/// a block-sampled offset index at close time. /// a block-sampled offset index at close time.
/// ///
/// One offset is stored every [`BLOCK_SIZE`] chunks; random access to chunk `i` /// One offset is stored every `1 << block_bits` chunks; random access to chunk
/// costs at most `BLOCK_SIZE 1` sequential chunk scans after the block lookup. /// `i` costs at most `(1 << block_bits) 1` sequential chunk scans after the
/// block lookup.
/// ///
/// Unitigs with more than [`MAX_KMERS_PER_CHUNK`] k-mers are transparently split /// Unitigs with more than [`MAX_KMERS_PER_CHUNK`] k-mers are transparently split
/// into overlapping chunks (k1 nucleotide overlap) so no k-mer is lost. /// into overlapping chunks (k1 nucleotide overlap) so no k-mer is lost.
pub struct UnitigFileWriter { pub struct UnitigFileWriter {
path: PathBuf, path: PathBuf,
file: BufWriter<File>, file: BufWriter<File>,
block_offsets: Vec<u32>, // byte offset of first record in each block block_offsets: Vec<u32>,
chunk_count: usize, chunk_count: usize,
next_offset: u32, // byte offset of the START of the next record next_offset: u32,
n_kmers: usize, n_kmers: usize,
k: usize, k: usize,
block_bits: u8,
mask: usize, // (1 << block_bits) - 1
} }
impl UnitigFileWriter { impl UnitigFileWriter {
/// Create a writer with the default block size (`DEFAULT_BLOCK_BITS = 6`).
pub fn create(path: &Path) -> SKResult<Self> { pub fn create(path: &Path) -> SKResult<Self> {
Self::create_with_block_bits(path, DEFAULT_BLOCK_BITS)
}
/// Create a writer with a custom block size.
///
/// `block_bits` must be in 0..=31. `block_bits=0` stores one offset per
/// chunk (exact, no scan); larger values trade index size for scan length.
pub fn create_with_block_bits(path: &Path, block_bits: u8) -> SKResult<Self> {
assert!(block_bits <= 31, "block_bits must be ≤ 31");
let file = File::create(path).map_err(SKError::Io)?; let file = File::create(path).map_err(SKError::Io)?;
Ok(Self { Ok(Self {
path: path.to_owned(), path: path.to_owned(),
@@ -59,6 +71,8 @@ impl UnitigFileWriter {
next_offset: 0, next_offset: 0,
n_kmers: 0, n_kmers: 0,
k: obikseq::params::k(), k: obikseq::params::k(),
block_bits,
mask: (1usize << block_bits) - 1,
}) })
} }
@@ -95,8 +109,7 @@ impl UnitigFileWriter {
debug_assert!(seql - self.k <= u8::MAX as usize, "chunk exceeds MAX_KMERS_PER_CHUNK"); debug_assert!(seql - self.k <= u8::MAX as usize, "chunk exceeds MAX_KMERS_PER_CHUNK");
// Record a block offset at the start of every BLOCK_SIZE-th chunk. if self.chunk_count & self.mask == 0 {
if self.chunk_count & (BLOCK_SIZE - 1) == 0 {
self.block_offsets.push(self.next_offset); self.block_offsets.push(self.next_offset);
} }
@@ -113,25 +126,32 @@ impl UnitigFileWriter {
self.file.flush().map_err(SKError::Io)?; self.file.flush().map_err(SKError::Io)?;
drop(self.file); drop(self.file);
// Sentinel: byte offset past the last record (needed for end-of-file detection).
self.block_offsets.push(self.next_offset); self.block_offsets.push(self.next_offset);
write_idx( write_idx(
&idx_path(&self.path), &idx_path(&self.path),
self.chunk_count as u32, self.chunk_count as u32,
self.n_kmers as u64, self.n_kmers as u64,
self.block_bits,
&self.block_offsets, &self.block_offsets,
) )
} }
pub fn len(&self) -> usize { self.chunk_count } pub fn len(&self) -> usize { self.chunk_count }
pub fn is_empty(&self) -> bool { self.chunk_count == 0 } pub fn is_empty(&self) -> bool { self.chunk_count == 0 }
pub fn block_bits(&self) -> u8 { self.block_bits }
} }
fn write_idx(path: &Path, n_unitigs: u32, n_kmers: u64, block_offsets: &[u32]) -> SKResult<()> { fn write_idx(
path: &Path,
n_unitigs: u32,
n_kmers: u64,
block_bits: u8,
block_offsets: &[u32],
) -> SKResult<()> {
let mut w = BufWriter::new(File::create(path).map_err(SKError::Io)?); let mut w = BufWriter::new(File::create(path).map_err(SKError::Io)?);
w.write_all(&MAGIC).map_err(SKError::Io)?; w.write_all(&MAGIC).map_err(SKError::Io)?;
w.write_all(&(BLOCK_SIZE as u32).to_le_bytes()).map_err(SKError::Io)?; w.write_all(&(block_bits as u32).to_le_bytes()).map_err(SKError::Io)?;
w.write_all(&n_unitigs.to_le_bytes()).map_err(SKError::Io)?; w.write_all(&n_unitigs.to_le_bytes()).map_err(SKError::Io)?;
w.write_all(&n_kmers.to_le_bytes()).map_err(SKError::Io)?; w.write_all(&n_kmers.to_le_bytes()).map_err(SKError::Io)?;
for &off in block_offsets { for &off in block_offsets {
@@ -145,39 +165,45 @@ fn write_idx(path: &Path, n_unitigs: u32, n_kmers: u64, block_offsets: &[u32]) -
/// Read-only random-access view of a unitig file. /// Read-only random-access view of a unitig file.
/// ///
/// The sequence file is memory-mapped; the block offset table is loaded into RAM /// The sequence file is memory-mapped; the block offset table is loaded into RAM
/// on open (≈ n_chunks / BLOCK_SIZE entries, negligible memory). /// on open. Random access to chunk `i`: O(1 << block_bits) sequential mmap
/// /// reads. Sequential iteration: O(n) via a running-offset cursor.
/// Random access to chunk `i`: O(BLOCK_SIZE) sequential mmap reads — branchless
/// shift/mask arithmetic, cache-friendly, negligible versus the MPHF lookup.
///
/// Sequential iteration: O(n) via a running-offset cursor (no per-chunk overhead).
pub struct UnitigFileReader { pub struct UnitigFileReader {
mmap: Mmap, mmap: Mmap,
block_offsets: Vec<u32>, block_offsets: Vec<u32>,
n_unitigs: usize, n_unitigs: usize,
n_kmers: usize, n_kmers: usize,
k: usize, k: usize,
block_bits: u8,
mask: usize, // (1 << block_bits) - 1
} }
impl UnitigFileReader { impl UnitigFileReader {
pub fn open(path: &Path) -> SKResult<Self> { pub fn open(path: &Path) -> SKResult<Self> {
let file = File::open(path).map_err(SKError::Io)?; let file = File::open(path).map_err(SKError::Io)?;
let mmap = unsafe { Mmap::map(&file).map_err(SKError::Io)? }; let mmap = unsafe { Mmap::map(&file).map_err(SKError::Io)? };
let (n_unitigs, n_kmers, block_offsets) = read_idx(&idx_path(path))?; let (n_unitigs, n_kmers, block_bits, block_offsets) = read_idx(&idx_path(path))?;
let k = obikseq::params::k(); let k = obikseq::params::k();
Ok(Self { mmap, block_offsets, n_unitigs, n_kmers, k }) Ok(Self {
mmap,
block_offsets,
n_unitigs,
n_kmers,
k,
block_bits,
mask: (1usize << block_bits) - 1,
})
} }
pub fn len(&self) -> usize { self.n_unitigs } pub fn len(&self) -> usize { self.n_unitigs }
pub fn is_empty(&self) -> bool { self.n_unitigs == 0 } pub fn is_empty(&self) -> bool { self.n_unitigs == 0 }
pub fn n_kmers(&self) -> usize { self.n_kmers } pub fn n_kmers(&self) -> usize { self.n_kmers }
pub fn block_bits(&self) -> u8 { self.block_bits }
/// Byte offset of the START of record `i` (the seql byte) in the mmap. /// Byte offset of the START of record `i` (the seql byte) in the mmap.
/// O(BLOCK_SIZE) sequential scan within the block.
#[inline] #[inline]
fn chunk_start(&self, i: usize) -> usize { fn chunk_start(&self, i: usize) -> usize {
let block = i >> LOG2_BLOCK_SIZE; let block = i >> self.block_bits;
let rem = i & (BLOCK_SIZE - 1); let rem = i & self.mask;
let mut offset = self.block_offsets[block] as usize; let mut offset = self.block_offsets[block] as usize;
for _ in 0..rem { for _ in 0..rem {
let seql_minus_k = self.mmap[offset] as usize; let seql_minus_k = self.mmap[offset] as usize;
@@ -216,7 +242,6 @@ impl UnitigFileReader {
// ── Sequential iterators (O(n) running-offset cursor) ───────────────────── // ── Sequential iterators (O(n) running-offset cursor) ─────────────────────
/// Iterate all chunks in file order with a running byte offset — O(n) total.
fn iter_chunks_sequential(&self) -> impl Iterator<Item = (usize, Unitig)> + '_ { fn iter_chunks_sequential(&self) -> impl Iterator<Item = (usize, Unitig)> + '_ {
let k = self.k; let k = self.k;
let mmap = &*self.mmap; let mmap = &*self.mmap;
@@ -253,7 +278,7 @@ impl UnitigFileReader {
} }
} }
fn read_idx(path: &Path) -> SKResult<(usize, usize, Vec<u32>)> { fn read_idx(path: &Path) -> SKResult<(usize, usize, u8, Vec<u32>)> {
let data = std::fs::read(path).map_err(SKError::Io)?; let data = std::fs::read(path).map_err(SKError::Io)?;
let mut pos = 0; let mut pos = 0;
@@ -261,22 +286,22 @@ fn read_idx(path: &Path) -> SKResult<(usize, usize, Vec<u32>)> {
.ok_or(SKError::Truncated { context: "unitig index: magic" })?; .ok_or(SKError::Truncated { context: "unitig index: magic" })?;
if magic_bytes != &MAGIC { if magic_bytes != &MAGIC {
return Err(SKError::BadMagic { return Err(SKError::BadMagic {
expected: "UIX2", expected: "UIX3",
got: magic_bytes.try_into().unwrap(), got: magic_bytes.try_into().unwrap(),
}); });
} }
pos += 4; pos += 4;
// block_size stored for forward-compatibility verification let bb_bytes = data.get(pos..pos + 4)
let bs_bytes = data.get(pos..pos + 4) .ok_or(SKError::Truncated { context: "unitig index: block_bits" })?;
.ok_or(SKError::Truncated { context: "unitig index: block_size" })?; let block_bits_u32 = u32::from_le_bytes(bb_bytes.try_into().unwrap());
let stored_bs = u32::from_le_bytes(bs_bytes.try_into().unwrap()) as usize; if block_bits_u32 > 31 {
if stored_bs != BLOCK_SIZE {
return Err(SKError::InvalidData { return Err(SKError::InvalidData {
context: "unitig index", context: "unitig index",
detail: format!("block_size mismatch: file={stored_bs} code={BLOCK_SIZE}"), detail: format!("block_bits out of range: {block_bits_u32}"),
}); });
} }
let block_bits = block_bits_u32 as u8;
pos += 4; pos += 4;
let n_bytes = data.get(pos..pos + 4) let n_bytes = data.get(pos..pos + 4)
@@ -289,8 +314,9 @@ fn read_idx(path: &Path) -> SKResult<(usize, usize, Vec<u32>)> {
let n_kmers = u64::from_le_bytes(nk_bytes.try_into().unwrap()) as usize; let n_kmers = u64::from_le_bytes(nk_bytes.try_into().unwrap()) as usize;
pos += 8; pos += 8;
let n_blocks = (n_unitigs + BLOCK_SIZE - 1) >> LOG2_BLOCK_SIZE; let block_size = 1usize << block_bits;
let n_offsets = n_blocks + 1; // +1 for sentinel let n_blocks = (n_unitigs + block_size - 1) >> block_bits;
let n_offsets = n_blocks + 1;
let mut block_offsets = Vec::with_capacity(n_offsets); let mut block_offsets = Vec::with_capacity(n_offsets);
for _ in 0..n_offsets { for _ in 0..n_offsets {
let off_bytes = data.get(pos..pos + 4) let off_bytes = data.get(pos..pos + 4)
@@ -299,7 +325,7 @@ fn read_idx(path: &Path) -> SKResult<(usize, usize, Vec<u32>)> {
pos += 4; pos += 4;
} }
Ok((n_unitigs, n_kmers, block_offsets)) Ok((n_unitigs, n_kmers, block_bits, block_offsets))
} }
// ── Kmer utilities ──────────────────────────────────────────────────────────── // ── Kmer utilities ────────────────────────────────────────────────────────────