feat: make index granularity configurable via block_bits
Replaces the hardcoded BLOCK_SIZE constant with a configurable block_bits parameter, enabling variable index granularity to balance index size and sequential scan cost. Both the reader and writer now store block_bits and a precomputed mask for branchless offset arithmetic, while the index file format is upgraded to UIX3 to persist the configuration. Comprehensive unit tests verify serialization, chunk offset indexing, random access consistency, and kmer count accuracy across various block sizes.
This commit is contained in:
@@ -11,18 +11,17 @@ use crate::error::{SKError, SKResult};
|
||||
|
||||
// ── Block index parameters ────────────────────────────────────────────────────
|
||||
//
|
||||
// One offset entry per BLOCK_SIZE chunks. BLOCK_SIZE must be a power of two
|
||||
// so that block = i >> LOG2_BLOCK_SIZE and rem = i & (BLOCK_SIZE − 1) are
|
||||
// branchless shifts/masks rather than divisions.
|
||||
// BLOCK_SIZE = 1 << block_bits chunks share one offset entry in the index.
|
||||
// block_bits=0 → one entry per chunk (exact offsets, no scan).
|
||||
// block_bits=6 → one entry per 64 chunks (default; O(64) scan per lookup).
|
||||
//
|
||||
// With BLOCK_SIZE = 64 and an average chunk size of ~10 bytes, a random lookup
|
||||
// scans at most 63 × 10 = 630 bytes sequentially — negligible next to the MPHF
|
||||
// lookup that precedes it. The index file shrinks from ~5 bytes/chunk to
|
||||
// ~1/64 bytes/chunk (≈ 300× for typical workloads).
|
||||
// block_bits is stored in the index file so the reader derives all parameters
|
||||
// at runtime — no compile-time constant constrains the format.
|
||||
|
||||
const MAGIC: [u8; 4] = *b"UIX2";
|
||||
const BLOCK_SIZE: usize = 64;
|
||||
const LOG2_BLOCK_SIZE: u32 = 6; // 2^6 = BLOCK_SIZE
|
||||
const MAGIC: [u8; 4] = *b"UIX3";
|
||||
|
||||
/// Default block granularity used by [`UnitigFileWriter::create`].
|
||||
pub const DEFAULT_BLOCK_BITS: u8 = 6;
|
||||
|
||||
fn idx_path(path: &Path) -> PathBuf {
|
||||
crate::append_path_suffix(path, ".idx")
|
||||
@@ -33,23 +32,36 @@ fn idx_path(path: &Path) -> PathBuf {
|
||||
/// Writes a sequence of [`Unitig`] to an uncompressed binary file and builds
|
||||
/// a block-sampled offset index at close time.
|
||||
///
|
||||
/// One offset is stored every [`BLOCK_SIZE`] chunks; random access to chunk `i`
|
||||
/// costs at most `BLOCK_SIZE − 1` sequential chunk scans after the block lookup.
|
||||
/// One offset is stored every `1 << block_bits` chunks; random access to chunk
|
||||
/// `i` costs at most `(1 << block_bits) − 1` sequential chunk scans after the
|
||||
/// block lookup.
|
||||
///
|
||||
/// Unitigs with more than [`MAX_KMERS_PER_CHUNK`] k-mers are transparently split
|
||||
/// into overlapping chunks (k−1 nucleotide overlap) so no k-mer is lost.
|
||||
pub struct UnitigFileWriter {
|
||||
path: PathBuf,
|
||||
file: BufWriter<File>,
|
||||
block_offsets: Vec<u32>, // byte offset of first record in each block
|
||||
block_offsets: Vec<u32>,
|
||||
chunk_count: usize,
|
||||
next_offset: u32, // byte offset of the START of the next record
|
||||
next_offset: u32,
|
||||
n_kmers: usize,
|
||||
k: usize,
|
||||
block_bits: u8,
|
||||
mask: usize, // (1 << block_bits) - 1
|
||||
}
|
||||
|
||||
impl UnitigFileWriter {
|
||||
/// Create a writer with the default block size (`DEFAULT_BLOCK_BITS = 6`).
|
||||
pub fn create(path: &Path) -> SKResult<Self> {
|
||||
Self::create_with_block_bits(path, DEFAULT_BLOCK_BITS)
|
||||
}
|
||||
|
||||
/// Create a writer with a custom block size.
|
||||
///
|
||||
/// `block_bits` must be in 0..=31. `block_bits=0` stores one offset per
|
||||
/// chunk (exact, no scan); larger values trade index size for scan length.
|
||||
pub fn create_with_block_bits(path: &Path, block_bits: u8) -> SKResult<Self> {
|
||||
assert!(block_bits <= 31, "block_bits must be ≤ 31");
|
||||
let file = File::create(path).map_err(SKError::Io)?;
|
||||
Ok(Self {
|
||||
path: path.to_owned(),
|
||||
@@ -59,6 +71,8 @@ impl UnitigFileWriter {
|
||||
next_offset: 0,
|
||||
n_kmers: 0,
|
||||
k: obikseq::params::k(),
|
||||
block_bits,
|
||||
mask: (1usize << block_bits) - 1,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -95,8 +109,7 @@ impl UnitigFileWriter {
|
||||
|
||||
debug_assert!(seql - self.k <= u8::MAX as usize, "chunk exceeds MAX_KMERS_PER_CHUNK");
|
||||
|
||||
// Record a block offset at the start of every BLOCK_SIZE-th chunk.
|
||||
if self.chunk_count & (BLOCK_SIZE - 1) == 0 {
|
||||
if self.chunk_count & self.mask == 0 {
|
||||
self.block_offsets.push(self.next_offset);
|
||||
}
|
||||
|
||||
@@ -113,25 +126,32 @@ impl UnitigFileWriter {
|
||||
self.file.flush().map_err(SKError::Io)?;
|
||||
drop(self.file);
|
||||
|
||||
// Sentinel: byte offset past the last record (needed for end-of-file detection).
|
||||
self.block_offsets.push(self.next_offset);
|
||||
|
||||
write_idx(
|
||||
&idx_path(&self.path),
|
||||
self.chunk_count as u32,
|
||||
self.n_kmers as u64,
|
||||
self.block_bits,
|
||||
&self.block_offsets,
|
||||
)
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize { self.chunk_count }
|
||||
pub fn is_empty(&self) -> bool { self.chunk_count == 0 }
|
||||
pub fn block_bits(&self) -> u8 { self.block_bits }
|
||||
}
|
||||
|
||||
fn write_idx(path: &Path, n_unitigs: u32, n_kmers: u64, block_offsets: &[u32]) -> SKResult<()> {
|
||||
fn write_idx(
|
||||
path: &Path,
|
||||
n_unitigs: u32,
|
||||
n_kmers: u64,
|
||||
block_bits: u8,
|
||||
block_offsets: &[u32],
|
||||
) -> SKResult<()> {
|
||||
let mut w = BufWriter::new(File::create(path).map_err(SKError::Io)?);
|
||||
w.write_all(&MAGIC).map_err(SKError::Io)?;
|
||||
w.write_all(&(BLOCK_SIZE as u32).to_le_bytes()).map_err(SKError::Io)?;
|
||||
w.write_all(&(block_bits as u32).to_le_bytes()).map_err(SKError::Io)?;
|
||||
w.write_all(&n_unitigs.to_le_bytes()).map_err(SKError::Io)?;
|
||||
w.write_all(&n_kmers.to_le_bytes()).map_err(SKError::Io)?;
|
||||
for &off in block_offsets {
|
||||
@@ -145,39 +165,45 @@ fn write_idx(path: &Path, n_unitigs: u32, n_kmers: u64, block_offsets: &[u32]) -
|
||||
/// Read-only random-access view of a unitig file.
|
||||
///
|
||||
/// The sequence file is memory-mapped; the block offset table is loaded into RAM
|
||||
/// on open (≈ n_chunks / BLOCK_SIZE entries, negligible memory).
|
||||
///
|
||||
/// Random access to chunk `i`: O(BLOCK_SIZE) sequential mmap reads — branchless
|
||||
/// shift/mask arithmetic, cache-friendly, negligible versus the MPHF lookup.
|
||||
///
|
||||
/// Sequential iteration: O(n) via a running-offset cursor (no per-chunk overhead).
|
||||
/// on open. Random access to chunk `i`: O(1 << block_bits) sequential mmap
|
||||
/// reads. Sequential iteration: O(n) via a running-offset cursor.
|
||||
pub struct UnitigFileReader {
|
||||
mmap: Mmap,
|
||||
block_offsets: Vec<u32>,
|
||||
n_unitigs: usize,
|
||||
n_kmers: usize,
|
||||
k: usize,
|
||||
block_bits: u8,
|
||||
mask: usize, // (1 << block_bits) - 1
|
||||
}
|
||||
|
||||
impl UnitigFileReader {
|
||||
pub fn open(path: &Path) -> SKResult<Self> {
|
||||
let file = File::open(path).map_err(SKError::Io)?;
|
||||
let mmap = unsafe { Mmap::map(&file).map_err(SKError::Io)? };
|
||||
let (n_unitigs, n_kmers, block_offsets) = read_idx(&idx_path(path))?;
|
||||
let (n_unitigs, n_kmers, block_bits, block_offsets) = read_idx(&idx_path(path))?;
|
||||
let k = obikseq::params::k();
|
||||
Ok(Self { mmap, block_offsets, n_unitigs, n_kmers, k })
|
||||
Ok(Self {
|
||||
mmap,
|
||||
block_offsets,
|
||||
n_unitigs,
|
||||
n_kmers,
|
||||
k,
|
||||
block_bits,
|
||||
mask: (1usize << block_bits) - 1,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize { self.n_unitigs }
|
||||
pub fn is_empty(&self) -> bool { self.n_unitigs == 0 }
|
||||
pub fn n_kmers(&self) -> usize { self.n_kmers }
|
||||
pub fn block_bits(&self) -> u8 { self.block_bits }
|
||||
|
||||
/// Byte offset of the START of record `i` (the seql byte) in the mmap.
|
||||
/// O(BLOCK_SIZE) sequential scan within the block.
|
||||
#[inline]
|
||||
fn chunk_start(&self, i: usize) -> usize {
|
||||
let block = i >> LOG2_BLOCK_SIZE;
|
||||
let rem = i & (BLOCK_SIZE - 1);
|
||||
let block = i >> self.block_bits;
|
||||
let rem = i & self.mask;
|
||||
let mut offset = self.block_offsets[block] as usize;
|
||||
for _ in 0..rem {
|
||||
let seql_minus_k = self.mmap[offset] as usize;
|
||||
@@ -216,7 +242,6 @@ impl UnitigFileReader {
|
||||
|
||||
// ── Sequential iterators (O(n) running-offset cursor) ─────────────────────
|
||||
|
||||
/// Iterate all chunks in file order with a running byte offset — O(n) total.
|
||||
fn iter_chunks_sequential(&self) -> impl Iterator<Item = (usize, Unitig)> + '_ {
|
||||
let k = self.k;
|
||||
let mmap = &*self.mmap;
|
||||
@@ -253,7 +278,7 @@ impl UnitigFileReader {
|
||||
}
|
||||
}
|
||||
|
||||
fn read_idx(path: &Path) -> SKResult<(usize, usize, Vec<u32>)> {
|
||||
fn read_idx(path: &Path) -> SKResult<(usize, usize, u8, Vec<u32>)> {
|
||||
let data = std::fs::read(path).map_err(SKError::Io)?;
|
||||
let mut pos = 0;
|
||||
|
||||
@@ -261,22 +286,22 @@ fn read_idx(path: &Path) -> SKResult<(usize, usize, Vec<u32>)> {
|
||||
.ok_or(SKError::Truncated { context: "unitig index: magic" })?;
|
||||
if magic_bytes != &MAGIC {
|
||||
return Err(SKError::BadMagic {
|
||||
expected: "UIX2",
|
||||
expected: "UIX3",
|
||||
got: magic_bytes.try_into().unwrap(),
|
||||
});
|
||||
}
|
||||
pos += 4;
|
||||
|
||||
// block_size stored for forward-compatibility verification
|
||||
let bs_bytes = data.get(pos..pos + 4)
|
||||
.ok_or(SKError::Truncated { context: "unitig index: block_size" })?;
|
||||
let stored_bs = u32::from_le_bytes(bs_bytes.try_into().unwrap()) as usize;
|
||||
if stored_bs != BLOCK_SIZE {
|
||||
let bb_bytes = data.get(pos..pos + 4)
|
||||
.ok_or(SKError::Truncated { context: "unitig index: block_bits" })?;
|
||||
let block_bits_u32 = u32::from_le_bytes(bb_bytes.try_into().unwrap());
|
||||
if block_bits_u32 > 31 {
|
||||
return Err(SKError::InvalidData {
|
||||
context: "unitig index",
|
||||
detail: format!("block_size mismatch: file={stored_bs} code={BLOCK_SIZE}"),
|
||||
detail: format!("block_bits out of range: {block_bits_u32}"),
|
||||
});
|
||||
}
|
||||
let block_bits = block_bits_u32 as u8;
|
||||
pos += 4;
|
||||
|
||||
let n_bytes = data.get(pos..pos + 4)
|
||||
@@ -289,8 +314,9 @@ fn read_idx(path: &Path) -> SKResult<(usize, usize, Vec<u32>)> {
|
||||
let n_kmers = u64::from_le_bytes(nk_bytes.try_into().unwrap()) as usize;
|
||||
pos += 8;
|
||||
|
||||
let n_blocks = (n_unitigs + BLOCK_SIZE - 1) >> LOG2_BLOCK_SIZE;
|
||||
let n_offsets = n_blocks + 1; // +1 for sentinel
|
||||
let block_size = 1usize << block_bits;
|
||||
let n_blocks = (n_unitigs + block_size - 1) >> block_bits;
|
||||
let n_offsets = n_blocks + 1;
|
||||
let mut block_offsets = Vec::with_capacity(n_offsets);
|
||||
for _ in 0..n_offsets {
|
||||
let off_bytes = data.get(pos..pos + 4)
|
||||
@@ -299,7 +325,7 @@ fn read_idx(path: &Path) -> SKResult<(usize, usize, Vec<u32>)> {
|
||||
pos += 4;
|
||||
}
|
||||
|
||||
Ok((n_unitigs, n_kmers, block_offsets))
|
||||
Ok((n_unitigs, n_kmers, block_bits, block_offsets))
|
||||
}
|
||||
|
||||
// ── Kmer utilities ────────────────────────────────────────────────────────────
|
||||
|
||||
Reference in New Issue
Block a user