feat: optimize unitig index and document evidence elimination

Replace the dense per-chunk offset index with a sparse block-sampled structure (64 chunks per block), reducing the index file size by approximately 300× while preserving O(1) k-mer extraction. Introduce a design document for eliminating the `evidence.bin` file, which accounts for ~66% of the lookup layer, by transitioning to fingerprint-based approximate indexing and value-based MPHF lookups. Update MkDocs navigation to include the new documentation and add a file count tracker to the scatter step progress bar for improved observability.
This commit is contained in:
Eric Coissac
2026-05-23 07:51:59 +02:00
parent 9b700ff4a4
commit 4a5ab0b8c2
5 changed files with 488 additions and 151 deletions
+148 -144
View File
@@ -9,21 +9,20 @@ pub use obikseq::MAX_KMERS_PER_CHUNK;
use crate::error::{SKError, SKResult};
// ── Index file format ─────────────────────────────────────────────────────────
// ── Block index parameters ────────────────────────────────────────────────────
//
// magic: [u8; 4] = b"UIDX"
// n_unitigs: u32 LE
// n_kmers: u64 LE total kmer count across all chunks
// seqls: [u8; n_unitigs] max kmer index per chunk (= n_kmers 1)
// packed_offsets: [u32; n_unitigs + 1] byte offsets to packed bytes in the
// sequence file; last entry is sentinel
// One offset entry per BLOCK_SIZE chunks. BLOCK_SIZE must be a power of two
// so that block = i >> LOG2_BLOCK_SIZE and rem = i & (BLOCK_SIZE 1) are
// branchless shifts/masks rather than divisions.
//
// Each sequence record in the binary file: [u8: n_kmers1][packed bytes].
// Offsets point to the first packed byte of each record, past the leading u8.
// Unitigs with more than MAX_KMERS_PER_CHUNK kmers are transparently split by the
// writer into overlapping chunks (k-1 nucleotide overlap) so no kmer is lost.
// With BLOCK_SIZE = 64 and an average chunk size of ~10 bytes, a random lookup
// scans at most 63 × 10 = 630 bytes sequentially — negligible next to the MPHF
// lookup that precedes it. The index file shrinks from ~5 bytes/chunk to
// ~1/64 bytes/chunk (≈ 300× for typical workloads).
const MAGIC: [u8; 4] = *b"UIDX";
const MAGIC: [u8; 4] = *b"UIX2";
const BLOCK_SIZE: usize = 64;
const LOG2_BLOCK_SIZE: u32 = 6; // 2^6 = BLOCK_SIZE
fn idx_path(path: &Path) -> PathBuf {
crate::append_path_suffix(path, ".idx")
@@ -32,21 +31,21 @@ fn idx_path(path: &Path) -> PathBuf {
// ── Writer ────────────────────────────────────────────────────────────────────
/// Writes a sequence of [`Unitig`] to an uncompressed binary file and builds
/// an offset index at close time.
/// a block-sampled offset index at close time.
///
/// Unitigs with more than [`MAX_KMERS_PER_CHUNK`] kmers are transparently split
/// into overlapping chunks (k-1 nucleotide overlap) so no kmer is lost.
/// One offset is stored every [`BLOCK_SIZE`] chunks; random access to chunk `i`
/// costs at most `BLOCK_SIZE 1` sequential chunk scans after the block lookup.
///
/// The companion index file (`path.idx`) is written on [`close`].
/// The binary format per record is `[u8: n_kmers1][packed 2-bit bytes]`.
/// Unitigs with more than [`MAX_KMERS_PER_CHUNK`] k-mers are transparently split
/// into overlapping chunks (k1 nucleotide overlap) so no k-mer is lost.
pub struct UnitigFileWriter {
path: PathBuf,
file: BufWriter<File>,
seqls: Vec<u8>,
packed_offsets: Vec<u32>,
next_offset: u32,
n_kmers: usize,
k: usize,
path: PathBuf,
file: BufWriter<File>,
block_offsets: Vec<u32>, // byte offset of first record in each block
chunk_count: usize,
next_offset: u32, // byte offset of the START of the next record
n_kmers: usize,
k: usize,
}
impl UnitigFileWriter {
@@ -55,15 +54,16 @@ impl UnitigFileWriter {
Ok(Self {
path: path.to_owned(),
file: BufWriter::new(file),
seqls: Vec::new(),
packed_offsets: Vec::new(),
block_offsets: Vec::new(),
chunk_count: 0,
next_offset: 0,
n_kmers: 0,
k: obikseq::params::k(),
})
}
/// Write a unitig, splitting it into chunks if it exceeds [`MAX_KMERS_PER_CHUNK`].
/// Write a unitig, splitting into overlapping chunks if it exceeds
/// [`MAX_KMERS_PER_CHUNK`].
pub fn write(&mut self, unitig: &Unitig) -> SKResult<()> {
let seql = unitig.seql();
let k = self.k;
@@ -77,17 +77,13 @@ impl UnitigFileWriter {
return self.write_chunk(unitig);
}
// Split into overlapping chunks of MAX_KMERS_PER_CHUNK kmers.
// Overlap of k-1 nucleotides ensures no kmer is lost at boundaries.
let chunk_nucl = MAX_KMERS_PER_CHUNK + k - 1;
let stride = MAX_KMERS_PER_CHUNK;
let mut start = 0;
while start < seql {
let end = (start + chunk_nucl).min(seql);
self.write_chunk(&unitig.sub(start, end))?;
if end == seql {
break;
}
if end == seql { break; }
start += stride;
}
Ok(())
@@ -97,54 +93,48 @@ impl UnitigFileWriter {
let seql = unitig.seql();
let byte_len = (seql + 3) / 4;
// Header is 1 byte (u8: n_kmers 1 = seql k); packed bytes follow.
debug_assert!(seql - self.k <= u8::MAX as usize, "chunk exceeds MAX_KMERS_PER_CHUNK");
self.packed_offsets.push(self.next_offset + 1);
self.seqls.push((seql - self.k) as u8);
self.n_kmers += seql - self.k + 1;
unitig
.write_to_binary(&mut self.file)
.map_err(SKError::Io)?;
// Record a block offset at the start of every BLOCK_SIZE-th chunk.
if self.chunk_count & (BLOCK_SIZE - 1) == 0 {
self.block_offsets.push(self.next_offset);
}
self.n_kmers += seql - self.k + 1;
self.chunk_count += 1;
unitig.write_to_binary(&mut self.file).map_err(SKError::Io)?;
self.next_offset += 1 + byte_len as u32;
Ok(())
}
/// Flush the sequence file and write the companion `.idx`.
pub fn close(mut self) -> SKResult<()> {
self.file.flush().map_err(SKError::Io)?;
drop(self.file);
// Sentinel: byte offset past the last record's packed bytes.
let sentinel = match (self.packed_offsets.last(), self.seqls.last()) {
(Some(&last_off), Some(&last_seql)) => {
let seql = last_seql as u32 + self.k as u32;
last_off + (seql + 3) / 4
}
_ => 0,
};
self.packed_offsets.push(sentinel);
// Sentinel: byte offset past the last record (needed for end-of-file detection).
self.block_offsets.push(self.next_offset);
write_idx(&idx_path(&self.path), &self.seqls, &self.packed_offsets, self.n_kmers)
write_idx(
&idx_path(&self.path),
self.chunk_count as u32,
self.n_kmers as u64,
&self.block_offsets,
)
}
pub fn len(&self) -> usize {
self.seqls.len()
}
pub fn is_empty(&self) -> bool {
self.seqls.is_empty()
}
pub fn len(&self) -> usize { self.chunk_count }
pub fn is_empty(&self) -> bool { self.chunk_count == 0 }
}
fn write_idx(path: &Path, seqls: &[u8], packed_offsets: &[u32], n_kmers: usize) -> SKResult<()> {
fn write_idx(path: &Path, n_unitigs: u32, n_kmers: u64, block_offsets: &[u32]) -> SKResult<()> {
let mut w = BufWriter::new(File::create(path).map_err(SKError::Io)?);
w.write_all(&MAGIC).map_err(SKError::Io)?;
w.write_all(&(seqls.len() as u32).to_le_bytes()).map_err(SKError::Io)?;
w.write_all(&(n_kmers as u64).to_le_bytes()).map_err(SKError::Io)?;
w.write_all(seqls).map_err(SKError::Io)?;
for &off in packed_offsets {
w.write_all(&(BLOCK_SIZE as u32).to_le_bytes()).map_err(SKError::Io)?;
w.write_all(&n_unitigs.to_le_bytes()).map_err(SKError::Io)?;
w.write_all(&n_kmers.to_le_bytes()).map_err(SKError::Io)?;
for &off in block_offsets {
w.write_all(&off.to_le_bytes()).map_err(SKError::Io)?;
}
w.flush().map_err(SKError::Io)
@@ -154,105 +144,116 @@ fn write_idx(path: &Path, seqls: &[u8], packed_offsets: &[u32], n_kmers: usize)
/// Read-only random-access view of a unitig file.
///
/// The sequence file is memory-mapped; the index is loaded into RAM on open.
/// All per-kmer operations are O(1) and allocation-free.
/// The sequence file is memory-mapped; the block offset table is loaded into RAM
/// on open (≈ n_chunks / BLOCK_SIZE entries, negligible memory).
///
/// Random access to chunk `i`: O(BLOCK_SIZE) sequential mmap reads — branchless
/// shift/mask arithmetic, cache-friendly, negligible versus the MPHF lookup.
///
/// Sequential iteration: O(n) via a running-offset cursor (no per-chunk overhead).
pub struct UnitigFileReader {
mmap: Mmap,
seqls: Vec<u8>,
packed_offsets: Vec<u32>,
n_kmers: usize,
k: usize,
mmap: Mmap,
block_offsets: Vec<u32>,
n_unitigs: usize,
n_kmers: usize,
k: usize,
}
impl UnitigFileReader {
pub fn open(path: &Path) -> SKResult<Self> {
let file = File::open(path).map_err(SKError::Io)?;
let mmap = unsafe { Mmap::map(&file).map_err(SKError::Io)? };
let (seqls, packed_offsets, n_kmers) = read_idx(&idx_path(path))?;
let (n_unitigs, n_kmers, block_offsets) = read_idx(&idx_path(path))?;
let k = obikseq::params::k();
Ok(Self { mmap, seqls, packed_offsets, n_kmers, k })
Ok(Self { mmap, block_offsets, n_unitigs, n_kmers, k })
}
pub fn len(&self) -> usize {
self.seqls.len()
pub fn len(&self) -> usize { self.n_unitigs }
pub fn is_empty(&self) -> bool { self.n_unitigs == 0 }
pub fn n_kmers(&self) -> usize { self.n_kmers }
/// Byte offset of the START of record `i` (the seql byte) in the mmap.
/// O(BLOCK_SIZE) sequential scan within the block.
#[inline]
fn chunk_start(&self, i: usize) -> usize {
let block = i >> LOG2_BLOCK_SIZE;
let rem = i & (BLOCK_SIZE - 1);
let mut offset = self.block_offsets[block] as usize;
for _ in 0..rem {
let seql_minus_k = self.mmap[offset] as usize;
offset += 1 + (seql_minus_k + self.k + 3) / 4;
}
offset
}
pub fn is_empty(&self) -> bool {
self.seqls.is_empty()
}
/// Total number of kmers across all chunks.
pub fn n_kmers(&self) -> usize {
self.n_kmers
}
/// Return the nucleotide length of chunk `i`.
/// Nucleotide length of chunk `i`.
#[inline]
pub fn seql(&self, i: usize) -> usize {
self.seqls[i] as usize + self.k
self.mmap[self.chunk_start(i)] as usize + self.k
}
/// Reconstruct chunk `i` as a [`Unitig`]. Allocates a copy of the packed bytes.
/// Reconstruct chunk `i` as a [`Unitig`].
pub fn unitig(&self, i: usize) -> Unitig {
let seql = self.seqls[i] as usize + self.k;
let start = self.packed_offsets[i] as usize;
let offset = self.chunk_start(i);
let seql = self.mmap[offset] as usize + self.k;
let byte_len = (seql + 3) / 4;
let tail = (seql % 4) as u8;
let bytes = self.mmap[start..start + byte_len].to_vec().into_boxed_slice();
Unitig::new(tail, bytes)
let bytes = self.mmap[offset + 1..offset + 1 + byte_len].to_vec().into_boxed_slice();
Unitig::new((seql % 4) as u8, bytes)
}
/// Extract the raw left-aligned u64 of the kmer at position `j` within chunk `i`.
/// Raw left-aligned u64 of the k-mer at position `j` within chunk `i`.
#[inline]
pub fn raw_kmer(&self, i: usize, j: usize) -> u64 {
let start = self.packed_offsets[i] as usize;
extract_kmer_raw(&self.mmap[start..], j, self.k)
let offset = self.chunk_start(i);
extract_kmer_raw(&self.mmap[offset + 1..], j, self.k)
}
/// Return `true` iff the kmer at position `j` of chunk `i` equals `query`.
///
/// O(1), zero allocation. The chunk may store either orientation of the kmer;
/// canonicalization is applied before comparison.
/// `true` iff the k-mer at position `j` of chunk `i` equals `query` (canonical).
#[inline]
pub fn verify_canonical_kmer(&self, i: usize, j: usize, query: CanonicalKmer) -> bool {
canonical_raw(self.raw_kmer(i, j), self.k) == query.raw()
}
/// Iterate over all kmers in file order (all positions of chunk 0, then chunk 1, …).
///
/// Each chunk is copied from the mmap once; iteration within the chunk is
/// zero-allocation (sliding-window via [`OwnedPackedSeqKmerIter`]).
// ── Sequential iterators (O(n) running-offset cursor) ─────────────────────
/// Iterate all chunks in file order with a running byte offset — O(n) total.
fn iter_chunks_sequential(&self) -> impl Iterator<Item = (usize, Unitig)> + '_ {
let k = self.k;
let mmap = &*self.mmap;
let n = self.n_unitigs;
let mut offset = 0usize;
(0..n).map(move |chunk_id| {
let seql = mmap[offset] as usize + k;
let byte_len = (seql + 3) / 4;
let bytes = mmap[offset + 1..offset + 1 + byte_len].to_vec().into_boxed_slice();
offset += 1 + byte_len;
(chunk_id, Unitig::new((seql % 4) as u8, bytes))
})
}
pub fn iter_kmers(&self) -> impl Iterator<Item = Kmer> + '_ {
(0..self.len()).flat_map(move |i| self.unitig(i).into_kmers())
self.iter_chunks_sequential()
.flat_map(|(_, u)| u.into_kmers())
}
/// Iterate over all canonical kmers in file order.
///
/// Equivalent to `iter_kmers().map(|km| km.canonical())` but uses the
/// built-in canonical iterator on each chunk, which avoids a separate
/// canonicalization pass.
pub fn iter_canonical_kmers(&self) -> impl Iterator<Item = CanonicalKmer> + '_ {
(0..self.len()).flat_map(move |i| self.unitig(i).into_canonical_kmers())
self.iter_chunks_sequential()
.flat_map(|(_, u)| u.into_canonical_kmers())
}
/// Iterate over `(kmer, chunk_id, rank)` for every canonical kmer in the file.
///
/// `chunk_id` is the index of the chunk within this file; `rank` is the
/// 0-based position of the kmer within that chunk. Used to build the
/// evidence table in `obilayeredmap`.
pub fn iter_indexed_canonical_kmers(
&self,
) -> impl Iterator<Item = (CanonicalKmer, usize, usize)> + '_ {
(0..self.len()).flat_map(move |chunk_id| {
self.unitig(chunk_id)
.into_canonical_kmers()
.enumerate()
.map(move |(rank, kmer)| (kmer, chunk_id, rank))
})
self.iter_chunks_sequential()
.flat_map(|(chunk_id, u)| {
u.into_canonical_kmers()
.enumerate()
.map(move |(rank, kmer)| (kmer, chunk_id, rank))
})
}
}
fn read_idx(path: &Path) -> SKResult<(Vec<u8>, Vec<u32>, usize)> {
fn read_idx(path: &Path) -> SKResult<(usize, usize, Vec<u32>)> {
let data = std::fs::read(path).map_err(SKError::Io)?;
let mut pos = 0;
@@ -260,15 +261,27 @@ fn read_idx(path: &Path) -> SKResult<(Vec<u8>, Vec<u32>, usize)> {
.ok_or(SKError::Truncated { context: "unitig index: magic" })?;
if magic_bytes != &MAGIC {
return Err(SKError::BadMagic {
expected: "UIDX",
expected: "UIX2",
got: magic_bytes.try_into().unwrap(),
});
}
pos += 4;
// block_size stored for forward-compatibility verification
let bs_bytes = data.get(pos..pos + 4)
.ok_or(SKError::Truncated { context: "unitig index: block_size" })?;
let stored_bs = u32::from_le_bytes(bs_bytes.try_into().unwrap()) as usize;
if stored_bs != BLOCK_SIZE {
return Err(SKError::InvalidData {
context: "unitig index",
detail: format!("block_size mismatch: file={stored_bs} code={BLOCK_SIZE}"),
});
}
pos += 4;
let n_bytes = data.get(pos..pos + 4)
.ok_or(SKError::Truncated { context: "unitig index: n_unitigs" })?;
let n = u32::from_le_bytes(n_bytes.try_into().unwrap()) as usize;
let n_unitigs = u32::from_le_bytes(n_bytes.try_into().unwrap()) as usize;
pos += 4;
let nk_bytes = data.get(pos..pos + 8)
@@ -276,25 +289,21 @@ fn read_idx(path: &Path) -> SKResult<(Vec<u8>, Vec<u32>, usize)> {
let n_kmers = u64::from_le_bytes(nk_bytes.try_into().unwrap()) as usize;
pos += 8;
let seqls = data.get(pos..pos + n)
.ok_or(SKError::Truncated { context: "unitig index: seqls" })?
.to_vec();
pos += n;
let mut packed_offsets = Vec::with_capacity(n + 1);
for _ in 0..=n {
let n_blocks = (n_unitigs + BLOCK_SIZE - 1) >> LOG2_BLOCK_SIZE;
let n_offsets = n_blocks + 1; // +1 for sentinel
let mut block_offsets = Vec::with_capacity(n_offsets);
for _ in 0..n_offsets {
let off_bytes = data.get(pos..pos + 4)
.ok_or(SKError::Truncated { context: "unitig index: packed_offsets" })?;
packed_offsets.push(u32::from_le_bytes(off_bytes.try_into().unwrap()));
.ok_or(SKError::Truncated { context: "unitig index: block_offsets" })?;
block_offsets.push(u32::from_le_bytes(off_bytes.try_into().unwrap()));
pos += 4;
}
Ok((seqls, packed_offsets, n_kmers))
Ok((n_unitigs, n_kmers, block_offsets))
}
// ── Kmer utilities ────────────────────────────────────────────────────────────
/// Reverse complement of a left-aligned 2-bit kmer (same algorithm as [`KmerOf::revcomp`]).
#[inline]
fn revcomp_raw(raw: u64, k: usize) -> u64 {
let x = !raw;
@@ -304,22 +313,17 @@ fn revcomp_raw(raw: u64, k: usize) -> u64 {
x << (64 - 2 * k)
}
/// Canonical form of a left-aligned 2-bit kmer: `min(kmer, revcomp(kmer))`.
#[inline]
fn canonical_raw(raw: u64, k: usize) -> u64 {
raw.min(revcomp_raw(raw, k))
}
// ── Bit extraction ────────────────────────────────────────────────────────────
/// Extract the kmer at nucleotide position `j` from MSB-first 2-bit packed `bytes`.
/// Returns a left-aligned u64 matching [`KmerOf`]'s internal representation.
#[inline]
fn extract_kmer_raw(bytes: &[u8], j: usize, k: usize) -> u64 {
let bit_start = j * 2;
let byte_start = bit_start / 8;
let bit_offset = bit_start % 8; // always 0, 2, 4, or 6
let bytes_needed = (bit_offset + 2 * k + 7) / 8; // ≤ 9 for k ≤ 32
let bit_start = j * 2;
let byte_start = bit_start / 8;
let bit_offset = bit_start % 8;
let bytes_needed = (bit_offset + 2 * k + 7) / 8;
let mut acc = 0u128;
for idx in 0..bytes_needed {
@@ -327,8 +331,8 @@ fn extract_kmer_raw(bytes: &[u8], j: usize, k: usize) -> u64 {
}
let shift = bytes_needed * 8 - bit_offset - 2 * k;
let mask = !0u64 >> (64 - 2 * k);
let raw = (acc >> shift) as u64 & mask;
let mask = !0u64 >> (64 - 2 * k);
let raw = (acc >> shift) as u64 & mask;
raw << (64 - 2 * k)
}