feat: implement persistent layered index and chunked binary format

Introduce the `obilayeredmap` specification and persistent MPHF-based index architecture for incremental multi-dataset indexing. Implement chunked binary serialization with a fixed `u8` k-mer count limit (256) and overlapping super-kmer segments. Add memory-mapped I/O and a companion `.idx` index file for allocation-free, O(1) unitig access. Update MkDocs navigation, enhance the k-mer comparison script, and add comprehensive tests for serialization, partitioning, and file I/O pipelines.
This commit is contained in:
Eric Coissac
2026-05-09 17:20:08 +08:00
parent 8c17bf958b
commit 5169f65dc9
24 changed files with 1342 additions and 382 deletions
+1
View File
@@ -21,6 +21,7 @@ pub mod unitig;
pub use annotations::Annotation;
pub use kmer::{CanonicalKmer, Kmer, Minimizer, hash_kmer};
pub use packed_seq::MAX_KMERS_PER_CHUNK;
pub use params::{k, m, set_k, set_m};
pub use routable::RoutableSuperKmer;
pub use sequence::Sequence;
+26 -9
View File
@@ -22,6 +22,9 @@ use crate::kmer::{CanonicalKmer, Kmer, KmerError, KLen, KmerLength, KmerOf, MLen
use crate::params::k;
use crate::revcomp_lookup::REVCOMP4;
/// Maximum kmers per stored chunk. Enforces the u8 max-kmer-index field in the binary format.
pub const MAX_KMERS_PER_CHUNK: usize = 256;
// ── PackedSeq ─────────────────────────────────────────────────────────────────
/// 2-bit packed DNA sequence of arbitrary length ≥ 1.
@@ -229,22 +232,36 @@ impl PackedSeq {
self.iter_kmers().map(|km| km.canonical())
}
/// Serialise to a compact binary representation.
/// Extract nucleotides `[start, end)` as a new [`PackedSeq`]. Allocates.
pub fn sub(&self, start: usize, end: usize) -> Self {
debug_assert!(end > start && end <= self.seql());
let nucs: Vec<u8> = (start..end).map(|i| self.nucleotide(i)).collect();
Self::from_nucleotides(&nucs)
}
/// Serialise one chunk to binary.
///
/// Format: varint(seql) followed by raw packed bytes.
/// `tail` and `byte_len` are both derivable from `seql` and need not be stored.
/// Format: `[u8: n_kmers1][packed bytes]`.
/// The caller must ensure `seql ≥ k` and `seql k + 1 ≤ MAX_KMERS_PER_CHUNK`.
/// Use [`SuperKmer::write_to_binary`] for sequences that may exceed one chunk.
pub fn write_to_binary<W: Write>(&self, w: &mut W) -> io::Result<()> {
write_varint(w, self.seql() as u64)?;
let k = crate::params::k();
let seql = self.seql();
debug_assert!(seql >= k, "sequence shorter than k");
debug_assert!(
seql - k + 1 <= MAX_KMERS_PER_CHUNK,
"chunk exceeds MAX_KMERS_PER_CHUNK; split before calling write_to_binary"
);
w.write_all(&[(seql - k) as u8])?;
w.write_all(&self.seq)
}
/// Deserialise from the compact binary format produced by [`write_to_binary`].
/// Deserialise one chunk from the binary format produced by [`write_to_binary`].
/// Allocates exactly one `Box<[u8]>` for the packed bytes.
pub fn read_from_binary<R: Read>(r: &mut R) -> io::Result<Self> {
let seql = read_varint(r)? as usize;
if seql == 0 {
return Err(io::Error::new(io::ErrorKind::InvalidData, "empty sequence"));
}
let mut buf = [0u8; 1];
r.read_exact(&mut buf)?;
let seql = buf[0] as usize + crate::params::k();
let byte_len = (seql + 3) / 4;
let tail = (seql % 4) as u8;
let mut seq = vec![0u8; byte_len];
+29 -5
View File
@@ -12,7 +12,7 @@ use xxhash_rust::xxh3::xxh3_64;
use crate::Annotation;
use crate::Sequence;
use crate::kmer::{CanonicalKmer, Kmer, KmerError};
use crate::packed_seq::{PackedSeq, read_varint, write_varint};
use crate::packed_seq::{MAX_KMERS_PER_CHUNK, PackedSeq, read_varint, write_varint};
// ── SKAnnotation ──────────────────────────────────────────────────────────────
@@ -91,13 +91,37 @@ impl SuperKmer {
Self { count: 1, inner }
}
/// Serialise to compact binary. Format: varint(count) + varint((byte_len << 2) | tail) + bytes.
/// Serialise to compact binary: `[varint(count)][u8: n_kmers1][packed bytes]` per chunk.
///
/// Sequences with more than [`MAX_KMERS_PER_CHUNK`] kmers are transparently split into
/// overlapping chunks (k1 nucleotide overlap, same count per chunk). Each chunk is an
/// independent, self-contained record — one [`read_from_binary`] call reads exactly one.
pub fn write_to_binary<W: Write>(&self, w: &mut W) -> io::Result<()> {
write_varint(w, self.count as u64)?;
self.inner.write_to_binary(w)
let k = crate::params::k();
let seql = self.seql();
debug_assert!(seql >= k, "super-kmer shorter than k");
let n_kmers = seql - k + 1;
if n_kmers <= MAX_KMERS_PER_CHUNK {
write_varint(w, self.count as u64)?;
self.inner.write_to_binary(w)
} else {
let chunk_nucl = MAX_KMERS_PER_CHUNK + k - 1;
let stride = MAX_KMERS_PER_CHUNK;
let mut start = 0;
loop {
let end = (start + chunk_nucl).min(seql);
let mut chunk = self.inner.sub(start, end);
chunk.canonicalize();
write_varint(w, self.count as u64)?;
chunk.write_to_binary(w)?;
if end == seql { break; }
start += stride;
}
Ok(())
}
}
/// Deserialise from the binary format produced by [`write_to_binary`].
/// Deserialise one chunk from the binary format produced by [`write_to_binary`].
/// Allocates exactly one `Box<[u8]>` for the packed bytes.
pub fn read_from_binary<R: Read>(r: &mut R) -> io::Result<Self> {
let count = read_varint(r)? as u32;
+39 -8
View File
@@ -67,27 +67,57 @@ fn seql_roundtrip() {
// ── binary serialisation ──────────────────────────────────────────────────────
fn binary_test_lengths(k: usize) -> Vec<usize> {
use crate::packed_seq::MAX_KMERS_PER_CHUNK;
// Only single-chunk lengths: seql in [k, MAX_KMERS_PER_CHUNK+k-1].
(k..=k + 5).chain([255, 256, 257, MAX_KMERS_PER_CHUNK + k - 1]).collect()
}
#[test]
fn binary_roundtrip() {
for len in all_lengths() {
set_k(4);
let k = crate::params::k();
for len in binary_test_lengths(k) {
let mut sk = SuperKmer::from_ascii(&make_seq(len));
sk.set_count(42);
let mut buf = Vec::new();
sk.write_to_binary(&mut buf).unwrap();
let sk2 = SuperKmer::read_from_binary(&mut buf.as_slice()).unwrap();
assert_eq!(
sk.to_ascii(),
sk2.to_ascii(),
"sequence mismatch for len={len}"
);
assert_eq!(sk.to_ascii(), sk2.to_ascii(), "sequence mismatch for len={len}");
assert_eq!(sk2.count(), 42, "count mismatch for len={len}");
}
}
#[test]
fn binary_split_roundtrip() {
// A super-kmer > MAX_KMERS_PER_CHUNK kmers is split into multiple records on write.
use crate::packed_seq::MAX_KMERS_PER_CHUNK;
set_k(4);
let k = crate::params::k();
// seql = MAX_KMERS_PER_CHUNK + k = 260 → n_kmers = 257 > 256 → 2 chunks
let seql = MAX_KMERS_PER_CHUNK + k;
let mut sk = SuperKmer::from_ascii(&make_seq(seql));
sk.set_count(7);
let mut buf = Vec::new();
sk.write_to_binary(&mut buf).unwrap();
// Read all records back.
let mut slice = buf.as_slice();
let chunk0 = SuperKmer::read_from_binary(&mut slice).unwrap();
let chunk1 = SuperKmer::read_from_binary(&mut slice).unwrap();
assert!(slice.is_empty(), "unexpected trailing bytes");
assert_eq!(chunk0.count(), 7);
assert_eq!(chunk1.count(), 7);
// Chunks cover the original sequence with k-1 overlap — no kmer lost.
assert_eq!(chunk0.seql(), MAX_KMERS_PER_CHUNK + k - 1); // 259
assert_eq!(chunk1.seql(), k); // 4 (1 kmer)
}
#[test]
fn binary_packed_seq_roundtrip() {
use crate::packed_seq::PackedSeq;
for len in all_lengths() {
set_k(4);
let k = crate::params::k();
for len in binary_test_lengths(k) {
let ps = PackedSeq::from_ascii(&make_seq(len));
let mut buf = Vec::new();
ps.write_to_binary(&mut buf).unwrap();
@@ -98,7 +128,8 @@ fn binary_packed_seq_roundtrip() {
#[test]
fn binary_size_is_compact() {
// seql=4 (1 byte packed): varint(count=1, 1 byte) + varint((1<<2)|0=4, 1 byte) + 1 byte = 3 bytes
// ACGT with k=4: varint(count=1, 1 byte) + u8(n_kmers-1=0, 1 byte) + 1 packed byte = 3 bytes
set_k(4);
let sk = SuperKmer::from_ascii(b"ACGT");
let mut buf = Vec::new();
sk.write_to_binary(&mut buf).unwrap();
+6 -1
View File
@@ -160,7 +160,12 @@ mod tests {
#[test]
fn binary_roundtrip_all_lengths() {
for len in test_lengths() {
// write_to_binary encodes a single chunk: seql must be in [k, MAX_KMERS_PER_CHUNK+k-1].
use crate::packed_seq::MAX_KMERS_PER_CHUNK;
set_k(4);
let k = crate::params::k();
let valid_lengths: Vec<usize> = (k..=9).chain([255, 256, 257, MAX_KMERS_PER_CHUNK + k - 1]).collect();
for len in valid_lengths {
let u = Unitig::from_ascii(&make_seq(len));
let mut buf = Vec::new();
u.write_to_binary(&mut buf).unwrap();