feat: implement persistent layered index and chunked binary format
Introduce the `obilayeredmap` specification and persistent MPHF-based index architecture for incremental multi-dataset indexing. Implement chunked binary serialization with a fixed `u8` k-mer count limit (256) and overlapping super-kmer segments. Add memory-mapped I/O and a companion `.idx` index file for allocation-free, O(1) unitig access. Update MkDocs navigation, enhance the k-mer comparison script, and add comprehensive tests for serialization, partitioning, and file I/O pipelines.
This commit is contained in:
@@ -21,6 +21,7 @@ pub mod unitig;
|
||||
|
||||
pub use annotations::Annotation;
|
||||
pub use kmer::{CanonicalKmer, Kmer, Minimizer, hash_kmer};
|
||||
pub use packed_seq::MAX_KMERS_PER_CHUNK;
|
||||
pub use params::{k, m, set_k, set_m};
|
||||
pub use routable::RoutableSuperKmer;
|
||||
pub use sequence::Sequence;
|
||||
|
||||
@@ -22,6 +22,9 @@ use crate::kmer::{CanonicalKmer, Kmer, KmerError, KLen, KmerLength, KmerOf, MLen
|
||||
use crate::params::k;
|
||||
use crate::revcomp_lookup::REVCOMP4;
|
||||
|
||||
/// Maximum kmers per stored chunk. Enforces the u8 max-kmer-index field in the binary format.
|
||||
pub const MAX_KMERS_PER_CHUNK: usize = 256;
|
||||
|
||||
// ── PackedSeq ─────────────────────────────────────────────────────────────────
|
||||
|
||||
/// 2-bit packed DNA sequence of arbitrary length ≥ 1.
|
||||
@@ -229,22 +232,36 @@ impl PackedSeq {
|
||||
self.iter_kmers().map(|km| km.canonical())
|
||||
}
|
||||
|
||||
/// Serialise to a compact binary representation.
|
||||
/// Extract nucleotides `[start, end)` as a new [`PackedSeq`]. Allocates.
|
||||
pub fn sub(&self, start: usize, end: usize) -> Self {
|
||||
debug_assert!(end > start && end <= self.seql());
|
||||
let nucs: Vec<u8> = (start..end).map(|i| self.nucleotide(i)).collect();
|
||||
Self::from_nucleotides(&nucs)
|
||||
}
|
||||
|
||||
/// Serialise one chunk to binary.
|
||||
///
|
||||
/// Format: varint(seql) followed by raw packed bytes.
|
||||
/// `tail` and `byte_len` are both derivable from `seql` and need not be stored.
|
||||
/// Format: `[u8: n_kmers−1][packed bytes]`.
|
||||
/// The caller must ensure `seql ≥ k` and `seql − k + 1 ≤ MAX_KMERS_PER_CHUNK`.
|
||||
/// Use [`SuperKmer::write_to_binary`] for sequences that may exceed one chunk.
|
||||
pub fn write_to_binary<W: Write>(&self, w: &mut W) -> io::Result<()> {
|
||||
write_varint(w, self.seql() as u64)?;
|
||||
let k = crate::params::k();
|
||||
let seql = self.seql();
|
||||
debug_assert!(seql >= k, "sequence shorter than k");
|
||||
debug_assert!(
|
||||
seql - k + 1 <= MAX_KMERS_PER_CHUNK,
|
||||
"chunk exceeds MAX_KMERS_PER_CHUNK; split before calling write_to_binary"
|
||||
);
|
||||
w.write_all(&[(seql - k) as u8])?;
|
||||
w.write_all(&self.seq)
|
||||
}
|
||||
|
||||
/// Deserialise from the compact binary format produced by [`write_to_binary`].
|
||||
/// Deserialise one chunk from the binary format produced by [`write_to_binary`].
|
||||
/// Allocates exactly one `Box<[u8]>` for the packed bytes.
|
||||
pub fn read_from_binary<R: Read>(r: &mut R) -> io::Result<Self> {
|
||||
let seql = read_varint(r)? as usize;
|
||||
if seql == 0 {
|
||||
return Err(io::Error::new(io::ErrorKind::InvalidData, "empty sequence"));
|
||||
}
|
||||
let mut buf = [0u8; 1];
|
||||
r.read_exact(&mut buf)?;
|
||||
let seql = buf[0] as usize + crate::params::k();
|
||||
let byte_len = (seql + 3) / 4;
|
||||
let tail = (seql % 4) as u8;
|
||||
let mut seq = vec![0u8; byte_len];
|
||||
|
||||
@@ -12,7 +12,7 @@ use xxhash_rust::xxh3::xxh3_64;
|
||||
use crate::Annotation;
|
||||
use crate::Sequence;
|
||||
use crate::kmer::{CanonicalKmer, Kmer, KmerError};
|
||||
use crate::packed_seq::{PackedSeq, read_varint, write_varint};
|
||||
use crate::packed_seq::{MAX_KMERS_PER_CHUNK, PackedSeq, read_varint, write_varint};
|
||||
|
||||
// ── SKAnnotation ──────────────────────────────────────────────────────────────
|
||||
|
||||
@@ -91,13 +91,37 @@ impl SuperKmer {
|
||||
Self { count: 1, inner }
|
||||
}
|
||||
|
||||
/// Serialise to compact binary. Format: varint(count) + varint((byte_len << 2) | tail) + bytes.
|
||||
/// Serialise to compact binary: `[varint(count)][u8: n_kmers−1][packed bytes]` per chunk.
|
||||
///
|
||||
/// Sequences with more than [`MAX_KMERS_PER_CHUNK`] kmers are transparently split into
|
||||
/// overlapping chunks (k−1 nucleotide overlap, same count per chunk). Each chunk is an
|
||||
/// independent, self-contained record — one [`read_from_binary`] call reads exactly one.
|
||||
pub fn write_to_binary<W: Write>(&self, w: &mut W) -> io::Result<()> {
|
||||
write_varint(w, self.count as u64)?;
|
||||
self.inner.write_to_binary(w)
|
||||
let k = crate::params::k();
|
||||
let seql = self.seql();
|
||||
debug_assert!(seql >= k, "super-kmer shorter than k");
|
||||
let n_kmers = seql - k + 1;
|
||||
if n_kmers <= MAX_KMERS_PER_CHUNK {
|
||||
write_varint(w, self.count as u64)?;
|
||||
self.inner.write_to_binary(w)
|
||||
} else {
|
||||
let chunk_nucl = MAX_KMERS_PER_CHUNK + k - 1;
|
||||
let stride = MAX_KMERS_PER_CHUNK;
|
||||
let mut start = 0;
|
||||
loop {
|
||||
let end = (start + chunk_nucl).min(seql);
|
||||
let mut chunk = self.inner.sub(start, end);
|
||||
chunk.canonicalize();
|
||||
write_varint(w, self.count as u64)?;
|
||||
chunk.write_to_binary(w)?;
|
||||
if end == seql { break; }
|
||||
start += stride;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Deserialise from the binary format produced by [`write_to_binary`].
|
||||
/// Deserialise one chunk from the binary format produced by [`write_to_binary`].
|
||||
/// Allocates exactly one `Box<[u8]>` for the packed bytes.
|
||||
pub fn read_from_binary<R: Read>(r: &mut R) -> io::Result<Self> {
|
||||
let count = read_varint(r)? as u32;
|
||||
|
||||
@@ -67,27 +67,57 @@ fn seql_roundtrip() {
|
||||
|
||||
// ── binary serialisation ──────────────────────────────────────────────────────
|
||||
|
||||
fn binary_test_lengths(k: usize) -> Vec<usize> {
|
||||
use crate::packed_seq::MAX_KMERS_PER_CHUNK;
|
||||
// Only single-chunk lengths: seql in [k, MAX_KMERS_PER_CHUNK+k-1].
|
||||
(k..=k + 5).chain([255, 256, 257, MAX_KMERS_PER_CHUNK + k - 1]).collect()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn binary_roundtrip() {
|
||||
for len in all_lengths() {
|
||||
set_k(4);
|
||||
let k = crate::params::k();
|
||||
for len in binary_test_lengths(k) {
|
||||
let mut sk = SuperKmer::from_ascii(&make_seq(len));
|
||||
sk.set_count(42);
|
||||
let mut buf = Vec::new();
|
||||
sk.write_to_binary(&mut buf).unwrap();
|
||||
let sk2 = SuperKmer::read_from_binary(&mut buf.as_slice()).unwrap();
|
||||
assert_eq!(
|
||||
sk.to_ascii(),
|
||||
sk2.to_ascii(),
|
||||
"sequence mismatch for len={len}"
|
||||
);
|
||||
assert_eq!(sk.to_ascii(), sk2.to_ascii(), "sequence mismatch for len={len}");
|
||||
assert_eq!(sk2.count(), 42, "count mismatch for len={len}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn binary_split_roundtrip() {
|
||||
// A super-kmer > MAX_KMERS_PER_CHUNK kmers is split into multiple records on write.
|
||||
use crate::packed_seq::MAX_KMERS_PER_CHUNK;
|
||||
set_k(4);
|
||||
let k = crate::params::k();
|
||||
// seql = MAX_KMERS_PER_CHUNK + k = 260 → n_kmers = 257 > 256 → 2 chunks
|
||||
let seql = MAX_KMERS_PER_CHUNK + k;
|
||||
let mut sk = SuperKmer::from_ascii(&make_seq(seql));
|
||||
sk.set_count(7);
|
||||
let mut buf = Vec::new();
|
||||
sk.write_to_binary(&mut buf).unwrap();
|
||||
// Read all records back.
|
||||
let mut slice = buf.as_slice();
|
||||
let chunk0 = SuperKmer::read_from_binary(&mut slice).unwrap();
|
||||
let chunk1 = SuperKmer::read_from_binary(&mut slice).unwrap();
|
||||
assert!(slice.is_empty(), "unexpected trailing bytes");
|
||||
assert_eq!(chunk0.count(), 7);
|
||||
assert_eq!(chunk1.count(), 7);
|
||||
// Chunks cover the original sequence with k-1 overlap — no kmer lost.
|
||||
assert_eq!(chunk0.seql(), MAX_KMERS_PER_CHUNK + k - 1); // 259
|
||||
assert_eq!(chunk1.seql(), k); // 4 (1 kmer)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn binary_packed_seq_roundtrip() {
|
||||
use crate::packed_seq::PackedSeq;
|
||||
for len in all_lengths() {
|
||||
set_k(4);
|
||||
let k = crate::params::k();
|
||||
for len in binary_test_lengths(k) {
|
||||
let ps = PackedSeq::from_ascii(&make_seq(len));
|
||||
let mut buf = Vec::new();
|
||||
ps.write_to_binary(&mut buf).unwrap();
|
||||
@@ -98,7 +128,8 @@ fn binary_packed_seq_roundtrip() {
|
||||
|
||||
#[test]
|
||||
fn binary_size_is_compact() {
|
||||
// seql=4 (1 byte packed): varint(count=1, 1 byte) + varint((1<<2)|0=4, 1 byte) + 1 byte = 3 bytes
|
||||
// ACGT with k=4: varint(count=1, 1 byte) + u8(n_kmers-1=0, 1 byte) + 1 packed byte = 3 bytes
|
||||
set_k(4);
|
||||
let sk = SuperKmer::from_ascii(b"ACGT");
|
||||
let mut buf = Vec::new();
|
||||
sk.write_to_binary(&mut buf).unwrap();
|
||||
|
||||
@@ -160,7 +160,12 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn binary_roundtrip_all_lengths() {
|
||||
for len in test_lengths() {
|
||||
// write_to_binary encodes a single chunk: seql must be in [k, MAX_KMERS_PER_CHUNK+k-1].
|
||||
use crate::packed_seq::MAX_KMERS_PER_CHUNK;
|
||||
set_k(4);
|
||||
let k = crate::params::k();
|
||||
let valid_lengths: Vec<usize> = (k..=9).chain([255, 256, 257, MAX_KMERS_PER_CHUNK + k - 1]).collect();
|
||||
for len in valid_lengths {
|
||||
let u = Unitig::from_ascii(&make_seq(len));
|
||||
let mut buf = Vec::new();
|
||||
u.write_to_binary(&mut buf).unwrap();
|
||||
|
||||
Reference in New Issue
Block a user