feat: implement persistent layered index and chunked binary format

Introduce the `obilayeredmap` specification and persistent MPHF-based index architecture for incremental multi-dataset indexing. Implement chunked binary serialization with a fixed `u8` k-mer count limit (256) and overlapping super-kmer segments. Add memory-mapped I/O and a companion `.idx` index file for allocation-free, O(1) unitig access. Update MkDocs navigation, enhance the k-mer comparison script, and add comprehensive tests for serialization, partitioning, and file I/O pipelines.
This commit is contained in:
Eric Coissac
2026-05-09 17:20:08 +08:00
parent 8c17bf958b
commit 5169f65dc9
24 changed files with 1342 additions and 382 deletions
+65 -1
View File
@@ -96,7 +96,7 @@ impl Iterator for SuperKmerIter<'_> {
}
// ── 1. Entropy check ─────────────────────────────────────────────
if self.stat.normalized_entropy().unwrap_or(1.0) <= self.theta {
if self.stat.normalized_entropy().unwrap_or(1.0) < self.theta {
let result = self.try_emit();
self.cursor.rewind(self.k - 1).ok();
self.reset();
@@ -168,6 +168,70 @@ mod tests {
// k=11, m=5 — valeurs minimales du projet (k ∈ [11,31])
const K: usize = 11;
/// Collect the set of canonical k-mers from a raw ASCII sequence (no NUL).
fn direct_canonical_kmers(seq: &[u8]) -> std::collections::HashSet<Vec<u8>> {
(0..seq.len().saturating_sub(K - 1))
.map(|i| obikseq::SuperKmer::from_ascii(&seq[i..i + K]).to_ascii())
.collect()
}
/// Collect the set of canonical k-mers emitted by SuperKmerIter over a rope.
fn iter_canonical_kmers(rope: &Rope) -> std::collections::HashSet<Vec<u8>> {
SuperKmerIter::new(rope, K, 1, 0.0)
.flat_map(|rsk| {
rsk.superkmer()
.iter_canonical_kmers()
.map(|km| km.to_ascii())
.collect::<Vec<_>>()
})
.collect()
}
#[test]
fn coverage_single_segment() {
setup();
let seq = b"ACGTACGTACGTACGTACGT";
let rope = make_rope(&[seq.as_ref(), b"\x00"].concat());
let direct = direct_canonical_kmers(seq);
let from_iter = iter_canonical_kmers(&rope);
let missing: Vec<_> = direct.difference(&from_iter).collect();
assert!(
missing.is_empty(),
"k-mers perdus dans segment unique : {missing:?}"
);
}
#[test]
fn coverage_two_segments() {
setup();
let seg1 = b"ACGTACGTACGTACGTACGT";
let seg2 = b"TGCATGCATGCATGCATGCA";
let rope = make_rope(&[seg1.as_ref(), b"\x00", seg2.as_ref(), b"\x00"].concat());
let mut direct = direct_canonical_kmers(seg1);
direct.extend(direct_canonical_kmers(seg2));
let from_iter = iter_canonical_kmers(&rope);
let missing: Vec<_> = direct.difference(&from_iter).collect();
assert!(
missing.is_empty(),
"k-mers perdus dans deux segments : {missing:?}"
);
}
#[test]
fn coverage_minimizer_boundary() {
setup();
// sequence assez longue pour forcer plusieurs changements de minimiseur
let seq: Vec<u8> = (0..80).map(|i| b"ACGT"[i % 4]).collect();
let rope = make_rope(&[seq.as_slice(), b"\x00"].concat());
let direct = direct_canonical_kmers(&seq);
let from_iter = iter_canonical_kmers(&rope);
let missing: Vec<_> = direct.difference(&from_iter).collect();
assert!(
missing.is_empty(),
"k-mers perdus à la frontière de minimiseur : {missing:?}"
);
}
#[test]
fn single_segment_one_superkmer() {
setup();