feat: implement persistent layered index and chunked binary format
Introduce the `obilayeredmap` specification and persistent MPHF-based index architecture for incremental multi-dataset indexing. Implement chunked binary serialization with a fixed `u8` k-mer count limit (256) and overlapping super-kmer segments. Add memory-mapped I/O and a companion `.idx` index file for allocation-free, O(1) unitig access. Update MkDocs navigation, enhance the k-mer comparison script, and add comprehensive tests for serialization, partitioning, and file I/O pipelines.
This commit is contained in:
@@ -96,7 +96,7 @@ impl Iterator for SuperKmerIter<'_> {
|
||||
}
|
||||
|
||||
// ── 1. Entropy check ─────────────────────────────────────────────
|
||||
if self.stat.normalized_entropy().unwrap_or(1.0) <= self.theta {
|
||||
if self.stat.normalized_entropy().unwrap_or(1.0) < self.theta {
|
||||
let result = self.try_emit();
|
||||
self.cursor.rewind(self.k - 1).ok();
|
||||
self.reset();
|
||||
@@ -168,6 +168,70 @@ mod tests {
|
||||
// k=11, m=5 — valeurs minimales du projet (k ∈ [11,31])
|
||||
const K: usize = 11;
|
||||
|
||||
/// Collect the set of canonical k-mers from a raw ASCII sequence (no NUL).
|
||||
fn direct_canonical_kmers(seq: &[u8]) -> std::collections::HashSet<Vec<u8>> {
|
||||
(0..seq.len().saturating_sub(K - 1))
|
||||
.map(|i| obikseq::SuperKmer::from_ascii(&seq[i..i + K]).to_ascii())
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Collect the set of canonical k-mers emitted by SuperKmerIter over a rope.
|
||||
fn iter_canonical_kmers(rope: &Rope) -> std::collections::HashSet<Vec<u8>> {
|
||||
SuperKmerIter::new(rope, K, 1, 0.0)
|
||||
.flat_map(|rsk| {
|
||||
rsk.superkmer()
|
||||
.iter_canonical_kmers()
|
||||
.map(|km| km.to_ascii())
|
||||
.collect::<Vec<_>>()
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn coverage_single_segment() {
|
||||
setup();
|
||||
let seq = b"ACGTACGTACGTACGTACGT";
|
||||
let rope = make_rope(&[seq.as_ref(), b"\x00"].concat());
|
||||
let direct = direct_canonical_kmers(seq);
|
||||
let from_iter = iter_canonical_kmers(&rope);
|
||||
let missing: Vec<_> = direct.difference(&from_iter).collect();
|
||||
assert!(
|
||||
missing.is_empty(),
|
||||
"k-mers perdus dans segment unique : {missing:?}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn coverage_two_segments() {
|
||||
setup();
|
||||
let seg1 = b"ACGTACGTACGTACGTACGT";
|
||||
let seg2 = b"TGCATGCATGCATGCATGCA";
|
||||
let rope = make_rope(&[seg1.as_ref(), b"\x00", seg2.as_ref(), b"\x00"].concat());
|
||||
let mut direct = direct_canonical_kmers(seg1);
|
||||
direct.extend(direct_canonical_kmers(seg2));
|
||||
let from_iter = iter_canonical_kmers(&rope);
|
||||
let missing: Vec<_> = direct.difference(&from_iter).collect();
|
||||
assert!(
|
||||
missing.is_empty(),
|
||||
"k-mers perdus dans deux segments : {missing:?}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn coverage_minimizer_boundary() {
|
||||
setup();
|
||||
// sequence assez longue pour forcer plusieurs changements de minimiseur
|
||||
let seq: Vec<u8> = (0..80).map(|i| b"ACGT"[i % 4]).collect();
|
||||
let rope = make_rope(&[seq.as_slice(), b"\x00"].concat());
|
||||
let direct = direct_canonical_kmers(&seq);
|
||||
let from_iter = iter_canonical_kmers(&rope);
|
||||
let missing: Vec<_> = direct.difference(&from_iter).collect();
|
||||
assert!(
|
||||
missing.is_empty(),
|
||||
"k-mers perdus à la frontière de minimiseur : {missing:?}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn single_segment_one_superkmer() {
|
||||
setup();
|
||||
|
||||
Reference in New Issue
Block a user