feat: implement persistent layered index and chunked binary format

Introduce the `obilayeredmap` specification and persistent MPHF-based index architecture for incremental multi-dataset indexing. Implement chunked binary serialization with a fixed `u8` k-mer count limit (256) and overlapping super-kmer segments. Add memory-mapped I/O and a companion `.idx` index file for allocation-free, O(1) unitig access. Update MkDocs navigation, enhance the k-mer comparison script, and add comprehensive tests for serialization, partitioning, and file I/O pipelines.
This commit is contained in:
Eric Coissac
2026-05-09 17:20:08 +08:00
parent 8c17bf958b
commit 5169f65dc9
24 changed files with 1342 additions and 382 deletions
+7
View File
@@ -3,6 +3,13 @@ name = "obikpartitionner"
version = "0.1.0"
edition = "2024"
[dev-dependencies]
tempfile = "3"
obikseq = { path = "../obikseq", features = ["test-utils"] }
obiskbuilder = { path = "../obiskbuilder" }
obiread = { path = "../obiread" }
obikrope = { path = "../obikrope" }
[dependencies]
niffler = "3.0.0"
remove_dir_all = "0.8"
+116
View File
@@ -616,3 +616,119 @@ impl Drop for KmerPartition {
let _ = self.close();
}
}
// ── integration tests ─────────────────────────────────────────────────────────
#[cfg(test)]
mod tests {
use super::*;
use std::collections::HashMap;
use obikrope::Rope;
use obikseq::SuperKmer;
use obiskbuilder::build_superkmers;
const K: usize = 11;
const M: usize = 5;
fn setup() {
obikseq::params::set_k(K);
obikseq::params::set_m(M);
}
/// Direct canonical k-mer counts from ASCII sequences — ground truth.
fn direct_counts(seqs: &[&[u8]]) -> (u64, u64) {
let mut counts: HashMap<Vec<u8>, u64> = HashMap::new();
for seq in seqs {
for i in 0..seq.len().saturating_sub(K - 1) {
let km = SuperKmer::from_ascii(&seq[i..i + K]).to_ascii();
*counts.entry(km).or_insert(0) += 1;
}
}
let f0 = counts.len() as u64;
let f1: u64 = counts.values().sum();
(f0, f1)
}
/// Run the full pipeline on a list of sequences and return (f0, f1) from
/// the `kmer_spectrum_raw.json` produced by `count_partition`.
fn pipeline_counts(seqs: &[&[u8]]) -> (u64, u64) {
setup();
let mut rope_data: Vec<u8> = Vec::new();
for seq in seqs {
rope_data.extend_from_slice(seq);
rope_data.push(0x00);
}
let mut rope = Rope::new(None);
rope.push(rope_data);
let superkmers: Vec<_> = build_superkmers(rope, K, 1, 0.0);
let dir = tempfile::tempdir().unwrap();
let mut kp = KmerPartition::create(dir.path(), 0, K, M, true).unwrap();
kp.write_batch(superkmers).unwrap();
kp.close().unwrap();
kp.dereplicate().unwrap();
let part_dir = dir.path().join("part_00000");
let dedup_path = part_dir.join("dereplicated.skmer.zst");
if !dedup_path.exists() {
return (0, 0);
}
count_partition(&part_dir, &dedup_path, K).unwrap();
let spec: serde_json::Value = serde_json::from_reader(
fs::File::open(part_dir.join("kmer_spectrum_raw.json")).unwrap(),
).unwrap();
let f0 = spec["f0"].as_u64().unwrap_or(0);
let f1 = spec["f1"].as_u64().unwrap_or(0);
(f0, f1)
}
#[test]
fn single_sequence_f0_f1_match() {
let seqs: &[&[u8]] = &[b"ACGTACGTACGTACGTACGT"];
let (ef0, ef1) = direct_counts(seqs);
let (gf0, gf1) = pipeline_counts(seqs);
assert_eq!(gf0, ef0, "f0 wrong: expected {ef0}, got {gf0}");
assert_eq!(gf1, ef1, "f1 wrong: expected {ef1}, got {gf1}");
}
#[test]
fn two_sequences_f0_f1_match() {
let seqs: &[&[u8]] = &[
b"ACGTACGTACGTACGTACGT",
b"TGCATGCATGCATGCATGCA",
];
let (ef0, ef1) = direct_counts(seqs);
let (gf0, gf1) = pipeline_counts(seqs);
assert_eq!(gf0, ef0, "f0 wrong: expected {ef0}, got {gf0}");
assert_eq!(gf1, ef1, "f1 wrong: expected {ef1}, got {gf1}");
}
#[test]
fn repeated_sequence_f1_doubles() {
let seq = b"ACGTACGTACGTACGTACGT";
let seqs: &[&[u8]] = &[seq, seq];
let (ef0, ef1) = direct_counts(seqs);
let (gf0, gf1) = pipeline_counts(seqs);
assert_eq!(gf0, ef0, "f0 wrong: expected {ef0}, got {gf0}");
assert_eq!(gf1, ef1, "f1 wrong: expected {ef1}, got {gf1}");
}
#[test]
fn many_sequences_f0_f1_match() {
// 20 distinct sequences of length 40 — forces multiple super-kmers and
// multiple minimizer boundaries per sequence.
let bases = b"ACGT";
let seqs: Vec<Vec<u8>> = (0..20u32)
.map(|i| (0..40).map(|j| bases[((i * 7 + j * 3) % 4) as usize]).collect())
.collect();
let seq_refs: Vec<&[u8]> = seqs.iter().map(|v| v.as_slice()).collect();
let (ef0, ef1) = direct_counts(&seq_refs);
let (gf0, gf1) = pipeline_counts(&seq_refs);
assert_eq!(gf0, ef0, "f0 wrong: expected {ef0}, got {gf0}");
assert_eq!(gf1, ef1, "f1 wrong: expected {ef1}, got {gf1}");
}
}