feat: implement persistent layered index and chunked binary format

Introduce the `obilayeredmap` specification and persistent MPHF-based index architecture for incremental multi-dataset indexing. Implement chunked binary serialization with a fixed `u8` k-mer count limit (256) and overlapping super-kmer segments. Add memory-mapped I/O and a companion `.idx` index file for allocation-free, O(1) unitig access. Update MkDocs navigation, enhance the k-mer comparison script, and add comprehensive tests for serialization, partitioning, and file I/O pipelines.
2026-05-09 17:20:08 +08:00
parent 8c17bf958b
commit 5169f65dc9
24 changed files with 1342 additions and 382 deletions
@@ -3,6 +3,13 @@ name = "obikpartitionner"
 version = "0.1.0"
 edition = "2024"

+[dev-dependencies]
+tempfile = "3"
+obikseq   = { path = "../obikseq",   features = ["test-utils"] }
+obiskbuilder = { path = "../obiskbuilder" }
+obiread   = { path = "../obiread" }
+obikrope  = { path = "../obikrope" }
+
 [dependencies]
 niffler = "3.0.0"
 remove_dir_all = "0.8"
@@ -616,3 +616,119 @@ impl Drop for KmerPartition {
        let _ = self.close();
    }
 }
+
+// ── integration tests ─────────────────────────────────────────────────────────
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::collections::HashMap;
+
+    use obikrope::Rope;
+    use obikseq::SuperKmer;
+    use obiskbuilder::build_superkmers;
+
+    const K: usize = 11;
+    const M: usize = 5;
+
+    fn setup() {
+        obikseq::params::set_k(K);
+        obikseq::params::set_m(M);
+    }
+
+    /// Direct canonical k-mer counts from ASCII sequences — ground truth.
+    fn direct_counts(seqs: &[&[u8]]) -> (u64, u64) {
+        let mut counts: HashMap<Vec<u8>, u64> = HashMap::new();
+        for seq in seqs {
+            for i in 0..seq.len().saturating_sub(K - 1) {
+                let km = SuperKmer::from_ascii(&seq[i..i + K]).to_ascii();
+                *counts.entry(km).or_insert(0) += 1;
+            }
+        }
+        let f0 = counts.len() as u64;
+        let f1: u64 = counts.values().sum();
+        (f0, f1)
+    }
+
+    /// Run the full pipeline on a list of sequences and return (f0, f1) from
+    /// the `kmer_spectrum_raw.json` produced by `count_partition`.
+    fn pipeline_counts(seqs: &[&[u8]]) -> (u64, u64) {
+        setup();
+
+        let mut rope_data: Vec<u8> = Vec::new();
+        for seq in seqs {
+            rope_data.extend_from_slice(seq);
+            rope_data.push(0x00);
+        }
+        let mut rope = Rope::new(None);
+        rope.push(rope_data);
+
+        let superkmers: Vec<_> = build_superkmers(rope, K, 1, 0.0);
+
+        let dir = tempfile::tempdir().unwrap();
+        let mut kp = KmerPartition::create(dir.path(), 0, K, M, true).unwrap();
+        kp.write_batch(superkmers).unwrap();
+        kp.close().unwrap();
+        kp.dereplicate().unwrap();
+
+        let part_dir = dir.path().join("part_00000");
+        let dedup_path = part_dir.join("dereplicated.skmer.zst");
+        if !dedup_path.exists() {
+            return (0, 0);
+        }
+        count_partition(&part_dir, &dedup_path, K).unwrap();
+
+        let spec: serde_json::Value = serde_json::from_reader(
+            fs::File::open(part_dir.join("kmer_spectrum_raw.json")).unwrap(),
+        ).unwrap();
+        let f0 = spec["f0"].as_u64().unwrap_or(0);
+        let f1 = spec["f1"].as_u64().unwrap_or(0);
+        (f0, f1)
+    }
+
+    #[test]
+    fn single_sequence_f0_f1_match() {
+        let seqs: &[&[u8]] = &[b"ACGTACGTACGTACGTACGT"];
+        let (ef0, ef1) = direct_counts(seqs);
+        let (gf0, gf1) = pipeline_counts(seqs);
+        assert_eq!(gf0, ef0, "f0 wrong: expected {ef0}, got {gf0}");
+        assert_eq!(gf1, ef1, "f1 wrong: expected {ef1}, got {gf1}");
+    }
+
+    #[test]
+    fn two_sequences_f0_f1_match() {
+        let seqs: &[&[u8]] = &[
+            b"ACGTACGTACGTACGTACGT",
+            b"TGCATGCATGCATGCATGCA",
+        ];
+        let (ef0, ef1) = direct_counts(seqs);
+        let (gf0, gf1) = pipeline_counts(seqs);
+        assert_eq!(gf0, ef0, "f0 wrong: expected {ef0}, got {gf0}");
+        assert_eq!(gf1, ef1, "f1 wrong: expected {ef1}, got {gf1}");
+    }
+
+    #[test]
+    fn repeated_sequence_f1_doubles() {
+        let seq = b"ACGTACGTACGTACGTACGT";
+        let seqs: &[&[u8]] = &[seq, seq];
+        let (ef0, ef1) = direct_counts(seqs);
+        let (gf0, gf1) = pipeline_counts(seqs);
+        assert_eq!(gf0, ef0, "f0 wrong: expected {ef0}, got {gf0}");
+        assert_eq!(gf1, ef1, "f1 wrong: expected {ef1}, got {gf1}");
+    }
+
+    #[test]
+    fn many_sequences_f0_f1_match() {
+        // 20 distinct sequences of length 40 — forces multiple super-kmers and
+        // multiple minimizer boundaries per sequence.
+        let bases = b"ACGT";
+        let seqs: Vec<Vec<u8>> = (0..20u32)
+            .map(|i| (0..40).map(|j| bases[((i * 7 + j * 3) % 4) as usize]).collect())
+            .collect();
+        let seq_refs: Vec<&[u8]> = seqs.iter().map(|v| v.as_slice()).collect();
+        let (ef0, ef1) = direct_counts(&seq_refs);
+        let (gf0, gf1) = pipeline_counts(&seq_refs);
+        assert_eq!(gf0, ef0, "f0 wrong: expected {ef0}, got {gf0}");
+        assert_eq!(gf1, ef1, "f1 wrong: expected {ef1}, got {gf1}");
+    }
+}