feat: implement persistent layered index and chunked binary format
Introduce the `obilayeredmap` specification and persistent MPHF-based index architecture for incremental multi-dataset indexing. Implement chunked binary serialization with a fixed `u8` k-mer count limit (256) and overlapping super-kmer segments. Add memory-mapped I/O and a companion `.idx` index file for allocation-free, O(1) unitig access. Update MkDocs navigation, enhance the k-mer comparison script, and add comprehensive tests for serialization, partitioning, and file I/O pipelines.
This commit is contained in:
@@ -3,6 +3,13 @@ name = "obikpartitionner"
|
||||
version = "0.1.0"
|
||||
edition = "2024"
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = "3"
|
||||
obikseq = { path = "../obikseq", features = ["test-utils"] }
|
||||
obiskbuilder = { path = "../obiskbuilder" }
|
||||
obiread = { path = "../obiread" }
|
||||
obikrope = { path = "../obikrope" }
|
||||
|
||||
[dependencies]
|
||||
niffler = "3.0.0"
|
||||
remove_dir_all = "0.8"
|
||||
|
||||
@@ -616,3 +616,119 @@ impl Drop for KmerPartition {
|
||||
let _ = self.close();
|
||||
}
|
||||
}
|
||||
|
||||
// ── integration tests ─────────────────────────────────────────────────────────
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::collections::HashMap;
|
||||
|
||||
use obikrope::Rope;
|
||||
use obikseq::SuperKmer;
|
||||
use obiskbuilder::build_superkmers;
|
||||
|
||||
const K: usize = 11;
|
||||
const M: usize = 5;
|
||||
|
||||
fn setup() {
|
||||
obikseq::params::set_k(K);
|
||||
obikseq::params::set_m(M);
|
||||
}
|
||||
|
||||
/// Direct canonical k-mer counts from ASCII sequences — ground truth.
|
||||
fn direct_counts(seqs: &[&[u8]]) -> (u64, u64) {
|
||||
let mut counts: HashMap<Vec<u8>, u64> = HashMap::new();
|
||||
for seq in seqs {
|
||||
for i in 0..seq.len().saturating_sub(K - 1) {
|
||||
let km = SuperKmer::from_ascii(&seq[i..i + K]).to_ascii();
|
||||
*counts.entry(km).or_insert(0) += 1;
|
||||
}
|
||||
}
|
||||
let f0 = counts.len() as u64;
|
||||
let f1: u64 = counts.values().sum();
|
||||
(f0, f1)
|
||||
}
|
||||
|
||||
/// Run the full pipeline on a list of sequences and return (f0, f1) from
|
||||
/// the `kmer_spectrum_raw.json` produced by `count_partition`.
|
||||
fn pipeline_counts(seqs: &[&[u8]]) -> (u64, u64) {
|
||||
setup();
|
||||
|
||||
let mut rope_data: Vec<u8> = Vec::new();
|
||||
for seq in seqs {
|
||||
rope_data.extend_from_slice(seq);
|
||||
rope_data.push(0x00);
|
||||
}
|
||||
let mut rope = Rope::new(None);
|
||||
rope.push(rope_data);
|
||||
|
||||
let superkmers: Vec<_> = build_superkmers(rope, K, 1, 0.0);
|
||||
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let mut kp = KmerPartition::create(dir.path(), 0, K, M, true).unwrap();
|
||||
kp.write_batch(superkmers).unwrap();
|
||||
kp.close().unwrap();
|
||||
kp.dereplicate().unwrap();
|
||||
|
||||
let part_dir = dir.path().join("part_00000");
|
||||
let dedup_path = part_dir.join("dereplicated.skmer.zst");
|
||||
if !dedup_path.exists() {
|
||||
return (0, 0);
|
||||
}
|
||||
count_partition(&part_dir, &dedup_path, K).unwrap();
|
||||
|
||||
let spec: serde_json::Value = serde_json::from_reader(
|
||||
fs::File::open(part_dir.join("kmer_spectrum_raw.json")).unwrap(),
|
||||
).unwrap();
|
||||
let f0 = spec["f0"].as_u64().unwrap_or(0);
|
||||
let f1 = spec["f1"].as_u64().unwrap_or(0);
|
||||
(f0, f1)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn single_sequence_f0_f1_match() {
|
||||
let seqs: &[&[u8]] = &[b"ACGTACGTACGTACGTACGT"];
|
||||
let (ef0, ef1) = direct_counts(seqs);
|
||||
let (gf0, gf1) = pipeline_counts(seqs);
|
||||
assert_eq!(gf0, ef0, "f0 wrong: expected {ef0}, got {gf0}");
|
||||
assert_eq!(gf1, ef1, "f1 wrong: expected {ef1}, got {gf1}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn two_sequences_f0_f1_match() {
|
||||
let seqs: &[&[u8]] = &[
|
||||
b"ACGTACGTACGTACGTACGT",
|
||||
b"TGCATGCATGCATGCATGCA",
|
||||
];
|
||||
let (ef0, ef1) = direct_counts(seqs);
|
||||
let (gf0, gf1) = pipeline_counts(seqs);
|
||||
assert_eq!(gf0, ef0, "f0 wrong: expected {ef0}, got {gf0}");
|
||||
assert_eq!(gf1, ef1, "f1 wrong: expected {ef1}, got {gf1}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn repeated_sequence_f1_doubles() {
|
||||
let seq = b"ACGTACGTACGTACGTACGT";
|
||||
let seqs: &[&[u8]] = &[seq, seq];
|
||||
let (ef0, ef1) = direct_counts(seqs);
|
||||
let (gf0, gf1) = pipeline_counts(seqs);
|
||||
assert_eq!(gf0, ef0, "f0 wrong: expected {ef0}, got {gf0}");
|
||||
assert_eq!(gf1, ef1, "f1 wrong: expected {ef1}, got {gf1}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn many_sequences_f0_f1_match() {
|
||||
// 20 distinct sequences of length 40 — forces multiple super-kmers and
|
||||
// multiple minimizer boundaries per sequence.
|
||||
let bases = b"ACGT";
|
||||
let seqs: Vec<Vec<u8>> = (0..20u32)
|
||||
.map(|i| (0..40).map(|j| bases[((i * 7 + j * 3) % 4) as usize]).collect())
|
||||
.collect();
|
||||
let seq_refs: Vec<&[u8]> = seqs.iter().map(|v| v.as_slice()).collect();
|
||||
let (ef0, ef1) = direct_counts(&seq_refs);
|
||||
let (gf0, gf1) = pipeline_counts(&seq_refs);
|
||||
assert_eq!(gf0, ef0, "f0 wrong: expected {ef0}, got {gf0}");
|
||||
assert_eq!(gf1, ef1, "f1 wrong: expected {ef1}, got {gf1}");
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user