From 9c41891cc85834f076bed1c2b960e650c0c6e601 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Tue, 12 May 2026 14:05:18 +0800 Subject: [PATCH] feat: add obilayeredmap crate for disk-backed k-mer indexing Introduces the `obilayeredmap` crate (v0.1.0), implementing an append-only, disk-backed k-mer index using a minimal perfect hash function (MPHF). The module features memory-mapped reads, buffered writes, custom error handling, partition metadata persistence, and comprehensive unit tests. Also adds a reverse complement benchmark for `obikseq` and updates `Cargo.lock` with the new dependencies. --- .DS_Store | Bin 6148 -> 8196 bytes src/Cargo.lock | 13 +++ src/Cargo.toml | 2 +- src/obikseq/benches/superkmer.rs | 10 ++- src/obilayeredmap/Cargo.toml | 16 ++++ src/obilayeredmap/src/counts.rs | 54 ++++++++++++ src/obilayeredmap/src/error.rs | 50 +++++++++++ src/obilayeredmap/src/evidence.rs | 60 +++++++++++++ src/obilayeredmap/src/layer.rs | 122 +++++++++++++++++++++++++++ src/obilayeredmap/src/lib.rs | 10 +++ src/obilayeredmap/src/map.rs | 94 +++++++++++++++++++++ src/obilayeredmap/src/meta.rs | 34 ++++++++ src/obilayeredmap/src/tests/layer.rs | 77 +++++++++++++++++ src/obilayeredmap/src/tests/map.rs | 100 ++++++++++++++++++++++ 14 files changed, 637 insertions(+), 5 deletions(-) create mode 100644 src/obilayeredmap/Cargo.toml create mode 100644 src/obilayeredmap/src/counts.rs create mode 100644 src/obilayeredmap/src/error.rs create mode 100644 src/obilayeredmap/src/evidence.rs create mode 100644 src/obilayeredmap/src/layer.rs create mode 100644 src/obilayeredmap/src/lib.rs create mode 100644 src/obilayeredmap/src/map.rs create mode 100644 src/obilayeredmap/src/meta.rs create mode 100644 src/obilayeredmap/src/tests/layer.rs create mode 100644 src/obilayeredmap/src/tests/map.rs diff --git a/.DS_Store b/.DS_Store index 2eab9d69cd2ce0ce11e11f6f840c71aa4377774e..0c99d24b907f53caa448c049fbb1825b743a74a1 100644 GIT binary patch literal 8196 zcmeHMUu+ab7@yy^(A|Z~0Hw6I0WLHGmLjD&jVizNC*z9}hDOEpQ5J!59B>`&wzFL#q}+z=xW zBM>7HBM>7HBXDORKzFuC>MZxZ?2XG9ff#|i5&>~OBqTIm~1p%d_!{`2CaIgk$?bYwKkwRn>Fm*3@V<^J*SVAN9)VAR83@ z?rhXay>ho#bWK0BP0J2VQ?(3l zIB}w_qrD}$am&_|Ey)udZLKZIP3@abp48MOP3yPq&m42cUGD|C2t)?~tDUi^)0Hc7 z{8EqnB2m>iy`n48id0RXXAF<%qZz)aMAIIa2yD;w4_F1CWQH&BX?gv5&wZp|iT>fd z=M=|XKf@R1vrfL~WCN?qanklvmOm0$#{=;+>U)KtCsfREUi7W~7U|g~tCzI>5u08o z)aqtT8`^vGmn>VkYHjQ0&Koe`ZD~L{lEUF^f&GQ|t^o%g(X$>|^#N`-*+T zuCnjh59}KIk^RhmWq+Xx^KlOrfTIzsum<;`85^+!DRf~s2CxtNk;WLZ$isvM7XeBr z<2atd^Eiz&coDDT4ZMwa@Gj2d0zStVxQMSYh0FL3S8xr#;dlIjzZF%{6kS=Y)G77K zQl(j0t87v_lukK!W!hy9RdP@1q52Jn0J%)`5l@-8mCGb{KB)KK5iWCezI2ksbxW2v zty$OBzAbf~O_F}ojJuJ?A$cn(ACfoR=^3n>lJ2B0oA$JselPi)Z0k9d`=HYYY~Du=9W)jKuvC|)hA59m8I z@^Q|>YN9*QtCCZ2^}loVefBB4$S$!d;_6St(qGs=P>G-QND@O=6F=8t9on%49q7b% z^kFA0|NDmk%Z%^; M`2G*McQ+UR0%J}grT_o{ delta 125 zcmZp1XfcprU|?W$DortDU=RQ@Ie-{MGjdEU6q~50$jCG?VE1GL8J5ZX0(UpB6pLfr z*szynF*^r`ATv-I5D0Js30IJgjfLNtC-ci#Du9F;85o$LGz*9ZazJK-tYg?5&ohS^ E0BV>MEC2ui diff --git a/src/Cargo.lock b/src/Cargo.lock index 9761b26..c41e900 100644 --- a/src/Cargo.lock +++ b/src/Cargo.lock @@ -1649,6 +1649,19 @@ dependencies = [ "xxhash-rust", ] +[[package]] +name = "obilayeredmap" +version = "0.1.0" +dependencies = [ + "memmap2", + "obikseq", + "obiskio", + "ph", + "serde", + "serde_json", + "tempfile", +] + [[package]] name = "obipipeline" version = "0.1.0" diff --git a/src/Cargo.toml b/src/Cargo.toml index 3a7a35e..997fbfe 100644 --- a/src/Cargo.toml +++ b/src/Cargo.toml @@ -1,5 +1,5 @@ [workspace] resolver = "3" -members = ["obikseq", "obiread", "obiskbuilder", "obifastwrite", "obikmer","obikrope","obipipeline", "obikpartitionner","obiskio","obidebruinj"] +members = ["obikseq", "obiread", "obiskbuilder", "obifastwrite", "obikmer","obikrope","obipipeline", "obikpartitionner","obiskio","obidebruinj","obilayeredmap"] [profile.release] debug = 1 diff --git a/src/obikseq/benches/superkmer.rs b/src/obikseq/benches/superkmer.rs index f2160a4..0464390 100644 --- a/src/obikseq/benches/superkmer.rs +++ b/src/obikseq/benches/superkmer.rs @@ -1,5 +1,7 @@ use criterion::{BatchSize, BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; +use obikseq::packed_seq::PackedSeq; use obikseq::superkmer::SuperKmer; +use obikseq::Sequence; const LENGTHS: &[usize] = &[1, 4, 8, 16, 40, 64, 128, 255, 256]; @@ -50,12 +52,12 @@ fn bench_write_ascii(c: &mut Criterion) { fn bench_revcomp(c: &mut Criterion) { let mut group = c.benchmark_group("revcomp"); for &len in LENGTHS { - let sk = SuperKmer::from_ascii(&make_ascii(len)); + let seq = PackedSeq::from_ascii(&make_ascii(len)); group.throughput(Throughput::Bytes(len as u64)); - group.bench_with_input(BenchmarkId::from_parameter(len), &sk, |b, sk| { + group.bench_with_input(BenchmarkId::from_parameter(len), &seq, |b, seq| { b.iter_batched( - || sk.clone(), - |mut s| { std::hint::black_box(&mut s).revcomp(); s }, + || seq.clone(), + |s| { let mut s = std::hint::black_box(s); s.revcomp_inplace(); s }, BatchSize::SmallInput, ); }); diff --git a/src/obilayeredmap/Cargo.toml b/src/obilayeredmap/Cargo.toml new file mode 100644 index 0000000..2487fc3 --- /dev/null +++ b/src/obilayeredmap/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "obilayeredmap" +version = "0.1.0" +edition = "2024" + +[dependencies] +obikseq = { path = "../obikseq" } +obiskio = { path = "../obiskio" } +ph = "0.11" +memmap2 = "0.9" +serde = { version = "1", features = ["derive"] } +serde_json = "1" + +[dev-dependencies] +tempfile = "3" +obikseq = { path = "../obikseq", features = ["test-utils"] } diff --git a/src/obilayeredmap/src/counts.rs b/src/obilayeredmap/src/counts.rs new file mode 100644 index 0000000..cd258d9 --- /dev/null +++ b/src/obilayeredmap/src/counts.rs @@ -0,0 +1,54 @@ +// u32 per MPHF slot: raw occurrence count for the kmer at that slot. + +use std::fs::File; +use std::io::{BufWriter, Write}; +use std::path::Path; + +use memmap2::Mmap; + +use crate::error::{OLMError, OLMResult}; + +pub struct Counts { + mmap: Mmap, +} + +impl Counts { + pub fn open(path: &Path) -> OLMResult { + let f = File::open(path)?; + let mmap = unsafe { Mmap::map(&f)? }; + Ok(Self { mmap }) + } + + #[inline] + pub fn get(&self, slot: usize) -> u32 { + let off = slot * 4; + u32::from_le_bytes(self.mmap[off..off + 4].try_into().unwrap()) + } + + pub fn len(&self) -> usize { + self.mmap.len() / 4 + } +} + +pub struct CountsWriter { + buf: Vec, +} + +impl CountsWriter { + pub fn new(n_slots: usize) -> Self { + Self { buf: vec![0u32; n_slots] } + } + + #[inline] + pub fn set(&mut self, slot: usize, count: u32) { + self.buf[slot] = count; + } + + pub fn write(self, path: &Path) -> OLMResult<()> { + let mut f = BufWriter::new(File::create(path)?); + for v in self.buf { + f.write_all(&v.to_le_bytes()).map_err(OLMError::Io)?; + } + Ok(()) + } +} diff --git a/src/obilayeredmap/src/error.rs b/src/obilayeredmap/src/error.rs new file mode 100644 index 0000000..c6b4e7f --- /dev/null +++ b/src/obilayeredmap/src/error.rs @@ -0,0 +1,50 @@ +use std::fmt; +use std::io; + +#[derive(Debug)] +pub enum OLMError { + Io(io::Error), + Json(serde_json::Error), + Mphf(String), + InvalidLayer(String), +} + +pub type OLMResult = Result; + +impl fmt::Display for OLMError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + OLMError::Io(e) => write!(f, "I/O error: {e}"), + OLMError::Json(e) => write!(f, "JSON error: {e}"), + OLMError::Mphf(s) => write!(f, "MPHF error: {s}"), + OLMError::InvalidLayer(s) => write!(f, "invalid layer: {s}"), + } + } +} + +impl std::error::Error for OLMError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match self { + OLMError::Io(e) => Some(e), + OLMError::Json(e) => Some(e), + _ => None, + } + } +} + +impl From for OLMError { + fn from(e: io::Error) -> Self { OLMError::Io(e) } +} + +impl From for OLMError { + fn from(e: serde_json::Error) -> Self { OLMError::Json(e) } +} + +impl From for OLMError { + fn from(e: obiskio::SKError) -> Self { + match e { + obiskio::SKError::Io(io_err) => OLMError::Io(io_err), + other => OLMError::InvalidLayer(other.to_string()), + } + } +} diff --git a/src/obilayeredmap/src/evidence.rs b/src/obilayeredmap/src/evidence.rs new file mode 100644 index 0000000..3bc8e86 --- /dev/null +++ b/src/obilayeredmap/src/evidence.rs @@ -0,0 +1,60 @@ +// u32 per MPHF slot: bits [31:7] = chunk_id (25 bits), bits [6:0] = rank (7 bits). + +use std::fs::File; +use std::io::{BufWriter, Write}; +use std::path::Path; + +use memmap2::Mmap; + +use crate::error::{OLMError, OLMResult}; + +pub struct Evidence { + mmap: Mmap, +} + +impl Evidence { + pub fn open(path: &Path) -> OLMResult { + let f = File::open(path)?; + let mmap = unsafe { Mmap::map(&f)? }; + Ok(Self { mmap }) + } + + #[inline] + pub fn decode(&self, slot: usize) -> (u32, u8) { + let off = slot * 4; + let raw = u32::from_le_bytes(self.mmap[off..off + 4].try_into().unwrap()); + (raw >> 7, (raw & 0x7F) as u8) + } + + pub fn len(&self) -> usize { + self.mmap.len() / 4 + } +} + +#[inline] +pub fn encode(chunk_id: u32, rank: u8) -> u32 { + (chunk_id << 7) | (rank as u32 & 0x7F) +} + +pub struct EvidenceWriter { + buf: Vec, +} + +impl EvidenceWriter { + pub fn new(n_slots: usize) -> Self { + Self { buf: vec![0u32; n_slots] } + } + + #[inline] + pub fn set(&mut self, slot: usize, chunk_id: u32, rank: u8) { + self.buf[slot] = encode(chunk_id, rank); + } + + pub fn write(self, path: &Path) -> OLMResult<()> { + let mut f = BufWriter::new(File::create(path)?); + for v in self.buf { + f.write_all(&v.to_le_bytes()).map_err(OLMError::Io)?; + } + Ok(()) + } +} diff --git a/src/obilayeredmap/src/layer.rs b/src/obilayeredmap/src/layer.rs new file mode 100644 index 0000000..d42f289 --- /dev/null +++ b/src/obilayeredmap/src/layer.rs @@ -0,0 +1,122 @@ +use std::collections::HashMap; +use std::fs; +use std::io::BufWriter; +use std::path::Path; + +use obikseq::{CanonicalKmer, Kmer, Sequence}; +use obiskio::{UnitigFileReader, UnitigFileWriter}; +use ph::fmph; + +use crate::counts::{Counts, CountsWriter}; +use crate::error::{OLMError, OLMResult}; +use crate::evidence::{Evidence, EvidenceWriter}; + +const MPHF_FILE: &str = "mphf.bin"; +const UNITIGS_FILE: &str = "unitigs.bin"; +const EVIDENCE_FILE: &str = "evidence.bin"; +const COUNTS_FILE: &str = "counts.bin"; + +pub struct Layer { + mphf: fmph::Function, + evidence: Evidence, + unitigs: UnitigFileReader, + counts: Counts, +} + +pub struct Hit { + pub slot: usize, + pub count: u32, +} + +impl Layer { + pub fn open(path: &Path) -> OLMResult { + let mphf = fmph::Function::read( + &mut fs::File::open(path.join(MPHF_FILE))? + ).map_err(OLMError::Io)?; + + let unitigs = UnitigFileReader::open(&path.join(UNITIGS_FILE))?; + let evidence = Evidence::open(&path.join(EVIDENCE_FILE))?; + let counts = Counts::open(&path.join(COUNTS_FILE))?; + + Ok(Self { mphf, evidence, unitigs, counts }) + } + + pub fn query(&self, kmer: CanonicalKmer) -> Option { + let slot = self.mphf.get(&kmer.raw())? as usize; + let (chunk_id, rank) = self.evidence.decode(slot); + if self.unitigs.verify_canonical_kmer(chunk_id as usize, rank as usize, kmer) { + Some(Hit { slot, count: self.counts.get(slot) }) + } else { + None + } + } + + /// Build a layer from unitigs already written to `out_dir/unitigs.bin`. + /// + /// `count_of` maps each canonical kmer to its occurrence count. + /// Returns the number of kmers indexed. + pub fn build( + out_dir: &Path, + count_of: impl Fn(CanonicalKmer) -> u32, + ) -> OLMResult { + let k = obikseq::params::k(); + let unitigs = UnitigFileReader::open(&out_dir.join(UNITIGS_FILE))?; + + let mut entries: Vec<(u64, u32, u8)> = Vec::new(); + for chunk_id in 0..unitigs.len() { + let n_kmers = unitigs.seql(chunk_id) - k + 1; + for rank in 0..n_kmers { + let raw = unitigs.raw_kmer(chunk_id, rank); + let canonical: CanonicalKmer = Kmer::from_raw(raw).canonical(); + entries.push((canonical.raw(), chunk_id as u32, rank as u8)); + } + } + + let n = entries.len(); + if n == 0 { + fs::File::create(out_dir.join(EVIDENCE_FILE))?; + fs::File::create(out_dir.join(COUNTS_FILE))?; + let mphf = fmph::Function::new(Vec::::new()); + mphf.write(&mut BufWriter::new(fs::File::create(out_dir.join(MPHF_FILE))?))?; + return Ok(0); + } + + let keys: Vec = entries.iter().map(|(k, _, _)| *k).collect(); + let mphf = fmph::Function::new(keys); + mphf.write(&mut BufWriter::new(fs::File::create(out_dir.join(MPHF_FILE))?))?; + + let mut ev = EvidenceWriter::new(n); + let mut cnt = CountsWriter::new(n); + + for (key, chunk_id, rank) in &entries { + let slot = mphf.get(key).unwrap() as usize; + ev.set(slot, *chunk_id, *rank); + let kmer = CanonicalKmer::from_raw_unchecked(*key); + cnt.set(slot, count_of(kmer)); + } + + ev.write(&out_dir.join(EVIDENCE_FILE))?; + cnt.write(&out_dir.join(COUNTS_FILE))?; + + Ok(n) + } + + /// Convenience variant of `build` that accepts a `HashMap`. + pub fn build_from_map( + out_dir: &Path, + counts: &HashMap, + ) -> OLMResult { + Self::build(out_dir, |kmer| counts.get(&kmer).copied().unwrap_or(0)) + } + + /// Return a `UnitigFileWriter` targeting this layer's `unitigs.bin`. + /// The caller writes unitigs, then calls `Layer::build` to finish the layer. + pub fn unitig_writer(out_dir: &Path) -> OLMResult { + fs::create_dir_all(out_dir)?; + Ok(UnitigFileWriter::create(&out_dir.join(UNITIGS_FILE))?) + } +} + +#[cfg(test)] +#[path = "tests/layer.rs"] +mod tests; diff --git a/src/obilayeredmap/src/lib.rs b/src/obilayeredmap/src/lib.rs new file mode 100644 index 0000000..babd20e --- /dev/null +++ b/src/obilayeredmap/src/lib.rs @@ -0,0 +1,10 @@ +pub mod counts; +pub mod error; +pub mod evidence; +pub mod layer; +pub mod map; +pub mod meta; + +pub use error::{OLMError, OLMResult}; +pub use layer::{Hit, Layer}; +pub use map::LayeredMap; diff --git a/src/obilayeredmap/src/map.rs b/src/obilayeredmap/src/map.rs new file mode 100644 index 0000000..2c8a349 --- /dev/null +++ b/src/obilayeredmap/src/map.rs @@ -0,0 +1,94 @@ +use std::collections::HashMap; +use std::fs; +use std::path::{Path, PathBuf}; + +use obikseq::CanonicalKmer; +use obiskio::UnitigFileWriter; + +use crate::error::OLMResult; +use crate::layer::{Hit, Layer}; +use crate::meta::PartitionMeta; + +/// Layered kmer index for a single partition. +/// +/// Each layer covers a disjoint kmer set. Queries probe layers in order; +/// the first match wins. Adding a dataset appends a new layer without +/// rebuilding existing ones. +pub struct LayeredMap { + root: PathBuf, + meta: PartitionMeta, + layers: Vec, +} + +impl LayeredMap { + /// Open an existing layered index at `root`. + pub fn open(root: &Path) -> OLMResult { + let meta = PartitionMeta::load(root)?; + let layers = (0..meta.n_layers) + .map(|i| Layer::open(&layer_dir(root, i))) + .collect::>>()?; + Ok(Self { root: root.to_owned(), meta, layers }) + } + + /// Create a new, empty layered index at `root`. + pub fn create(root: &Path) -> OLMResult { + fs::create_dir_all(root)?; + let meta = PartitionMeta::new(); + meta.save(root)?; + Ok(Self { root: root.to_owned(), meta, layers: Vec::new() }) + } + + pub fn n_layers(&self) -> usize { + self.layers.len() + } + + pub fn layer(&self, i: usize) -> &Layer { + &self.layers[i] + } + + /// Query `kmer` across all layers. Returns `(layer_index, Hit)` on match. + pub fn query(&self, kmer: CanonicalKmer) -> Option<(usize, Hit)> { + self.layers.iter().enumerate().find_map(|(i, layer)| { + layer.query(kmer).map(|hit| (i, hit)) + }) + } + + /// Return a `UnitigFileWriter` for the next layer to be built. + /// The caller writes unitigs, calls `.close()` on the writer, + /// then calls `push_layer` to finish. + pub fn next_layer_writer(&self) -> OLMResult { + let dir = layer_dir(&self.root, self.layers.len()); + Layer::unitig_writer(&dir) + } + + /// Build and append the next layer from a count closure. + /// Unitigs must already have been written via `next_layer_writer`. + pub fn push_layer( + &mut self, + count_of: impl Fn(CanonicalKmer) -> u32, + ) -> OLMResult { + let i = self.layers.len(); + let dir = layer_dir(&self.root, i); + Layer::build(&dir, count_of)?; + self.layers.push(Layer::open(&dir)?); + self.meta.n_layers = self.layers.len(); + self.meta.save(&self.root)?; + Ok(i) + } + + /// Convenience variant of `push_layer` that accepts a `HashMap`. + pub fn push_layer_from_map( + &mut self, + counts: &HashMap, + ) -> OLMResult { + self.push_layer(|kmer| counts.get(&kmer).copied().unwrap_or(0)) + } +} + +fn layer_dir(root: &Path, i: usize) -> PathBuf { + root.join(format!("layer_{i}")) +} + +#[cfg(test)] +#[path = "tests/map.rs"] +mod tests; diff --git a/src/obilayeredmap/src/meta.rs b/src/obilayeredmap/src/meta.rs new file mode 100644 index 0000000..12e17f7 --- /dev/null +++ b/src/obilayeredmap/src/meta.rs @@ -0,0 +1,34 @@ +use std::fs::File; +use std::path::Path; + +use serde::{Deserialize, Serialize}; + +use crate::error::OLMResult; + +const META_FILE: &str = "meta.json"; + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct PartitionMeta { + pub n_layers: usize, +} + +impl PartitionMeta { + pub fn new() -> Self { + Self { n_layers: 0 } + } + + pub fn load(dir: &Path) -> OLMResult { + let f = File::open(dir.join(META_FILE))?; + Ok(serde_json::from_reader(f)?) + } + + pub fn save(&self, dir: &Path) -> OLMResult<()> { + let f = File::create(dir.join(META_FILE))?; + serde_json::to_writer_pretty(f, self)?; + Ok(()) + } +} + +impl Default for PartitionMeta { + fn default() -> Self { Self::new() } +} diff --git a/src/obilayeredmap/src/tests/layer.rs b/src/obilayeredmap/src/tests/layer.rs new file mode 100644 index 0000000..516062c --- /dev/null +++ b/src/obilayeredmap/src/tests/layer.rs @@ -0,0 +1,77 @@ +use super::*; +use obikseq::{set_k, Unitig}; +use tempfile::tempdir; + +fn write_unitigs(dir: &Path, seqs: &[&[u8]]) { + let mut w = UnitigFileWriter::create(&dir.join(UNITIGS_FILE)).unwrap(); + for s in seqs { + w.write(&Unitig::from_ascii(s)).unwrap(); + } + w.close().unwrap(); +} + +fn all_canonical_kmers(dir: &Path, k: usize) -> Vec { + let r = UnitigFileReader::open(&dir.join(UNITIGS_FILE)).unwrap(); + let mut out = Vec::new(); + for ci in 0..r.len() { + let n = r.seql(ci) - k + 1; + for rank in 0..n { + out.push(Kmer::from_raw(r.raw_kmer(ci, rank)).canonical()); + } + } + out +} + +#[test] +fn build_and_query_all_kmers_found() { + set_k(4); + let dir = tempdir().unwrap(); + write_unitigs(dir.path(), &[b"AAAACGT"]); + let kmers = all_canonical_kmers(dir.path(), 4); + Layer::build(dir.path(), |_| 1).unwrap(); + let layer = Layer::open(dir.path()).unwrap(); + for kmer in kmers { + assert!(layer.query(kmer).is_some(), "kmer should be present"); + } +} + +#[test] +fn counts_are_stored_and_retrieved() { + set_k(4); + let dir = tempdir().unwrap(); + write_unitigs(dir.path(), &[b"AAAACGT"]); + let kmers = all_canonical_kmers(dir.path(), 4); + let count_map: HashMap = + kmers.iter().enumerate().map(|(i, &k)| (k, i as u32 + 1)).collect(); + Layer::build(dir.path(), |kmer| count_map.get(&kmer).copied().unwrap_or(0)).unwrap(); + let layer = Layer::open(dir.path()).unwrap(); + for kmer in &kmers { + let hit = layer.query(*kmer).expect("kmer must be present"); + assert_eq!(hit.count, count_map[kmer]); + } +} + +#[test] +fn query_absent_returns_none() { + set_k(4); + let dir = tempdir().unwrap(); + write_unitigs(dir.path(), &[b"AAAACGT"]); + Layer::build(dir.path(), |_| 1).unwrap(); + let layer = Layer::open(dir.path()).unwrap(); + let absent = Kmer::from_ascii(b"CCCC").unwrap().canonical(); + assert!(layer.query(absent).is_none()); +} + +#[test] +fn open_after_build_is_consistent() { + set_k(4); + let dir = tempdir().unwrap(); + // "AAAACGT": 7 nucl → 4 kmers, all with distinct canonical forms + write_unitigs(dir.path(), &[b"AAAACGT"]); + let n = Layer::build(dir.path(), |_| 7).unwrap(); + assert_eq!(n, 4); + let layer = Layer::open(dir.path()).unwrap(); + let kmer = Kmer::from_ascii(b"AAAA").unwrap().canonical(); + let hit = layer.query(kmer).expect("AAAA must be present"); + assert_eq!(hit.count, 7); +} diff --git a/src/obilayeredmap/src/tests/map.rs b/src/obilayeredmap/src/tests/map.rs new file mode 100644 index 0000000..932a048 --- /dev/null +++ b/src/obilayeredmap/src/tests/map.rs @@ -0,0 +1,100 @@ +use super::*; +use obikseq::{set_k, Sequence as _, Unitig}; +use tempfile::tempdir; + +fn push_unitigs_and_layer( + map: &mut LayeredMap, + seqs: &[&[u8]], + count: u32, +) { + let mut w = map.next_layer_writer().unwrap(); + for s in seqs { + w.write(&Unitig::from_ascii(s)).unwrap(); + } + w.close().unwrap(); + map.push_layer(|_| count).unwrap(); +} + +fn canonical(ascii: &[u8]) -> CanonicalKmer { + obikseq::Kmer::from_ascii(ascii).unwrap().canonical() +} + +#[test] +fn create_empty_map() { + set_k(4); + let dir = tempdir().unwrap(); + let map = LayeredMap::create(dir.path()).unwrap(); + assert_eq!(map.n_layers(), 0); +} + +#[test] +fn open_reloads_layer_count() { + set_k(4); + let dir = tempdir().unwrap(); + { + let mut map = LayeredMap::create(dir.path()).unwrap(); + push_unitigs_and_layer(&mut map, &[b"AAAACGT"], 1); + } + let map = LayeredMap::open(dir.path()).unwrap(); + assert_eq!(map.n_layers(), 1); +} + +#[test] +fn query_finds_kmer_in_layer_zero() { + set_k(4); + let dir = tempdir().unwrap(); + let mut map = LayeredMap::create(dir.path()).unwrap(); + push_unitigs_and_layer(&mut map, &[b"AAAACGT"], 3); + let kmer = canonical(b"AAAC"); + let (layer_idx, hit) = map.query(kmer).expect("kmer must be found"); + assert_eq!(layer_idx, 0); + assert_eq!(hit.count, 3); +} + +#[test] +fn query_finds_kmer_in_correct_layer() { + set_k(4); + let dir = tempdir().unwrap(); + let mut map = LayeredMap::create(dir.path()).unwrap(); + // Layer 0: AAAACGT + push_unitigs_and_layer(&mut map, &[b"AAAACGT"], 1); + // Layer 1: GGGACGT (no kmer overlap with layer 0 by construction) + push_unitigs_and_layer(&mut map, &[b"GGGACGT"], 2); + assert_eq!(map.n_layers(), 2); + + // AAAA is in layer 0 + let (li, hit) = map.query(canonical(b"AAAA")).expect("AAAA must be found"); + assert_eq!(li, 0); + assert_eq!(hit.count, 1); + + // GGGA is in layer 1 + let (li, hit) = map.query(canonical(b"GGGA")).expect("GGGA must be found"); + assert_eq!(li, 1); + assert_eq!(hit.count, 2); +} + +#[test] +fn query_absent_returns_none() { + set_k(4); + let dir = tempdir().unwrap(); + let mut map = LayeredMap::create(dir.path()).unwrap(); + push_unitigs_and_layer(&mut map, &[b"AAAACGT"], 1); + let absent = canonical(b"CCCC"); + assert!(map.query(absent).is_none()); +} + +#[test] +fn push_layer_from_map_convenience() { + set_k(4); + let dir = tempdir().unwrap(); + let mut map = LayeredMap::create(dir.path()).unwrap(); + let mut w = map.next_layer_writer().unwrap(); + w.write(&Unitig::from_ascii(b"AAAACGT")).unwrap(); + w.close().unwrap(); + let counts: HashMap = vec![ + (canonical(b"AAAA"), 10u32), + ].into_iter().collect(); + map.push_layer_from_map(&counts).unwrap(); + let (_, hit) = map.query(canonical(b"AAAA")).unwrap(); + assert_eq!(hit.count, 10); +}