refactor(obilayeredmap): support generic payload types
Replace the hardcoded `Counts` module with a generic `LayerData` trait, parameterizing `Layer` and `LayeredMap` over arbitrary payload types. This decouples read-path access from build-path logic, enabling both set membership and count-based indexing via `PersistentCompactIntVec`. Adds the `obicompactvec` dependency, implements streaming layer construction, and expands test coverage for persistence and multi-layer resolution.
This commit is contained in:
Generated
+1
@@ -1752,6 +1752,7 @@ dependencies = [
|
||||
"cacheline-ef",
|
||||
"epserde 0.8.0",
|
||||
"memmap2",
|
||||
"obicompactvec",
|
||||
"obikseq",
|
||||
"obiskio",
|
||||
"ptr_hash",
|
||||
|
||||
@@ -4,8 +4,9 @@ version = "0.1.0"
|
||||
edition = "2024"
|
||||
|
||||
[dependencies]
|
||||
obikseq = { path = "../obikseq" }
|
||||
obiskio = { path = "../obiskio" }
|
||||
obikseq = { path = "../obikseq" }
|
||||
obiskio = { path = "../obiskio" }
|
||||
obicompactvec = { path = "../obicompactvec" }
|
||||
ptr_hash = "1.1"
|
||||
cacheline-ef = "1.1"
|
||||
epserde = "0.8"
|
||||
|
||||
@@ -1,54 +0,0 @@
|
||||
// u32 per MPHF slot: raw occurrence count for the kmer at that slot.
|
||||
|
||||
use std::fs::File;
|
||||
use std::io::{BufWriter, Write};
|
||||
use std::path::Path;
|
||||
|
||||
use memmap2::Mmap;
|
||||
|
||||
use crate::error::{OLMError, OLMResult};
|
||||
|
||||
pub struct Counts {
|
||||
mmap: Mmap,
|
||||
}
|
||||
|
||||
impl Counts {
|
||||
pub fn open(path: &Path) -> OLMResult<Self> {
|
||||
let f = File::open(path)?;
|
||||
let mmap = unsafe { Mmap::map(&f)? };
|
||||
Ok(Self { mmap })
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn get(&self, slot: usize) -> u32 {
|
||||
let off = slot * 4;
|
||||
u32::from_le_bytes(self.mmap[off..off + 4].try_into().unwrap())
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize {
|
||||
self.mmap.len() / 4
|
||||
}
|
||||
}
|
||||
|
||||
pub struct CountsWriter {
|
||||
buf: Vec<u32>,
|
||||
}
|
||||
|
||||
impl CountsWriter {
|
||||
pub fn new(n_slots: usize) -> Self {
|
||||
Self { buf: vec![0u32; n_slots] }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn set(&mut self, slot: usize, count: u32) {
|
||||
self.buf[slot] = count;
|
||||
}
|
||||
|
||||
pub fn write(self, path: &Path) -> OLMResult<()> {
|
||||
let mut f = BufWriter::new(File::create(path)?);
|
||||
for v in self.buf {
|
||||
f.write_all(&v.to_le_bytes()).map_err(OLMError::Io)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
+143
-88
@@ -4,130 +4,185 @@ use std::path::Path;
|
||||
|
||||
use cacheline_ef::{CachelineEf, CachelineEfVec};
|
||||
use epserde::prelude::*;
|
||||
use obicompactvec::{PersistentCompactIntVec, PersistentCompactIntVecBuilder};
|
||||
use obikseq::CanonicalKmer;
|
||||
use obiskio::{UnitigFileReader, UnitigFileWriter};
|
||||
use ptr_hash::{PtrHash, PtrHashParams, bucket_fn::CubicEps, hash::Xx64};
|
||||
|
||||
use crate::counts::{Counts, CountsWriter};
|
||||
use crate::error::{OLMError, OLMResult};
|
||||
use crate::evidence::{Evidence, EvidenceWriter};
|
||||
|
||||
const MPHF_FILE: &str = "mphf.bin";
|
||||
const UNITIGS_FILE: &str = "unitigs.bin";
|
||||
pub(crate) const MPHF_FILE: &str = "mphf.bin";
|
||||
pub(crate) const UNITIGS_FILE: &str = "unitigs.bin";
|
||||
const EVIDENCE_FILE: &str = "evidence.bin";
|
||||
const COUNTS_FILE: &str = "counts.bin";
|
||||
const COUNTS_FILE: &str = "counts.pciv";
|
||||
|
||||
type Mphf = PtrHash<u64, CubicEps, CachelineEfVec<Vec<CachelineEf>>, Xx64, Vec<u8>>;
|
||||
|
||||
pub struct Layer {
|
||||
mphf: Mphf,
|
||||
// ── Trait ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
pub trait LayerData: Sized {
|
||||
type Item;
|
||||
fn open(layer_dir: &Path) -> OLMResult<Self>;
|
||||
fn read(&self, slot: usize) -> Self::Item;
|
||||
}
|
||||
|
||||
impl LayerData for () {
|
||||
type Item = ();
|
||||
fn open(_layer_dir: &Path) -> OLMResult<Self> { Ok(()) }
|
||||
fn read(&self, _slot: usize) {}
|
||||
}
|
||||
|
||||
impl LayerData for PersistentCompactIntVec {
|
||||
type Item = u32;
|
||||
fn open(layer_dir: &Path) -> OLMResult<Self> {
|
||||
PersistentCompactIntVec::open(&layer_dir.join(COUNTS_FILE)).map_err(OLMError::Io)
|
||||
}
|
||||
fn read(&self, slot: usize) -> u32 { self.get(slot) }
|
||||
}
|
||||
|
||||
// ── Structures ────────────────────────────────────────────────────────────────
|
||||
|
||||
pub struct Layer<D: LayerData = ()> {
|
||||
mphf: Mphf,
|
||||
evidence: Evidence,
|
||||
unitigs: UnitigFileReader,
|
||||
counts: Counts,
|
||||
unitigs: UnitigFileReader,
|
||||
data: D,
|
||||
}
|
||||
|
||||
pub struct Hit {
|
||||
pub struct Hit<T = ()> {
|
||||
pub slot: usize,
|
||||
pub count: u32,
|
||||
pub data: T,
|
||||
}
|
||||
|
||||
impl Layer {
|
||||
// ── Common read path ──────────────────────────────────────────────────────────
|
||||
|
||||
impl<D: LayerData> Layer<D> {
|
||||
pub fn open(path: &Path) -> OLMResult<Self> {
|
||||
let mphf: Mphf = Mphf::load_full(&path.join(MPHF_FILE))
|
||||
.map_err(|e| OLMError::InvalidLayer(e.to_string()))?;
|
||||
|
||||
let unitigs = UnitigFileReader::open(&path.join(UNITIGS_FILE))?;
|
||||
let unitigs = UnitigFileReader::open(&path.join(UNITIGS_FILE))?;
|
||||
let evidence = Evidence::open(&path.join(EVIDENCE_FILE))?;
|
||||
let counts = Counts::open(&path.join(COUNTS_FILE))?;
|
||||
|
||||
Ok(Self { mphf, evidence, unitigs, counts })
|
||||
let data = D::open(path)?;
|
||||
Ok(Self { mphf, evidence, unitigs, data })
|
||||
}
|
||||
|
||||
pub fn query(&self, kmer: CanonicalKmer) -> Option<Hit> {
|
||||
pub fn query(&self, kmer: CanonicalKmer) -> Option<Hit<D::Item>> {
|
||||
let slot = self.mphf.index(&kmer.raw());
|
||||
let (chunk_id, rank) = self.evidence.decode(slot);
|
||||
if self
|
||||
.unitigs
|
||||
.verify_canonical_kmer(chunk_id as usize, rank as usize, kmer)
|
||||
{
|
||||
Some(Hit { slot, count: self.counts.get(slot) })
|
||||
if self.unitigs.verify_canonical_kmer(chunk_id as usize, rank as usize, kmer) {
|
||||
Some(Hit { slot, data: self.data.read(slot) })
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Build a layer from unitigs already written to `out_dir/unitigs.bin`.
|
||||
///
|
||||
/// `count_of` maps each canonical kmer to its occurrence count.
|
||||
/// Returns the number of kmers indexed.
|
||||
pub fn build(out_dir: &Path, count_of: impl Fn(CanonicalKmer) -> u32) -> OLMResult<usize> {
|
||||
use rayon::prelude::*;
|
||||
|
||||
let unitigs = UnitigFileReader::open(&out_dir.join(UNITIGS_FILE))?;
|
||||
let n = unitigs.n_kmers();
|
||||
|
||||
if n == 0 {
|
||||
fs::File::create(out_dir.join(EVIDENCE_FILE))?;
|
||||
fs::File::create(out_dir.join(COUNTS_FILE))?;
|
||||
let mphf: Mphf = Mphf::try_new(&[] as &[u64], PtrHashParams::<CubicEps>::default())
|
||||
.ok_or_else(|| OLMError::Mphf("construction failed".into()))?;
|
||||
mphf.store(&out_dir.join(MPHF_FILE))
|
||||
.map_err(|e| OLMError::InvalidLayer(e.to_string()))?;
|
||||
return Ok(0);
|
||||
}
|
||||
|
||||
// First pass: build the MPHF from a cloneable parallel iterator.
|
||||
// flat_map_iter: outer chunks in parallel, inner kmer sliding-window sequential.
|
||||
let keys = (0..unitigs.len())
|
||||
.into_par_iter()
|
||||
.flat_map_iter(|ci| unitigs.unitig(ci).into_canonical_kmers().map(|km| km.raw()));
|
||||
let mphf: Mphf = Mphf::new_from_par_iter(n, keys, PtrHashParams::<CubicEps>::default());
|
||||
mphf.store(&out_dir.join(MPHF_FILE))
|
||||
.map_err(|e| OLMError::InvalidLayer(e.to_string()))?;
|
||||
|
||||
// Second pass: fill evidence and counts; verify MPHF correctness inline.
|
||||
// seen is a compact bitset (n/8 bytes) — no extra iteration needed.
|
||||
let mut ev = EvidenceWriter::new(n);
|
||||
let mut cnt = CountsWriter::new(n);
|
||||
let mut seen = vec![0u8; (n + 7) / 8];
|
||||
|
||||
for (kmer, chunk_id, rank) in unitigs.iter_indexed_canonical_kmers() {
|
||||
let slot = mphf.index(&kmer.raw());
|
||||
if slot >= n {
|
||||
return Err(OLMError::Mphf("MPHF construction failed: slot out of bounds".into()));
|
||||
}
|
||||
let byte = slot / 8;
|
||||
let bit = 1u8 << (slot % 8);
|
||||
if seen[byte] & bit != 0 {
|
||||
return Err(OLMError::Mphf("MPHF construction failed: duplicate slot".into()));
|
||||
}
|
||||
seen[byte] |= bit;
|
||||
ev.set(slot, chunk_id as u32, rank as u8);
|
||||
cnt.set(slot, count_of(kmer));
|
||||
}
|
||||
|
||||
ev.write(&out_dir.join(EVIDENCE_FILE))?;
|
||||
cnt.write(&out_dir.join(COUNTS_FILE))?;
|
||||
|
||||
Ok(n)
|
||||
}
|
||||
|
||||
/// Convenience variant of `build` that accepts a `HashMap`.
|
||||
pub fn build_from_map(
|
||||
out_dir: &Path,
|
||||
counts: &HashMap<CanonicalKmer, u32>,
|
||||
) -> OLMResult<usize> {
|
||||
Self::build(out_dir, |kmer| counts.get(&kmer).copied().unwrap_or(0))
|
||||
}
|
||||
|
||||
/// Return a `UnitigFileWriter` targeting this layer's `unitigs.bin`.
|
||||
/// The caller writes unitigs, then calls `Layer::build` to finish the layer.
|
||||
pub fn unitig_writer(out_dir: &Path) -> OLMResult<UnitigFileWriter> {
|
||||
fs::create_dir_all(out_dir)?;
|
||||
Ok(UnitigFileWriter::create(&out_dir.join(UNITIGS_FILE))?)
|
||||
}
|
||||
}
|
||||
|
||||
// ── Build helpers (private) ───────────────────────────────────────────────────
|
||||
|
||||
fn build_mphf(out_dir: &Path, n: usize) -> OLMResult<Mphf> {
|
||||
use rayon::prelude::*;
|
||||
let unitigs = UnitigFileReader::open(&out_dir.join(UNITIGS_FILE))?;
|
||||
let keys = (0..unitigs.len())
|
||||
.into_par_iter()
|
||||
.flat_map_iter(|ci| unitigs.unitig(ci).into_canonical_kmers().map(|km| km.raw()));
|
||||
let mphf: Mphf = Mphf::new_from_par_iter(n, keys, PtrHashParams::<CubicEps>::default());
|
||||
mphf.store(&out_dir.join(MPHF_FILE))
|
||||
.map_err(|e| OLMError::InvalidLayer(e.to_string()))?;
|
||||
Ok(mphf)
|
||||
}
|
||||
|
||||
fn build_second_pass(
|
||||
out_dir: &Path,
|
||||
n: usize,
|
||||
mphf: &Mphf,
|
||||
fill_slot: &mut impl FnMut(usize, CanonicalKmer) -> OLMResult<()>,
|
||||
) -> OLMResult<()> {
|
||||
let unitigs = UnitigFileReader::open(&out_dir.join(UNITIGS_FILE))?;
|
||||
let mut ev = EvidenceWriter::new(n);
|
||||
let mut seen = vec![0u8; (n + 7) / 8];
|
||||
|
||||
for (kmer, chunk_id, rank) in unitigs.iter_indexed_canonical_kmers() {
|
||||
let slot = mphf.index(&kmer.raw());
|
||||
if slot >= n {
|
||||
return Err(OLMError::Mphf("slot out of bounds".into()));
|
||||
}
|
||||
let byte = slot / 8;
|
||||
let bit = 1u8 << (slot % 8);
|
||||
if seen[byte] & bit != 0 {
|
||||
return Err(OLMError::Mphf("duplicate slot".into()));
|
||||
}
|
||||
seen[byte] |= bit;
|
||||
ev.set(slot, chunk_id as u32, rank as u8);
|
||||
fill_slot(slot, kmer)?;
|
||||
}
|
||||
|
||||
ev.write(&out_dir.join(EVIDENCE_FILE))?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn empty_layer(out_dir: &Path) -> OLMResult<()> {
|
||||
fs::File::create(out_dir.join(EVIDENCE_FILE))?;
|
||||
let mphf: Mphf = Mphf::try_new(&[] as &[u64], PtrHashParams::<CubicEps>::default())
|
||||
.ok_or_else(|| OLMError::Mphf("construction failed".into()))?;
|
||||
mphf.store(&out_dir.join(MPHF_FILE))
|
||||
.map_err(|e| OLMError::InvalidLayer(e.to_string()))?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// ── Mode 1 — set membership ───────────────────────────────────────────────────
|
||||
|
||||
impl Layer<()> {
|
||||
pub fn build(out_dir: &Path) -> OLMResult<usize> {
|
||||
let unitigs = UnitigFileReader::open(&out_dir.join(UNITIGS_FILE))?;
|
||||
let n = unitigs.n_kmers();
|
||||
if n == 0 {
|
||||
empty_layer(out_dir)?;
|
||||
return Ok(0);
|
||||
}
|
||||
let mphf = build_mphf(out_dir, n)?;
|
||||
build_second_pass(out_dir, n, &mphf, &mut |_, _| Ok(()))?;
|
||||
Ok(n)
|
||||
}
|
||||
}
|
||||
|
||||
// ── Mode 2 — counts (PersistentCompactIntVec) ─────────────────────────────────
|
||||
|
||||
impl Layer<PersistentCompactIntVec> {
|
||||
pub fn build(out_dir: &Path, count_of: impl Fn(CanonicalKmer) -> u32) -> OLMResult<usize> {
|
||||
let unitigs = UnitigFileReader::open(&out_dir.join(UNITIGS_FILE))?;
|
||||
let n = unitigs.n_kmers();
|
||||
if n == 0 {
|
||||
empty_layer(out_dir)?;
|
||||
PersistentCompactIntVecBuilder::new(0, &out_dir.join(COUNTS_FILE))
|
||||
.and_then(|b| b.close())
|
||||
.map_err(OLMError::Io)?;
|
||||
return Ok(0);
|
||||
}
|
||||
let mphf = build_mphf(out_dir, n)?;
|
||||
let mut cnt = PersistentCompactIntVecBuilder::new(n, &out_dir.join(COUNTS_FILE))
|
||||
.map_err(OLMError::Io)?;
|
||||
build_second_pass(out_dir, n, &mphf, &mut |slot, kmer| {
|
||||
cnt.set(slot, count_of(kmer));
|
||||
Ok(())
|
||||
})?;
|
||||
cnt.close().map_err(OLMError::Io)?;
|
||||
Ok(n)
|
||||
}
|
||||
|
||||
pub fn build_from_map(
|
||||
out_dir: &Path,
|
||||
counts: &HashMap<CanonicalKmer, u32>,
|
||||
) -> OLMResult<usize> {
|
||||
Self::build(out_dir, |kmer| counts.get(&kmer).copied().unwrap_or(0))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
#[path = "tests/layer.rs"]
|
||||
mod tests;
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
pub mod counts;
|
||||
pub mod error;
|
||||
pub mod evidence;
|
||||
pub mod layer;
|
||||
@@ -6,5 +5,5 @@ pub mod map;
|
||||
pub mod meta;
|
||||
|
||||
pub use error::{OLMError, OLMResult};
|
||||
pub use layer::{Hit, Layer};
|
||||
pub use layer::{Hit, Layer, LayerData};
|
||||
pub use map::LayeredMap;
|
||||
|
||||
@@ -2,11 +2,12 @@ use std::collections::HashMap;
|
||||
use std::fs;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use obicompactvec::PersistentCompactIntVec;
|
||||
use obikseq::CanonicalKmer;
|
||||
use obiskio::UnitigFileWriter;
|
||||
|
||||
use crate::error::OLMResult;
|
||||
use crate::layer::{Hit, Layer};
|
||||
use crate::layer::{Hit, Layer, LayerData};
|
||||
use crate::meta::PartitionMeta;
|
||||
|
||||
/// Layered kmer index for a single partition.
|
||||
@@ -14,20 +15,26 @@ use crate::meta::PartitionMeta;
|
||||
/// Each layer covers a disjoint kmer set. Queries probe layers in order;
|
||||
/// the first match wins. Adding a dataset appends a new layer without
|
||||
/// rebuilding existing ones.
|
||||
pub struct LayeredMap {
|
||||
root: PathBuf,
|
||||
meta: PartitionMeta,
|
||||
layers: Vec<Layer>,
|
||||
pub struct LayeredMap<D: LayerData = ()> {
|
||||
root: PathBuf,
|
||||
meta: PartitionMeta,
|
||||
layers: Vec<Layer<D>>,
|
||||
}
|
||||
|
||||
impl LayeredMap {
|
||||
// ── Common methods ────────────────────────────────────────────────────────────
|
||||
|
||||
impl<D: LayerData> LayeredMap<D> {
|
||||
/// Open an existing layered index at `root`.
|
||||
pub fn open(root: &Path) -> OLMResult<Self> {
|
||||
let meta = PartitionMeta::load(root)?;
|
||||
let layers = (0..meta.n_layers)
|
||||
.map(|i| Layer::open(&layer_dir(root, i)))
|
||||
.map(|i| Layer::<D>::open(&layer_dir(root, i)))
|
||||
.collect::<OLMResult<Vec<_>>>()?;
|
||||
Ok(Self { root: root.to_owned(), meta, layers })
|
||||
Ok(Self {
|
||||
root: root.to_owned(),
|
||||
meta,
|
||||
layers,
|
||||
})
|
||||
}
|
||||
|
||||
/// Create a new, empty layered index at `root`.
|
||||
@@ -35,48 +42,71 @@ impl LayeredMap {
|
||||
fs::create_dir_all(root)?;
|
||||
let meta = PartitionMeta::new();
|
||||
meta.save(root)?;
|
||||
Ok(Self { root: root.to_owned(), meta, layers: Vec::new() })
|
||||
Ok(Self {
|
||||
root: root.to_owned(),
|
||||
meta,
|
||||
layers: Vec::new(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Return the number of layers in this index.
|
||||
pub fn n_layers(&self) -> usize {
|
||||
self.layers.len()
|
||||
}
|
||||
|
||||
pub fn layer(&self, i: usize) -> &Layer {
|
||||
/// Return a reference to the `i`-th layer.
|
||||
pub fn layer(&self, i: usize) -> &Layer<D> {
|
||||
&self.layers[i]
|
||||
}
|
||||
|
||||
/// Query `kmer` across all layers. Returns `(layer_index, Hit)` on match.
|
||||
pub fn query(&self, kmer: CanonicalKmer) -> Option<(usize, Hit)> {
|
||||
self.layers.iter().enumerate().find_map(|(i, layer)| {
|
||||
layer.query(kmer).map(|hit| (i, hit))
|
||||
})
|
||||
pub fn query(&self, kmer: CanonicalKmer) -> Option<(usize, Hit<D::Item>)> {
|
||||
self.layers
|
||||
.iter()
|
||||
.enumerate()
|
||||
.find_map(|(i, layer)| layer.query(kmer).map(|hit| (i, hit)))
|
||||
}
|
||||
|
||||
/// Return a `UnitigFileWriter` for the next layer to be built.
|
||||
/// The caller writes unitigs, calls `.close()` on the writer,
|
||||
/// then calls `push_layer` to finish.
|
||||
pub fn next_layer_writer(&self) -> OLMResult<UnitigFileWriter> {
|
||||
let dir = layer_dir(&self.root, self.layers.len());
|
||||
Layer::unitig_writer(&dir)
|
||||
Layer::<D>::unitig_writer(&dir)
|
||||
}
|
||||
|
||||
/// Build and append the next layer from a count closure.
|
||||
/// Unitigs must already have been written via `next_layer_writer`.
|
||||
pub fn push_layer(
|
||||
&mut self,
|
||||
count_of: impl Fn(CanonicalKmer) -> u32,
|
||||
) -> OLMResult<usize> {
|
||||
/// Append a new layer to the index.
|
||||
fn append_layer(&mut self) -> OLMResult<()> {
|
||||
let i = self.layers.len();
|
||||
let dir = layer_dir(&self.root, i);
|
||||
Layer::build(&dir, count_of)?;
|
||||
self.layers.push(Layer::open(&dir)?);
|
||||
self.layers.push(Layer::<D>::open(&dir)?);
|
||||
self.meta.n_layers = self.layers.len();
|
||||
self.meta.save(&self.root)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
// ── Mode 1 — set membership ───────────────────────────────────────────────────
|
||||
|
||||
impl LayeredMap<()> {
|
||||
pub fn push_layer(&mut self) -> OLMResult<usize> {
|
||||
let i = self.layers.len();
|
||||
let dir = layer_dir(&self.root, i);
|
||||
Layer::<()>::build(&dir)?;
|
||||
self.append_layer()?;
|
||||
Ok(i)
|
||||
}
|
||||
}
|
||||
|
||||
// ── Mode 2 — counts ───────────────────────────────────────────────────────────
|
||||
|
||||
impl LayeredMap<PersistentCompactIntVec> {
|
||||
pub fn push_layer(&mut self, count_of: impl Fn(CanonicalKmer) -> u32) -> OLMResult<usize> {
|
||||
let i = self.layers.len();
|
||||
let dir = layer_dir(&self.root, i);
|
||||
Layer::<PersistentCompactIntVec>::build(&dir, count_of)?;
|
||||
self.append_layer()?;
|
||||
Ok(i)
|
||||
}
|
||||
|
||||
/// Convenience variant of `push_layer` that accepts a `HashMap`.
|
||||
pub fn push_layer_from_map(
|
||||
&mut self,
|
||||
counts: &HashMap<CanonicalKmer, u32>,
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use super::*;
|
||||
use obicompactvec::PersistentCompactIntVec;
|
||||
use obikseq::{set_k, Kmer, Sequence as _, Unitig};
|
||||
use tempfile::tempdir;
|
||||
|
||||
@@ -28,8 +29,8 @@ fn build_and_query_all_kmers_found() {
|
||||
let dir = tempdir().unwrap();
|
||||
write_unitigs(dir.path(), &[b"AAAACGT"]);
|
||||
let kmers = all_canonical_kmers(dir.path(), 4);
|
||||
Layer::build(dir.path(), |_| 1).unwrap();
|
||||
let layer = Layer::open(dir.path()).unwrap();
|
||||
Layer::<()>::build(dir.path()).unwrap();
|
||||
let layer = Layer::<()>::open(dir.path()).unwrap();
|
||||
for kmer in kmers {
|
||||
assert!(layer.query(kmer).is_some(), "kmer should be present");
|
||||
}
|
||||
@@ -43,11 +44,14 @@ fn counts_are_stored_and_retrieved() {
|
||||
let kmers = all_canonical_kmers(dir.path(), 4);
|
||||
let count_map: HashMap<CanonicalKmer, u32> =
|
||||
kmers.iter().enumerate().map(|(i, &k)| (k, i as u32 + 1)).collect();
|
||||
Layer::build(dir.path(), |kmer| count_map.get(&kmer).copied().unwrap_or(0)).unwrap();
|
||||
let layer = Layer::open(dir.path()).unwrap();
|
||||
Layer::<PersistentCompactIntVec>::build(
|
||||
dir.path(),
|
||||
|kmer| count_map.get(&kmer).copied().unwrap_or(0),
|
||||
).unwrap();
|
||||
let layer = Layer::<PersistentCompactIntVec>::open(dir.path()).unwrap();
|
||||
for kmer in &kmers {
|
||||
let hit = layer.query(*kmer).expect("kmer must be present");
|
||||
assert_eq!(hit.count, count_map[kmer]);
|
||||
assert_eq!(hit.data, count_map[kmer]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -56,8 +60,8 @@ fn query_absent_returns_none() {
|
||||
set_k(4);
|
||||
let dir = tempdir().unwrap();
|
||||
write_unitigs(dir.path(), &[b"AAAACGT"]);
|
||||
Layer::build(dir.path(), |_| 1).unwrap();
|
||||
let layer = Layer::open(dir.path()).unwrap();
|
||||
Layer::<()>::build(dir.path()).unwrap();
|
||||
let layer = Layer::<()>::open(dir.path()).unwrap();
|
||||
let absent = Kmer::from_ascii(b"CCCC").unwrap().canonical();
|
||||
assert!(layer.query(absent).is_none());
|
||||
}
|
||||
@@ -66,12 +70,11 @@ fn query_absent_returns_none() {
|
||||
fn open_after_build_is_consistent() {
|
||||
set_k(4);
|
||||
let dir = tempdir().unwrap();
|
||||
// "AAAACGT": 7 nucl → 4 kmers, all with distinct canonical forms
|
||||
write_unitigs(dir.path(), &[b"AAAACGT"]);
|
||||
let n = Layer::build(dir.path(), |_| 7).unwrap();
|
||||
let n = Layer::<PersistentCompactIntVec>::build(dir.path(), |_| 7).unwrap();
|
||||
assert_eq!(n, 4);
|
||||
let layer = Layer::open(dir.path()).unwrap();
|
||||
let layer = Layer::<PersistentCompactIntVec>::open(dir.path()).unwrap();
|
||||
let kmer = Kmer::from_ascii(b"AAAA").unwrap().canonical();
|
||||
let hit = layer.query(kmer).expect("AAAA must be present");
|
||||
assert_eq!(hit.count, 7);
|
||||
assert_eq!(hit.data, 7);
|
||||
}
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
use super::*;
|
||||
use obicompactvec::PersistentCompactIntVec;
|
||||
use obikseq::{set_k, Sequence as _, Unitig};
|
||||
use tempfile::tempdir;
|
||||
|
||||
fn push_unitigs_and_layer(
|
||||
map: &mut LayeredMap,
|
||||
map: &mut LayeredMap<PersistentCompactIntVec>,
|
||||
seqs: &[&[u8]],
|
||||
count: u32,
|
||||
) {
|
||||
@@ -23,7 +24,7 @@ fn canonical(ascii: &[u8]) -> CanonicalKmer {
|
||||
fn create_empty_map() {
|
||||
set_k(4);
|
||||
let dir = tempdir().unwrap();
|
||||
let map = LayeredMap::create(dir.path()).unwrap();
|
||||
let map = LayeredMap::<()>::create(dir.path()).unwrap();
|
||||
assert_eq!(map.n_layers(), 0);
|
||||
}
|
||||
|
||||
@@ -32,10 +33,10 @@ fn open_reloads_layer_count() {
|
||||
set_k(4);
|
||||
let dir = tempdir().unwrap();
|
||||
{
|
||||
let mut map = LayeredMap::create(dir.path()).unwrap();
|
||||
let mut map = LayeredMap::<PersistentCompactIntVec>::create(dir.path()).unwrap();
|
||||
push_unitigs_and_layer(&mut map, &[b"AAAACGT"], 1);
|
||||
}
|
||||
let map = LayeredMap::open(dir.path()).unwrap();
|
||||
let map = LayeredMap::<PersistentCompactIntVec>::open(dir.path()).unwrap();
|
||||
assert_eq!(map.n_layers(), 1);
|
||||
}
|
||||
|
||||
@@ -43,41 +44,37 @@ fn open_reloads_layer_count() {
|
||||
fn query_finds_kmer_in_layer_zero() {
|
||||
set_k(4);
|
||||
let dir = tempdir().unwrap();
|
||||
let mut map = LayeredMap::create(dir.path()).unwrap();
|
||||
let mut map = LayeredMap::<PersistentCompactIntVec>::create(dir.path()).unwrap();
|
||||
push_unitigs_and_layer(&mut map, &[b"AAAACGT"], 3);
|
||||
let kmer = canonical(b"AAAC");
|
||||
let (layer_idx, hit) = map.query(kmer).expect("kmer must be found");
|
||||
assert_eq!(layer_idx, 0);
|
||||
assert_eq!(hit.count, 3);
|
||||
assert_eq!(hit.data, 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn query_finds_kmer_in_correct_layer() {
|
||||
set_k(4);
|
||||
let dir = tempdir().unwrap();
|
||||
let mut map = LayeredMap::create(dir.path()).unwrap();
|
||||
// Layer 0: AAAACGT
|
||||
let mut map = LayeredMap::<PersistentCompactIntVec>::create(dir.path()).unwrap();
|
||||
push_unitigs_and_layer(&mut map, &[b"AAAACGT"], 1);
|
||||
// Layer 1: GGGACGT (no kmer overlap with layer 0 by construction)
|
||||
push_unitigs_and_layer(&mut map, &[b"GGGACGT"], 2);
|
||||
assert_eq!(map.n_layers(), 2);
|
||||
|
||||
// AAAA is in layer 0
|
||||
let (li, hit) = map.query(canonical(b"AAAA")).expect("AAAA must be found");
|
||||
assert_eq!(li, 0);
|
||||
assert_eq!(hit.count, 1);
|
||||
assert_eq!(hit.data, 1);
|
||||
|
||||
// GGGA is in layer 1
|
||||
let (li, hit) = map.query(canonical(b"GGGA")).expect("GGGA must be found");
|
||||
assert_eq!(li, 1);
|
||||
assert_eq!(hit.count, 2);
|
||||
assert_eq!(hit.data, 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn query_absent_returns_none() {
|
||||
set_k(4);
|
||||
let dir = tempdir().unwrap();
|
||||
let mut map = LayeredMap::create(dir.path()).unwrap();
|
||||
let mut map = LayeredMap::<PersistentCompactIntVec>::create(dir.path()).unwrap();
|
||||
push_unitigs_and_layer(&mut map, &[b"AAAACGT"], 1);
|
||||
let absent = canonical(b"CCCC");
|
||||
assert!(map.query(absent).is_none());
|
||||
@@ -87,7 +84,7 @@ fn query_absent_returns_none() {
|
||||
fn push_layer_from_map_convenience() {
|
||||
set_k(4);
|
||||
let dir = tempdir().unwrap();
|
||||
let mut map = LayeredMap::create(dir.path()).unwrap();
|
||||
let mut map = LayeredMap::<PersistentCompactIntVec>::create(dir.path()).unwrap();
|
||||
let mut w = map.next_layer_writer().unwrap();
|
||||
w.write(&Unitig::from_ascii(b"AAAACGT")).unwrap();
|
||||
w.close().unwrap();
|
||||
@@ -96,5 +93,5 @@ fn push_layer_from_map_convenience() {
|
||||
].into_iter().collect();
|
||||
map.push_layer_from_map(&counts).unwrap();
|
||||
let (_, hit) = map.query(canonical(b"AAAA")).unwrap();
|
||||
assert_eq!(hit.count, 10);
|
||||
assert_eq!(hit.data, 10);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user