feat: add obilayeredmap crate for disk-backed k-mer indexing
Introduces the `obilayeredmap` crate (v0.1.0), implementing an append-only, disk-backed k-mer index using a minimal perfect hash function (MPHF). The module features memory-mapped reads, buffered writes, custom error handling, partition metadata persistence, and comprehensive unit tests. Also adds a reverse complement benchmark for `obikseq` and updates `Cargo.lock` with the new dependencies.
This commit is contained in:
Generated
+13
@@ -1649,6 +1649,19 @@ dependencies = [
|
|||||||
"xxhash-rust",
|
"xxhash-rust",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "obilayeredmap"
|
||||||
|
version = "0.1.0"
|
||||||
|
dependencies = [
|
||||||
|
"memmap2",
|
||||||
|
"obikseq",
|
||||||
|
"obiskio",
|
||||||
|
"ph",
|
||||||
|
"serde",
|
||||||
|
"serde_json",
|
||||||
|
"tempfile",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "obipipeline"
|
name = "obipipeline"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
|
|||||||
+1
-1
@@ -1,5 +1,5 @@
|
|||||||
[workspace]
|
[workspace]
|
||||||
resolver = "3"
|
resolver = "3"
|
||||||
members = ["obikseq", "obiread", "obiskbuilder", "obifastwrite", "obikmer","obikrope","obipipeline", "obikpartitionner","obiskio","obidebruinj"]
|
members = ["obikseq", "obiread", "obiskbuilder", "obifastwrite", "obikmer","obikrope","obipipeline", "obikpartitionner","obiskio","obidebruinj","obilayeredmap"]
|
||||||
[profile.release]
|
[profile.release]
|
||||||
debug = 1
|
debug = 1
|
||||||
|
|||||||
@@ -1,5 +1,7 @@
|
|||||||
use criterion::{BatchSize, BenchmarkId, Criterion, Throughput, criterion_group, criterion_main};
|
use criterion::{BatchSize, BenchmarkId, Criterion, Throughput, criterion_group, criterion_main};
|
||||||
|
use obikseq::packed_seq::PackedSeq;
|
||||||
use obikseq::superkmer::SuperKmer;
|
use obikseq::superkmer::SuperKmer;
|
||||||
|
use obikseq::Sequence;
|
||||||
|
|
||||||
const LENGTHS: &[usize] = &[1, 4, 8, 16, 40, 64, 128, 255, 256];
|
const LENGTHS: &[usize] = &[1, 4, 8, 16, 40, 64, 128, 255, 256];
|
||||||
|
|
||||||
@@ -50,12 +52,12 @@ fn bench_write_ascii(c: &mut Criterion) {
|
|||||||
fn bench_revcomp(c: &mut Criterion) {
|
fn bench_revcomp(c: &mut Criterion) {
|
||||||
let mut group = c.benchmark_group("revcomp");
|
let mut group = c.benchmark_group("revcomp");
|
||||||
for &len in LENGTHS {
|
for &len in LENGTHS {
|
||||||
let sk = SuperKmer::from_ascii(&make_ascii(len));
|
let seq = PackedSeq::from_ascii(&make_ascii(len));
|
||||||
group.throughput(Throughput::Bytes(len as u64));
|
group.throughput(Throughput::Bytes(len as u64));
|
||||||
group.bench_with_input(BenchmarkId::from_parameter(len), &sk, |b, sk| {
|
group.bench_with_input(BenchmarkId::from_parameter(len), &seq, |b, seq| {
|
||||||
b.iter_batched(
|
b.iter_batched(
|
||||||
|| sk.clone(),
|
|| seq.clone(),
|
||||||
|mut s| { std::hint::black_box(&mut s).revcomp(); s },
|
|s| { let mut s = std::hint::black_box(s); s.revcomp_inplace(); s },
|
||||||
BatchSize::SmallInput,
|
BatchSize::SmallInput,
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -0,0 +1,16 @@
|
|||||||
|
[package]
|
||||||
|
name = "obilayeredmap"
|
||||||
|
version = "0.1.0"
|
||||||
|
edition = "2024"
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
obikseq = { path = "../obikseq" }
|
||||||
|
obiskio = { path = "../obiskio" }
|
||||||
|
ph = "0.11"
|
||||||
|
memmap2 = "0.9"
|
||||||
|
serde = { version = "1", features = ["derive"] }
|
||||||
|
serde_json = "1"
|
||||||
|
|
||||||
|
[dev-dependencies]
|
||||||
|
tempfile = "3"
|
||||||
|
obikseq = { path = "../obikseq", features = ["test-utils"] }
|
||||||
@@ -0,0 +1,54 @@
|
|||||||
|
// u32 per MPHF slot: raw occurrence count for the kmer at that slot.
|
||||||
|
|
||||||
|
use std::fs::File;
|
||||||
|
use std::io::{BufWriter, Write};
|
||||||
|
use std::path::Path;
|
||||||
|
|
||||||
|
use memmap2::Mmap;
|
||||||
|
|
||||||
|
use crate::error::{OLMError, OLMResult};
|
||||||
|
|
||||||
|
pub struct Counts {
|
||||||
|
mmap: Mmap,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Counts {
|
||||||
|
pub fn open(path: &Path) -> OLMResult<Self> {
|
||||||
|
let f = File::open(path)?;
|
||||||
|
let mmap = unsafe { Mmap::map(&f)? };
|
||||||
|
Ok(Self { mmap })
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
pub fn get(&self, slot: usize) -> u32 {
|
||||||
|
let off = slot * 4;
|
||||||
|
u32::from_le_bytes(self.mmap[off..off + 4].try_into().unwrap())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn len(&self) -> usize {
|
||||||
|
self.mmap.len() / 4
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct CountsWriter {
|
||||||
|
buf: Vec<u32>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl CountsWriter {
|
||||||
|
pub fn new(n_slots: usize) -> Self {
|
||||||
|
Self { buf: vec![0u32; n_slots] }
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
pub fn set(&mut self, slot: usize, count: u32) {
|
||||||
|
self.buf[slot] = count;
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn write(self, path: &Path) -> OLMResult<()> {
|
||||||
|
let mut f = BufWriter::new(File::create(path)?);
|
||||||
|
for v in self.buf {
|
||||||
|
f.write_all(&v.to_le_bytes()).map_err(OLMError::Io)?;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,50 @@
|
|||||||
|
use std::fmt;
|
||||||
|
use std::io;
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub enum OLMError {
|
||||||
|
Io(io::Error),
|
||||||
|
Json(serde_json::Error),
|
||||||
|
Mphf(String),
|
||||||
|
InvalidLayer(String),
|
||||||
|
}
|
||||||
|
|
||||||
|
pub type OLMResult<T> = Result<T, OLMError>;
|
||||||
|
|
||||||
|
impl fmt::Display for OLMError {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||||
|
match self {
|
||||||
|
OLMError::Io(e) => write!(f, "I/O error: {e}"),
|
||||||
|
OLMError::Json(e) => write!(f, "JSON error: {e}"),
|
||||||
|
OLMError::Mphf(s) => write!(f, "MPHF error: {s}"),
|
||||||
|
OLMError::InvalidLayer(s) => write!(f, "invalid layer: {s}"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl std::error::Error for OLMError {
|
||||||
|
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
|
||||||
|
match self {
|
||||||
|
OLMError::Io(e) => Some(e),
|
||||||
|
OLMError::Json(e) => Some(e),
|
||||||
|
_ => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<io::Error> for OLMError {
|
||||||
|
fn from(e: io::Error) -> Self { OLMError::Io(e) }
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<serde_json::Error> for OLMError {
|
||||||
|
fn from(e: serde_json::Error) -> Self { OLMError::Json(e) }
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<obiskio::SKError> for OLMError {
|
||||||
|
fn from(e: obiskio::SKError) -> Self {
|
||||||
|
match e {
|
||||||
|
obiskio::SKError::Io(io_err) => OLMError::Io(io_err),
|
||||||
|
other => OLMError::InvalidLayer(other.to_string()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,60 @@
|
|||||||
|
// u32 per MPHF slot: bits [31:7] = chunk_id (25 bits), bits [6:0] = rank (7 bits).
|
||||||
|
|
||||||
|
use std::fs::File;
|
||||||
|
use std::io::{BufWriter, Write};
|
||||||
|
use std::path::Path;
|
||||||
|
|
||||||
|
use memmap2::Mmap;
|
||||||
|
|
||||||
|
use crate::error::{OLMError, OLMResult};
|
||||||
|
|
||||||
|
pub struct Evidence {
|
||||||
|
mmap: Mmap,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Evidence {
|
||||||
|
pub fn open(path: &Path) -> OLMResult<Self> {
|
||||||
|
let f = File::open(path)?;
|
||||||
|
let mmap = unsafe { Mmap::map(&f)? };
|
||||||
|
Ok(Self { mmap })
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
pub fn decode(&self, slot: usize) -> (u32, u8) {
|
||||||
|
let off = slot * 4;
|
||||||
|
let raw = u32::from_le_bytes(self.mmap[off..off + 4].try_into().unwrap());
|
||||||
|
(raw >> 7, (raw & 0x7F) as u8)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn len(&self) -> usize {
|
||||||
|
self.mmap.len() / 4
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
pub fn encode(chunk_id: u32, rank: u8) -> u32 {
|
||||||
|
(chunk_id << 7) | (rank as u32 & 0x7F)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct EvidenceWriter {
|
||||||
|
buf: Vec<u32>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl EvidenceWriter {
|
||||||
|
pub fn new(n_slots: usize) -> Self {
|
||||||
|
Self { buf: vec![0u32; n_slots] }
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
pub fn set(&mut self, slot: usize, chunk_id: u32, rank: u8) {
|
||||||
|
self.buf[slot] = encode(chunk_id, rank);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn write(self, path: &Path) -> OLMResult<()> {
|
||||||
|
let mut f = BufWriter::new(File::create(path)?);
|
||||||
|
for v in self.buf {
|
||||||
|
f.write_all(&v.to_le_bytes()).map_err(OLMError::Io)?;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,122 @@
|
|||||||
|
use std::collections::HashMap;
|
||||||
|
use std::fs;
|
||||||
|
use std::io::BufWriter;
|
||||||
|
use std::path::Path;
|
||||||
|
|
||||||
|
use obikseq::{CanonicalKmer, Kmer, Sequence};
|
||||||
|
use obiskio::{UnitigFileReader, UnitigFileWriter};
|
||||||
|
use ph::fmph;
|
||||||
|
|
||||||
|
use crate::counts::{Counts, CountsWriter};
|
||||||
|
use crate::error::{OLMError, OLMResult};
|
||||||
|
use crate::evidence::{Evidence, EvidenceWriter};
|
||||||
|
|
||||||
|
const MPHF_FILE: &str = "mphf.bin";
|
||||||
|
const UNITIGS_FILE: &str = "unitigs.bin";
|
||||||
|
const EVIDENCE_FILE: &str = "evidence.bin";
|
||||||
|
const COUNTS_FILE: &str = "counts.bin";
|
||||||
|
|
||||||
|
pub struct Layer {
|
||||||
|
mphf: fmph::Function,
|
||||||
|
evidence: Evidence,
|
||||||
|
unitigs: UnitigFileReader,
|
||||||
|
counts: Counts,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct Hit {
|
||||||
|
pub slot: usize,
|
||||||
|
pub count: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Layer {
|
||||||
|
pub fn open(path: &Path) -> OLMResult<Self> {
|
||||||
|
let mphf = fmph::Function::read(
|
||||||
|
&mut fs::File::open(path.join(MPHF_FILE))?
|
||||||
|
).map_err(OLMError::Io)?;
|
||||||
|
|
||||||
|
let unitigs = UnitigFileReader::open(&path.join(UNITIGS_FILE))?;
|
||||||
|
let evidence = Evidence::open(&path.join(EVIDENCE_FILE))?;
|
||||||
|
let counts = Counts::open(&path.join(COUNTS_FILE))?;
|
||||||
|
|
||||||
|
Ok(Self { mphf, evidence, unitigs, counts })
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn query(&self, kmer: CanonicalKmer) -> Option<Hit> {
|
||||||
|
let slot = self.mphf.get(&kmer.raw())? as usize;
|
||||||
|
let (chunk_id, rank) = self.evidence.decode(slot);
|
||||||
|
if self.unitigs.verify_canonical_kmer(chunk_id as usize, rank as usize, kmer) {
|
||||||
|
Some(Hit { slot, count: self.counts.get(slot) })
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Build a layer from unitigs already written to `out_dir/unitigs.bin`.
|
||||||
|
///
|
||||||
|
/// `count_of` maps each canonical kmer to its occurrence count.
|
||||||
|
/// Returns the number of kmers indexed.
|
||||||
|
pub fn build(
|
||||||
|
out_dir: &Path,
|
||||||
|
count_of: impl Fn(CanonicalKmer) -> u32,
|
||||||
|
) -> OLMResult<usize> {
|
||||||
|
let k = obikseq::params::k();
|
||||||
|
let unitigs = UnitigFileReader::open(&out_dir.join(UNITIGS_FILE))?;
|
||||||
|
|
||||||
|
let mut entries: Vec<(u64, u32, u8)> = Vec::new();
|
||||||
|
for chunk_id in 0..unitigs.len() {
|
||||||
|
let n_kmers = unitigs.seql(chunk_id) - k + 1;
|
||||||
|
for rank in 0..n_kmers {
|
||||||
|
let raw = unitigs.raw_kmer(chunk_id, rank);
|
||||||
|
let canonical: CanonicalKmer = Kmer::from_raw(raw).canonical();
|
||||||
|
entries.push((canonical.raw(), chunk_id as u32, rank as u8));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let n = entries.len();
|
||||||
|
if n == 0 {
|
||||||
|
fs::File::create(out_dir.join(EVIDENCE_FILE))?;
|
||||||
|
fs::File::create(out_dir.join(COUNTS_FILE))?;
|
||||||
|
let mphf = fmph::Function::new(Vec::<u64>::new());
|
||||||
|
mphf.write(&mut BufWriter::new(fs::File::create(out_dir.join(MPHF_FILE))?))?;
|
||||||
|
return Ok(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
let keys: Vec<u64> = entries.iter().map(|(k, _, _)| *k).collect();
|
||||||
|
let mphf = fmph::Function::new(keys);
|
||||||
|
mphf.write(&mut BufWriter::new(fs::File::create(out_dir.join(MPHF_FILE))?))?;
|
||||||
|
|
||||||
|
let mut ev = EvidenceWriter::new(n);
|
||||||
|
let mut cnt = CountsWriter::new(n);
|
||||||
|
|
||||||
|
for (key, chunk_id, rank) in &entries {
|
||||||
|
let slot = mphf.get(key).unwrap() as usize;
|
||||||
|
ev.set(slot, *chunk_id, *rank);
|
||||||
|
let kmer = CanonicalKmer::from_raw_unchecked(*key);
|
||||||
|
cnt.set(slot, count_of(kmer));
|
||||||
|
}
|
||||||
|
|
||||||
|
ev.write(&out_dir.join(EVIDENCE_FILE))?;
|
||||||
|
cnt.write(&out_dir.join(COUNTS_FILE))?;
|
||||||
|
|
||||||
|
Ok(n)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convenience variant of `build` that accepts a `HashMap`.
|
||||||
|
pub fn build_from_map(
|
||||||
|
out_dir: &Path,
|
||||||
|
counts: &HashMap<CanonicalKmer, u32>,
|
||||||
|
) -> OLMResult<usize> {
|
||||||
|
Self::build(out_dir, |kmer| counts.get(&kmer).copied().unwrap_or(0))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return a `UnitigFileWriter` targeting this layer's `unitigs.bin`.
|
||||||
|
/// The caller writes unitigs, then calls `Layer::build` to finish the layer.
|
||||||
|
pub fn unitig_writer(out_dir: &Path) -> OLMResult<UnitigFileWriter> {
|
||||||
|
fs::create_dir_all(out_dir)?;
|
||||||
|
Ok(UnitigFileWriter::create(&out_dir.join(UNITIGS_FILE))?)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
#[path = "tests/layer.rs"]
|
||||||
|
mod tests;
|
||||||
@@ -0,0 +1,10 @@
|
|||||||
|
pub mod counts;
|
||||||
|
pub mod error;
|
||||||
|
pub mod evidence;
|
||||||
|
pub mod layer;
|
||||||
|
pub mod map;
|
||||||
|
pub mod meta;
|
||||||
|
|
||||||
|
pub use error::{OLMError, OLMResult};
|
||||||
|
pub use layer::{Hit, Layer};
|
||||||
|
pub use map::LayeredMap;
|
||||||
@@ -0,0 +1,94 @@
|
|||||||
|
use std::collections::HashMap;
|
||||||
|
use std::fs;
|
||||||
|
use std::path::{Path, PathBuf};
|
||||||
|
|
||||||
|
use obikseq::CanonicalKmer;
|
||||||
|
use obiskio::UnitigFileWriter;
|
||||||
|
|
||||||
|
use crate::error::OLMResult;
|
||||||
|
use crate::layer::{Hit, Layer};
|
||||||
|
use crate::meta::PartitionMeta;
|
||||||
|
|
||||||
|
/// Layered kmer index for a single partition.
|
||||||
|
///
|
||||||
|
/// Each layer covers a disjoint kmer set. Queries probe layers in order;
|
||||||
|
/// the first match wins. Adding a dataset appends a new layer without
|
||||||
|
/// rebuilding existing ones.
|
||||||
|
pub struct LayeredMap {
|
||||||
|
root: PathBuf,
|
||||||
|
meta: PartitionMeta,
|
||||||
|
layers: Vec<Layer>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl LayeredMap {
|
||||||
|
/// Open an existing layered index at `root`.
|
||||||
|
pub fn open(root: &Path) -> OLMResult<Self> {
|
||||||
|
let meta = PartitionMeta::load(root)?;
|
||||||
|
let layers = (0..meta.n_layers)
|
||||||
|
.map(|i| Layer::open(&layer_dir(root, i)))
|
||||||
|
.collect::<OLMResult<Vec<_>>>()?;
|
||||||
|
Ok(Self { root: root.to_owned(), meta, layers })
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create a new, empty layered index at `root`.
|
||||||
|
pub fn create(root: &Path) -> OLMResult<Self> {
|
||||||
|
fs::create_dir_all(root)?;
|
||||||
|
let meta = PartitionMeta::new();
|
||||||
|
meta.save(root)?;
|
||||||
|
Ok(Self { root: root.to_owned(), meta, layers: Vec::new() })
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn n_layers(&self) -> usize {
|
||||||
|
self.layers.len()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn layer(&self, i: usize) -> &Layer {
|
||||||
|
&self.layers[i]
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Query `kmer` across all layers. Returns `(layer_index, Hit)` on match.
|
||||||
|
pub fn query(&self, kmer: CanonicalKmer) -> Option<(usize, Hit)> {
|
||||||
|
self.layers.iter().enumerate().find_map(|(i, layer)| {
|
||||||
|
layer.query(kmer).map(|hit| (i, hit))
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return a `UnitigFileWriter` for the next layer to be built.
|
||||||
|
/// The caller writes unitigs, calls `.close()` on the writer,
|
||||||
|
/// then calls `push_layer` to finish.
|
||||||
|
pub fn next_layer_writer(&self) -> OLMResult<UnitigFileWriter> {
|
||||||
|
let dir = layer_dir(&self.root, self.layers.len());
|
||||||
|
Layer::unitig_writer(&dir)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Build and append the next layer from a count closure.
|
||||||
|
/// Unitigs must already have been written via `next_layer_writer`.
|
||||||
|
pub fn push_layer(
|
||||||
|
&mut self,
|
||||||
|
count_of: impl Fn(CanonicalKmer) -> u32,
|
||||||
|
) -> OLMResult<usize> {
|
||||||
|
let i = self.layers.len();
|
||||||
|
let dir = layer_dir(&self.root, i);
|
||||||
|
Layer::build(&dir, count_of)?;
|
||||||
|
self.layers.push(Layer::open(&dir)?);
|
||||||
|
self.meta.n_layers = self.layers.len();
|
||||||
|
self.meta.save(&self.root)?;
|
||||||
|
Ok(i)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convenience variant of `push_layer` that accepts a `HashMap`.
|
||||||
|
pub fn push_layer_from_map(
|
||||||
|
&mut self,
|
||||||
|
counts: &HashMap<CanonicalKmer, u32>,
|
||||||
|
) -> OLMResult<usize> {
|
||||||
|
self.push_layer(|kmer| counts.get(&kmer).copied().unwrap_or(0))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn layer_dir(root: &Path, i: usize) -> PathBuf {
|
||||||
|
root.join(format!("layer_{i}"))
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
#[path = "tests/map.rs"]
|
||||||
|
mod tests;
|
||||||
@@ -0,0 +1,34 @@
|
|||||||
|
use std::fs::File;
|
||||||
|
use std::path::Path;
|
||||||
|
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
|
use crate::error::OLMResult;
|
||||||
|
|
||||||
|
const META_FILE: &str = "meta.json";
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct PartitionMeta {
|
||||||
|
pub n_layers: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PartitionMeta {
|
||||||
|
pub fn new() -> Self {
|
||||||
|
Self { n_layers: 0 }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn load(dir: &Path) -> OLMResult<Self> {
|
||||||
|
let f = File::open(dir.join(META_FILE))?;
|
||||||
|
Ok(serde_json::from_reader(f)?)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn save(&self, dir: &Path) -> OLMResult<()> {
|
||||||
|
let f = File::create(dir.join(META_FILE))?;
|
||||||
|
serde_json::to_writer_pretty(f, self)?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for PartitionMeta {
|
||||||
|
fn default() -> Self { Self::new() }
|
||||||
|
}
|
||||||
@@ -0,0 +1,77 @@
|
|||||||
|
use super::*;
|
||||||
|
use obikseq::{set_k, Unitig};
|
||||||
|
use tempfile::tempdir;
|
||||||
|
|
||||||
|
fn write_unitigs(dir: &Path, seqs: &[&[u8]]) {
|
||||||
|
let mut w = UnitigFileWriter::create(&dir.join(UNITIGS_FILE)).unwrap();
|
||||||
|
for s in seqs {
|
||||||
|
w.write(&Unitig::from_ascii(s)).unwrap();
|
||||||
|
}
|
||||||
|
w.close().unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
fn all_canonical_kmers(dir: &Path, k: usize) -> Vec<CanonicalKmer> {
|
||||||
|
let r = UnitigFileReader::open(&dir.join(UNITIGS_FILE)).unwrap();
|
||||||
|
let mut out = Vec::new();
|
||||||
|
for ci in 0..r.len() {
|
||||||
|
let n = r.seql(ci) - k + 1;
|
||||||
|
for rank in 0..n {
|
||||||
|
out.push(Kmer::from_raw(r.raw_kmer(ci, rank)).canonical());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
out
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn build_and_query_all_kmers_found() {
|
||||||
|
set_k(4);
|
||||||
|
let dir = tempdir().unwrap();
|
||||||
|
write_unitigs(dir.path(), &[b"AAAACGT"]);
|
||||||
|
let kmers = all_canonical_kmers(dir.path(), 4);
|
||||||
|
Layer::build(dir.path(), |_| 1).unwrap();
|
||||||
|
let layer = Layer::open(dir.path()).unwrap();
|
||||||
|
for kmer in kmers {
|
||||||
|
assert!(layer.query(kmer).is_some(), "kmer should be present");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn counts_are_stored_and_retrieved() {
|
||||||
|
set_k(4);
|
||||||
|
let dir = tempdir().unwrap();
|
||||||
|
write_unitigs(dir.path(), &[b"AAAACGT"]);
|
||||||
|
let kmers = all_canonical_kmers(dir.path(), 4);
|
||||||
|
let count_map: HashMap<CanonicalKmer, u32> =
|
||||||
|
kmers.iter().enumerate().map(|(i, &k)| (k, i as u32 + 1)).collect();
|
||||||
|
Layer::build(dir.path(), |kmer| count_map.get(&kmer).copied().unwrap_or(0)).unwrap();
|
||||||
|
let layer = Layer::open(dir.path()).unwrap();
|
||||||
|
for kmer in &kmers {
|
||||||
|
let hit = layer.query(*kmer).expect("kmer must be present");
|
||||||
|
assert_eq!(hit.count, count_map[kmer]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn query_absent_returns_none() {
|
||||||
|
set_k(4);
|
||||||
|
let dir = tempdir().unwrap();
|
||||||
|
write_unitigs(dir.path(), &[b"AAAACGT"]);
|
||||||
|
Layer::build(dir.path(), |_| 1).unwrap();
|
||||||
|
let layer = Layer::open(dir.path()).unwrap();
|
||||||
|
let absent = Kmer::from_ascii(b"CCCC").unwrap().canonical();
|
||||||
|
assert!(layer.query(absent).is_none());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn open_after_build_is_consistent() {
|
||||||
|
set_k(4);
|
||||||
|
let dir = tempdir().unwrap();
|
||||||
|
// "AAAACGT": 7 nucl → 4 kmers, all with distinct canonical forms
|
||||||
|
write_unitigs(dir.path(), &[b"AAAACGT"]);
|
||||||
|
let n = Layer::build(dir.path(), |_| 7).unwrap();
|
||||||
|
assert_eq!(n, 4);
|
||||||
|
let layer = Layer::open(dir.path()).unwrap();
|
||||||
|
let kmer = Kmer::from_ascii(b"AAAA").unwrap().canonical();
|
||||||
|
let hit = layer.query(kmer).expect("AAAA must be present");
|
||||||
|
assert_eq!(hit.count, 7);
|
||||||
|
}
|
||||||
@@ -0,0 +1,100 @@
|
|||||||
|
use super::*;
|
||||||
|
use obikseq::{set_k, Sequence as _, Unitig};
|
||||||
|
use tempfile::tempdir;
|
||||||
|
|
||||||
|
fn push_unitigs_and_layer(
|
||||||
|
map: &mut LayeredMap,
|
||||||
|
seqs: &[&[u8]],
|
||||||
|
count: u32,
|
||||||
|
) {
|
||||||
|
let mut w = map.next_layer_writer().unwrap();
|
||||||
|
for s in seqs {
|
||||||
|
w.write(&Unitig::from_ascii(s)).unwrap();
|
||||||
|
}
|
||||||
|
w.close().unwrap();
|
||||||
|
map.push_layer(|_| count).unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
fn canonical(ascii: &[u8]) -> CanonicalKmer {
|
||||||
|
obikseq::Kmer::from_ascii(ascii).unwrap().canonical()
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn create_empty_map() {
|
||||||
|
set_k(4);
|
||||||
|
let dir = tempdir().unwrap();
|
||||||
|
let map = LayeredMap::create(dir.path()).unwrap();
|
||||||
|
assert_eq!(map.n_layers(), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn open_reloads_layer_count() {
|
||||||
|
set_k(4);
|
||||||
|
let dir = tempdir().unwrap();
|
||||||
|
{
|
||||||
|
let mut map = LayeredMap::create(dir.path()).unwrap();
|
||||||
|
push_unitigs_and_layer(&mut map, &[b"AAAACGT"], 1);
|
||||||
|
}
|
||||||
|
let map = LayeredMap::open(dir.path()).unwrap();
|
||||||
|
assert_eq!(map.n_layers(), 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn query_finds_kmer_in_layer_zero() {
|
||||||
|
set_k(4);
|
||||||
|
let dir = tempdir().unwrap();
|
||||||
|
let mut map = LayeredMap::create(dir.path()).unwrap();
|
||||||
|
push_unitigs_and_layer(&mut map, &[b"AAAACGT"], 3);
|
||||||
|
let kmer = canonical(b"AAAC");
|
||||||
|
let (layer_idx, hit) = map.query(kmer).expect("kmer must be found");
|
||||||
|
assert_eq!(layer_idx, 0);
|
||||||
|
assert_eq!(hit.count, 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn query_finds_kmer_in_correct_layer() {
|
||||||
|
set_k(4);
|
||||||
|
let dir = tempdir().unwrap();
|
||||||
|
let mut map = LayeredMap::create(dir.path()).unwrap();
|
||||||
|
// Layer 0: AAAACGT
|
||||||
|
push_unitigs_and_layer(&mut map, &[b"AAAACGT"], 1);
|
||||||
|
// Layer 1: GGGACGT (no kmer overlap with layer 0 by construction)
|
||||||
|
push_unitigs_and_layer(&mut map, &[b"GGGACGT"], 2);
|
||||||
|
assert_eq!(map.n_layers(), 2);
|
||||||
|
|
||||||
|
// AAAA is in layer 0
|
||||||
|
let (li, hit) = map.query(canonical(b"AAAA")).expect("AAAA must be found");
|
||||||
|
assert_eq!(li, 0);
|
||||||
|
assert_eq!(hit.count, 1);
|
||||||
|
|
||||||
|
// GGGA is in layer 1
|
||||||
|
let (li, hit) = map.query(canonical(b"GGGA")).expect("GGGA must be found");
|
||||||
|
assert_eq!(li, 1);
|
||||||
|
assert_eq!(hit.count, 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn query_absent_returns_none() {
|
||||||
|
set_k(4);
|
||||||
|
let dir = tempdir().unwrap();
|
||||||
|
let mut map = LayeredMap::create(dir.path()).unwrap();
|
||||||
|
push_unitigs_and_layer(&mut map, &[b"AAAACGT"], 1);
|
||||||
|
let absent = canonical(b"CCCC");
|
||||||
|
assert!(map.query(absent).is_none());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn push_layer_from_map_convenience() {
|
||||||
|
set_k(4);
|
||||||
|
let dir = tempdir().unwrap();
|
||||||
|
let mut map = LayeredMap::create(dir.path()).unwrap();
|
||||||
|
let mut w = map.next_layer_writer().unwrap();
|
||||||
|
w.write(&Unitig::from_ascii(b"AAAACGT")).unwrap();
|
||||||
|
w.close().unwrap();
|
||||||
|
let counts: HashMap<CanonicalKmer, u32> = vec![
|
||||||
|
(canonical(b"AAAA"), 10u32),
|
||||||
|
].into_iter().collect();
|
||||||
|
map.push_layer_from_map(&counts).unwrap();
|
||||||
|
let (_, hit) = map.query(canonical(b"AAAA")).unwrap();
|
||||||
|
assert_eq!(hit.count, 10);
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user