Files
obikmer/src/obilayeredmap/src/layer.rs
T
Eric Coissac 9c41891cc8 feat: add obilayeredmap crate for disk-backed k-mer indexing
Introduces the `obilayeredmap` crate (v0.1.0), implementing an append-only, disk-backed k-mer index using a minimal perfect hash function (MPHF). The module features memory-mapped reads, buffered writes, custom error handling, partition metadata persistence, and comprehensive unit tests. Also adds a reverse complement benchmark for `obikseq` and updates `Cargo.lock` with the new dependencies.
2026-05-12 15:26:39 +08:00

123 lines
4.0 KiB
Rust

use std::collections::HashMap;
use std::fs;
use std::io::BufWriter;
use std::path::Path;
use obikseq::{CanonicalKmer, Kmer, Sequence};
use obiskio::{UnitigFileReader, UnitigFileWriter};
use ph::fmph;
use crate::counts::{Counts, CountsWriter};
use crate::error::{OLMError, OLMResult};
use crate::evidence::{Evidence, EvidenceWriter};
const MPHF_FILE: &str = "mphf.bin";
const UNITIGS_FILE: &str = "unitigs.bin";
const EVIDENCE_FILE: &str = "evidence.bin";
const COUNTS_FILE: &str = "counts.bin";
pub struct Layer {
mphf: fmph::Function,
evidence: Evidence,
unitigs: UnitigFileReader,
counts: Counts,
}
pub struct Hit {
pub slot: usize,
pub count: u32,
}
impl Layer {
pub fn open(path: &Path) -> OLMResult<Self> {
let mphf = fmph::Function::read(
&mut fs::File::open(path.join(MPHF_FILE))?
).map_err(OLMError::Io)?;
let unitigs = UnitigFileReader::open(&path.join(UNITIGS_FILE))?;
let evidence = Evidence::open(&path.join(EVIDENCE_FILE))?;
let counts = Counts::open(&path.join(COUNTS_FILE))?;
Ok(Self { mphf, evidence, unitigs, counts })
}
pub fn query(&self, kmer: CanonicalKmer) -> Option<Hit> {
let slot = self.mphf.get(&kmer.raw())? as usize;
let (chunk_id, rank) = self.evidence.decode(slot);
if self.unitigs.verify_canonical_kmer(chunk_id as usize, rank as usize, kmer) {
Some(Hit { slot, count: self.counts.get(slot) })
} else {
None
}
}
/// Build a layer from unitigs already written to `out_dir/unitigs.bin`.
///
/// `count_of` maps each canonical kmer to its occurrence count.
/// Returns the number of kmers indexed.
pub fn build(
out_dir: &Path,
count_of: impl Fn(CanonicalKmer) -> u32,
) -> OLMResult<usize> {
let k = obikseq::params::k();
let unitigs = UnitigFileReader::open(&out_dir.join(UNITIGS_FILE))?;
let mut entries: Vec<(u64, u32, u8)> = Vec::new();
for chunk_id in 0..unitigs.len() {
let n_kmers = unitigs.seql(chunk_id) - k + 1;
for rank in 0..n_kmers {
let raw = unitigs.raw_kmer(chunk_id, rank);
let canonical: CanonicalKmer = Kmer::from_raw(raw).canonical();
entries.push((canonical.raw(), chunk_id as u32, rank as u8));
}
}
let n = entries.len();
if n == 0 {
fs::File::create(out_dir.join(EVIDENCE_FILE))?;
fs::File::create(out_dir.join(COUNTS_FILE))?;
let mphf = fmph::Function::new(Vec::<u64>::new());
mphf.write(&mut BufWriter::new(fs::File::create(out_dir.join(MPHF_FILE))?))?;
return Ok(0);
}
let keys: Vec<u64> = entries.iter().map(|(k, _, _)| *k).collect();
let mphf = fmph::Function::new(keys);
mphf.write(&mut BufWriter::new(fs::File::create(out_dir.join(MPHF_FILE))?))?;
let mut ev = EvidenceWriter::new(n);
let mut cnt = CountsWriter::new(n);
for (key, chunk_id, rank) in &entries {
let slot = mphf.get(key).unwrap() as usize;
ev.set(slot, *chunk_id, *rank);
let kmer = CanonicalKmer::from_raw_unchecked(*key);
cnt.set(slot, count_of(kmer));
}
ev.write(&out_dir.join(EVIDENCE_FILE))?;
cnt.write(&out_dir.join(COUNTS_FILE))?;
Ok(n)
}
/// Convenience variant of `build` that accepts a `HashMap`.
pub fn build_from_map(
out_dir: &Path,
counts: &HashMap<CanonicalKmer, u32>,
) -> OLMResult<usize> {
Self::build(out_dir, |kmer| counts.get(&kmer).copied().unwrap_or(0))
}
/// Return a `UnitigFileWriter` targeting this layer's `unitigs.bin`.
/// The caller writes unitigs, then calls `Layer::build` to finish the layer.
pub fn unitig_writer(out_dir: &Path) -> OLMResult<UnitigFileWriter> {
fs::create_dir_all(out_dir)?;
Ok(UnitigFileWriter::create(&out_dir.join(UNITIGS_FILE))?)
}
}
#[cfg(test)]
#[path = "tests/layer.rs"]
mod tests;