9c41891cc8
Introduces the `obilayeredmap` crate (v0.1.0), implementing an append-only, disk-backed k-mer index using a minimal perfect hash function (MPHF). The module features memory-mapped reads, buffered writes, custom error handling, partition metadata persistence, and comprehensive unit tests. Also adds a reverse complement benchmark for `obikseq` and updates `Cargo.lock` with the new dependencies.
123 lines
4.0 KiB
Rust
123 lines
4.0 KiB
Rust
use std::collections::HashMap;
|
|
use std::fs;
|
|
use std::io::BufWriter;
|
|
use std::path::Path;
|
|
|
|
use obikseq::{CanonicalKmer, Kmer, Sequence};
|
|
use obiskio::{UnitigFileReader, UnitigFileWriter};
|
|
use ph::fmph;
|
|
|
|
use crate::counts::{Counts, CountsWriter};
|
|
use crate::error::{OLMError, OLMResult};
|
|
use crate::evidence::{Evidence, EvidenceWriter};
|
|
|
|
const MPHF_FILE: &str = "mphf.bin";
|
|
const UNITIGS_FILE: &str = "unitigs.bin";
|
|
const EVIDENCE_FILE: &str = "evidence.bin";
|
|
const COUNTS_FILE: &str = "counts.bin";
|
|
|
|
pub struct Layer {
|
|
mphf: fmph::Function,
|
|
evidence: Evidence,
|
|
unitigs: UnitigFileReader,
|
|
counts: Counts,
|
|
}
|
|
|
|
pub struct Hit {
|
|
pub slot: usize,
|
|
pub count: u32,
|
|
}
|
|
|
|
impl Layer {
|
|
pub fn open(path: &Path) -> OLMResult<Self> {
|
|
let mphf = fmph::Function::read(
|
|
&mut fs::File::open(path.join(MPHF_FILE))?
|
|
).map_err(OLMError::Io)?;
|
|
|
|
let unitigs = UnitigFileReader::open(&path.join(UNITIGS_FILE))?;
|
|
let evidence = Evidence::open(&path.join(EVIDENCE_FILE))?;
|
|
let counts = Counts::open(&path.join(COUNTS_FILE))?;
|
|
|
|
Ok(Self { mphf, evidence, unitigs, counts })
|
|
}
|
|
|
|
pub fn query(&self, kmer: CanonicalKmer) -> Option<Hit> {
|
|
let slot = self.mphf.get(&kmer.raw())? as usize;
|
|
let (chunk_id, rank) = self.evidence.decode(slot);
|
|
if self.unitigs.verify_canonical_kmer(chunk_id as usize, rank as usize, kmer) {
|
|
Some(Hit { slot, count: self.counts.get(slot) })
|
|
} else {
|
|
None
|
|
}
|
|
}
|
|
|
|
/// Build a layer from unitigs already written to `out_dir/unitigs.bin`.
|
|
///
|
|
/// `count_of` maps each canonical kmer to its occurrence count.
|
|
/// Returns the number of kmers indexed.
|
|
pub fn build(
|
|
out_dir: &Path,
|
|
count_of: impl Fn(CanonicalKmer) -> u32,
|
|
) -> OLMResult<usize> {
|
|
let k = obikseq::params::k();
|
|
let unitigs = UnitigFileReader::open(&out_dir.join(UNITIGS_FILE))?;
|
|
|
|
let mut entries: Vec<(u64, u32, u8)> = Vec::new();
|
|
for chunk_id in 0..unitigs.len() {
|
|
let n_kmers = unitigs.seql(chunk_id) - k + 1;
|
|
for rank in 0..n_kmers {
|
|
let raw = unitigs.raw_kmer(chunk_id, rank);
|
|
let canonical: CanonicalKmer = Kmer::from_raw(raw).canonical();
|
|
entries.push((canonical.raw(), chunk_id as u32, rank as u8));
|
|
}
|
|
}
|
|
|
|
let n = entries.len();
|
|
if n == 0 {
|
|
fs::File::create(out_dir.join(EVIDENCE_FILE))?;
|
|
fs::File::create(out_dir.join(COUNTS_FILE))?;
|
|
let mphf = fmph::Function::new(Vec::<u64>::new());
|
|
mphf.write(&mut BufWriter::new(fs::File::create(out_dir.join(MPHF_FILE))?))?;
|
|
return Ok(0);
|
|
}
|
|
|
|
let keys: Vec<u64> = entries.iter().map(|(k, _, _)| *k).collect();
|
|
let mphf = fmph::Function::new(keys);
|
|
mphf.write(&mut BufWriter::new(fs::File::create(out_dir.join(MPHF_FILE))?))?;
|
|
|
|
let mut ev = EvidenceWriter::new(n);
|
|
let mut cnt = CountsWriter::new(n);
|
|
|
|
for (key, chunk_id, rank) in &entries {
|
|
let slot = mphf.get(key).unwrap() as usize;
|
|
ev.set(slot, *chunk_id, *rank);
|
|
let kmer = CanonicalKmer::from_raw_unchecked(*key);
|
|
cnt.set(slot, count_of(kmer));
|
|
}
|
|
|
|
ev.write(&out_dir.join(EVIDENCE_FILE))?;
|
|
cnt.write(&out_dir.join(COUNTS_FILE))?;
|
|
|
|
Ok(n)
|
|
}
|
|
|
|
/// Convenience variant of `build` that accepts a `HashMap`.
|
|
pub fn build_from_map(
|
|
out_dir: &Path,
|
|
counts: &HashMap<CanonicalKmer, u32>,
|
|
) -> OLMResult<usize> {
|
|
Self::build(out_dir, |kmer| counts.get(&kmer).copied().unwrap_or(0))
|
|
}
|
|
|
|
/// Return a `UnitigFileWriter` targeting this layer's `unitigs.bin`.
|
|
/// The caller writes unitigs, then calls `Layer::build` to finish the layer.
|
|
pub fn unitig_writer(out_dir: &Path) -> OLMResult<UnitigFileWriter> {
|
|
fs::create_dir_all(out_dir)?;
|
|
Ok(UnitigFileWriter::create(&out_dir.join(UNITIGS_FILE))?)
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
#[path = "tests/layer.rs"]
|
|
mod tests;
|