feat: add kmer iterators and optimize layered map performance
Replace `ph` with `ptr_hash` and introduce `epserde` and `rayon` dependencies. Refactor MPHF construction to leverage parallel iteration, eliminating intermediate `Vec<u64>` allocations and reducing memory footprint. Add a `n_kmers` field to track and serialize total kmer counts, alongside three zero-allocation iterators for efficient chunk traversal. Include comprehensive unit tests for the new iterators and update CLAUDE.md to enforce explicit dependency validation policies.
This commit is contained in:
@@ -3,7 +3,7 @@ use std::io::{BufWriter, Write as _};
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use memmap2::Mmap;
|
||||
use obikseq::{CanonicalKmer, Unitig};
|
||||
use obikseq::{CanonicalKmer, Kmer, Unitig};
|
||||
|
||||
pub use obikseq::MAX_KMERS_PER_CHUNK;
|
||||
|
||||
@@ -13,6 +13,7 @@ use crate::error::{SKError, SKResult};
|
||||
//
|
||||
// magic: [u8; 4] = b"UIDX"
|
||||
// n_unitigs: u32 LE
|
||||
// n_kmers: u64 LE total kmer count across all chunks
|
||||
// seqls: [u8; n_unitigs] max kmer index per chunk (= n_kmers − 1)
|
||||
// packed_offsets: [u32; n_unitigs + 1] byte offsets to packed bytes in the
|
||||
// sequence file; last entry is sentinel
|
||||
@@ -44,6 +45,7 @@ pub struct UnitigFileWriter {
|
||||
seqls: Vec<u8>,
|
||||
packed_offsets: Vec<u32>,
|
||||
next_offset: u32,
|
||||
n_kmers: usize,
|
||||
k: usize,
|
||||
}
|
||||
|
||||
@@ -56,6 +58,7 @@ impl UnitigFileWriter {
|
||||
seqls: Vec::new(),
|
||||
packed_offsets: Vec::new(),
|
||||
next_offset: 0,
|
||||
n_kmers: 0,
|
||||
k: obikseq::params::k(),
|
||||
})
|
||||
}
|
||||
@@ -98,6 +101,7 @@ impl UnitigFileWriter {
|
||||
debug_assert!(seql - self.k <= u8::MAX as usize, "chunk exceeds MAX_KMERS_PER_CHUNK");
|
||||
self.packed_offsets.push(self.next_offset + 1);
|
||||
self.seqls.push((seql - self.k) as u8);
|
||||
self.n_kmers += seql - self.k + 1;
|
||||
|
||||
unitig
|
||||
.write_to_binary(&mut self.file)
|
||||
@@ -122,7 +126,7 @@ impl UnitigFileWriter {
|
||||
};
|
||||
self.packed_offsets.push(sentinel);
|
||||
|
||||
write_idx(&idx_path(&self.path), &self.seqls, &self.packed_offsets)
|
||||
write_idx(&idx_path(&self.path), &self.seqls, &self.packed_offsets, self.n_kmers)
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize {
|
||||
@@ -134,10 +138,11 @@ impl UnitigFileWriter {
|
||||
}
|
||||
}
|
||||
|
||||
fn write_idx(path: &Path, seqls: &[u8], packed_offsets: &[u32]) -> SKResult<()> {
|
||||
fn write_idx(path: &Path, seqls: &[u8], packed_offsets: &[u32], n_kmers: usize) -> SKResult<()> {
|
||||
let mut w = BufWriter::new(File::create(path).map_err(SKError::Io)?);
|
||||
w.write_all(&MAGIC).map_err(SKError::Io)?;
|
||||
w.write_all(&(seqls.len() as u32).to_le_bytes()).map_err(SKError::Io)?;
|
||||
w.write_all(&(n_kmers as u64).to_le_bytes()).map_err(SKError::Io)?;
|
||||
w.write_all(seqls).map_err(SKError::Io)?;
|
||||
for &off in packed_offsets {
|
||||
w.write_all(&off.to_le_bytes()).map_err(SKError::Io)?;
|
||||
@@ -155,6 +160,7 @@ pub struct UnitigFileReader {
|
||||
mmap: Mmap,
|
||||
seqls: Vec<u8>,
|
||||
packed_offsets: Vec<u32>,
|
||||
n_kmers: usize,
|
||||
k: usize,
|
||||
}
|
||||
|
||||
@@ -162,9 +168,9 @@ impl UnitigFileReader {
|
||||
pub fn open(path: &Path) -> SKResult<Self> {
|
||||
let file = File::open(path).map_err(SKError::Io)?;
|
||||
let mmap = unsafe { Mmap::map(&file).map_err(SKError::Io)? };
|
||||
let (seqls, packed_offsets) = read_idx(&idx_path(path))?;
|
||||
let (seqls, packed_offsets, n_kmers) = read_idx(&idx_path(path))?;
|
||||
let k = obikseq::params::k();
|
||||
Ok(Self { mmap, seqls, packed_offsets, k })
|
||||
Ok(Self { mmap, seqls, packed_offsets, n_kmers, k })
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize {
|
||||
@@ -175,6 +181,11 @@ impl UnitigFileReader {
|
||||
self.seqls.is_empty()
|
||||
}
|
||||
|
||||
/// Total number of kmers across all chunks.
|
||||
pub fn n_kmers(&self) -> usize {
|
||||
self.n_kmers
|
||||
}
|
||||
|
||||
/// Return the nucleotide length of chunk `i`.
|
||||
#[inline]
|
||||
pub fn seql(&self, i: usize) -> usize {
|
||||
@@ -206,9 +217,42 @@ impl UnitigFileReader {
|
||||
pub fn verify_canonical_kmer(&self, i: usize, j: usize, query: CanonicalKmer) -> bool {
|
||||
canonical_raw(self.raw_kmer(i, j), self.k) == query.raw()
|
||||
}
|
||||
|
||||
/// Iterate over all kmers in file order (all positions of chunk 0, then chunk 1, …).
|
||||
///
|
||||
/// Each chunk is copied from the mmap once; iteration within the chunk is
|
||||
/// zero-allocation (sliding-window via [`OwnedPackedSeqKmerIter`]).
|
||||
pub fn iter_kmers(&self) -> impl Iterator<Item = Kmer> + '_ {
|
||||
(0..self.len()).flat_map(move |i| self.unitig(i).into_kmers())
|
||||
}
|
||||
|
||||
/// Iterate over all canonical kmers in file order.
|
||||
///
|
||||
/// Equivalent to `iter_kmers().map(|km| km.canonical())` but uses the
|
||||
/// built-in canonical iterator on each chunk, which avoids a separate
|
||||
/// canonicalization pass.
|
||||
pub fn iter_canonical_kmers(&self) -> impl Iterator<Item = CanonicalKmer> + '_ {
|
||||
(0..self.len()).flat_map(move |i| self.unitig(i).into_canonical_kmers())
|
||||
}
|
||||
|
||||
/// Iterate over `(kmer, chunk_id, rank)` for every canonical kmer in the file.
|
||||
///
|
||||
/// `chunk_id` is the index of the chunk within this file; `rank` is the
|
||||
/// 0-based position of the kmer within that chunk. Used to build the
|
||||
/// evidence table in `obilayeredmap`.
|
||||
pub fn iter_indexed_canonical_kmers(
|
||||
&self,
|
||||
) -> impl Iterator<Item = (CanonicalKmer, usize, usize)> + '_ {
|
||||
(0..self.len()).flat_map(move |chunk_id| {
|
||||
self.unitig(chunk_id)
|
||||
.into_canonical_kmers()
|
||||
.enumerate()
|
||||
.map(move |(rank, kmer)| (kmer, chunk_id, rank))
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn read_idx(path: &Path) -> SKResult<(Vec<u8>, Vec<u32>)> {
|
||||
fn read_idx(path: &Path) -> SKResult<(Vec<u8>, Vec<u32>, usize)> {
|
||||
let data = std::fs::read(path).map_err(SKError::Io)?;
|
||||
let mut pos = 0;
|
||||
|
||||
@@ -227,6 +271,11 @@ fn read_idx(path: &Path) -> SKResult<(Vec<u8>, Vec<u32>)> {
|
||||
let n = u32::from_le_bytes(n_bytes.try_into().unwrap()) as usize;
|
||||
pos += 4;
|
||||
|
||||
let nk_bytes = data.get(pos..pos + 8)
|
||||
.ok_or(SKError::Truncated { context: "unitig index: n_kmers" })?;
|
||||
let n_kmers = u64::from_le_bytes(nk_bytes.try_into().unwrap()) as usize;
|
||||
pos += 8;
|
||||
|
||||
let seqls = data.get(pos..pos + n)
|
||||
.ok_or(SKError::Truncated { context: "unitig index: seqls" })?
|
||||
.to_vec();
|
||||
@@ -240,7 +289,7 @@ fn read_idx(path: &Path) -> SKResult<(Vec<u8>, Vec<u32>)> {
|
||||
pos += 4;
|
||||
}
|
||||
|
||||
Ok((seqls, packed_offsets))
|
||||
Ok((seqls, packed_offsets, n_kmers))
|
||||
}
|
||||
|
||||
// ── Kmer utilities ────────────────────────────────────────────────────────────
|
||||
|
||||
Reference in New Issue
Block a user