refactor: implement RoutableSuperKmer and update k-mer indexing pipeline
Replace raw SuperkMer routing with a new RoutableSuperKimer type that embeds canonical sequences and precomputed minimizers, enabling direct partition routing via hash. Update the build pipeline to yield RoutableSuperKmers throughout (builder, scatterer), refactor FASTA/unitig export commands to use the new type and compressed outputs (.fasta.gz, .unitigs.fasta.zst), revise SuperKmer header to store n_kmers instead of seql (avoiding 256-byte wrap), and update documentation to reflect minimizer-based theory, two evidence-encoding strategies for unitig-MPHF indexing (global offset vs. ID+rank), and the new obipipeline library architecture with parallel workers, biased scheduling, and error handling.
This commit is contained in:
@@ -16,12 +16,12 @@
|
||||
//! | super-kmer length = 256| k |
|
||||
|
||||
use obikrope::{ForwardCursor, Rope, RopeCursor};
|
||||
use obikseq::superkmer::SuperKmer;
|
||||
use obikseq::RoutableSuperKmer;
|
||||
|
||||
use crate::rolling_stat::RollingStat;
|
||||
use crate::scratch::SuperKmerScratch;
|
||||
|
||||
/// Iterator over `(minimizer_hash, SuperKmer)` pairs.
|
||||
/// Iterator over [`RoutableSuperKmer`] values.
|
||||
pub struct SuperKmerIter<'a> {
|
||||
cursor: ForwardCursor<'a>,
|
||||
k: usize,
|
||||
@@ -60,26 +60,19 @@ impl<'a> SuperKmerIter<'a> {
|
||||
self.prev_min_pos = 0;
|
||||
}
|
||||
|
||||
fn try_emit(&mut self) -> Option<SuperKmer> {
|
||||
fn try_emit(&mut self) -> Option<RoutableSuperKmer> {
|
||||
if self.scratch.len() < self.k {
|
||||
return None;
|
||||
}
|
||||
let min = self.prev_min?;
|
||||
let mut sk = self.scratch.emit();
|
||||
let min_pos = if sk.canonical() {
|
||||
self.prev_min_pos
|
||||
} else {
|
||||
sk.seql() - self.m - self.prev_min_pos
|
||||
};
|
||||
sk.set_minimizer_pos(min_pos as u8);
|
||||
Some(sk)
|
||||
self.prev_min?;
|
||||
Some(self.scratch.emit(self.prev_min_pos, self.m))
|
||||
}
|
||||
}
|
||||
|
||||
impl Iterator for SuperKmerIter<'_> {
|
||||
type Item = SuperKmer;
|
||||
type Item = RoutableSuperKmer;
|
||||
|
||||
fn next(&mut self) -> Option<SuperKmer> {
|
||||
fn next(&mut self) -> Option<RoutableSuperKmer> {
|
||||
loop {
|
||||
let byte = match self.cursor.read_next().ok() {
|
||||
None => {
|
||||
@@ -164,7 +157,7 @@ mod tests {
|
||||
fn run_nofilter(data: &[u8], k: usize, m: usize) -> Vec<Vec<u8>> {
|
||||
let rope = make_rope(data);
|
||||
SuperKmerIter::new(&rope, k, m, 1, 0.0)
|
||||
.map(|sk| sk.to_ascii())
|
||||
.map(|rsk| rsk.superkmer().to_ascii())
|
||||
.collect()
|
||||
}
|
||||
|
||||
@@ -205,7 +198,7 @@ mod tests {
|
||||
|
||||
let rope = make_rope(b"AAAAAAAAAAAAAAAAAAAA\x00");
|
||||
let out_reject: Vec<Vec<u8>> = SuperKmerIter::new(&rope, K, M, 6, 0.9)
|
||||
.map(|sk| sk.to_ascii())
|
||||
.map(|rsk| rsk.superkmer().to_ascii())
|
||||
.collect();
|
||||
assert!(out_reject.is_empty());
|
||||
}
|
||||
@@ -218,7 +211,7 @@ mod tests {
|
||||
rope.push(data[..mid].to_vec());
|
||||
rope.push(data[mid..].to_vec());
|
||||
let out: Vec<Vec<u8>> = SuperKmerIter::new(&rope, K, M, 1, 0.0)
|
||||
.map(|sk| sk.to_ascii())
|
||||
.map(|rsk| rsk.superkmer().to_ascii())
|
||||
.collect();
|
||||
assert!(!out.is_empty());
|
||||
}
|
||||
@@ -226,7 +219,7 @@ mod tests {
|
||||
#[test]
|
||||
fn yields_minimizer_value() {
|
||||
let rope = make_rope(b"ACGTACGTACGTACGTACGT\x00");
|
||||
let results: Vec<SuperKmer> = SuperKmerIter::new(&rope, K, M, 1, 0.0).collect();
|
||||
let results: Vec<RoutableSuperKmer> = SuperKmerIter::new(&rope, K, M, 1, 0.0).collect();
|
||||
assert!(!results.is_empty());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,9 +16,9 @@ pub use iter::SuperKmerIter;
|
||||
pub use scratch::SuperKmerScratch;
|
||||
|
||||
use obikrope::Rope;
|
||||
use obikseq::superkmer::SuperKmer;
|
||||
use obikseq::RoutableSuperKmer;
|
||||
|
||||
/// Collect all super-kmers from a normalised rope chunk.
|
||||
pub fn build_superkmers(rope: Rope, k: usize, m: usize, level_max: usize, theta: f64) -> Vec<SuperKmer> {
|
||||
pub fn build_superkmers(rope: Rope, k: usize, m: usize, level_max: usize, theta: f64) -> Vec<RoutableSuperKmer> {
|
||||
SuperKmerIter::new(&rope, k, m, level_max, theta).collect()
|
||||
}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
//! Stack-allocated scratch buffer for building a SuperKmer before heap emission.
|
||||
|
||||
use crate::encoding::{BYTE_LEN_MAX, encode_nuc};
|
||||
use obikseq::superkmer::SuperKmer;
|
||||
use obikseq::RoutableSuperKmer;
|
||||
|
||||
/// Maximum nucleotides in a super-kmer (fits one `u64` segment window, kept ≤ 256).
|
||||
pub const MAX_SUPERKMER_LEN: usize = 256;
|
||||
@@ -56,16 +56,15 @@ impl SuperKmerScratch {
|
||||
///
|
||||
/// The heap allocation (`Box<[u8]>`) is exactly sized to the sequence.
|
||||
/// Resets the buffer to empty afterward.
|
||||
pub fn emit(&mut self) -> SuperKmer {
|
||||
pub fn emit(&mut self, min_pos: usize, m: usize) -> RoutableSuperKmer {
|
||||
let seql = self.len;
|
||||
debug_assert!(seql >= 1 && seql <= MAX_SUPERKMER_LEN);
|
||||
let n = (seql + 3) / 4;
|
||||
let seq: Box<[u8]> = self.buf[..n].into();
|
||||
self.buf[..n].fill(0);
|
||||
self.len = 0;
|
||||
SuperKmer::new(seql as u8, seq)
|
||||
RoutableSuperKmer::build(min_pos, m, seql as u8, seq)
|
||||
}
|
||||
|
||||
/// Discard all accumulated nucleotides without producing a [`SuperKmer`].
|
||||
pub fn reset(&mut self) {
|
||||
let n = (self.len + 3) / 4;
|
||||
|
||||
Reference in New Issue
Block a user