refactor: implement RoutableSuperKmer and update k-mer indexing pipeline

Replace raw SuperkMer routing with a new RoutableSuperKimer type that embeds canonical sequences and precomputed minimizers, enabling direct partition routing via hash. Update the build pipeline to yield RoutableSuperKmers throughout (builder, scatterer), refactor FASTA/unitig export commands to use the new type and compressed outputs (.fasta.gz, .unitigs.fasta.zst), revise SuperKmer header to store n_kmers instead of seql (avoiding 256-byte wrap), and update documentation to reflect minimizer-based theory, two evidence-encoding strategies for unitig-MPHF indexing (global offset vs. ID+rank), and the new obipipeline library architecture with parallel workers, biased scheduling, and error handling.
This commit is contained in:
Eric Coissac
2026-04-29 22:52:42 +02:00
parent 4e26e3bd40
commit 27f5e88a7b
72 changed files with 10093 additions and 1626 deletions
+11 -18
View File
@@ -16,12 +16,12 @@
//! | super-kmer length = 256| k |
use obikrope::{ForwardCursor, Rope, RopeCursor};
use obikseq::superkmer::SuperKmer;
use obikseq::RoutableSuperKmer;
use crate::rolling_stat::RollingStat;
use crate::scratch::SuperKmerScratch;
/// Iterator over `(minimizer_hash, SuperKmer)` pairs.
/// Iterator over [`RoutableSuperKmer`] values.
pub struct SuperKmerIter<'a> {
cursor: ForwardCursor<'a>,
k: usize,
@@ -60,26 +60,19 @@ impl<'a> SuperKmerIter<'a> {
self.prev_min_pos = 0;
}
fn try_emit(&mut self) -> Option<SuperKmer> {
fn try_emit(&mut self) -> Option<RoutableSuperKmer> {
if self.scratch.len() < self.k {
return None;
}
let min = self.prev_min?;
let mut sk = self.scratch.emit();
let min_pos = if sk.canonical() {
self.prev_min_pos
} else {
sk.seql() - self.m - self.prev_min_pos
};
sk.set_minimizer_pos(min_pos as u8);
Some(sk)
self.prev_min?;
Some(self.scratch.emit(self.prev_min_pos, self.m))
}
}
impl Iterator for SuperKmerIter<'_> {
type Item = SuperKmer;
type Item = RoutableSuperKmer;
fn next(&mut self) -> Option<SuperKmer> {
fn next(&mut self) -> Option<RoutableSuperKmer> {
loop {
let byte = match self.cursor.read_next().ok() {
None => {
@@ -164,7 +157,7 @@ mod tests {
fn run_nofilter(data: &[u8], k: usize, m: usize) -> Vec<Vec<u8>> {
let rope = make_rope(data);
SuperKmerIter::new(&rope, k, m, 1, 0.0)
.map(|sk| sk.to_ascii())
.map(|rsk| rsk.superkmer().to_ascii())
.collect()
}
@@ -205,7 +198,7 @@ mod tests {
let rope = make_rope(b"AAAAAAAAAAAAAAAAAAAA\x00");
let out_reject: Vec<Vec<u8>> = SuperKmerIter::new(&rope, K, M, 6, 0.9)
.map(|sk| sk.to_ascii())
.map(|rsk| rsk.superkmer().to_ascii())
.collect();
assert!(out_reject.is_empty());
}
@@ -218,7 +211,7 @@ mod tests {
rope.push(data[..mid].to_vec());
rope.push(data[mid..].to_vec());
let out: Vec<Vec<u8>> = SuperKmerIter::new(&rope, K, M, 1, 0.0)
.map(|sk| sk.to_ascii())
.map(|rsk| rsk.superkmer().to_ascii())
.collect();
assert!(!out.is_empty());
}
@@ -226,7 +219,7 @@ mod tests {
#[test]
fn yields_minimizer_value() {
let rope = make_rope(b"ACGTACGTACGTACGTACGT\x00");
let results: Vec<SuperKmer> = SuperKmerIter::new(&rope, K, M, 1, 0.0).collect();
let results: Vec<RoutableSuperKmer> = SuperKmerIter::new(&rope, K, M, 1, 0.0).collect();
assert!(!results.is_empty());
}
}
+2 -2
View File
@@ -16,9 +16,9 @@ pub use iter::SuperKmerIter;
pub use scratch::SuperKmerScratch;
use obikrope::Rope;
use obikseq::superkmer::SuperKmer;
use obikseq::RoutableSuperKmer;
/// Collect all super-kmers from a normalised rope chunk.
pub fn build_superkmers(rope: Rope, k: usize, m: usize, level_max: usize, theta: f64) -> Vec<SuperKmer> {
pub fn build_superkmers(rope: Rope, k: usize, m: usize, level_max: usize, theta: f64) -> Vec<RoutableSuperKmer> {
SuperKmerIter::new(&rope, k, m, level_max, theta).collect()
}
+3 -4
View File
@@ -1,7 +1,7 @@
//! Stack-allocated scratch buffer for building a SuperKmer before heap emission.
use crate::encoding::{BYTE_LEN_MAX, encode_nuc};
use obikseq::superkmer::SuperKmer;
use obikseq::RoutableSuperKmer;
/// Maximum nucleotides in a super-kmer (fits one `u64` segment window, kept ≤ 256).
pub const MAX_SUPERKMER_LEN: usize = 256;
@@ -56,16 +56,15 @@ impl SuperKmerScratch {
///
/// The heap allocation (`Box<[u8]>`) is exactly sized to the sequence.
/// Resets the buffer to empty afterward.
pub fn emit(&mut self) -> SuperKmer {
pub fn emit(&mut self, min_pos: usize, m: usize) -> RoutableSuperKmer {
let seql = self.len;
debug_assert!(seql >= 1 && seql <= MAX_SUPERKMER_LEN);
let n = (seql + 3) / 4;
let seq: Box<[u8]> = self.buf[..n].into();
self.buf[..n].fill(0);
self.len = 0;
SuperKmer::new(seql as u8, seq)
RoutableSuperKmer::build(min_pos, m, seql as u8, seq)
}
/// Discard all accumulated nucleotides without producing a [`SuperKmer`].
pub fn reset(&mut self) {
let n = (self.len + 3) / 4;