refactor: implement RoutableSuperKmer and update k-mer indexing pipeline

Replace raw SuperkMer routing with a new RoutableSuperKimer type that embeds canonical sequences and precomputed minimizers, enabling direct partition routing via hash. Update the build pipeline to yield RoutableSuperKmers throughout (builder, scatterer), refactor FASTA/unitig export commands to use the new type and compressed outputs (.fasta.gz, .unitigs.fasta.zst), revise SuperKmer header to store n_kmers instead of seql (avoiding 256-byte wrap), and update documentation to reflect minimizer-based theory, two evidence-encoding strategies for unitig-MPHF indexing (global offset vs. ID+rank), and the new obipipeline library architecture with parallel workers, biased scheduling, and error handling.
This commit is contained in:
Eric Coissac
2026-04-29 22:52:42 +02:00
parent 4e26e3bd40
commit 27f5e88a7b
72 changed files with 10093 additions and 1626 deletions
+33 -15
View File
@@ -2,17 +2,25 @@ use obikseq::superkmer::SuperKmer;
use std::io::{self, Read, Write};
/// Serialise one SuperKmer into `w` (uncompressed; caller must wrap with a compressor).
///
/// Bits [7:0] of the header store `n_kmers = seql - k + 1` (kmer units, 1255),
/// not the raw nucleotide length. This removes the 0=256 wrapping convention.
#[inline]
pub(crate) fn write_superkmer<W: Write>(w: &mut W, sk: &SuperKmer) -> io::Result<()> {
w.write_all(&sk.header_bits().to_le_bytes())?;
pub(crate) fn write_superkmer<W: Write>(w: &mut W, sk: &SuperKmer, k: usize) -> io::Result<()> {
let n_kmers = sk.len() - k + 1;
let new_bits = (sk.header_bits() & !0xFF) | (n_kmers as u32);
w.write_all(&new_bits.to_le_bytes())?;
w.write_all(sk.seq_bytes())
}
/// Deserialise one SuperKmer from `r`. Returns `None` on clean EOF.
/// `seq_buf` is a reusable scratch buffer to avoid per-record allocation.
/// Bits [7:0] of the on-disk header contain `n_kmers`; nucleotide length is
/// reconstructed as `n_kmers + k - 1`.
pub(crate) fn read_superkmer<R: Read>(
r: &mut R,
seq_buf: &mut Vec<u8>,
k: usize,
) -> io::Result<Option<SuperKmer>> {
let mut hdr = [0u8; 4];
match r.read_exact(&mut hdr) {
@@ -21,12 +29,18 @@ pub(crate) fn read_superkmer<R: Read>(
Err(e) => return Err(e),
}
let bits = u32::from_le_bytes(hdr);
let seql_byte = (bits & 0xFF) as u8;
let nt_len: usize = if seql_byte == 0 { 256 } else { seql_byte as usize };
let n_kmers = (bits & 0xFF) as usize;
let nt_len = n_kmers + k - 1;
let byte_len = (nt_len + 3) / 4;
seq_buf.resize(byte_len, 0);
r.read_exact(seq_buf)?;
Ok(Some(SuperKmer::from_header_bits(bits, seq_buf.as_slice().into())))
// Reconstruct the in-memory seql byte (0 encodes 256, 1-255 direct).
let seql_byte = if nt_len == 256 { 0u8 } else { nt_len as u8 };
let mem_bits = (bits & !0xFF) | (seql_byte as u32);
Ok(Some(SuperKmer::from_header_bits(
mem_bits,
seq_buf.as_slice().into(),
)))
}
#[cfg(test)]
@@ -40,28 +54,31 @@ mod tests {
#[test]
fn roundtrip_single() {
let k = 4;
let sk = make_sk(b"ACGTACGT");
let mut buf = Vec::new();
write_superkmer(&mut buf, &sk).unwrap();
write_superkmer(&mut buf, &sk, k).unwrap();
let mut cur = Cursor::new(&buf);
let mut seq_buf = Vec::new();
let got = read_superkmer(&mut cur, &mut seq_buf).unwrap().unwrap();
let got = read_superkmer(&mut cur, &mut seq_buf, k).unwrap().unwrap();
assert_eq!(sk.to_ascii(), got.to_ascii());
assert_eq!(sk.seql(), got.seql());
assert_eq!(sk.len(), got.len());
}
#[test]
fn roundtrip_all_lengths() {
let bases: Vec<u8> = (0..256).map(|i| b"ACGT"[i % 4]).collect();
for len in (1..=9).chain([255, 256]) {
// k=11 is the project minimum; test seql from k to 256.
let k = 11;
for len in (k..=k + 8).chain([255, 256]) {
let sk = make_sk(&bases[..len]);
let mut buf = Vec::new();
write_superkmer(&mut buf, &sk).unwrap();
write_superkmer(&mut buf, &sk, k).unwrap();
let mut cur = Cursor::new(&buf);
let mut seq_buf = Vec::new();
let got = read_superkmer(&mut cur, &mut seq_buf).unwrap().unwrap();
let got = read_superkmer(&mut cur, &mut seq_buf, k).unwrap().unwrap();
assert_eq!(sk.to_ascii(), got.to_ascii(), "len={len}");
}
}
@@ -71,24 +88,25 @@ mod tests {
let buf: Vec<u8> = vec![];
let mut cur = Cursor::new(&buf);
let mut seq_buf = Vec::new();
assert!(read_superkmer(&mut cur, &mut seq_buf).unwrap().is_none());
assert!(read_superkmer(&mut cur, &mut seq_buf, 4).unwrap().is_none());
}
#[test]
fn multiple_records() {
let k = 4;
let seqs: &[&[u8]] = &[b"AAAA", b"CCCC", b"GGGG", b"TTTT"];
let mut buf = Vec::new();
for s in seqs {
write_superkmer(&mut buf, &make_sk(s)).unwrap();
write_superkmer(&mut buf, &make_sk(s), k).unwrap();
}
let mut cur = Cursor::new(&buf);
let mut seq_buf = Vec::new();
for s in seqs {
let got = read_superkmer(&mut cur, &mut seq_buf).unwrap().unwrap();
let got = read_superkmer(&mut cur, &mut seq_buf, k).unwrap().unwrap();
let expected = make_sk(s);
assert_eq!(expected.to_ascii(), got.to_ascii());
}
assert!(read_superkmer(&mut cur, &mut seq_buf).unwrap().is_none());
assert!(read_superkmer(&mut cur, &mut seq_buf, k).unwrap().is_none());
}
}
+63 -35
View File
@@ -3,8 +3,8 @@ use crate::error::SKResult;
use crate::limits::max_concurrent_files;
use crate::meta::SKFileMeta;
use lru::LruCache;
use niffler::send::compression::Format;
use niffler::Level;
use niffler::send::compression::Format;
use obikseq::superkmer::SuperKmer;
use std::fs::{File, OpenOptions};
use std::io::{BufWriter, Write};
@@ -79,7 +79,11 @@ impl SKFilePool {
/// Create a pool allowing at most `max_open` simultaneously open fds.
pub fn new(max_open: usize) -> Self {
let cap = NonZeroUsize::new(max_open.max(1)).unwrap();
Self { max_open, entries: Vec::new(), open: LruCache::new(cap) }
Self {
max_open,
entries: Vec::new(),
open: LruCache::new(cap),
}
}
/// Derive pool size from the OS fd limit (75 %, clamped to `[16, MAX_POOL_SIZE]`).
@@ -218,6 +222,7 @@ pub struct SKFileWriter {
id: usize,
pool: Arc<Mutex<SKFilePool>>,
path: PathBuf,
k: usize,
pending: Vec<u8>,
flush_threshold: usize,
logically_closed: bool,
@@ -225,14 +230,15 @@ pub struct SKFileWriter {
}
/// Create a `SKFileWriter` for a new file (Zstd, level 3).
pub fn create_token(pool: &SharedPool, path: PathBuf) -> SKResult<SKFileWriter> {
create_token_with(pool, path, Format::Zstd, Level::Three)
pub fn create_token(pool: &SharedPool, path: PathBuf, k: usize) -> SKResult<SKFileWriter> {
create_token_with(pool, path, k, Format::Zstd, Level::Three)
}
/// Create a `SKFileWriter` for a new file with explicit format and level.
pub fn create_token_with(
pool: &SharedPool,
path: PathBuf,
k: usize,
format: Format,
level: Level,
) -> SKResult<SKFileWriter> {
@@ -241,6 +247,7 @@ pub fn create_token_with(
id,
pool: Arc::clone(pool),
path,
k,
pending: Vec::with_capacity(DEFAULT_FLUSH_THRESHOLD + 128),
flush_threshold: DEFAULT_FLUSH_THRESHOLD,
logically_closed: false,
@@ -251,13 +258,18 @@ pub fn create_token_with(
impl SKFileWriter {
/// Create a standalone file writer (Zstd, level 3).
/// The pool is created internally and is not accessible to the caller.
pub fn create<P: AsRef<Path>>(path: P) -> SKResult<Self> {
Self::create_with(path, Format::Zstd, Level::Three)
pub fn create<P: AsRef<Path>>(path: P, k: usize) -> SKResult<Self> {
Self::create_with(path, k, Format::Zstd, Level::Three)
}
/// Create a standalone file writer with explicit format and level.
pub fn create_with<P: AsRef<Path>>(path: P, format: Format, level: Level) -> SKResult<Self> {
create_token_with(global_pool(), path.as_ref().to_owned(), format, level)
pub fn create_with<P: AsRef<Path>>(
path: P,
k: usize,
format: Format,
level: Level,
) -> SKResult<Self> {
create_token_with(global_pool(), path.as_ref().to_owned(), k, format, level)
}
/// `true` if the underlying fd is currently open in the pool.
@@ -268,10 +280,10 @@ impl SKFileWriter {
/// Accumulate one SuperKmer. Drains to fd when `pending ≥ flush_threshold`.
pub fn write(&mut self, sk: &SuperKmer) -> SKResult<()> {
self.check_not_closed()?;
write_superkmer(&mut self.pending, sk)?;
write_superkmer(&mut self.pending, sk, self.k)?;
self.meta.instances += 1;
self.meta.count_sum += sk.count() as u64;
self.meta.length_sum += sk.seql() as u64;
self.meta.length_sum += sk.len() as u64;
if self.pending.len() >= self.flush_threshold {
self.drain()?;
}
@@ -282,10 +294,10 @@ impl SKFileWriter {
pub fn write_batch(&mut self, sks: &[SuperKmer]) -> SKResult<()> {
self.check_not_closed()?;
for sk in sks {
write_superkmer(&mut self.pending, sk)?;
write_superkmer(&mut self.pending, sk, self.k)?;
self.meta.instances += 1;
self.meta.count_sum += sk.count() as u64;
self.meta.length_sum += sk.seql() as u64;
self.meta.length_sum += sk.len() as u64;
if self.pending.len() >= self.flush_threshold {
self.drain()?;
}
@@ -339,7 +351,10 @@ impl SKFileWriter {
}
if !self.pending.is_empty() {
fd_guard.as_mut().expect("fd open after ensure_open").write_all(&self.pending)?;
fd_guard
.as_mut()
.expect("fd open after ensure_open")
.write_all(&self.pending)?;
self.pending.clear();
}
if let Some(mut w) = fd_guard.take() {
@@ -400,7 +415,10 @@ impl SKFileWriter {
fd_guard = fd_arc.lock().unwrap(); // acquire fd lock under pool lock
// pool drops here → pool lock released, fd lock still held
}
fd_guard.as_mut().expect("fd open after ensure_open").write_all(&self.pending)?;
fd_guard
.as_mut()
.expect("fd open after ensure_open")
.write_all(&self.pending)?;
// fd_guard drops → entry fd lock released
self.pending.clear();
Ok(())
@@ -424,6 +442,8 @@ mod tests {
use obikseq::superkmer::SuperKmer;
use tempfile::{NamedTempFile, TempDir};
const TEST_K: usize = 4;
fn make_sk(seed: usize) -> SuperKmer {
let bases: Vec<u8> = (0..8).map(|j| b"ACGT"[(seed + j) % 4]).collect();
SuperKmer::from_ascii(&bases)
@@ -443,7 +463,7 @@ mod tests {
let dir = TempDir::new().unwrap();
let p = pool(3);
for i in 0..10 {
create_token(&p, dir.path().join(format!("p{i}.zst"))).unwrap();
create_token(&p, dir.path().join(format!("p{i}.zst")), TEST_K).unwrap();
}
assert_eq!(p.lock().unwrap().open_count(), 0);
}
@@ -455,14 +475,18 @@ mod tests {
let sk = make_sk(0);
let mut tokens: Vec<SKFileWriter> = (0..6)
.map(|i| create_token(&p, dir.path().join(format!("p{i}.zst"))).unwrap())
.map(|i| create_token(&p, dir.path().join(format!("p{i}.zst")), TEST_K).unwrap())
.collect();
for t in tokens.iter_mut() {
open_token(t, &sk);
}
assert!(p.lock().unwrap().open_count() <= 3, "open={}", p.lock().unwrap().open_count());
assert!(
p.lock().unwrap().open_count() <= 3,
"open={}",
p.lock().unwrap().open_count()
);
}
#[test]
@@ -471,8 +495,8 @@ mod tests {
let p = pool(1);
let sk = make_sk(0);
let mut t0 = create_token(&p, dir.path().join("a.zst")).unwrap();
let mut t1 = create_token(&p, dir.path().join("b.zst")).unwrap();
let mut t0 = create_token(&p, dir.path().join("a.zst"), TEST_K).unwrap();
let mut t1 = create_token(&p, dir.path().join("b.zst"), TEST_K).unwrap();
open_token(&mut t0, &sk); // t0 fd open, pool full
open_token(&mut t1, &sk); // evicts t0, t1 fd open
@@ -487,8 +511,8 @@ mod tests {
let p = pool(1);
let sk = make_sk(0);
let mut t0 = create_token(&p, dir.path().join("a.zst")).unwrap();
let mut t1 = create_token(&p, dir.path().join("b.zst")).unwrap();
let mut t0 = create_token(&p, dir.path().join("a.zst"), TEST_K).unwrap();
let mut t1 = create_token(&p, dir.path().join("b.zst"), TEST_K).unwrap();
t0.set_flush_threshold(1);
t0.write(&sk).unwrap(); // t0 fd open, pool full
@@ -504,7 +528,7 @@ mod tests {
p.lock().unwrap().close_all().unwrap();
for name in &["a.zst", "b.zst"] {
let mut r = SKFileReader::open(dir.path().join(name)).unwrap();
let mut r = SKFileReader::open(dir.path().join(name), TEST_K).unwrap();
let got = r.read_batch(10).unwrap();
assert_eq!(got.len(), 1, "{name}: expected 1 record");
}
@@ -516,9 +540,9 @@ mod tests {
let p = pool(2);
let sk = make_sk(0);
let mut t0 = create_token(&p, dir.path().join("a.zst")).unwrap();
let mut t1 = create_token(&p, dir.path().join("b.zst")).unwrap();
let mut t2 = create_token(&p, dir.path().join("c.zst")).unwrap();
let mut t0 = create_token(&p, dir.path().join("a.zst"), TEST_K).unwrap();
let mut t1 = create_token(&p, dir.path().join("b.zst"), TEST_K).unwrap();
let mut t2 = create_token(&p, dir.path().join("c.zst"), TEST_K).unwrap();
open_token(&mut t0, &sk); // t0 open
open_token(&mut t1, &sk); // t1 open, t0 LRU
@@ -538,10 +562,14 @@ mod tests {
fn close_all_produces_readable_files() {
let dir = TempDir::new().unwrap();
let p = pool(8);
let paths: Vec<_> = (0..4).map(|i| dir.path().join(format!("{i}.zst"))).collect();
let paths: Vec<_> = (0..4)
.map(|i| dir.path().join(format!("{i}.zst")))
.collect();
let mut tokens: Vec<SKFileWriter> =
paths.iter().map(|path| create_token(&p, path.clone()).unwrap()).collect();
let mut tokens: Vec<SKFileWriter> = paths
.iter()
.map(|path| create_token(&p, path.clone(), TEST_K).unwrap())
.collect();
for (i, t) in tokens.iter_mut().enumerate() {
t.write(&make_sk(i)).unwrap();
@@ -553,7 +581,7 @@ mod tests {
p.lock().unwrap().close_all().unwrap();
for path in &paths {
let mut r = SKFileReader::open(path).unwrap();
let mut r = SKFileReader::open(path, TEST_K).unwrap();
let got = r.read_batch(10).unwrap();
assert_eq!(got.len(), 1);
}
@@ -566,11 +594,11 @@ mod tests {
let sks: Vec<_> = (0..50).map(make_sk).collect();
let path = dir.path().join("batch.zst");
let mut t = create_token(&p, path.clone()).unwrap();
let mut t = create_token(&p, path.clone(), TEST_K).unwrap();
t.write_batch(&sks).unwrap();
t.close().unwrap();
let mut r = SKFileReader::open(&path).unwrap();
let mut r = SKFileReader::open(&path, TEST_K).unwrap();
let got = r.read_batch(100).unwrap();
assert_eq!(got.len(), 50);
for (a, b) in sks.iter().zip(got.iter()) {
@@ -590,11 +618,11 @@ mod tests {
let tmp = NamedTempFile::new().unwrap();
let sks: Vec<_> = (0..100).map(make_sk).collect();
{
let mut w = SKFileWriter::create(tmp.path()).unwrap();
let mut w = SKFileWriter::create(tmp.path(), TEST_K).unwrap();
w.write_batch(&sks).unwrap();
w.close().unwrap();
}
let mut r = SKFileReader::open(tmp.path()).unwrap();
let mut r = SKFileReader::open(tmp.path(), TEST_K).unwrap();
let got = r.read_batch(200).unwrap();
assert_eq!(got.len(), 100);
for (a, b) in sks.iter().zip(got.iter()) {
@@ -605,7 +633,7 @@ mod tests {
#[test]
fn standalone_close_prevents_write() {
let tmp = NamedTempFile::new().unwrap();
let mut w = SKFileWriter::create(tmp.path()).unwrap();
let mut w = SKFileWriter::create(tmp.path(), TEST_K).unwrap();
w.close().unwrap();
assert!(!w.is_open());
assert!(w.write(&make_sk(0)).is_err());
@@ -614,7 +642,7 @@ mod tests {
#[test]
fn standalone_is_physically_open() {
let tmp = NamedTempFile::new().unwrap();
let mut w = SKFileWriter::create(tmp.path()).unwrap();
let mut w = SKFileWriter::create(tmp.path(), TEST_K).unwrap();
assert!(!w.is_physically_open()); // fd deferred until first drain
w.set_flush_threshold(1);
w.write(&make_sk(0)).unwrap(); // triggers drain → fd opened
+12 -7
View File
@@ -15,6 +15,7 @@ use std::path::{Path, PathBuf};
/// that it can fast-forward on next open.
pub struct SKFileReader {
path: PathBuf,
k: usize,
reader: Option<Box<dyn std::io::Read + Send>>,
/// Reusable scratch buffer for the `seq` bytes of each record.
seq_buf: Vec<u8>,
@@ -24,11 +25,13 @@ pub struct SKFileReader {
impl SKFileReader {
/// Open a file for reading. Format is auto-detected from magic bytes.
pub fn open<P: AsRef<Path>>(path: P) -> SKResult<Self> {
/// `k` is the kmer size of the partition; required to decode the on-disk n_kmers field.
pub fn open<P: AsRef<Path>>(path: P, k: usize) -> SKResult<Self> {
let path = path.as_ref().to_owned();
let (reader, _fmt) = niffler::send::get_reader(Box::new(BufReader::new(File::open(&path)?)))?;
Ok(Self {
path,
k,
reader: Some(reader),
seq_buf: Vec::with_capacity(64),
consumed: 0,
@@ -43,7 +46,7 @@ impl SKFileReader {
"read from physically closed SKFileReader",
)
})?;
let result = read_superkmer(r, &mut self.seq_buf)?;
let result = read_superkmer(r, &mut self.seq_buf, self.k)?;
if result.is_some() {
self.consumed += 1;
}
@@ -100,7 +103,7 @@ impl SKFileReader {
let target = self.consumed;
self.consumed = 0;
for _ in 0..target {
match read_superkmer(self.reader.as_mut().unwrap(), &mut self.seq_buf)? {
match read_superkmer(self.reader.as_mut().unwrap(), &mut self.seq_buf, self.k)? {
Some(_) => self.consumed += 1,
None => break,
}
@@ -147,6 +150,8 @@ mod tests {
use crate::pool::SKFileWriter;
use tempfile::NamedTempFile;
const TEST_K: usize = 4; // test sequences are 8 bases; k=4 gives n_kmers=5
fn make_sks(n: usize) -> Vec<SuperKmer> {
(0..n)
.map(|i| {
@@ -162,11 +167,11 @@ mod tests {
let sks = make_sks(50);
{
let mut w = SKFileWriter::create(tmp.path()).unwrap();
let mut w = SKFileWriter::create(tmp.path(), TEST_K).unwrap();
w.write_batch(&sks).unwrap();
}
let mut r = SKFileReader::open(tmp.path()).unwrap();
let mut r = SKFileReader::open(tmp.path(), TEST_K).unwrap();
let got: Vec<_> = r.iter().collect();
assert_eq!(got.len(), 50);
for (a, b) in sks.iter().zip(got.iter()) {
@@ -180,11 +185,11 @@ mod tests {
let sks = make_sks(20);
{
let mut w = SKFileWriter::create(tmp.path()).unwrap();
let mut w = SKFileWriter::create(tmp.path(), TEST_K).unwrap();
w.write_batch(&sks).unwrap();
}
let mut r = SKFileReader::open(tmp.path()).unwrap();
let mut r = SKFileReader::open(tmp.path(), TEST_K).unwrap();
// Read 10, then simulate pool eviction + re-access
let first = r.read_batch(10).unwrap();
r.close();