refactor: implement RoutableSuperKmer and update k-mer indexing pipeline
Replace raw SuperkMer routing with a new RoutableSuperKimer type that embeds canonical sequences and precomputed minimizers, enabling direct partition routing via hash. Update the build pipeline to yield RoutableSuperKmers throughout (builder, scatterer), refactor FASTA/unitig export commands to use the new type and compressed outputs (.fasta.gz, .unitigs.fasta.zst), revise SuperKmer header to store n_kmers instead of seql (avoiding 256-byte wrap), and update documentation to reflect minimizer-based theory, two evidence-encoding strategies for unitig-MPHF indexing (global offset vs. ID+rank), and the new obipipeline library architecture with parallel workers, biased scheduling, and error handling.
This commit is contained in:
+33
-15
@@ -2,17 +2,25 @@ use obikseq::superkmer::SuperKmer;
|
||||
use std::io::{self, Read, Write};
|
||||
|
||||
/// Serialise one SuperKmer into `w` (uncompressed; caller must wrap with a compressor).
|
||||
///
|
||||
/// Bits [7:0] of the header store `n_kmers = seql - k + 1` (kmer units, 1–255),
|
||||
/// not the raw nucleotide length. This removes the 0=256 wrapping convention.
|
||||
#[inline]
|
||||
pub(crate) fn write_superkmer<W: Write>(w: &mut W, sk: &SuperKmer) -> io::Result<()> {
|
||||
w.write_all(&sk.header_bits().to_le_bytes())?;
|
||||
pub(crate) fn write_superkmer<W: Write>(w: &mut W, sk: &SuperKmer, k: usize) -> io::Result<()> {
|
||||
let n_kmers = sk.len() - k + 1;
|
||||
let new_bits = (sk.header_bits() & !0xFF) | (n_kmers as u32);
|
||||
w.write_all(&new_bits.to_le_bytes())?;
|
||||
w.write_all(sk.seq_bytes())
|
||||
}
|
||||
|
||||
/// Deserialise one SuperKmer from `r`. Returns `None` on clean EOF.
|
||||
/// `seq_buf` is a reusable scratch buffer to avoid per-record allocation.
|
||||
/// Bits [7:0] of the on-disk header contain `n_kmers`; nucleotide length is
|
||||
/// reconstructed as `n_kmers + k - 1`.
|
||||
pub(crate) fn read_superkmer<R: Read>(
|
||||
r: &mut R,
|
||||
seq_buf: &mut Vec<u8>,
|
||||
k: usize,
|
||||
) -> io::Result<Option<SuperKmer>> {
|
||||
let mut hdr = [0u8; 4];
|
||||
match r.read_exact(&mut hdr) {
|
||||
@@ -21,12 +29,18 @@ pub(crate) fn read_superkmer<R: Read>(
|
||||
Err(e) => return Err(e),
|
||||
}
|
||||
let bits = u32::from_le_bytes(hdr);
|
||||
let seql_byte = (bits & 0xFF) as u8;
|
||||
let nt_len: usize = if seql_byte == 0 { 256 } else { seql_byte as usize };
|
||||
let n_kmers = (bits & 0xFF) as usize;
|
||||
let nt_len = n_kmers + k - 1;
|
||||
let byte_len = (nt_len + 3) / 4;
|
||||
seq_buf.resize(byte_len, 0);
|
||||
r.read_exact(seq_buf)?;
|
||||
Ok(Some(SuperKmer::from_header_bits(bits, seq_buf.as_slice().into())))
|
||||
// Reconstruct the in-memory seql byte (0 encodes 256, 1-255 direct).
|
||||
let seql_byte = if nt_len == 256 { 0u8 } else { nt_len as u8 };
|
||||
let mem_bits = (bits & !0xFF) | (seql_byte as u32);
|
||||
Ok(Some(SuperKmer::from_header_bits(
|
||||
mem_bits,
|
||||
seq_buf.as_slice().into(),
|
||||
)))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -40,28 +54,31 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn roundtrip_single() {
|
||||
let k = 4;
|
||||
let sk = make_sk(b"ACGTACGT");
|
||||
let mut buf = Vec::new();
|
||||
write_superkmer(&mut buf, &sk).unwrap();
|
||||
write_superkmer(&mut buf, &sk, k).unwrap();
|
||||
|
||||
let mut cur = Cursor::new(&buf);
|
||||
let mut seq_buf = Vec::new();
|
||||
let got = read_superkmer(&mut cur, &mut seq_buf).unwrap().unwrap();
|
||||
let got = read_superkmer(&mut cur, &mut seq_buf, k).unwrap().unwrap();
|
||||
assert_eq!(sk.to_ascii(), got.to_ascii());
|
||||
assert_eq!(sk.seql(), got.seql());
|
||||
assert_eq!(sk.len(), got.len());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn roundtrip_all_lengths() {
|
||||
let bases: Vec<u8> = (0..256).map(|i| b"ACGT"[i % 4]).collect();
|
||||
for len in (1..=9).chain([255, 256]) {
|
||||
// k=11 is the project minimum; test seql from k to 256.
|
||||
let k = 11;
|
||||
for len in (k..=k + 8).chain([255, 256]) {
|
||||
let sk = make_sk(&bases[..len]);
|
||||
let mut buf = Vec::new();
|
||||
write_superkmer(&mut buf, &sk).unwrap();
|
||||
write_superkmer(&mut buf, &sk, k).unwrap();
|
||||
|
||||
let mut cur = Cursor::new(&buf);
|
||||
let mut seq_buf = Vec::new();
|
||||
let got = read_superkmer(&mut cur, &mut seq_buf).unwrap().unwrap();
|
||||
let got = read_superkmer(&mut cur, &mut seq_buf, k).unwrap().unwrap();
|
||||
assert_eq!(sk.to_ascii(), got.to_ascii(), "len={len}");
|
||||
}
|
||||
}
|
||||
@@ -71,24 +88,25 @@ mod tests {
|
||||
let buf: Vec<u8> = vec![];
|
||||
let mut cur = Cursor::new(&buf);
|
||||
let mut seq_buf = Vec::new();
|
||||
assert!(read_superkmer(&mut cur, &mut seq_buf).unwrap().is_none());
|
||||
assert!(read_superkmer(&mut cur, &mut seq_buf, 4).unwrap().is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multiple_records() {
|
||||
let k = 4;
|
||||
let seqs: &[&[u8]] = &[b"AAAA", b"CCCC", b"GGGG", b"TTTT"];
|
||||
let mut buf = Vec::new();
|
||||
for s in seqs {
|
||||
write_superkmer(&mut buf, &make_sk(s)).unwrap();
|
||||
write_superkmer(&mut buf, &make_sk(s), k).unwrap();
|
||||
}
|
||||
|
||||
let mut cur = Cursor::new(&buf);
|
||||
let mut seq_buf = Vec::new();
|
||||
for s in seqs {
|
||||
let got = read_superkmer(&mut cur, &mut seq_buf).unwrap().unwrap();
|
||||
let got = read_superkmer(&mut cur, &mut seq_buf, k).unwrap().unwrap();
|
||||
let expected = make_sk(s);
|
||||
assert_eq!(expected.to_ascii(), got.to_ascii());
|
||||
}
|
||||
assert!(read_superkmer(&mut cur, &mut seq_buf).unwrap().is_none());
|
||||
assert!(read_superkmer(&mut cur, &mut seq_buf, k).unwrap().is_none());
|
||||
}
|
||||
}
|
||||
|
||||
+63
-35
@@ -3,8 +3,8 @@ use crate::error::SKResult;
|
||||
use crate::limits::max_concurrent_files;
|
||||
use crate::meta::SKFileMeta;
|
||||
use lru::LruCache;
|
||||
use niffler::send::compression::Format;
|
||||
use niffler::Level;
|
||||
use niffler::send::compression::Format;
|
||||
use obikseq::superkmer::SuperKmer;
|
||||
use std::fs::{File, OpenOptions};
|
||||
use std::io::{BufWriter, Write};
|
||||
@@ -79,7 +79,11 @@ impl SKFilePool {
|
||||
/// Create a pool allowing at most `max_open` simultaneously open fds.
|
||||
pub fn new(max_open: usize) -> Self {
|
||||
let cap = NonZeroUsize::new(max_open.max(1)).unwrap();
|
||||
Self { max_open, entries: Vec::new(), open: LruCache::new(cap) }
|
||||
Self {
|
||||
max_open,
|
||||
entries: Vec::new(),
|
||||
open: LruCache::new(cap),
|
||||
}
|
||||
}
|
||||
|
||||
/// Derive pool size from the OS fd limit (75 %, clamped to `[16, MAX_POOL_SIZE]`).
|
||||
@@ -218,6 +222,7 @@ pub struct SKFileWriter {
|
||||
id: usize,
|
||||
pool: Arc<Mutex<SKFilePool>>,
|
||||
path: PathBuf,
|
||||
k: usize,
|
||||
pending: Vec<u8>,
|
||||
flush_threshold: usize,
|
||||
logically_closed: bool,
|
||||
@@ -225,14 +230,15 @@ pub struct SKFileWriter {
|
||||
}
|
||||
|
||||
/// Create a `SKFileWriter` for a new file (Zstd, level 3).
|
||||
pub fn create_token(pool: &SharedPool, path: PathBuf) -> SKResult<SKFileWriter> {
|
||||
create_token_with(pool, path, Format::Zstd, Level::Three)
|
||||
pub fn create_token(pool: &SharedPool, path: PathBuf, k: usize) -> SKResult<SKFileWriter> {
|
||||
create_token_with(pool, path, k, Format::Zstd, Level::Three)
|
||||
}
|
||||
|
||||
/// Create a `SKFileWriter` for a new file with explicit format and level.
|
||||
pub fn create_token_with(
|
||||
pool: &SharedPool,
|
||||
path: PathBuf,
|
||||
k: usize,
|
||||
format: Format,
|
||||
level: Level,
|
||||
) -> SKResult<SKFileWriter> {
|
||||
@@ -241,6 +247,7 @@ pub fn create_token_with(
|
||||
id,
|
||||
pool: Arc::clone(pool),
|
||||
path,
|
||||
k,
|
||||
pending: Vec::with_capacity(DEFAULT_FLUSH_THRESHOLD + 128),
|
||||
flush_threshold: DEFAULT_FLUSH_THRESHOLD,
|
||||
logically_closed: false,
|
||||
@@ -251,13 +258,18 @@ pub fn create_token_with(
|
||||
impl SKFileWriter {
|
||||
/// Create a standalone file writer (Zstd, level 3).
|
||||
/// The pool is created internally and is not accessible to the caller.
|
||||
pub fn create<P: AsRef<Path>>(path: P) -> SKResult<Self> {
|
||||
Self::create_with(path, Format::Zstd, Level::Three)
|
||||
pub fn create<P: AsRef<Path>>(path: P, k: usize) -> SKResult<Self> {
|
||||
Self::create_with(path, k, Format::Zstd, Level::Three)
|
||||
}
|
||||
|
||||
/// Create a standalone file writer with explicit format and level.
|
||||
pub fn create_with<P: AsRef<Path>>(path: P, format: Format, level: Level) -> SKResult<Self> {
|
||||
create_token_with(global_pool(), path.as_ref().to_owned(), format, level)
|
||||
pub fn create_with<P: AsRef<Path>>(
|
||||
path: P,
|
||||
k: usize,
|
||||
format: Format,
|
||||
level: Level,
|
||||
) -> SKResult<Self> {
|
||||
create_token_with(global_pool(), path.as_ref().to_owned(), k, format, level)
|
||||
}
|
||||
|
||||
/// `true` if the underlying fd is currently open in the pool.
|
||||
@@ -268,10 +280,10 @@ impl SKFileWriter {
|
||||
/// Accumulate one SuperKmer. Drains to fd when `pending ≥ flush_threshold`.
|
||||
pub fn write(&mut self, sk: &SuperKmer) -> SKResult<()> {
|
||||
self.check_not_closed()?;
|
||||
write_superkmer(&mut self.pending, sk)?;
|
||||
write_superkmer(&mut self.pending, sk, self.k)?;
|
||||
self.meta.instances += 1;
|
||||
self.meta.count_sum += sk.count() as u64;
|
||||
self.meta.length_sum += sk.seql() as u64;
|
||||
self.meta.length_sum += sk.len() as u64;
|
||||
if self.pending.len() >= self.flush_threshold {
|
||||
self.drain()?;
|
||||
}
|
||||
@@ -282,10 +294,10 @@ impl SKFileWriter {
|
||||
pub fn write_batch(&mut self, sks: &[SuperKmer]) -> SKResult<()> {
|
||||
self.check_not_closed()?;
|
||||
for sk in sks {
|
||||
write_superkmer(&mut self.pending, sk)?;
|
||||
write_superkmer(&mut self.pending, sk, self.k)?;
|
||||
self.meta.instances += 1;
|
||||
self.meta.count_sum += sk.count() as u64;
|
||||
self.meta.length_sum += sk.seql() as u64;
|
||||
self.meta.length_sum += sk.len() as u64;
|
||||
if self.pending.len() >= self.flush_threshold {
|
||||
self.drain()?;
|
||||
}
|
||||
@@ -339,7 +351,10 @@ impl SKFileWriter {
|
||||
}
|
||||
|
||||
if !self.pending.is_empty() {
|
||||
fd_guard.as_mut().expect("fd open after ensure_open").write_all(&self.pending)?;
|
||||
fd_guard
|
||||
.as_mut()
|
||||
.expect("fd open after ensure_open")
|
||||
.write_all(&self.pending)?;
|
||||
self.pending.clear();
|
||||
}
|
||||
if let Some(mut w) = fd_guard.take() {
|
||||
@@ -400,7 +415,10 @@ impl SKFileWriter {
|
||||
fd_guard = fd_arc.lock().unwrap(); // acquire fd lock under pool lock
|
||||
// pool drops here → pool lock released, fd lock still held
|
||||
}
|
||||
fd_guard.as_mut().expect("fd open after ensure_open").write_all(&self.pending)?;
|
||||
fd_guard
|
||||
.as_mut()
|
||||
.expect("fd open after ensure_open")
|
||||
.write_all(&self.pending)?;
|
||||
// fd_guard drops → entry fd lock released
|
||||
self.pending.clear();
|
||||
Ok(())
|
||||
@@ -424,6 +442,8 @@ mod tests {
|
||||
use obikseq::superkmer::SuperKmer;
|
||||
use tempfile::{NamedTempFile, TempDir};
|
||||
|
||||
const TEST_K: usize = 4;
|
||||
|
||||
fn make_sk(seed: usize) -> SuperKmer {
|
||||
let bases: Vec<u8> = (0..8).map(|j| b"ACGT"[(seed + j) % 4]).collect();
|
||||
SuperKmer::from_ascii(&bases)
|
||||
@@ -443,7 +463,7 @@ mod tests {
|
||||
let dir = TempDir::new().unwrap();
|
||||
let p = pool(3);
|
||||
for i in 0..10 {
|
||||
create_token(&p, dir.path().join(format!("p{i}.zst"))).unwrap();
|
||||
create_token(&p, dir.path().join(format!("p{i}.zst")), TEST_K).unwrap();
|
||||
}
|
||||
assert_eq!(p.lock().unwrap().open_count(), 0);
|
||||
}
|
||||
@@ -455,14 +475,18 @@ mod tests {
|
||||
let sk = make_sk(0);
|
||||
|
||||
let mut tokens: Vec<SKFileWriter> = (0..6)
|
||||
.map(|i| create_token(&p, dir.path().join(format!("p{i}.zst"))).unwrap())
|
||||
.map(|i| create_token(&p, dir.path().join(format!("p{i}.zst")), TEST_K).unwrap())
|
||||
.collect();
|
||||
|
||||
for t in tokens.iter_mut() {
|
||||
open_token(t, &sk);
|
||||
}
|
||||
|
||||
assert!(p.lock().unwrap().open_count() <= 3, "open={}", p.lock().unwrap().open_count());
|
||||
assert!(
|
||||
p.lock().unwrap().open_count() <= 3,
|
||||
"open={}",
|
||||
p.lock().unwrap().open_count()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -471,8 +495,8 @@ mod tests {
|
||||
let p = pool(1);
|
||||
let sk = make_sk(0);
|
||||
|
||||
let mut t0 = create_token(&p, dir.path().join("a.zst")).unwrap();
|
||||
let mut t1 = create_token(&p, dir.path().join("b.zst")).unwrap();
|
||||
let mut t0 = create_token(&p, dir.path().join("a.zst"), TEST_K).unwrap();
|
||||
let mut t1 = create_token(&p, dir.path().join("b.zst"), TEST_K).unwrap();
|
||||
|
||||
open_token(&mut t0, &sk); // t0 fd open, pool full
|
||||
open_token(&mut t1, &sk); // evicts t0, t1 fd open
|
||||
@@ -487,8 +511,8 @@ mod tests {
|
||||
let p = pool(1);
|
||||
let sk = make_sk(0);
|
||||
|
||||
let mut t0 = create_token(&p, dir.path().join("a.zst")).unwrap();
|
||||
let mut t1 = create_token(&p, dir.path().join("b.zst")).unwrap();
|
||||
let mut t0 = create_token(&p, dir.path().join("a.zst"), TEST_K).unwrap();
|
||||
let mut t1 = create_token(&p, dir.path().join("b.zst"), TEST_K).unwrap();
|
||||
|
||||
t0.set_flush_threshold(1);
|
||||
t0.write(&sk).unwrap(); // t0 fd open, pool full
|
||||
@@ -504,7 +528,7 @@ mod tests {
|
||||
p.lock().unwrap().close_all().unwrap();
|
||||
|
||||
for name in &["a.zst", "b.zst"] {
|
||||
let mut r = SKFileReader::open(dir.path().join(name)).unwrap();
|
||||
let mut r = SKFileReader::open(dir.path().join(name), TEST_K).unwrap();
|
||||
let got = r.read_batch(10).unwrap();
|
||||
assert_eq!(got.len(), 1, "{name}: expected 1 record");
|
||||
}
|
||||
@@ -516,9 +540,9 @@ mod tests {
|
||||
let p = pool(2);
|
||||
let sk = make_sk(0);
|
||||
|
||||
let mut t0 = create_token(&p, dir.path().join("a.zst")).unwrap();
|
||||
let mut t1 = create_token(&p, dir.path().join("b.zst")).unwrap();
|
||||
let mut t2 = create_token(&p, dir.path().join("c.zst")).unwrap();
|
||||
let mut t0 = create_token(&p, dir.path().join("a.zst"), TEST_K).unwrap();
|
||||
let mut t1 = create_token(&p, dir.path().join("b.zst"), TEST_K).unwrap();
|
||||
let mut t2 = create_token(&p, dir.path().join("c.zst"), TEST_K).unwrap();
|
||||
|
||||
open_token(&mut t0, &sk); // t0 open
|
||||
open_token(&mut t1, &sk); // t1 open, t0 LRU
|
||||
@@ -538,10 +562,14 @@ mod tests {
|
||||
fn close_all_produces_readable_files() {
|
||||
let dir = TempDir::new().unwrap();
|
||||
let p = pool(8);
|
||||
let paths: Vec<_> = (0..4).map(|i| dir.path().join(format!("{i}.zst"))).collect();
|
||||
let paths: Vec<_> = (0..4)
|
||||
.map(|i| dir.path().join(format!("{i}.zst")))
|
||||
.collect();
|
||||
|
||||
let mut tokens: Vec<SKFileWriter> =
|
||||
paths.iter().map(|path| create_token(&p, path.clone()).unwrap()).collect();
|
||||
let mut tokens: Vec<SKFileWriter> = paths
|
||||
.iter()
|
||||
.map(|path| create_token(&p, path.clone(), TEST_K).unwrap())
|
||||
.collect();
|
||||
|
||||
for (i, t) in tokens.iter_mut().enumerate() {
|
||||
t.write(&make_sk(i)).unwrap();
|
||||
@@ -553,7 +581,7 @@ mod tests {
|
||||
p.lock().unwrap().close_all().unwrap();
|
||||
|
||||
for path in &paths {
|
||||
let mut r = SKFileReader::open(path).unwrap();
|
||||
let mut r = SKFileReader::open(path, TEST_K).unwrap();
|
||||
let got = r.read_batch(10).unwrap();
|
||||
assert_eq!(got.len(), 1);
|
||||
}
|
||||
@@ -566,11 +594,11 @@ mod tests {
|
||||
let sks: Vec<_> = (0..50).map(make_sk).collect();
|
||||
let path = dir.path().join("batch.zst");
|
||||
|
||||
let mut t = create_token(&p, path.clone()).unwrap();
|
||||
let mut t = create_token(&p, path.clone(), TEST_K).unwrap();
|
||||
t.write_batch(&sks).unwrap();
|
||||
t.close().unwrap();
|
||||
|
||||
let mut r = SKFileReader::open(&path).unwrap();
|
||||
let mut r = SKFileReader::open(&path, TEST_K).unwrap();
|
||||
let got = r.read_batch(100).unwrap();
|
||||
assert_eq!(got.len(), 50);
|
||||
for (a, b) in sks.iter().zip(got.iter()) {
|
||||
@@ -590,11 +618,11 @@ mod tests {
|
||||
let tmp = NamedTempFile::new().unwrap();
|
||||
let sks: Vec<_> = (0..100).map(make_sk).collect();
|
||||
{
|
||||
let mut w = SKFileWriter::create(tmp.path()).unwrap();
|
||||
let mut w = SKFileWriter::create(tmp.path(), TEST_K).unwrap();
|
||||
w.write_batch(&sks).unwrap();
|
||||
w.close().unwrap();
|
||||
}
|
||||
let mut r = SKFileReader::open(tmp.path()).unwrap();
|
||||
let mut r = SKFileReader::open(tmp.path(), TEST_K).unwrap();
|
||||
let got = r.read_batch(200).unwrap();
|
||||
assert_eq!(got.len(), 100);
|
||||
for (a, b) in sks.iter().zip(got.iter()) {
|
||||
@@ -605,7 +633,7 @@ mod tests {
|
||||
#[test]
|
||||
fn standalone_close_prevents_write() {
|
||||
let tmp = NamedTempFile::new().unwrap();
|
||||
let mut w = SKFileWriter::create(tmp.path()).unwrap();
|
||||
let mut w = SKFileWriter::create(tmp.path(), TEST_K).unwrap();
|
||||
w.close().unwrap();
|
||||
assert!(!w.is_open());
|
||||
assert!(w.write(&make_sk(0)).is_err());
|
||||
@@ -614,7 +642,7 @@ mod tests {
|
||||
#[test]
|
||||
fn standalone_is_physically_open() {
|
||||
let tmp = NamedTempFile::new().unwrap();
|
||||
let mut w = SKFileWriter::create(tmp.path()).unwrap();
|
||||
let mut w = SKFileWriter::create(tmp.path(), TEST_K).unwrap();
|
||||
assert!(!w.is_physically_open()); // fd deferred until first drain
|
||||
w.set_flush_threshold(1);
|
||||
w.write(&make_sk(0)).unwrap(); // triggers drain → fd opened
|
||||
|
||||
@@ -15,6 +15,7 @@ use std::path::{Path, PathBuf};
|
||||
/// that it can fast-forward on next open.
|
||||
pub struct SKFileReader {
|
||||
path: PathBuf,
|
||||
k: usize,
|
||||
reader: Option<Box<dyn std::io::Read + Send>>,
|
||||
/// Reusable scratch buffer for the `seq` bytes of each record.
|
||||
seq_buf: Vec<u8>,
|
||||
@@ -24,11 +25,13 @@ pub struct SKFileReader {
|
||||
|
||||
impl SKFileReader {
|
||||
/// Open a file for reading. Format is auto-detected from magic bytes.
|
||||
pub fn open<P: AsRef<Path>>(path: P) -> SKResult<Self> {
|
||||
/// `k` is the kmer size of the partition; required to decode the on-disk n_kmers field.
|
||||
pub fn open<P: AsRef<Path>>(path: P, k: usize) -> SKResult<Self> {
|
||||
let path = path.as_ref().to_owned();
|
||||
let (reader, _fmt) = niffler::send::get_reader(Box::new(BufReader::new(File::open(&path)?)))?;
|
||||
Ok(Self {
|
||||
path,
|
||||
k,
|
||||
reader: Some(reader),
|
||||
seq_buf: Vec::with_capacity(64),
|
||||
consumed: 0,
|
||||
@@ -43,7 +46,7 @@ impl SKFileReader {
|
||||
"read from physically closed SKFileReader",
|
||||
)
|
||||
})?;
|
||||
let result = read_superkmer(r, &mut self.seq_buf)?;
|
||||
let result = read_superkmer(r, &mut self.seq_buf, self.k)?;
|
||||
if result.is_some() {
|
||||
self.consumed += 1;
|
||||
}
|
||||
@@ -100,7 +103,7 @@ impl SKFileReader {
|
||||
let target = self.consumed;
|
||||
self.consumed = 0;
|
||||
for _ in 0..target {
|
||||
match read_superkmer(self.reader.as_mut().unwrap(), &mut self.seq_buf)? {
|
||||
match read_superkmer(self.reader.as_mut().unwrap(), &mut self.seq_buf, self.k)? {
|
||||
Some(_) => self.consumed += 1,
|
||||
None => break,
|
||||
}
|
||||
@@ -147,6 +150,8 @@ mod tests {
|
||||
use crate::pool::SKFileWriter;
|
||||
use tempfile::NamedTempFile;
|
||||
|
||||
const TEST_K: usize = 4; // test sequences are 8 bases; k=4 gives n_kmers=5
|
||||
|
||||
fn make_sks(n: usize) -> Vec<SuperKmer> {
|
||||
(0..n)
|
||||
.map(|i| {
|
||||
@@ -162,11 +167,11 @@ mod tests {
|
||||
let sks = make_sks(50);
|
||||
|
||||
{
|
||||
let mut w = SKFileWriter::create(tmp.path()).unwrap();
|
||||
let mut w = SKFileWriter::create(tmp.path(), TEST_K).unwrap();
|
||||
w.write_batch(&sks).unwrap();
|
||||
}
|
||||
|
||||
let mut r = SKFileReader::open(tmp.path()).unwrap();
|
||||
let mut r = SKFileReader::open(tmp.path(), TEST_K).unwrap();
|
||||
let got: Vec<_> = r.iter().collect();
|
||||
assert_eq!(got.len(), 50);
|
||||
for (a, b) in sks.iter().zip(got.iter()) {
|
||||
@@ -180,11 +185,11 @@ mod tests {
|
||||
let sks = make_sks(20);
|
||||
|
||||
{
|
||||
let mut w = SKFileWriter::create(tmp.path()).unwrap();
|
||||
let mut w = SKFileWriter::create(tmp.path(), TEST_K).unwrap();
|
||||
w.write_batch(&sks).unwrap();
|
||||
}
|
||||
|
||||
let mut r = SKFileReader::open(tmp.path()).unwrap();
|
||||
let mut r = SKFileReader::open(tmp.path(), TEST_K).unwrap();
|
||||
// Read 10, then simulate pool eviction + re-access
|
||||
let first = r.read_batch(10).unwrap();
|
||||
r.close();
|
||||
|
||||
Reference in New Issue
Block a user