Add consuming k-mer iterators to PackedSeq and Superkmer

Introduces `into_kmers()` and `into_canonical_kmers()` consuming methods to `PackedSeq` and `Superkmer`, enabling zero-allocation sliding-window k-mer extraction via bitwise operations. This complements existing borrow-based iterators by allowing direct ownership transfer. Also includes minor documentation updates, whitespace fixes, and new unit tests to verify canonical k-mer iteration counts and output sequences.
This commit is contained in:
Eric Coissac
2026-05-11 10:21:35 +08:00
parent 92cda13ae4
commit 6687911d60
5 changed files with 135 additions and 8 deletions
+1 -1
View File
@@ -6,7 +6,7 @@ pub mod unitig_index;
pub use error::{SKError, SKResult};
pub use meta::SKFileMeta;
pub use pool::{create_token, create_token_with, SKFilePool, SharedPool, SKFileWriter};
pub use pool::{SKFilePool, SKFileWriter, SharedPool, create_token, create_token_with};
pub use reader::{SKFileIter, SKFileReader};
pub use unitig_index::{UnitigFileReader, UnitigFileWriter};
+28 -7
View File
@@ -1,13 +1,15 @@
use crate::error::{SKError, SKResult};
use obikseq::kmer::CanonicalKmer;
use obikseq::superkmer::SuperKmer;
use std::fs::File;
use std::io::{self, BufReader, Read};
use std::io::{self, Read};
use std::path::{Path, PathBuf};
/// Binary reader for SuperKmers, with transparent decompression via niffler.
///
/// Access is strictly sequential. Call [`iter`](SKFileReader::iter) to get an
/// [`Iterator`] over the SuperKmers.
/// [`Iterator`] over the SuperKmers, or [`iter_canonical_kmers`](SKFileReader::iter_canonical_kmers)
/// to iterate directly over the canonical kmers they contain.
pub struct SKFileReader {
path: PathBuf,
reader: Box<dyn std::io::Read + Send>,
@@ -18,7 +20,7 @@ impl SKFileReader {
pub fn open<P: AsRef<Path>>(path: P) -> SKResult<Self> {
let path = path.as_ref().to_owned();
let (reader, _fmt) =
niffler::send::get_reader(Box::new(BufReader::new(File::open(&path)?)))?;
niffler::send::get_reader(Box::new(File::open(&path)?))?;
Ok(Self { path, reader })
}
@@ -44,15 +46,26 @@ impl SKFileReader {
&self.path
}
/// Return an iterator over this reader.
/// Return an iterator over the SuperKmers in this file.
pub fn iter(&mut self) -> SKFileIter<'_> {
SKFileIter { reader: self, error: None }
SKFileIter {
reader: self,
error: None,
}
}
/// Return an iterator over all canonical kmers across all SuperKmers in this file.
///
/// Kmers are yielded in file order: all kmers of the first SuperKmer, then
/// all kmers of the second, and so on. No deduplication is performed.
pub fn iter_canonical_kmers(&mut self) -> impl Iterator<Item = CanonicalKmer> + '_ {
self.iter().flat_map(|sk| sk.into_canonical_kmers())
}
}
// ── Iterator ─────────────────────────────────────────────────────────────────
// ── SKFileIter ────────────────────────────────────────────────────────────────
/// Iterator adapter for [`SKFileReader`].
/// Iterator over [`SuperKmer`]s in a file.
///
/// Errors during iteration are stored and accessible via [`take_error`](SKFileIter::take_error).
pub struct SKFileIter<'a> {
@@ -61,6 +74,14 @@ pub struct SKFileIter<'a> {
}
impl<'a> SKFileIter<'a> {
/// Read the next SuperKmer, or `None` at EOF.
pub fn read(&mut self) -> SKResult<Option<SuperKmer>> {
match self.reader.read()? {
Some(sk) => Ok(Some(sk)),
None => Ok(None),
}
}
/// Returns the first I/O error encountered during iteration, if any.
pub fn take_error(&mut self) -> Option<SKError> {
self.error.take()
+43
View File
@@ -1,5 +1,6 @@
use super::*;
use crate::pool::SKFileWriter;
use obikseq::kmer::CanonicalKmer;
use std::io::Cursor;
use tempfile::NamedTempFile;
@@ -42,6 +43,48 @@ fn iter_all() {
}
}
#[test]
fn iter_canonical_kmers_count() {
setup();
let tmp = NamedTempFile::new().unwrap();
let sks = make_sks(10); // each superkmer: seql=8, k=4 → 5 kmers
{
let mut w = SKFileWriter::create(tmp.path()).unwrap();
w.write_batch(&sks).unwrap();
}
use std::fs;
eprintln!("DEBUG: about to open");
let sz = fs::metadata(tmp.path()).unwrap().len();
eprintln!("DEBUG file size: {sz} bytes");
let mut r = SKFileReader::open(tmp.path()).unwrap();
eprintln!("DEBUG: reader opened, starting count");
let count = r.iter_canonical_kmers().count();
eprintln!("DEBUG: count = {count}");
assert_eq!(count, 10 * 5);
}
#[test]
fn iter_canonical_kmers_matches_per_sk() {
setup();
let tmp = NamedTempFile::new().unwrap();
let sks = make_sks(4);
{
let mut w = SKFileWriter::create(tmp.path()).unwrap();
w.write_batch(&sks).unwrap();
}
// Reference: collect kmers superkmer by superkmer
let expected: Vec<CanonicalKmer> = sks.iter().flat_map(|sk| sk.iter_canonical_kmers()).collect();
let mut r = SKFileReader::open(tmp.path()).unwrap();
let got: Vec<CanonicalKmer> = r.iter_canonical_kmers().collect();
assert_eq!(expected, got);
}
// ── serialisation round-trips (formerly codec.rs tests) ──────────────────────
#[test]