Add consuming k-mer iterators to PackedSeq and Superkmer
Introduces `into_kmers()` and `into_canonical_kmers()` consuming methods to `PackedSeq` and `Superkmer`, enabling zero-allocation sliding-window k-mer extraction via bitwise operations. This complements existing borrow-based iterators by allowing direct ownership transfer. Also includes minor documentation updates, whitespace fixes, and new unit tests to verify canonical k-mer iteration counts and output sequences.
This commit is contained in:
@@ -232,6 +232,18 @@ impl PackedSeq {
|
|||||||
self.iter_kmers().map(|km| km.canonical())
|
self.iter_kmers().map(|km| km.canonical())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Consuming iterator over all k-mers. Moves `self` into the iterator; zero allocation.
|
||||||
|
#[inline]
|
||||||
|
pub fn into_kmers(self) -> OwnedPackedSeqKmerIter {
|
||||||
|
OwnedPackedSeqKmerIter::new(self)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Consuming iterator over all canonical k-mers. Moves `self` into the iterator; zero allocation.
|
||||||
|
#[inline]
|
||||||
|
pub fn into_canonical_kmers(self) -> impl Iterator<Item = CanonicalKmer> {
|
||||||
|
self.into_kmers().map(|km| km.canonical())
|
||||||
|
}
|
||||||
|
|
||||||
/// Extract nucleotides `[start, end)` as a new [`PackedSeq`]. Allocates.
|
/// Extract nucleotides `[start, end)` as a new [`PackedSeq`]. Allocates.
|
||||||
pub fn sub(&self, start: usize, end: usize) -> Self {
|
pub fn sub(&self, start: usize, end: usize) -> Self {
|
||||||
debug_assert!(end > start && end <= self.seql());
|
debug_assert!(end > start && end <= self.seql());
|
||||||
@@ -317,6 +329,51 @@ impl Iterator for PackedSeqKmerIter<'_> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ── OwnedPackedSeqKmerIter ────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/// Sliding-window kmer iterator that owns its [`PackedSeq`]. Zero allocation.
|
||||||
|
pub struct OwnedPackedSeqKmerIter {
|
||||||
|
seq: PackedSeq,
|
||||||
|
mask: u64,
|
||||||
|
lshift: usize,
|
||||||
|
current: u64,
|
||||||
|
pos: usize,
|
||||||
|
max_pos: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl OwnedPackedSeqKmerIter {
|
||||||
|
fn new(seq: PackedSeq) -> Self {
|
||||||
|
let seql = seq.seql();
|
||||||
|
let klen = k();
|
||||||
|
let lshift = 64 - klen * 2;
|
||||||
|
let mask = ((!0u128) << (lshift + 2)) as u64;
|
||||||
|
let current = if seql >= klen {
|
||||||
|
seq.extract::<KLen>(0).map(|km| km.raw()).unwrap_or(0)
|
||||||
|
} else {
|
||||||
|
0
|
||||||
|
};
|
||||||
|
Self { seq, mask, lshift, current, pos: klen, max_pos: seql }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Iterator for OwnedPackedSeqKmerIter {
|
||||||
|
type Item = Kmer;
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<Kmer> {
|
||||||
|
if self.pos > self.max_pos {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
let result = Kmer::from_raw(self.current);
|
||||||
|
if self.pos < self.max_pos {
|
||||||
|
let inner_shift = 6 - 2 * (self.pos & 3);
|
||||||
|
let nuc = ((self.seq.seq[self.pos / 4] >> inner_shift) & 3) as u64;
|
||||||
|
self.current = ((self.current << 2) & self.mask) | (nuc << self.lshift);
|
||||||
|
}
|
||||||
|
self.pos += 1;
|
||||||
|
Some(result)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// ── varint (LEB128) ───────────────────────────────────────────────────────────
|
// ── varint (LEB128) ───────────────────────────────────────────────────────────
|
||||||
|
|
||||||
pub(crate) fn write_varint<W: Write>(w: &mut W, mut val: u64) -> io::Result<()> {
|
pub(crate) fn write_varint<W: Write>(w: &mut W, mut val: u64) -> io::Result<()> {
|
||||||
|
|||||||
@@ -207,6 +207,12 @@ impl SuperKmer {
|
|||||||
pub fn iter_canonical_kmers(&self) -> impl Iterator<Item = CanonicalKmer> + '_ {
|
pub fn iter_canonical_kmers(&self) -> impl Iterator<Item = CanonicalKmer> + '_ {
|
||||||
self.inner.iter_canonical_kmers()
|
self.inner.iter_canonical_kmers()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Consuming iterator over all canonical k-mers. Moves `self`; zero allocation.
|
||||||
|
#[inline]
|
||||||
|
pub fn into_canonical_kmers(self) -> impl Iterator<Item = CanonicalKmer> {
|
||||||
|
self.inner.into_canonical_kmers()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ pub mod unitig_index;
|
|||||||
|
|
||||||
pub use error::{SKError, SKResult};
|
pub use error::{SKError, SKResult};
|
||||||
pub use meta::SKFileMeta;
|
pub use meta::SKFileMeta;
|
||||||
pub use pool::{create_token, create_token_with, SKFilePool, SharedPool, SKFileWriter};
|
pub use pool::{SKFilePool, SKFileWriter, SharedPool, create_token, create_token_with};
|
||||||
pub use reader::{SKFileIter, SKFileReader};
|
pub use reader::{SKFileIter, SKFileReader};
|
||||||
pub use unitig_index::{UnitigFileReader, UnitigFileWriter};
|
pub use unitig_index::{UnitigFileReader, UnitigFileWriter};
|
||||||
|
|
||||||
|
|||||||
@@ -1,13 +1,15 @@
|
|||||||
use crate::error::{SKError, SKResult};
|
use crate::error::{SKError, SKResult};
|
||||||
|
use obikseq::kmer::CanonicalKmer;
|
||||||
use obikseq::superkmer::SuperKmer;
|
use obikseq::superkmer::SuperKmer;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io::{self, BufReader, Read};
|
use std::io::{self, Read};
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
|
|
||||||
/// Binary reader for SuperKmers, with transparent decompression via niffler.
|
/// Binary reader for SuperKmers, with transparent decompression via niffler.
|
||||||
///
|
///
|
||||||
/// Access is strictly sequential. Call [`iter`](SKFileReader::iter) to get an
|
/// Access is strictly sequential. Call [`iter`](SKFileReader::iter) to get an
|
||||||
/// [`Iterator`] over the SuperKmers.
|
/// [`Iterator`] over the SuperKmers, or [`iter_canonical_kmers`](SKFileReader::iter_canonical_kmers)
|
||||||
|
/// to iterate directly over the canonical kmers they contain.
|
||||||
pub struct SKFileReader {
|
pub struct SKFileReader {
|
||||||
path: PathBuf,
|
path: PathBuf,
|
||||||
reader: Box<dyn std::io::Read + Send>,
|
reader: Box<dyn std::io::Read + Send>,
|
||||||
@@ -18,7 +20,7 @@ impl SKFileReader {
|
|||||||
pub fn open<P: AsRef<Path>>(path: P) -> SKResult<Self> {
|
pub fn open<P: AsRef<Path>>(path: P) -> SKResult<Self> {
|
||||||
let path = path.as_ref().to_owned();
|
let path = path.as_ref().to_owned();
|
||||||
let (reader, _fmt) =
|
let (reader, _fmt) =
|
||||||
niffler::send::get_reader(Box::new(BufReader::new(File::open(&path)?)))?;
|
niffler::send::get_reader(Box::new(File::open(&path)?))?;
|
||||||
Ok(Self { path, reader })
|
Ok(Self { path, reader })
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -44,15 +46,26 @@ impl SKFileReader {
|
|||||||
&self.path
|
&self.path
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return an iterator over this reader.
|
/// Return an iterator over the SuperKmers in this file.
|
||||||
pub fn iter(&mut self) -> SKFileIter<'_> {
|
pub fn iter(&mut self) -> SKFileIter<'_> {
|
||||||
SKFileIter { reader: self, error: None }
|
SKFileIter {
|
||||||
|
reader: self,
|
||||||
|
error: None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Iterator ─────────────────────────────────────────────────────────────────
|
/// Return an iterator over all canonical kmers across all SuperKmers in this file.
|
||||||
|
///
|
||||||
|
/// Kmers are yielded in file order: all kmers of the first SuperKmer, then
|
||||||
|
/// all kmers of the second, and so on. No deduplication is performed.
|
||||||
|
pub fn iter_canonical_kmers(&mut self) -> impl Iterator<Item = CanonicalKmer> + '_ {
|
||||||
|
self.iter().flat_map(|sk| sk.into_canonical_kmers())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Iterator adapter for [`SKFileReader`].
|
// ── SKFileIter ────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/// Iterator over [`SuperKmer`]s in a file.
|
||||||
///
|
///
|
||||||
/// Errors during iteration are stored and accessible via [`take_error`](SKFileIter::take_error).
|
/// Errors during iteration are stored and accessible via [`take_error`](SKFileIter::take_error).
|
||||||
pub struct SKFileIter<'a> {
|
pub struct SKFileIter<'a> {
|
||||||
@@ -61,6 +74,14 @@ pub struct SKFileIter<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> SKFileIter<'a> {
|
impl<'a> SKFileIter<'a> {
|
||||||
|
|
||||||
|
/// Read the next SuperKmer, or `None` at EOF.
|
||||||
|
pub fn read(&mut self) -> SKResult<Option<SuperKmer>> {
|
||||||
|
match self.reader.read()? {
|
||||||
|
Some(sk) => Ok(Some(sk)),
|
||||||
|
None => Ok(None),
|
||||||
|
}
|
||||||
|
}
|
||||||
/// Returns the first I/O error encountered during iteration, if any.
|
/// Returns the first I/O error encountered during iteration, if any.
|
||||||
pub fn take_error(&mut self) -> Option<SKError> {
|
pub fn take_error(&mut self) -> Option<SKError> {
|
||||||
self.error.take()
|
self.error.take()
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
use super::*;
|
use super::*;
|
||||||
use crate::pool::SKFileWriter;
|
use crate::pool::SKFileWriter;
|
||||||
|
use obikseq::kmer::CanonicalKmer;
|
||||||
use std::io::Cursor;
|
use std::io::Cursor;
|
||||||
use tempfile::NamedTempFile;
|
use tempfile::NamedTempFile;
|
||||||
|
|
||||||
@@ -42,6 +43,48 @@ fn iter_all() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn iter_canonical_kmers_count() {
|
||||||
|
setup();
|
||||||
|
let tmp = NamedTempFile::new().unwrap();
|
||||||
|
let sks = make_sks(10); // each superkmer: seql=8, k=4 → 5 kmers
|
||||||
|
|
||||||
|
{
|
||||||
|
let mut w = SKFileWriter::create(tmp.path()).unwrap();
|
||||||
|
w.write_batch(&sks).unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
use std::fs;
|
||||||
|
eprintln!("DEBUG: about to open");
|
||||||
|
let sz = fs::metadata(tmp.path()).unwrap().len();
|
||||||
|
eprintln!("DEBUG file size: {sz} bytes");
|
||||||
|
let mut r = SKFileReader::open(tmp.path()).unwrap();
|
||||||
|
eprintln!("DEBUG: reader opened, starting count");
|
||||||
|
let count = r.iter_canonical_kmers().count();
|
||||||
|
eprintln!("DEBUG: count = {count}");
|
||||||
|
assert_eq!(count, 10 * 5);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn iter_canonical_kmers_matches_per_sk() {
|
||||||
|
setup();
|
||||||
|
let tmp = NamedTempFile::new().unwrap();
|
||||||
|
let sks = make_sks(4);
|
||||||
|
|
||||||
|
{
|
||||||
|
let mut w = SKFileWriter::create(tmp.path()).unwrap();
|
||||||
|
w.write_batch(&sks).unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reference: collect kmers superkmer by superkmer
|
||||||
|
let expected: Vec<CanonicalKmer> = sks.iter().flat_map(|sk| sk.iter_canonical_kmers()).collect();
|
||||||
|
|
||||||
|
let mut r = SKFileReader::open(tmp.path()).unwrap();
|
||||||
|
let got: Vec<CanonicalKmer> = r.iter_canonical_kmers().collect();
|
||||||
|
|
||||||
|
assert_eq!(expected, got);
|
||||||
|
}
|
||||||
|
|
||||||
// ── serialisation round-trips (formerly codec.rs tests) ──────────────────────
|
// ── serialisation round-trips (formerly codec.rs tests) ──────────────────────
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|||||||
Reference in New Issue
Block a user