diff --git a/src/obikseq/src/packed_seq.rs b/src/obikseq/src/packed_seq.rs index 82ee5f6..fe428f6 100644 --- a/src/obikseq/src/packed_seq.rs +++ b/src/obikseq/src/packed_seq.rs @@ -232,6 +232,18 @@ impl PackedSeq { self.iter_kmers().map(|km| km.canonical()) } + /// Consuming iterator over all k-mers. Moves `self` into the iterator; zero allocation. + #[inline] + pub fn into_kmers(self) -> OwnedPackedSeqKmerIter { + OwnedPackedSeqKmerIter::new(self) + } + + /// Consuming iterator over all canonical k-mers. Moves `self` into the iterator; zero allocation. + #[inline] + pub fn into_canonical_kmers(self) -> impl Iterator { + self.into_kmers().map(|km| km.canonical()) + } + /// Extract nucleotides `[start, end)` as a new [`PackedSeq`]. Allocates. pub fn sub(&self, start: usize, end: usize) -> Self { debug_assert!(end > start && end <= self.seql()); @@ -317,6 +329,51 @@ impl Iterator for PackedSeqKmerIter<'_> { } } +// ── OwnedPackedSeqKmerIter ──────────────────────────────────────────────────── + +/// Sliding-window kmer iterator that owns its [`PackedSeq`]. Zero allocation. +pub struct OwnedPackedSeqKmerIter { + seq: PackedSeq, + mask: u64, + lshift: usize, + current: u64, + pos: usize, + max_pos: usize, +} + +impl OwnedPackedSeqKmerIter { + fn new(seq: PackedSeq) -> Self { + let seql = seq.seql(); + let klen = k(); + let lshift = 64 - klen * 2; + let mask = ((!0u128) << (lshift + 2)) as u64; + let current = if seql >= klen { + seq.extract::(0).map(|km| km.raw()).unwrap_or(0) + } else { + 0 + }; + Self { seq, mask, lshift, current, pos: klen, max_pos: seql } + } +} + +impl Iterator for OwnedPackedSeqKmerIter { + type Item = Kmer; + + fn next(&mut self) -> Option { + if self.pos > self.max_pos { + return None; + } + let result = Kmer::from_raw(self.current); + if self.pos < self.max_pos { + let inner_shift = 6 - 2 * (self.pos & 3); + let nuc = ((self.seq.seq[self.pos / 4] >> inner_shift) & 3) as u64; + self.current = ((self.current << 2) & self.mask) | (nuc << self.lshift); + } + self.pos += 1; + Some(result) + } +} + // ── varint (LEB128) ─────────────────────────────────────────────────────────── pub(crate) fn write_varint(w: &mut W, mut val: u64) -> io::Result<()> { diff --git a/src/obikseq/src/superkmer.rs b/src/obikseq/src/superkmer.rs index 1e8c9b5..40e01d3 100644 --- a/src/obikseq/src/superkmer.rs +++ b/src/obikseq/src/superkmer.rs @@ -207,6 +207,12 @@ impl SuperKmer { pub fn iter_canonical_kmers(&self) -> impl Iterator + '_ { self.inner.iter_canonical_kmers() } + + /// Consuming iterator over all canonical k-mers. Moves `self`; zero allocation. + #[inline] + pub fn into_canonical_kmers(self) -> impl Iterator { + self.inner.into_canonical_kmers() + } } #[cfg(test)] diff --git a/src/obiskio/src/lib.rs b/src/obiskio/src/lib.rs index 1c01539..8f32ca2 100644 --- a/src/obiskio/src/lib.rs +++ b/src/obiskio/src/lib.rs @@ -6,7 +6,7 @@ pub mod unitig_index; pub use error::{SKError, SKResult}; pub use meta::SKFileMeta; -pub use pool::{create_token, create_token_with, SKFilePool, SharedPool, SKFileWriter}; +pub use pool::{SKFilePool, SKFileWriter, SharedPool, create_token, create_token_with}; pub use reader::{SKFileIter, SKFileReader}; pub use unitig_index::{UnitigFileReader, UnitigFileWriter}; diff --git a/src/obiskio/src/reader.rs b/src/obiskio/src/reader.rs index 3b9ed33..9694f18 100644 --- a/src/obiskio/src/reader.rs +++ b/src/obiskio/src/reader.rs @@ -1,13 +1,15 @@ use crate::error::{SKError, SKResult}; +use obikseq::kmer::CanonicalKmer; use obikseq::superkmer::SuperKmer; use std::fs::File; -use std::io::{self, BufReader, Read}; +use std::io::{self, Read}; use std::path::{Path, PathBuf}; /// Binary reader for SuperKmers, with transparent decompression via niffler. /// /// Access is strictly sequential. Call [`iter`](SKFileReader::iter) to get an -/// [`Iterator`] over the SuperKmers. +/// [`Iterator`] over the SuperKmers, or [`iter_canonical_kmers`](SKFileReader::iter_canonical_kmers) +/// to iterate directly over the canonical kmers they contain. pub struct SKFileReader { path: PathBuf, reader: Box, @@ -18,7 +20,7 @@ impl SKFileReader { pub fn open>(path: P) -> SKResult { let path = path.as_ref().to_owned(); let (reader, _fmt) = - niffler::send::get_reader(Box::new(BufReader::new(File::open(&path)?)))?; + niffler::send::get_reader(Box::new(File::open(&path)?))?; Ok(Self { path, reader }) } @@ -44,15 +46,26 @@ impl SKFileReader { &self.path } - /// Return an iterator over this reader. + /// Return an iterator over the SuperKmers in this file. pub fn iter(&mut self) -> SKFileIter<'_> { - SKFileIter { reader: self, error: None } + SKFileIter { + reader: self, + error: None, + } + } + + /// Return an iterator over all canonical kmers across all SuperKmers in this file. + /// + /// Kmers are yielded in file order: all kmers of the first SuperKmer, then + /// all kmers of the second, and so on. No deduplication is performed. + pub fn iter_canonical_kmers(&mut self) -> impl Iterator + '_ { + self.iter().flat_map(|sk| sk.into_canonical_kmers()) } } -// ── Iterator ───────────────────────────────────────────────────────────────── +// ── SKFileIter ──────────────────────────────────────────────────────────────── -/// Iterator adapter for [`SKFileReader`]. +/// Iterator over [`SuperKmer`]s in a file. /// /// Errors during iteration are stored and accessible via [`take_error`](SKFileIter::take_error). pub struct SKFileIter<'a> { @@ -61,6 +74,14 @@ pub struct SKFileIter<'a> { } impl<'a> SKFileIter<'a> { + + /// Read the next SuperKmer, or `None` at EOF. + pub fn read(&mut self) -> SKResult> { + match self.reader.read()? { + Some(sk) => Ok(Some(sk)), + None => Ok(None), + } + } /// Returns the first I/O error encountered during iteration, if any. pub fn take_error(&mut self) -> Option { self.error.take() diff --git a/src/obiskio/src/tests/reader.rs b/src/obiskio/src/tests/reader.rs index 57378c5..cc1a238 100644 --- a/src/obiskio/src/tests/reader.rs +++ b/src/obiskio/src/tests/reader.rs @@ -1,5 +1,6 @@ use super::*; use crate::pool::SKFileWriter; +use obikseq::kmer::CanonicalKmer; use std::io::Cursor; use tempfile::NamedTempFile; @@ -42,6 +43,48 @@ fn iter_all() { } } +#[test] +fn iter_canonical_kmers_count() { + setup(); + let tmp = NamedTempFile::new().unwrap(); + let sks = make_sks(10); // each superkmer: seql=8, k=4 → 5 kmers + + { + let mut w = SKFileWriter::create(tmp.path()).unwrap(); + w.write_batch(&sks).unwrap(); + } + + use std::fs; + eprintln!("DEBUG: about to open"); + let sz = fs::metadata(tmp.path()).unwrap().len(); + eprintln!("DEBUG file size: {sz} bytes"); + let mut r = SKFileReader::open(tmp.path()).unwrap(); + eprintln!("DEBUG: reader opened, starting count"); + let count = r.iter_canonical_kmers().count(); + eprintln!("DEBUG: count = {count}"); + assert_eq!(count, 10 * 5); +} + +#[test] +fn iter_canonical_kmers_matches_per_sk() { + setup(); + let tmp = NamedTempFile::new().unwrap(); + let sks = make_sks(4); + + { + let mut w = SKFileWriter::create(tmp.path()).unwrap(); + w.write_batch(&sks).unwrap(); + } + + // Reference: collect kmers superkmer by superkmer + let expected: Vec = sks.iter().flat_map(|sk| sk.iter_canonical_kmers()).collect(); + + let mut r = SKFileReader::open(tmp.path()).unwrap(); + let got: Vec = r.iter_canonical_kmers().collect(); + + assert_eq!(expected, got); +} + // ── serialisation round-trips (formerly codec.rs tests) ────────────────────── #[test]