feat: add streaming sequence reader and superkmer iterator

Introduce the `obiread` crate with a streaming byte normalizer that processes FASTA, FASTQ, and GenBank files using a 64 KiB ring buffer for O(1) memory usage. Integrate this crate into `obiskbuilder` to provide `SuperKmerStreamIter`, enabling memory-efficient superkmer traversal with rolling entropy and minimizer-based cut conditions.
This commit is contained in:
Eric Coissac
2026-05-27 00:09:12 +02:00
parent 0d9be53d1f
commit a4b57a96de
8 changed files with 671 additions and 1 deletions
+1
View File
@@ -6,6 +6,7 @@ edition = "2024"
[dependencies]
obikseq = { path = "../obikseq" }
obikrope = { path = "../obikrope" }
obiread = { path = "../obiread" }
lazy_static = "1.5.0"
[dev-dependencies]
+2
View File
@@ -6,6 +6,7 @@
#![deny(missing_docs)]
pub mod iter;
pub mod stream_iter;
mod scratch;
pub(crate) mod encoding;
@@ -14,6 +15,7 @@ pub(crate) mod rolling_stat;
pub use iter::SuperKmerIter;
pub use scratch::SuperKmerScratch;
pub use stream_iter::SuperKmerStreamIter;
use obikrope::Rope;
use obikseq::RoutableSuperKmer;
+146
View File
@@ -0,0 +1,146 @@
//! Streaming superkmer iterator that does not require a fully-buffered record.
//!
//! [`SuperKmerStreamIter`] wraps a [`NormalizedByteStream`] and yields
//! [`RoutableSuperKmer`] values one by one, exactly as [`SuperKmerIter`] does
//! over a [`Rope`], but without accumulating the whole input in memory first.
//!
//! This makes it suitable for large GBFF chromosomes (250 MiB ORIGIN sections)
//! or any other source where buffering the full record would exhaust memory.
//!
//! The cut conditions and superkmer semantics are identical to [`SuperKmerIter`]:
//!
//! | Condition | stream rewind |
//! |------------------------|---------------|
//! | entropy(kmer) ≤ θ | k1 |
//! | minimizer changed | k |
//! | super-kmer length = 256| k |
//!
//! [`SuperKmerIter`]: crate::iter::SuperKmerIter
//! [`Rope`]: obikrope::Rope
use std::io::Read;
use obiread::stream::NormalizedByteStream;
use obikseq::RoutableSuperKmer;
use obikseq::kmer::Minimizer;
use crate::rolling_stat::RollingStat;
use crate::scratch::SuperKmerScratch;
/// Streaming iterator over [`RoutableSuperKmer`] values.
pub struct SuperKmerStreamIter<R: Read> {
stream: NormalizedByteStream<R>,
k: usize,
theta: f64,
scratch: SuperKmerScratch,
stat: RollingStat,
prev_min: Option<Minimizer>,
prev_min_pos: usize,
}
impl<R: Read> SuperKmerStreamIter<R> {
/// Build a streaming superkmer iterator from any `Read` source.
///
/// - `reader`: raw bytes (FASTA, FASTQ, or GBFF; format auto-detected)
/// - `k`: k-mer size (must be odd, 11 ≤ k ≤ 31)
/// - `level_max`: maximum sub-word size for entropy (16)
/// - `theta`: entropy threshold; k-mers with score ≤ theta are rejected
pub fn new(reader: R, k: usize, level_max: usize, theta: f64) -> std::io::Result<Self> {
Ok(Self {
stream: NormalizedByteStream::new(reader)?,
k,
theta,
scratch: SuperKmerScratch::new(),
stat: RollingStat::new(level_max),
prev_min: None,
prev_min_pos: 0,
})
}
fn reset(&mut self) {
self.stat.reset();
self.scratch.reset();
self.prev_min = None;
self.prev_min_pos = 0;
}
fn try_emit(&mut self) -> Option<RoutableSuperKmer> {
if self.scratch.len() < self.k {
return None;
}
self.prev_min?;
Some(self.scratch.emit(self.prev_min_pos))
}
}
impl<R: Read> Iterator for SuperKmerStreamIter<R> {
type Item = RoutableSuperKmer;
fn next(&mut self) -> Option<RoutableSuperKmer> {
loop {
let byte = match self.stream.next_byte() {
None => {
return self.try_emit();
}
Some(0x00) => {
let result = self.try_emit();
self.reset();
if result.is_some() {
return result;
}
continue;
}
Some(b) => b,
};
self.stat.push(byte);
if !self.stat.ready() {
self.scratch.push(byte);
continue;
}
// ── 1. Entropy check ─────────────────────────────────────────────
if self.stat.normalized_entropy().unwrap_or(1.0) < self.theta {
let result = self.try_emit();
self.stream.rewind(self.k - 1);
self.reset();
if result.is_some() {
return result;
}
continue;
}
let min = self.stat.canonical_minimizer().unwrap();
let min_pos = self.stat.minimizer_position().unwrap_or(0);
// ── 2. Minimizer change check ─────────────────────────────────────
if let Some(prev) = self.prev_min {
if min != prev {
let result = self.try_emit();
self.stream.rewind(self.k);
self.reset();
if result.is_some() {
return result;
}
continue;
}
}
// ── 3. Super-kmer length check ────────────────────────────────────
if self.scratch.len() == 256 {
let result = self.try_emit();
self.stream.rewind(self.k);
self.reset();
if result.is_some() {
return result;
}
continue;
}
self.prev_min = Some(min);
self.prev_min_pos = min_pos;
self.scratch.push(byte);
}
}
}