feat: add streaming sequence reader and superkmer iterator
Introduce the `obiread` crate with a streaming byte normalizer that processes FASTA, FASTQ, and GenBank files using a 64 KiB ring buffer for O(1) memory usage. Integrate this crate into `obiskbuilder` to provide `SuperKmerStreamIter`, enabling memory-efficient superkmer traversal with rolling entropy and minimizer-based cut conditions.
This commit is contained in:
@@ -6,6 +6,7 @@ edition = "2024"
|
||||
[dependencies]
|
||||
obikseq = { path = "../obikseq" }
|
||||
obikrope = { path = "../obikrope" }
|
||||
obiread = { path = "../obiread" }
|
||||
lazy_static = "1.5.0"
|
||||
|
||||
[dev-dependencies]
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
#![deny(missing_docs)]
|
||||
|
||||
pub mod iter;
|
||||
pub mod stream_iter;
|
||||
mod scratch;
|
||||
|
||||
pub(crate) mod encoding;
|
||||
@@ -14,6 +15,7 @@ pub(crate) mod rolling_stat;
|
||||
|
||||
pub use iter::SuperKmerIter;
|
||||
pub use scratch::SuperKmerScratch;
|
||||
pub use stream_iter::SuperKmerStreamIter;
|
||||
|
||||
use obikrope::Rope;
|
||||
use obikseq::RoutableSuperKmer;
|
||||
|
||||
@@ -0,0 +1,146 @@
|
||||
//! Streaming superkmer iterator that does not require a fully-buffered record.
|
||||
//!
|
||||
//! [`SuperKmerStreamIter`] wraps a [`NormalizedByteStream`] and yields
|
||||
//! [`RoutableSuperKmer`] values one by one, exactly as [`SuperKmerIter`] does
|
||||
//! over a [`Rope`], but without accumulating the whole input in memory first.
|
||||
//!
|
||||
//! This makes it suitable for large GBFF chromosomes (250 MiB ORIGIN sections)
|
||||
//! or any other source where buffering the full record would exhaust memory.
|
||||
//!
|
||||
//! The cut conditions and superkmer semantics are identical to [`SuperKmerIter`]:
|
||||
//!
|
||||
//! | Condition | stream rewind |
|
||||
//! |------------------------|---------------|
|
||||
//! | entropy(kmer) ≤ θ | k−1 |
|
||||
//! | minimizer changed | k |
|
||||
//! | super-kmer length = 256| k |
|
||||
//!
|
||||
//! [`SuperKmerIter`]: crate::iter::SuperKmerIter
|
||||
//! [`Rope`]: obikrope::Rope
|
||||
|
||||
use std::io::Read;
|
||||
|
||||
use obiread::stream::NormalizedByteStream;
|
||||
use obikseq::RoutableSuperKmer;
|
||||
use obikseq::kmer::Minimizer;
|
||||
|
||||
use crate::rolling_stat::RollingStat;
|
||||
use crate::scratch::SuperKmerScratch;
|
||||
|
||||
/// Streaming iterator over [`RoutableSuperKmer`] values.
|
||||
pub struct SuperKmerStreamIter<R: Read> {
|
||||
stream: NormalizedByteStream<R>,
|
||||
k: usize,
|
||||
theta: f64,
|
||||
scratch: SuperKmerScratch,
|
||||
stat: RollingStat,
|
||||
prev_min: Option<Minimizer>,
|
||||
prev_min_pos: usize,
|
||||
}
|
||||
|
||||
impl<R: Read> SuperKmerStreamIter<R> {
|
||||
/// Build a streaming superkmer iterator from any `Read` source.
|
||||
///
|
||||
/// - `reader`: raw bytes (FASTA, FASTQ, or GBFF; format auto-detected)
|
||||
/// - `k`: k-mer size (must be odd, 11 ≤ k ≤ 31)
|
||||
/// - `level_max`: maximum sub-word size for entropy (1–6)
|
||||
/// - `theta`: entropy threshold; k-mers with score ≤ theta are rejected
|
||||
pub fn new(reader: R, k: usize, level_max: usize, theta: f64) -> std::io::Result<Self> {
|
||||
Ok(Self {
|
||||
stream: NormalizedByteStream::new(reader)?,
|
||||
k,
|
||||
theta,
|
||||
scratch: SuperKmerScratch::new(),
|
||||
stat: RollingStat::new(level_max),
|
||||
prev_min: None,
|
||||
prev_min_pos: 0,
|
||||
})
|
||||
}
|
||||
|
||||
fn reset(&mut self) {
|
||||
self.stat.reset();
|
||||
self.scratch.reset();
|
||||
self.prev_min = None;
|
||||
self.prev_min_pos = 0;
|
||||
}
|
||||
|
||||
fn try_emit(&mut self) -> Option<RoutableSuperKmer> {
|
||||
if self.scratch.len() < self.k {
|
||||
return None;
|
||||
}
|
||||
self.prev_min?;
|
||||
Some(self.scratch.emit(self.prev_min_pos))
|
||||
}
|
||||
}
|
||||
|
||||
impl<R: Read> Iterator for SuperKmerStreamIter<R> {
|
||||
type Item = RoutableSuperKmer;
|
||||
|
||||
fn next(&mut self) -> Option<RoutableSuperKmer> {
|
||||
loop {
|
||||
let byte = match self.stream.next_byte() {
|
||||
None => {
|
||||
return self.try_emit();
|
||||
}
|
||||
Some(0x00) => {
|
||||
let result = self.try_emit();
|
||||
self.reset();
|
||||
if result.is_some() {
|
||||
return result;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
Some(b) => b,
|
||||
};
|
||||
|
||||
self.stat.push(byte);
|
||||
|
||||
if !self.stat.ready() {
|
||||
self.scratch.push(byte);
|
||||
continue;
|
||||
}
|
||||
|
||||
// ── 1. Entropy check ─────────────────────────────────────────────
|
||||
if self.stat.normalized_entropy().unwrap_or(1.0) < self.theta {
|
||||
let result = self.try_emit();
|
||||
self.stream.rewind(self.k - 1);
|
||||
self.reset();
|
||||
if result.is_some() {
|
||||
return result;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
let min = self.stat.canonical_minimizer().unwrap();
|
||||
let min_pos = self.stat.minimizer_position().unwrap_or(0);
|
||||
|
||||
// ── 2. Minimizer change check ─────────────────────────────────────
|
||||
if let Some(prev) = self.prev_min {
|
||||
if min != prev {
|
||||
let result = self.try_emit();
|
||||
self.stream.rewind(self.k);
|
||||
self.reset();
|
||||
if result.is_some() {
|
||||
return result;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// ── 3. Super-kmer length check ────────────────────────────────────
|
||||
if self.scratch.len() == 256 {
|
||||
let result = self.try_emit();
|
||||
self.stream.rewind(self.k);
|
||||
self.reset();
|
||||
if result.is_some() {
|
||||
return result;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
self.prev_min = Some(min);
|
||||
self.prev_min_pos = min_pos;
|
||||
self.scratch.push(byte);
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user