refactor: migrate pipeline to NucPage-based stream processing
Replace the existing chunk and Rope-based processing pipeline with a fixed-size NucPage architecture. Introduce a new nucstream module featuring buffer-pooled, in-place parsing that auto-detects and decompresses FASTA/FASTQ/GenBank inputs into normalized ACGT streams with k-mer overlap preservation. Update obikmer scatter and superkmer stages to consume NucPage iterators and cursor-based navigation, eliminating std::io::Read dependencies and optimizing memory management. Add a configurable max_open_files CLI argument and update implementation documentation to reflect the new record vs. stream reading paths.
This commit is contained in:
@@ -17,9 +17,21 @@ pub use iter::SuperKmerIter;
|
||||
pub use scratch::SuperKmerScratch;
|
||||
pub use stream_iter::SuperKmerStreamIter;
|
||||
|
||||
use obiread::NucPage;
|
||||
use obikrope::Rope;
|
||||
use obikseq::RoutableSuperKmer;
|
||||
|
||||
/// Collect all super-kmers from a normalised [`NucPage`].
|
||||
pub fn build_superkmers_page(
|
||||
page: NucPage,
|
||||
k: usize,
|
||||
level_max: usize,
|
||||
theta: f64,
|
||||
) -> Vec<RoutableSuperKmer> {
|
||||
let cursor = page.cursor();
|
||||
SuperKmerStreamIter::new(cursor, k, level_max, theta).collect()
|
||||
}
|
||||
|
||||
/// Collect all super-kmers from a normalised rope chunk.
|
||||
pub fn build_superkmers(
|
||||
rope: Rope,
|
||||
|
||||
@@ -1,35 +1,23 @@
|
||||
//! Streaming superkmer iterator that does not require a fully-buffered record.
|
||||
//!
|
||||
//! [`SuperKmerStreamIter`] wraps a [`NormalizedByteStream`] and yields
|
||||
//! [`RoutableSuperKmer`] values one by one, exactly as [`SuperKmerIter`] does
|
||||
//! over a [`Rope`], but without accumulating the whole input in memory first.
|
||||
//!
|
||||
//! This makes it suitable for large GBFF chromosomes (250 MiB ORIGIN sections)
|
||||
//! or any other source where buffering the full record would exhaust memory.
|
||||
//!
|
||||
//! The cut conditions and superkmer semantics are identical to [`SuperKmerIter`]:
|
||||
//!
|
||||
//! | Condition | stream rewind |
|
||||
//! |------------------------|---------------|
|
||||
//! | entropy(kmer) ≤ θ | k−1 |
|
||||
//! | minimizer changed | k |
|
||||
//! | super-kmer length = 256| k |
|
||||
//!
|
||||
//! [`SuperKmerIter`]: crate::iter::SuperKmerIter
|
||||
//! [`Rope`]: obikrope::Rope
|
||||
//! Streaming superkmer iterator over a [`NucPageCursor`].
|
||||
|
||||
use std::io::Read;
|
||||
|
||||
use obiread::stream::NormalizedByteStream;
|
||||
use obiread::NucPageCursor;
|
||||
use obikseq::RoutableSuperKmer;
|
||||
use obikseq::kmer::Minimizer;
|
||||
|
||||
use crate::rolling_stat::RollingStat;
|
||||
use crate::scratch::SuperKmerScratch;
|
||||
|
||||
/// Streaming iterator over [`RoutableSuperKmer`] values.
|
||||
pub struct SuperKmerStreamIter<R: Read> {
|
||||
stream: NormalizedByteStream<R>,
|
||||
/// Streaming iterator over [`RoutableSuperKmer`] values from a [`NucPageCursor`].
|
||||
///
|
||||
/// Cut conditions (checked in order per nucleotide, once the k-mer window is full):
|
||||
///
|
||||
/// | Condition | cursor rewind |
|
||||
/// |------------------------|---------------|
|
||||
/// | entropy(kmer) ≤ θ | k−1 |
|
||||
/// | minimizer changed | k |
|
||||
/// | super-kmer length = 256| k |
|
||||
pub struct SuperKmerStreamIter<'a> {
|
||||
cursor: NucPageCursor<'a>,
|
||||
k: usize,
|
||||
theta: f64,
|
||||
scratch: SuperKmerScratch,
|
||||
@@ -38,23 +26,18 @@ pub struct SuperKmerStreamIter<R: Read> {
|
||||
prev_min_pos: usize,
|
||||
}
|
||||
|
||||
impl<R: Read> SuperKmerStreamIter<R> {
|
||||
/// Build a streaming superkmer iterator from any `Read` source.
|
||||
///
|
||||
/// - `reader`: raw bytes (FASTA, FASTQ, or GBFF; format auto-detected)
|
||||
/// - `k`: k-mer size (must be odd, 11 ≤ k ≤ 31)
|
||||
/// - `level_max`: maximum sub-word size for entropy (1–6)
|
||||
/// - `theta`: entropy threshold; k-mers with score ≤ theta are rejected
|
||||
pub fn new(reader: R, k: usize, level_max: usize, theta: f64) -> std::io::Result<Self> {
|
||||
Ok(Self {
|
||||
stream: NormalizedByteStream::new(reader)?,
|
||||
impl<'a> SuperKmerStreamIter<'a> {
|
||||
/// Build an iterator from a [`NucPageCursor`] over normalised sequence data.
|
||||
pub fn new(cursor: NucPageCursor<'a>, k: usize, level_max: usize, theta: f64) -> Self {
|
||||
Self {
|
||||
cursor,
|
||||
k,
|
||||
theta,
|
||||
scratch: SuperKmerScratch::new(),
|
||||
stat: RollingStat::new(level_max),
|
||||
prev_min: None,
|
||||
prev_min_pos: 0,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn reset(&mut self) {
|
||||
@@ -73,12 +56,12 @@ impl<R: Read> SuperKmerStreamIter<R> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<R: Read> Iterator for SuperKmerStreamIter<R> {
|
||||
impl Iterator for SuperKmerStreamIter<'_> {
|
||||
type Item = RoutableSuperKmer;
|
||||
|
||||
fn next(&mut self) -> Option<RoutableSuperKmer> {
|
||||
loop {
|
||||
let byte = match self.stream.next_byte() {
|
||||
let byte = match self.cursor.next_byte() {
|
||||
None => {
|
||||
return self.try_emit();
|
||||
}
|
||||
@@ -103,7 +86,7 @@ impl<R: Read> Iterator for SuperKmerStreamIter<R> {
|
||||
// ── 1. Entropy check ─────────────────────────────────────────────
|
||||
if self.stat.normalized_entropy().unwrap_or(1.0) < self.theta {
|
||||
let result = self.try_emit();
|
||||
self.stream.rewind(self.k - 1);
|
||||
self.cursor.rewind(self.k - 1);
|
||||
self.reset();
|
||||
if result.is_some() {
|
||||
return result;
|
||||
@@ -114,11 +97,11 @@ impl<R: Read> Iterator for SuperKmerStreamIter<R> {
|
||||
let min = self.stat.canonical_minimizer().unwrap();
|
||||
let min_pos = self.stat.minimizer_position().unwrap_or(0);
|
||||
|
||||
// ── 2. Minimizer change check ─────────────────────────────────────
|
||||
// ── 2. Minimizer change ───────────────────────────────────────────
|
||||
if let Some(prev) = self.prev_min {
|
||||
if min != prev {
|
||||
let result = self.try_emit();
|
||||
self.stream.rewind(self.k);
|
||||
self.cursor.rewind(self.k);
|
||||
self.reset();
|
||||
if result.is_some() {
|
||||
return result;
|
||||
@@ -127,10 +110,10 @@ impl<R: Read> Iterator for SuperKmerStreamIter<R> {
|
||||
}
|
||||
}
|
||||
|
||||
// ── 3. Super-kmer length check ────────────────────────────────────
|
||||
// ── 3. Super-kmer length cap ──────────────────────────────────────
|
||||
if self.scratch.len() == 256 {
|
||||
let result = self.try_emit();
|
||||
self.stream.rewind(self.k);
|
||||
self.cursor.rewind(self.k);
|
||||
self.reset();
|
||||
if result.is_some() {
|
||||
return result;
|
||||
|
||||
Reference in New Issue
Block a user