From 0d9be53d1fcef566e08f9cbd54fdbbcb68ca0b36 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Tue, 26 May 2026 22:55:05 +0200 Subject: [PATCH 1/6] feat: enforce runtime validation for kmer and minimizer parameters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduces `CommonArgs::validate()` to enforce strict constraints on `--kmer-size` (odd, 11–31), `--minimizer-size` (odd, 3–k−1), and `z` (strictly less than k). This validation is applied at the entry point of the `superkmer` and `index` commands to prevent invalid configurations, avoid palindromes, prevent u64 overflow, and ensure positive effective indexing sizes. Documentation is updated to reflect these runtime checks and immediate termination on invalid input. --- docmd/index.md | 12 ++++++++++++ docmd/kmers.md | 4 +++- src/obikmer/src/cli.rs | 23 +++++++++++++++++++++++ src/obikmer/src/cmd/index.rs | 11 +++++++++++ src/obikmer/src/cmd/superkmer.rs | 2 ++ 5 files changed, 51 insertions(+), 1 deletion(-) diff --git a/docmd/index.md b/docmd/index.md index a5a0934..1b73fa7 100644 --- a/docmd/index.md +++ b/docmd/index.md @@ -27,6 +27,18 @@ - Canonical form: `min(kmer, revcomp(kmer))` reduces strand-symmetric space by half - Input formats: FASTA, FASTQ, gzip, streaming stdin; `index` reads from stdin automatically when no input files are provided (`-` can also be passed explicitly among other paths) +## Parameter constraints (enforced at CLI) + +All constraints below are checked by `CommonArgs::validate()` at the start of `superkmer` and `index`. Invalid values exit immediately with an error. + +| Parameter | Constraint | Reason | +|-----------|-----------|--------| +| k (`--kmer-size`) | odd | even k allows palindromic k-mers: kmer == revcomp(kmer), breaking the canonical form invariant | +| k (`--kmer-size`) | k ∈ [11, 31] | k > 31 overflows u64 at 2 bits/base; k < 11 gives insufficient specificity | +| m (`--minimizer-size`) | odd | same palindrome argument as k | +| m (`--minimizer-size`) | 3 ≤ m ≤ k−1 | minimizer must be strictly shorter than the kmer | +| z (`-z`, Findere, `index --approx` only) | z ≤ k−1 | effective indexed kmer size is k−z+1; z ≥ k would make it ≤ 0 | + ## Genome label constraints Genome labels are arbitrary Unicode strings with the following restrictions: diff --git a/docmd/kmers.md b/docmd/kmers.md index 2b0f04e..f447195 100644 --- a/docmd/kmers.md +++ b/docmd/kmers.md @@ -4,9 +4,11 @@ A **kmer** is a DNA subsequence of fixed length k. Two constraints govern the choice of k: -- **k ∈ [11, 31]**: the range ensures the kmer is long enough to be specific and short enough to fit in a single machine word. +- **k ∈ [11, 31]**: the range ensures the kmer is long enough to be specific and short enough to fit in a single machine word (u64 at 2 bits/base requires k ≤ 32; k < 11 yields insufficient specificity). - **k is odd**: an odd-length sequence cannot equal its own reverse complement (no palindromes). This guarantees that the canonical form `min(kmer, revcomp(kmer))` is always strictly defined — the two orientations are always distinct — which is required for strand-independent counting. +Both constraints are **enforced at CLI entry** by `CommonArgs::validate()` in `superkmer` and `index`. Passing an invalid k exits immediately with an error message. + ## Super-kmers A **super-kmer** is a maximal run of consecutive kmers from a DNA read, each overlapping the next by k−1 nucleotides, sharing the same **canonical minimizer**. The **canonical minimizer** of a kmer is the m-mer (m < k) whose canonical hash `hash_kmer(min(m-mer, revcomp(m-mer)))` is smallest over all m-mers in the kmer window. The hash function is a `mix64`-based bijection; selection is purely hash-ordered with no degeneracy filter. A super-kmer is capped at 256 nucleotides; a longer run is split at that boundary. diff --git a/src/obikmer/src/cli.rs b/src/obikmer/src/cli.rs index 42c7600..82c3a38 100644 --- a/src/obikmer/src/cli.rs +++ b/src/obikmer/src/cli.rs @@ -63,6 +63,29 @@ pub fn block_size_to_bits(n: usize) -> u8 { } impl CommonArgs { + /// Validate k and m constraints. Exits on error. + pub fn validate(&self) { + let k = self.kmer_size; + let m = self.minimizer_size; + + if k < 11 || k > 31 { + eprintln!("error: --kmer-size must be in [11, 31] (got {k})"); + std::process::exit(1); + } + if k % 2 == 0 { + eprintln!("error: --kmer-size must be odd (got {k}); even k allows palindromic k-mers"); + std::process::exit(1); + } + if m < 3 || m >= k { + eprintln!("error: --minimizer-size must be in [3, k−1] = [3, {}] (got {m})", k - 1); + std::process::exit(1); + } + if m % 2 == 0 { + eprintln!("error: --minimizer-size must be odd (got {m})"); + std::process::exit(1); + } + } + pub fn seqfile_paths(&self) -> obiread::PathIter { let paths: Vec = if self.inputs.is_empty() { vec![PathBuf::from("-")] diff --git a/src/obikmer/src/cmd/index.rs b/src/obikmer/src/cmd/index.rs index 8a7d8ea..13268d8 100644 --- a/src/obikmer/src/cmd/index.rs +++ b/src/obikmer/src/cmd/index.rs @@ -152,12 +152,23 @@ pub(crate) fn resolve_approx_params( } pub fn run(args: IndexArgs) { + args.common.validate(); + let output = args.output.clone(); let mut rep = Reporter::new(); // ── Resolve evidence kind ──────────────────────────────────────────────── let evidence = if args.approx { let (z, b, fp) = resolve_approx_params(args.findere_z, args.evidence_bits, args.fp); + let k = args.common.kmer_size; + if z as usize >= k { + eprintln!( + "error: Findere z={z} must be < kmer-size={k} \ + (effective kmer size k−z+1 = {} ≤ 0)", + k as isize - z as isize + 1 + ); + std::process::exit(1); + } info!("approximate evidence: b={b}, z={z}, fp={fp:.2e}"); IndexMode::Approx { b, z } } else { diff --git a/src/obikmer/src/cmd/superkmer.rs b/src/obikmer/src/cmd/superkmer.rs index 7d4f91a..a1c6398 100644 --- a/src/obikmer/src/cmd/superkmer.rs +++ b/src/obikmer/src/cmd/superkmer.rs @@ -33,6 +33,8 @@ fn write_batch( // ── Entry point ─────────────────────────────────────────────────────────────── pub fn run(args: SuperkmerArgs) { + args.common.validate(); + let k = args.common.kmer_size; let m = args.common.minimizer_size; let theta = args.common.theta; -- 2.52.0 From a4b57a96dec2a1ec3b8cefca100dbe9f07965974 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Wed, 27 May 2026 00:09:12 +0200 Subject: [PATCH 2/6] feat: add streaming sequence reader and superkmer iterator Introduce the `obiread` crate with a streaming byte normalizer that processes FASTA, FASTQ, and GenBank files using a 64 KiB ring buffer for O(1) memory usage. Integrate this crate into `obiskbuilder` to provide `SuperKmerStreamIter`, enabling memory-efficient superkmer traversal with rolling entropy and minimizer-based cut conditions. --- src/Cargo.lock | 1 + src/obiread/src/lib.rs | 2 + src/obiread/src/mimetype.rs | 5 + src/obiread/src/path_iterator.rs | 8 +- src/obiread/src/stream.rs | 507 ++++++++++++++++++++++++++++ src/obiskbuilder/Cargo.toml | 1 + src/obiskbuilder/src/lib.rs | 2 + src/obiskbuilder/src/stream_iter.rs | 146 ++++++++ 8 files changed, 671 insertions(+), 1 deletion(-) create mode 100644 src/obiread/src/stream.rs create mode 100644 src/obiskbuilder/src/stream_iter.rs diff --git a/src/Cargo.lock b/src/Cargo.lock index dd85960..4cfce7b 100644 --- a/src/Cargo.lock +++ b/src/Cargo.lock @@ -1637,6 +1637,7 @@ dependencies = [ "lazy_static", "obikrope", "obikseq", + "obiread", ] [[package]] diff --git a/src/obiread/src/lib.rs b/src/obiread/src/lib.rs index 771d839..94f0b8e 100644 --- a/src/obiread/src/lib.rs +++ b/src/obiread/src/lib.rs @@ -13,6 +13,7 @@ pub mod normalize; mod path_iterator; pub mod peakreader; pub mod record; +pub mod stream; pub mod xopen; pub use chunk::{SeqChunkIter, fasta_chunks, fastq_chunks, @@ -21,6 +22,7 @@ pub use normalize::{normalize_fasta_chunk, normalize_fastq_chunk, normalize_sequ pub use mimetype::MimeTypeGuesser; pub use path_iterator::{PathIter, path_iter}; pub use peakreader::PeekReader; +pub use stream::NormalizedByteStream; pub use xopen::xopen; /// Default read block size: 1 MiB. diff --git a/src/obiread/src/mimetype.rs b/src/obiread/src/mimetype.rs index 906b627..ab7ea5c 100644 --- a/src/obiread/src/mimetype.rs +++ b/src/obiread/src/mimetype.rs @@ -9,6 +9,10 @@ use crate::peakreader::PeekReader; const BUF_SIZE: usize = 4096; +fn is_gbff(buf: &[u8]) -> bool { + buf.starts_with(b"LOCUS ") +} + static RE_FASTA: LazyLock = LazyLock::new(|| Regex::new(r"^>[^ ]").unwrap()); fn is_fasta(buf: &[u8]) -> bool { std::str::from_utf8(buf).map_or(false, |s| RE_FASTA.is_match(s)) @@ -30,6 +34,7 @@ fn is_text(buf: &[u8]) -> bool { // Most specific formats (fastq, fasta) come before the generic text/plain fallback. static INFER: LazyLock = LazyLock::new(|| { let mut infer = Infer::new(); + infer.add("text/gbff", "gbff", is_gbff); infer.add("text/fastq", "fastq", is_fastq); infer.add("text/fasta", "fasta", is_fasta); infer.add("text/plain", "txt", is_text); diff --git a/src/obiread/src/path_iterator.rs b/src/obiread/src/path_iterator.rs index 9e73144..815f22c 100644 --- a/src/obiread/src/path_iterator.rs +++ b/src/obiread/src/path_iterator.rs @@ -71,7 +71,7 @@ pub fn path_iter(paths: &[String]) -> PathIter { PathIter::new(path_bufs) } -/// Returns true if the path ends with a fasta or fastq file extension. +/// Returns true if the path ends with a recognised sequence file extension. fn is_fasta_or_fastq(path: &Path) -> bool { let name = path.file_name().and_then(|n| n.to_str()).unwrap_or(""); name.ends_with(".fasta") @@ -82,4 +82,10 @@ fn is_fasta_or_fastq(path: &Path) -> bool { || name.ends_with(".fa.gz") || name.ends_with(".fastq.gz") || name.ends_with(".fq.gz") + || name.ends_with(".gbff") + || name.ends_with(".gbk") + || name.ends_with(".gb") + || name.ends_with(".gbff.gz") + || name.ends_with(".gbk.gz") + || name.ends_with(".gb.gz") } diff --git a/src/obiread/src/stream.rs b/src/obiread/src/stream.rs new file mode 100644 index 0000000..e797ee2 --- /dev/null +++ b/src/obiread/src/stream.rs @@ -0,0 +1,507 @@ +//! Streaming byte normaliser for FASTA, FASTQ, and GBFF sequence files. +//! +//! [`NormalizedByteStream`] wraps any `Read` source in a 64 KiB circular ring +//! buffer and emits a normalised byte stream: +//! +//! - Uppercase `ACGT` bytes for in-sequence positions. +//! - `0x00` as a separator between independent sequence segments (record +//! boundary, non-ACGT base, or end of the GBFF `//` record). +//! - `None` at true EOF. +//! +//! Unlike the Rope-based pipeline, the stream never accumulates a full +//! record: it is safe for chromosomes of arbitrary size (e.g. GBFF files +//! with 250 MiB ORIGIN sections). +//! +//! # Rewind invariant +//! +//! `rewind(n)` subtracts `n` from the read head. It is only called by +//! [`obiskbuilder::SuperKmerStreamIter`] when the last `n` bytes were all +//! ACGT (no format overhead in between), so the ring buffer always contains +//! those bytes at positions `[head-n .. head]`. The maximum rewind is +//! `k ≤ 31`, well within the 32 KiB kept below the refill watermark. + +use std::io::{self, Read}; + +// ── ring-buffer constants ───────────────────────────────────────────────────── + +const CAP: usize = 65536; // must be a power of two +const MASK: usize = CAP - 1; +const REFILL_THRESHOLD: usize = 32768; // refill when available drops below this +const REFILL_CHUNK: usize = 32768; // target bytes per refill call + +// ── format detection ────────────────────────────────────────────────────────── + +#[derive(Clone, Copy, PartialEq, Eq)] +enum Format { Fasta, Fastq, Gbff } + +fn detect_format(buf: &[u8]) -> Format { + if buf.starts_with(b"LOCUS ") { + Format::Gbff + } else if buf.first() == Some(&b'>') { + Format::Fasta + } else { + Format::Fastq + } +} + +// ── format state machines ───────────────────────────────────────────────────── + +#[derive(Clone, Copy)] +enum FaState { + OutSeq, // reading header / between records + InSeq, // reading sequence bases +} + +#[derive(Clone, Copy)] +enum FqState { + Header, // reading @id line + Seq, // reading sequence bases + Plus, // reading +[id] separator line + Qual(usize), // skipping quality bytes; payload = remaining count +} + +const ORIGIN: &[u8] = b"ORIGIN"; + +#[derive(Clone, Copy)] +enum GbState { + Pre(u8), // pre-ORIGIN: n chars of "ORIGIN" matched at current line start + PreSkip, // pre-ORIGIN: skip rest of non-ORIGIN line then → Pre(0) + LineStart, // ORIGIN data: at column 0 of a numbered data line + Num, // ORIGIN data: skipping spaces + line-number digits + Seq, // ORIGIN data: reading lowercase sequence bases + Slash, // ORIGIN data: saw the first '/' of a potential '//' end marker +} + +#[derive(Clone, Copy)] +enum State { + Fa(FaState), + Fq(FqState), + Gb(GbState), +} + +// ── NormalizedByteStream ────────────────────────────────────────────────────── + +/// A streaming normaliser over any `Read` source. +/// +/// Call [`next_byte`](NormalizedByteStream::next_byte) to consume one normalised +/// byte at a time. Call [`rewind`](NormalizedByteStream::rewind) to step back +/// up to 31 bytes (used by the superkmer builder when a minimizer or length +/// boundary is crossed). +pub struct NormalizedByteStream { + reader: R, + buf: Box<[u8; CAP]>, + head: usize, // absolute read position; index = head & MASK + write: usize, // absolute write position; index = write & MASK + eof: bool, + state: State, + seq_len: usize, // FASTQ only: non-newline chars seen in current sequence line +} + +impl NormalizedByteStream { + /// Wrap `reader` and detect its format from the first bytes. + pub fn new(mut reader: R) -> io::Result { + let mut buf = Box::new([0u8; CAP]); + + // Initial read — enough to detect the format. + let n = reader.read(&mut buf[..REFILL_CHUNK])?; + let fmt = detect_format(&buf[..n]); + let state = match fmt { + Format::Fasta => State::Fa(FaState::OutSeq), + Format::Fastq => State::Fq(FqState::Header), + Format::Gbff => State::Gb(GbState::Pre(0)), + }; + + Ok(Self { + reader, + buf, + head: 0, + write: n, + eof: n == 0, + state, + seq_len: 0, + }) + } + + /// Step the read head back by `n` bytes. + /// + /// # Safety + /// The caller guarantees that the last `n` bytes returned by + /// [`next_byte`] were all uppercase ACGT (no `0x00` separators), so those + /// bytes are still present in the ring buffer. + #[inline] + pub fn rewind(&mut self, n: usize) { + self.head -= n; + } + + #[inline] + fn available(&self) -> usize { + self.write - self.head + } + + fn try_refill(&mut self) -> io::Result<()> { + if self.eof || self.available() >= REFILL_THRESHOLD { + return Ok(()); + } + + // First segment: write_idx → end of buffer (contiguous, no wrap). + let write_idx = self.write & MASK; + let free = CAP - self.available(); + let to_end = CAP - write_idx; + let chunk1 = free.min(to_end).min(REFILL_CHUNK); + + if chunk1 > 0 { + let n = self.reader.read(&mut self.buf[write_idx..write_idx + chunk1])?; + self.write += n; + if n == 0 { + self.eof = true; + return Ok(()); + } + } + + // Second segment: if still below threshold, read into the wrapped region. + if self.available() < REFILL_THRESHOLD && !self.eof { + let write_idx2 = self.write & MASK; + let free2 = CAP - self.available(); + let to_end2 = CAP - write_idx2; + let chunk2 = free2.min(to_end2).min(REFILL_CHUNK); + if chunk2 > 0 { + let n = self.reader.read(&mut self.buf[write_idx2..write_idx2 + chunk2])?; + self.write += n; + if n == 0 { + self.eof = true; + } + } + } + + Ok(()) + } + + /// Return the next normalised byte, or `None` at EOF. + /// + /// Emits uppercase `A`, `C`, `G`, `T`, or `0x00` (segment separator). + pub fn next_byte(&mut self) -> Option { + loop { + // Proactive refill: keep ≥ REFILL_THRESHOLD bytes in the buffer + // so rewinds are always safe. + if !self.eof && self.available() < REFILL_THRESHOLD { + let _ = self.try_refill(); + } + + if self.available() == 0 { + return None; + } + + let b = self.buf[self.head & MASK]; + self.head += 1; + + // Copy state to avoid borrow conflicts while mutating self.state. + let state = self.state; + if let Some(out) = self.process(state, b) { + return Some(out); + } + } + } + + // ── per-format byte processors ──────────────────────────────────────────── + + #[inline] + fn process(&mut self, state: State, b: u8) -> Option { + match state { + State::Fa(fa) => self.fasta(fa, b), + State::Fq(fq) => self.fastq(fq, b), + State::Gb(gb) => self.gbff(gb, b), + } + } + + fn fasta(&mut self, fa: FaState, b: u8) -> Option { + match fa { + FaState::OutSeq => { + if b == b'\n' { + self.state = State::Fa(FaState::InSeq); + } + None // skip header bytes + } + FaState::InSeq => { + if b == b'>' { + self.state = State::Fa(FaState::OutSeq); + Some(0x00) // record boundary + } else if b == b'\n' || b == b'\r' { + None + } else { + Some(normalize_nuc(b)) + } + } + } + } + + fn fastq(&mut self, fq: FqState, b: u8) -> Option { + match fq { + FqState::Header => { + if b == b'\n' { + self.state = State::Fq(FqState::Seq); + self.seq_len = 0; + } + None + } + FqState::Seq => { + if b == b'\n' { + self.state = State::Fq(FqState::Plus); + None + } else if b == b'\r' { + None + } else { + self.seq_len += 1; // count all chars, including non-ACGT + Some(normalize_nuc(b)) + } + } + FqState::Plus => { + if b == b'\n' { + self.state = State::Fq(FqState::Qual(self.seq_len)); + } + None + } + FqState::Qual(rem) => { + if b == b'\n' { + if rem == 0 { + // Quality line ended after all bases consumed → new record. + self.state = State::Fq(FqState::Header); + Some(0x00) // record boundary + } else { + // Newline inside multi-line quality — keep counting. + None + } + } else if b == b'\r' { + None + } else if rem > 0 { + self.state = State::Fq(FqState::Qual(rem - 1)); + None + } else { + // rem == 0 but non-newline: shouldn't happen in valid FASTQ. + None + } + } + } + } + + fn gbff(&mut self, gb: GbState, b: u8) -> Option { + match gb { + GbState::Pre(n) => { + let n = n as usize; + if n < 6 { + if b == ORIGIN[n] { + self.state = State::Gb(GbState::Pre((n + 1) as u8)); + } else if b == b'\n' { + self.state = State::Gb(GbState::Pre(0)); + } else { + self.state = State::Gb(GbState::PreSkip); + } + } else { + // All 6 chars of "ORIGIN" matched; skip the rest of the header line. + if b == b'\n' { + self.state = State::Gb(GbState::LineStart); + } + // Non-'\n': stay in Pre(6) implicitly (state unchanged). + } + None + } + GbState::PreSkip => { + if b == b'\n' { + self.state = State::Gb(GbState::Pre(0)); + } + None + } + GbState::LineStart => { + if b == b'/' { + self.state = State::Gb(GbState::Slash); + } else if b != b'\n' { + // Space or digit: start of a numbered sequence line. + self.state = State::Gb(GbState::Num); + } + None + } + GbState::Num => { + if b == b'\n' { + self.state = State::Gb(GbState::LineStart); + None + } else if b.is_ascii_digit() || b == b' ' { + None // still in the number prefix + } else { + // First letter: transition to Seq and process this byte. + self.state = State::Gb(GbState::Seq); + Some(normalize_nuc(b)) + } + } + GbState::Seq => { + if b == b'\n' { + self.state = State::Gb(GbState::LineStart); + None + } else if b == b' ' { + None // inter-group space + } else { + Some(normalize_nuc(b)) + } + } + GbState::Slash => { + if b == b'/' { + // End of GBFF record ("//"). + self.state = State::Gb(GbState::Pre(0)); // ready for next record + Some(0x00) + } else { + // Stray '/' — shouldn't happen in valid GBFF; recover. + self.state = State::Gb(GbState::LineStart); + None + } + } + } + } +} + +// ── nucleotide normalisation ────────────────────────────────────────────────── + +#[inline] +fn normalize_nuc(b: u8) -> u8 { + match b | 0x20 { + b'a' => b'A', + b'c' => b'C', + b'g' => b'G', + b't' => b'T', + _ => 0x00, + } +} + +// ── tests ───────────────────────────────────────────────────────────────────── + +#[cfg(test)] +mod tests { + use super::*; + + fn stream(data: &[u8]) -> Vec { + let mut s = NormalizedByteStream::new(data).unwrap(); + let mut out = Vec::new(); + while let Some(b) = s.next_byte() { + out.push(b); + } + out + } + + // ── FASTA ───────────────────────────────────────────────────────────────── + + #[test] + fn fasta_single_record() { + assert_eq!(stream(b">s1\nACGTACGT\n"), b"ACGTACGT"); + } + + #[test] + fn fasta_two_records_separated() { + let out = stream(b">s1\nACGT\n>s2\nTTTT\n"); + assert_eq!(out, b"ACGT\x00TTTT"); + } + + #[test] + fn fasta_multiline_concatenated() { + assert_eq!(stream(b">s1\nACGT\nACGT\n"), b"ACGTACGT"); + } + + #[test] + fn fasta_lowercase_uppercased() { + assert_eq!(stream(b">s1\nacgt\n"), b"ACGT"); + } + + #[test] + fn fasta_non_acgt_becomes_null() { + let out = stream(b">s1\nACGTNACGT\n"); + assert_eq!(out, b"ACGT\x00ACGT"); + } + + // ── FASTQ ───────────────────────────────────────────────────────────────── + + fn make_fastq(records: &[&[u8]]) -> Vec { + let mut buf = Vec::new(); + for seq in records { + buf.extend_from_slice(b"@hdr\n"); + buf.extend_from_slice(seq); + buf.push(b'\n'); + buf.extend_from_slice(b"+\n"); + buf.extend_from_slice(&vec![b'I'; seq.len()]); + buf.push(b'\n'); + } + buf + } + + #[test] + fn fastq_single_record() { + // Trailing \x00 is emitted on the newline that ends the quality line. + assert_eq!(stream(&make_fastq(&[b"ACGTACGT"])), b"ACGTACGT\x00"); + } + + #[test] + fn fastq_two_records_separated() { + let out = stream(&make_fastq(&[b"ACGT", b"TTTT"])); + assert_eq!(out, b"ACGT\x00TTTT\x00"); + } + + #[test] + fn fastq_lowercase_uppercased() { + assert_eq!(stream(&make_fastq(&[b"acgt"])), b"ACGT\x00"); + } + + #[test] + fn fastq_non_acgt_becomes_null() { + let out = stream(&make_fastq(&[b"ACGTNACGT"])); + assert_eq!(out, b"ACGT\x00ACGT\x00"); + } + + // ── GBFF ────────────────────────────────────────────────────────────────── + + fn make_gbff(seqs: &[&[u8]]) -> Vec { + let mut buf = Vec::new(); + for seq in seqs { + buf.extend_from_slice(b"LOCUS NC_000001\nFEATURES\nORIGIN\n"); + // write numbered lines of 60 bases each + let mut pos = 1usize; + for chunk in seq.chunks(60) { + let groups: Vec<&[u8]> = chunk.chunks(10).collect(); + let line = groups.join(&b' '); + buf.extend_from_slice(format!("{:9} ", pos).as_bytes()); + buf.extend_from_slice(&line); + buf.push(b'\n'); + pos += chunk.len(); + } + buf.extend_from_slice(b"//\n"); + } + buf + } + + #[test] + fn gbff_single_record() { + let seq = b"acgtacgtacgt"; + let out = stream(&make_gbff(&[seq])); + assert_eq!(out, b"ACGTACGTACGT\x00"); + } + + #[test] + fn gbff_two_records_separated() { + let out = stream(&make_gbff(&[b"acgtacgt", b"tttttttt"])); + assert_eq!(out, b"ACGTACGT\x00TTTTTTTT\x00"); + } + + #[test] + fn gbff_non_acgt_becomes_null() { + let out = stream(&make_gbff(&[b"acgtnacgt"])); + assert_eq!(out, b"ACGT\x00ACGT\x00"); + } + + // ── rewind ──────────────────────────────────────────────────────────────── + + #[test] + fn rewind_replays_bytes() { + let mut s = NormalizedByteStream::new(b">s\nACGT\n" as &[u8]).unwrap(); + assert_eq!(s.next_byte(), Some(b'A')); + assert_eq!(s.next_byte(), Some(b'C')); + s.rewind(1); + assert_eq!(s.next_byte(), Some(b'C')); + assert_eq!(s.next_byte(), Some(b'G')); + assert_eq!(s.next_byte(), Some(b'T')); + assert_eq!(s.next_byte(), None); + } +} diff --git a/src/obiskbuilder/Cargo.toml b/src/obiskbuilder/Cargo.toml index 0b68fb8..5e19801 100644 --- a/src/obiskbuilder/Cargo.toml +++ b/src/obiskbuilder/Cargo.toml @@ -6,6 +6,7 @@ edition = "2024" [dependencies] obikseq = { path = "../obikseq" } obikrope = { path = "../obikrope" } +obiread = { path = "../obiread" } lazy_static = "1.5.0" [dev-dependencies] diff --git a/src/obiskbuilder/src/lib.rs b/src/obiskbuilder/src/lib.rs index 89ea349..a5409e1 100644 --- a/src/obiskbuilder/src/lib.rs +++ b/src/obiskbuilder/src/lib.rs @@ -6,6 +6,7 @@ #![deny(missing_docs)] pub mod iter; +pub mod stream_iter; mod scratch; pub(crate) mod encoding; @@ -14,6 +15,7 @@ pub(crate) mod rolling_stat; pub use iter::SuperKmerIter; pub use scratch::SuperKmerScratch; +pub use stream_iter::SuperKmerStreamIter; use obikrope::Rope; use obikseq::RoutableSuperKmer; diff --git a/src/obiskbuilder/src/stream_iter.rs b/src/obiskbuilder/src/stream_iter.rs new file mode 100644 index 0000000..25f41c5 --- /dev/null +++ b/src/obiskbuilder/src/stream_iter.rs @@ -0,0 +1,146 @@ +//! Streaming superkmer iterator that does not require a fully-buffered record. +//! +//! [`SuperKmerStreamIter`] wraps a [`NormalizedByteStream`] and yields +//! [`RoutableSuperKmer`] values one by one, exactly as [`SuperKmerIter`] does +//! over a [`Rope`], but without accumulating the whole input in memory first. +//! +//! This makes it suitable for large GBFF chromosomes (250 MiB ORIGIN sections) +//! or any other source where buffering the full record would exhaust memory. +//! +//! The cut conditions and superkmer semantics are identical to [`SuperKmerIter`]: +//! +//! | Condition | stream rewind | +//! |------------------------|---------------| +//! | entropy(kmer) ≤ θ | k−1 | +//! | minimizer changed | k | +//! | super-kmer length = 256| k | +//! +//! [`SuperKmerIter`]: crate::iter::SuperKmerIter +//! [`Rope`]: obikrope::Rope + +use std::io::Read; + +use obiread::stream::NormalizedByteStream; +use obikseq::RoutableSuperKmer; +use obikseq::kmer::Minimizer; + +use crate::rolling_stat::RollingStat; +use crate::scratch::SuperKmerScratch; + +/// Streaming iterator over [`RoutableSuperKmer`] values. +pub struct SuperKmerStreamIter { + stream: NormalizedByteStream, + k: usize, + theta: f64, + scratch: SuperKmerScratch, + stat: RollingStat, + prev_min: Option, + prev_min_pos: usize, +} + +impl SuperKmerStreamIter { + /// Build a streaming superkmer iterator from any `Read` source. + /// + /// - `reader`: raw bytes (FASTA, FASTQ, or GBFF; format auto-detected) + /// - `k`: k-mer size (must be odd, 11 ≤ k ≤ 31) + /// - `level_max`: maximum sub-word size for entropy (1–6) + /// - `theta`: entropy threshold; k-mers with score ≤ theta are rejected + pub fn new(reader: R, k: usize, level_max: usize, theta: f64) -> std::io::Result { + Ok(Self { + stream: NormalizedByteStream::new(reader)?, + k, + theta, + scratch: SuperKmerScratch::new(), + stat: RollingStat::new(level_max), + prev_min: None, + prev_min_pos: 0, + }) + } + + fn reset(&mut self) { + self.stat.reset(); + self.scratch.reset(); + self.prev_min = None; + self.prev_min_pos = 0; + } + + fn try_emit(&mut self) -> Option { + if self.scratch.len() < self.k { + return None; + } + self.prev_min?; + Some(self.scratch.emit(self.prev_min_pos)) + } +} + +impl Iterator for SuperKmerStreamIter { + type Item = RoutableSuperKmer; + + fn next(&mut self) -> Option { + loop { + let byte = match self.stream.next_byte() { + None => { + return self.try_emit(); + } + Some(0x00) => { + let result = self.try_emit(); + self.reset(); + if result.is_some() { + return result; + } + continue; + } + Some(b) => b, + }; + + self.stat.push(byte); + + if !self.stat.ready() { + self.scratch.push(byte); + continue; + } + + // ── 1. Entropy check ───────────────────────────────────────────── + if self.stat.normalized_entropy().unwrap_or(1.0) < self.theta { + let result = self.try_emit(); + self.stream.rewind(self.k - 1); + self.reset(); + if result.is_some() { + return result; + } + continue; + } + + let min = self.stat.canonical_minimizer().unwrap(); + let min_pos = self.stat.minimizer_position().unwrap_or(0); + + // ── 2. Minimizer change check ───────────────────────────────────── + if let Some(prev) = self.prev_min { + if min != prev { + let result = self.try_emit(); + self.stream.rewind(self.k); + self.reset(); + if result.is_some() { + return result; + } + continue; + } + } + + // ── 3. Super-kmer length check ──────────────────────────────────── + if self.scratch.len() == 256 { + let result = self.try_emit(); + self.stream.rewind(self.k); + self.reset(); + if result.is_some() { + return result; + } + continue; + } + + self.prev_min = Some(min); + self.prev_min_pos = min_pos; + self.scratch.push(byte); + } + } +} -- 2.52.0 From cfadf63bbc4022db1be4a4f0966ca8a629930024 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Thu, 28 May 2026 22:16:31 +0200 Subject: [PATCH 3/6] refactor: migrate pipeline to NucPage-based stream processing Replace the existing chunk and Rope-based processing pipeline with a fixed-size NucPage architecture. Introduce a new nucstream module featuring buffer-pooled, in-place parsing that auto-detects and decompresses FASTA/FASTQ/GenBank inputs into normalized ACGT streams with k-mer overlap preservation. Update obikmer scatter and superkmer stages to consume NucPage iterators and cursor-based navigation, eliminating std::io::Read dependencies and optimizing memory management. Add a configurable max_open_files CLI argument and update implementation documentation to reflect the new record vs. stream reading paths. --- docmd/implementation/chunkreader.md | 20 +- docmd/implementation/pipeline.md | 6 +- src/obikmer/src/cli.rs | 37 +- src/obikmer/src/cmd/index.rs | 2 +- src/obikmer/src/cmd/superkmer.rs | 15 +- src/obikmer/src/steps/scatter.rs | 56 ++- src/obiread/src/lib.rs | 12 +- src/obiread/src/nucstream.rs | 732 ++++++++++++++++++++++++++++ src/obiread/src/stream.rs | 507 ------------------- src/obiread/src/xopen.rs | 17 +- src/obiskbuilder/src/lib.rs | 12 + src/obiskbuilder/src/stream_iter.rs | 69 +-- 12 files changed, 876 insertions(+), 609 deletions(-) create mode 100644 src/obiread/src/nucstream.rs delete mode 100644 src/obiread/src/stream.rs diff --git a/docmd/implementation/chunkreader.md b/docmd/implementation/chunkreader.md index 4021c57..06184d1 100644 --- a/docmd/implementation/chunkreader.md +++ b/docmd/implementation/chunkreader.md @@ -1,6 +1,24 @@ # Chunk reader — implementation -The `obiread` crate provides a streaming iterator that reads FASTA or FASTQ files in fixed-size blocks and yields self-contained chunks, each ending on a complete sequence record boundary. Chunks are consumed in parallel by downstream workers. +`obiread` exposes two distinct sequence reading paths, each optimised for a different use case. + +## Two reading paths + +| Path | API | Output unit | Per-record identity | Use case | +|------|-----|-------------|---------------------|----------| +| **Record path** | `read_sequence_chunks` → `parse_chunk` | `SeqRecord` (id + raw sequence + normalised rope) | yes | `query` — must read complete records | +| **Stream path** | `open_nuc_stream` | `NucPage` (flat normalised byte buffer) | no | `index`, `superkmer` — bulk throughput | + +The record path uses `Rope`-backed chunks and is described in detail below. +The stream path (`NucStream` / `NucPage`) is described in the scatter section of [pipeline](pipeline.md). + +--- + +## Record path: chunk reader + +The chunk reader reads FASTA or FASTQ files in fixed-size blocks and yields self-contained chunks, each ending on a complete sequence record boundary. `parse_chunk` then converts each chunk into a `Vec`, where each record carries its identifier, raw sequence bytes, and a normalised rope ready for superkmer building. + +This path is mandatory for `query`, where superkmers must be tracked back to their originating sequence (id, kmer offset) for output annotation. ## Output type: Rope diff --git a/docmd/implementation/pipeline.md b/docmd/implementation/pipeline.md index 2cfac4b..b95a075 100644 --- a/docmd/implementation/pipeline.md +++ b/docmd/implementation/pipeline.md @@ -19,7 +19,11 @@ The histogram gives: ## Phase 1 — Scatter -Single streaming pass over raw input files (FASTA/FASTQ, gzip). FASTQ quality scores are ignored. For each read: +Single streaming pass over raw input files (FASTA/FASTQ, gzip). FASTQ quality scores are ignored. + +Input files are read via `open_nuc_stream`, which opens and decompresses the file, auto-detects the format (FASTA / FASTQ / GenBank), and yields a sequence of `NucPage` buffers. Each `NucPage` is a flat 64 KB buffer of normalised bytes (`ACGT` + `\x00` separators), carrying a k−1 byte overlap from the preceding page so that no k-mer is lost at page boundaries. Per-record identity (sequence id, raw bytes) is not preserved; this is intentional — the scatter phase only needs normalised bases to produce superkmers. + +For each read fragment within a page: 1. **Ambiguous base filter**: cut at any non-ACGT base; discard fragments shorter than k. 2. **Entropy filter**: scan each fragment with a sliding window of size k. When the kmer $K_i = S[i \mathinner{..} i+k-1]$ ended by nucleotide $S[j]$ (with $j = i+k-1$) has entropy below threshold $\theta$, emit the current segment and start a new one (see algorithm below). $K_i$ belongs to neither segment, and no valid kmer is lost. diff --git a/src/obikmer/src/cli.rs b/src/obikmer/src/cli.rs index 82c3a38..8a84c03 100644 --- a/src/obikmer/src/cli.rs +++ b/src/obikmer/src/cli.rs @@ -1,9 +1,8 @@ -use std::io; use std::path::PathBuf; use std::sync::{Arc, Condvar, Mutex}; use clap::Args; -use obikrope::Rope; +use obiread::NucPage; use obikseq::RoutableSuperKmer; // ── Shared arguments ────────────────────────────────────────────────────────── @@ -45,9 +44,11 @@ pub struct CommonArgs { )] pub threads: usize, - /// Maximum number of input files open simultaneously - #[arg(long, default_value_t = 20)] - pub max_open_files: usize, + /// Maximum number of input files open simultaneously. + /// Defaults to threads/4 (minimum 1). Keep below the number of workers + /// to ensure CPU workers are always available for the transform stage. + #[arg(long)] + pub max_open_files: Option, } /// Smallest `b` such that `2^b >= n` (i.e. `n.next_power_of_two().ilog2()`). @@ -86,6 +87,12 @@ impl CommonArgs { } } + pub fn effective_max_open(&self) -> usize { + self.max_open_files + .unwrap_or_else(|| (self.threads / 4).max(1)) + .max(1) + } + pub fn seqfile_paths(&self) -> obiread::PathIter { let paths: Vec = if self.inputs.is_empty() { vec![PathBuf::from("-")] @@ -144,13 +151,10 @@ pub struct PathWithSlot { pub enum PipelineData { Path(PathWithSlot), - RawChunk(Rope), - NormChunk(Rope), + NucPage(NucPage), Batch(Vec), } -// SAFETY: Rope contains Cell which is !Sync, but pipeline ownership transfers -// exclusively through channels — no item is ever shared across threads. unsafe impl Send for PipelineData {} unsafe impl Sync for PipelineData {} @@ -171,18 +175,3 @@ pub fn throttle_paths( }) } -// ── I/O plumbing ────────────────────────────────────────────────────────────── - -pub fn open_chunks(path: PathBuf) -> io::Result> { - let path_str = path - .to_str() - .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidInput, "non-UTF-8 path"))?; - let iter = obiread::read_sequence_chunks(path_str)?; - Ok(iter.filter_map(|r| match r { - Ok(rope) => Some(rope), - Err(e) => { - eprintln!("chunk read error: {e}"); - None - } - })) -} diff --git a/src/obikmer/src/cmd/index.rs b/src/obikmer/src/cmd/index.rs index 13268d8..1a796f9 100644 --- a/src/obikmer/src/cmd/index.rs +++ b/src/obikmer/src/cmd/index.rs @@ -231,7 +231,7 @@ pub fn run(args: IndexArgs) { let theta = args.common.theta; let n_workers = args.common.threads.max(1); - let max_open = args.common.max_open_files.max(1); + let max_open = args.common.effective_max_open(); scatter(idx.partition_mut(), args.common.seqfile_paths(), k, level_max, theta, n_workers, max_open, &mut rep); idx.mark_scattered().unwrap_or_else(|e| { diff --git a/src/obikmer/src/cmd/superkmer.rs b/src/obikmer/src/cmd/superkmer.rs index a1c6398..1b0d7cd 100644 --- a/src/obikmer/src/cmd/superkmer.rs +++ b/src/obikmer/src/cmd/superkmer.rs @@ -4,7 +4,7 @@ use clap::Args; use obifastwrite::write_scatter; use obikseq::{RoutableSuperKmer, set_k, set_m}; -use crate::cli::{CommonArgs, PipelineData, PathWithSlot, open_chunks, partitions_to_bits, throttle_paths}; +use crate::cli::{CommonArgs, PipelineData, PathWithSlot, partitions_to_bits, throttle_paths}; #[derive(Args)] pub struct SuperkmerArgs { @@ -41,7 +41,7 @@ pub fn run(args: SuperkmerArgs) { let level_max = args.common.level_max; let partition_bits = partitions_to_bits(args.common.partitions); let n_workers = args.common.threads.max(1); - let max_open = args.common.max_open_files.max(1); + let max_open = args.common.effective_max_open(); set_k(k); set_m(m); @@ -50,9 +50,14 @@ pub fn run(args: SuperkmerArgs) { let pipe = obipipeline::make_pipe! { PipelineData : PathWithSlot => Vec, - ||? { |pw: PathWithSlot| open_chunks(pw.path) } : Path => RawChunk, - |? { move |rope| obiread::normalize_sequence_chunk(rope, k) } : RawChunk => NormChunk, - | { move |rope| obiskbuilder::build_superkmers(rope, k, level_max, theta) } : NormChunk => Batch, + ||? { + let k = k; + move |pw: PathWithSlot| { + let path_str = pw.path.to_str().unwrap_or("").to_owned(); + obiread::open_nuc_stream(&path_str, k) + } + } : Path => NucPage, + | { move |page| obiskbuilder::build_superkmers_page(page, k, level_max, theta) } : NucPage => Batch, }; let mut out = BufWriter::new(io::stdout()); diff --git a/src/obikmer/src/steps/scatter.rs b/src/obikmer/src/steps/scatter.rs index 777e92b..80d789e 100644 --- a/src/obikmer/src/steps/scatter.rs +++ b/src/obikmer/src/steps/scatter.rs @@ -1,30 +1,37 @@ use std::path::PathBuf; -use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::atomic::{AtomicU32, AtomicU64, Ordering}; use std::sync::Arc; use std::time::{Duration, Instant}; use indicatif::{ProgressBar, ProgressStyle}; +use obiread::NucPage; use obikpartitionner::KmerPartition; -use obikrope::Rope; use obisys::{Reporter, Stage}; use tracing::info; -use crate::cli::{PipelineData, PathWithSlot, open_chunks, throttle_paths}; +use crate::cli::{PipelineData, PathWithSlot, throttle_paths}; // ── Iterator that keeps the slot guard alive until the file is exhausted ────── -struct GuardedIter { - inner: I, - _guard: Box, +struct GuardedIter { + inner: Box + Send>, + _guard: Box, + flat_active: Arc, } -impl> Iterator for GuardedIter { - type Item = Rope; - fn next(&mut self) -> Option { +impl Iterator for GuardedIter { + type Item = NucPage; + fn next(&mut self) -> Option { self.inner.next() } } +impl Drop for GuardedIter { + fn drop(&mut self) { + self.flat_active.fetch_sub(1, Ordering::Relaxed); + } +} + // ── scatter ─────────────────────────────────────────────────────────────────── /// Run scatter: normalise → build superkmers → route to partition → close. @@ -44,23 +51,36 @@ pub fn scatter( // Throttle in the source thread — never in a worker — to prevent deadlock. let throttled = throttle_paths(path_source, max_open); - let file_count = Arc::new(AtomicU64::new(0)); + let file_count = Arc::new(AtomicU64::new(0)); + let flat_active = Arc::new(AtomicU32::new(0)); + let transform_active = Arc::new(AtomicU32::new(0)); let t = Stage::start("scatter"); let pipe = obipipeline::make_pipe! { PipelineData : PathWithSlot => Vec, ||? { - let file_count = Arc::clone(&file_count); + let file_count = Arc::clone(&file_count); + let flat_active = Arc::clone(&flat_active); + let k = k; move |pw: PathWithSlot| { let PathWithSlot { path, _guard } = pw; let n = file_count.fetch_add(1, Ordering::Relaxed) + 1; info!("indexing [{}]: {}", n, path.display()); - // _guard travels into GuardedIter; released when all chunks are read - open_chunks(path).map(|iter| GuardedIter { inner: iter, _guard }) + let path_str = path.to_str().unwrap_or("").to_owned(); + flat_active.fetch_add(1, Ordering::Relaxed); + obiread::open_nuc_stream(&path_str, k) + .map(|iter| GuardedIter { inner: iter, _guard, flat_active: Arc::clone(&flat_active) }) } - } : Path => RawChunk, - |? { move |rope| obiread::normalize_sequence_chunk(rope, k) } : RawChunk => NormChunk, - | { move |rope| obiskbuilder::build_superkmers(rope, k, level_max, theta) } : NormChunk => Batch, + } : Path => NucPage, + | { + let transform_active = Arc::clone(&transform_active); + move |page| { + transform_active.fetch_add(1, Ordering::Relaxed); + let result = obiskbuilder::build_superkmers_page(page, k, level_max, theta); + transform_active.fetch_sub(1, Ordering::Relaxed); + result + } + } : NucPage => Batch, }; let pb = ProgressBar::new_spinner(); @@ -93,7 +113,9 @@ pub fn scatter( (format!("{:.0} Mbp", bp / 1e6), format!("{:.0} Mbp/s", ema_rate / 1e6)) }; let n_files = file_count.load(Ordering::Relaxed); - pb.set_message(format!("{count_str} {rate_str} {n_files} files")); + let r = flat_active.load(Ordering::Relaxed); + let c = transform_active.load(Ordering::Relaxed); + pb.set_message(format!("{count_str} {rate_str} {n_files} files [R:{r} C:{c}]")); } kp.write_batch(batch).unwrap_or_else(|e| { eprintln!("error: {e}"); diff --git a/src/obiread/src/lib.rs b/src/obiread/src/lib.rs index 94f0b8e..2370bb4 100644 --- a/src/obiread/src/lib.rs +++ b/src/obiread/src/lib.rs @@ -10,19 +10,21 @@ mod fasta; mod fastq; mod mimetype; pub mod normalize; +mod nucstream; mod path_iterator; pub mod peakreader; pub mod record; -pub mod stream; pub mod xopen; -pub use chunk::{SeqChunkIter, fasta_chunks, fastq_chunks, - read_fasta_chunks, read_fastq_chunks, read_sequence_chunks}; -pub use normalize::{normalize_fasta_chunk, normalize_fastq_chunk, normalize_sequence_chunk}; +pub use chunk::{ + SeqChunkIter, fasta_chunks, fastq_chunks, read_fasta_chunks, read_fastq_chunks, + read_sequence_chunks, +}; pub use mimetype::MimeTypeGuesser; +pub use normalize::{normalize_fasta_chunk, normalize_fastq_chunk, normalize_sequence_chunk}; +pub use nucstream::{NucPage, NucPageCursor, open_nuc_stream}; pub use path_iterator::{PathIter, path_iter}; pub use peakreader::PeekReader; -pub use stream::NormalizedByteStream; pub use xopen::xopen; /// Default read block size: 1 MiB. diff --git a/src/obiread/src/nucstream.rs b/src/obiread/src/nucstream.rs new file mode 100644 index 0000000..eb252d6 --- /dev/null +++ b/src/obiread/src/nucstream.rs @@ -0,0 +1,732 @@ +use std::io::{self, Read}; +use std::mem::ManuallyDrop; +use std::sync::{Arc, Mutex}; + +use crate::mimetype::MimeTypeGuesser; +use crate::xopen::open_raw; + +pub const MAX_K: usize = 31; +const PAGE_SIZE: usize = 65536; +// overlap (MAX_K - 1) + page data (PAGE_SIZE) + 1 byte for the end-of-page terminating 0 +const BUF_SIZE: usize = MAX_K + PAGE_SIZE; + +// ─── OverlapState ───────────────────────────────────────────────────────────── + +pub(crate) struct OverlapState { + data: [u8; MAX_K], + len: usize, + k: usize, +} + +impl OverlapState { + pub(crate) fn new(k: usize) -> Self { + assert!(k > 0 && k <= MAX_K); + Self { + data: [0u8; MAX_K], + len: 0, + k, + } + } +} + +// ─── NucParser trait ────────────────────────────────────────────────────────── + +// Transforms a raw page into a compacted nucleotide stream in-place. +// +// Buffer layout on each call: +// buf[0..overlap_len()] — overlap bytes copied by write_overlap() +// buf[overlap_len()..overlap_len()+n] — raw bytes just read from the source +// +// Returns the number of output bytes in buf[0..returned]. +pub(crate) trait NucParser { + // required: format-specific + fn new(k: usize) -> Self + where + Self: Sized; + fn overlap_state(&self) -> &OverlapState; + fn overlap_state_mut(&mut self) -> &mut OverlapState; + fn is_in_seq(&self) -> bool; + fn parse_inplace(&mut self, buf: &mut [u8], n: usize) -> usize; + + // provided: format-independent overlap management + fn overlap_len(&self) -> usize { + self.overlap_state().len + } + + fn write_overlap(&self, buf: &mut [u8]) { + let ol = &self.overlap_state(); + buf[..ol.len].copy_from_slice(&ol.data[..ol.len]); + } + + // Called at end of parse_inplace: saves overlap state and returns adjusted j. + // seq_start is the j-position where the last sequence started in this call's output. + fn save_overlap(&mut self, buf: &mut [u8], j: usize, seq_start: usize) -> usize { + if !self.is_in_seq() { + self.overlap_state_mut().len = 0; + return j; + } + let seq_len = j - seq_start; + let k = self.overlap_state().k; + if seq_len >= k { + // Sequence long enough: save last k-1 nucleotides, terminate with 0. + let ol = k - 1; + self.overlap_state_mut().data[..ol].copy_from_slice(&buf[j - ol..j]); + self.overlap_state_mut().len = ol; + // SAFETY: j <= total - 1 < BUF_SIZE = buf.len() + // (total = overlap_len + n <= (MAX_K-1) + PAGE_SIZE = BUF_SIZE - 1) + unsafe { + *buf.get_unchecked_mut(j) = 0; + } + j + 1 + } else if seq_len > 0 { + // Short sequence (< k): save whole fragment, strip from output. + self.overlap_state_mut().data[..seq_len].copy_from_slice(&buf[seq_start..j]); + self.overlap_state_mut().len = seq_len; + seq_start + } else { + self.overlap_state_mut().len = 0; + j + } + } +} + +// ─── FASTA parser ───────────────────────────────────────────────────────────── + +#[derive(Clone, Copy)] +enum FastaState { + OutSeq, + InTitle, + InSeq, + InAmbiguous, +} + +pub(crate) struct FastaParser { + state: FastaState, + overlap: OverlapState, +} + +impl NucParser for FastaParser { + fn new(k: usize) -> Self { + Self { + state: FastaState::OutSeq, + overlap: OverlapState::new(k), + } + } + + #[inline] + fn overlap_state(&self) -> &OverlapState { + &self.overlap + } + + #[inline] + fn overlap_state_mut(&mut self) -> &mut OverlapState { + &mut self.overlap + } + + #[inline] + fn is_in_seq(&self) -> bool { + matches!(self.state, FastaState::InSeq) + } + + fn parse_inplace(&mut self, buf: &mut [u8], n: usize) -> usize { + let total = self.overlap.len + n; + let mut i = 0; // read index + let mut j = 0; // write index (invariant: j <= i always) + // j-position where the current sequence started in this call's output; + // meaningful only when state is InSeq. + let mut seq_start: usize = 0; + + while i < total { + // SAFETY: i < total <= BUF_SIZE = buf.len() + let byte = unsafe { *buf.get_unchecked(i) }; + + match self.state { + FastaState::OutSeq => { + if byte == b'>' { + self.state = FastaState::InTitle; + } + i += 1; + } + FastaState::InTitle => { + if byte == b'\n' || byte == b'\r' { + self.state = FastaState::InSeq; + seq_start = j; + } + i += 1; + } + FastaState::InSeq => { + if byte == b'\n' || byte == b'\r' { + i += 1; + continue; + } + let nuc = byte & 0xDF; // to uppercase + if nuc == b'A' || nuc == b'C' || nuc == b'G' || nuc == b'T' { + // SAFETY: j <= i < total <= BUF_SIZE = buf.len() + unsafe { + *buf.get_unchecked_mut(j) = nuc; + } + j += 1; + i += 1; + } else if byte == b'>' { + if j > seq_start { + unsafe { + *buf.get_unchecked_mut(j) = 0; + } + j += 1; + } + self.state = FastaState::InTitle; + i += 1; + } else { + // first ambiguous base: end current sequence if non-empty + if j > seq_start { + unsafe { + *buf.get_unchecked_mut(j) = 0; + } + j += 1; + } + self.state = FastaState::InAmbiguous; + i += 1; + } + } + FastaState::InAmbiguous => { + if byte == b'\n' || byte == b'\r' { + i += 1; + continue; + } + if byte == b'>' { + self.state = FastaState::InTitle; + i += 1; + continue; + } + let nuc = byte & 0xDF; + if nuc == b'A' || nuc == b'C' || nuc == b'G' || nuc == b'T' { + seq_start = j; + // SAFETY: j <= i < total <= BUF_SIZE = buf.len() + unsafe { + *buf.get_unchecked_mut(j) = nuc; + } + j += 1; + self.state = FastaState::InSeq; + } + i += 1; + } + } + } + + self.save_overlap(buf, j, seq_start) + } +} + +// ─── FASTQ parser ───────────────────────────────────────────────────────────── + +#[derive(Clone, Copy)] +enum FastqState { + OutSeq, + InTitle, + InSeq, + InAmbiguous, + InQualTitle, + InQual, +} + +pub(crate) struct FastqParser { + state: FastqState, + overlap: OverlapState, +} + +impl NucParser for FastqParser { + fn new(k: usize) -> Self { + Self { + state: FastqState::OutSeq, + overlap: OverlapState::new(k), + } + } + + #[inline] + fn overlap_state(&self) -> &OverlapState { + &self.overlap + } + + #[inline] + fn overlap_state_mut(&mut self) -> &mut OverlapState { + &mut self.overlap + } + + #[inline] + fn is_in_seq(&self) -> bool { + matches!(self.state, FastqState::InSeq) + } + + fn parse_inplace(&mut self, buf: &mut [u8], n: usize) -> usize { + let total = self.overlap.len + n; + let mut i = 0; + let mut j = 0; + let mut seq_start: usize = 0; + + while i < total { + // SAFETY: i < total <= BUF_SIZE = buf.len() + let byte = unsafe { *buf.get_unchecked(i) }; + + match self.state { + FastqState::OutSeq => { + if byte == b'@' { + self.state = FastqState::InTitle; + } + i += 1; + } + FastqState::InTitle => { + if byte == b'\n' || byte == b'\r' { + self.state = FastqState::InSeq; + seq_start = j; + } + i += 1; + } + FastqState::InSeq => { + if byte == b'\n' || byte == b'\r' { + if j > seq_start { + unsafe { + *buf.get_unchecked_mut(j) = 0; + } + j += 1; + } + self.state = FastqState::InQualTitle; + i += 1; + continue; + } + let nuc = byte & 0xDF; + if nuc == b'A' || nuc == b'C' || nuc == b'G' || nuc == b'T' { + // SAFETY: j <= i < total <= BUF_SIZE = buf.len() + unsafe { + *buf.get_unchecked_mut(j) = nuc; + } + j += 1; + } else { + if j > seq_start { + unsafe { + *buf.get_unchecked_mut(j) = 0; + } + j += 1; + } + self.state = FastqState::InAmbiguous; + } + i += 1; + } + FastqState::InAmbiguous => { + if byte == b'\n' || byte == b'\r' { + self.state = FastqState::InQualTitle; + i += 1; + continue; + } + let nuc = byte & 0xDF; + if nuc == b'A' || nuc == b'C' || nuc == b'G' || nuc == b'T' { + seq_start = j; + // SAFETY: j <= i < total <= BUF_SIZE = buf.len() + unsafe { + *buf.get_unchecked_mut(j) = nuc; + } + j += 1; + self.state = FastqState::InSeq; + } + i += 1; + } + FastqState::InQualTitle => { + if byte == b'\n' || byte == b'\r' { + self.state = FastqState::InQual; + } + i += 1; + } + FastqState::InQual => { + if byte == b'\n' || byte == b'\r' { + self.state = FastqState::OutSeq; + } + i += 1; + } + } + } + + self.save_overlap(buf, j, seq_start) + } +} + +// ─── GenBank parser ─────────────────────────────────────────────────────────── + +const ORIGIN_TAIL: &[u8] = b"RIGIN"; + +#[derive(Clone, Copy)] +enum GenbankState { + OutSeq, + MatchOrigin, + SkipOriginLine, + InSeq, + InSlash, + InAmbiguous, +} + +pub(crate) struct GenbankParser { + state: GenbankState, + overlap: OverlapState, + keyword_pos: usize, + at_line_start: bool, +} + +impl NucParser for GenbankParser { + fn new(k: usize) -> Self { + Self { + state: GenbankState::OutSeq, + overlap: OverlapState::new(k), + keyword_pos: 0, + at_line_start: true, + } + } + + #[inline] + fn overlap_state(&self) -> &OverlapState { + &self.overlap + } + + #[inline] + fn overlap_state_mut(&mut self) -> &mut OverlapState { + &mut self.overlap + } + + #[inline] + fn is_in_seq(&self) -> bool { + matches!(self.state, GenbankState::InSeq) + } + + fn parse_inplace(&mut self, buf: &mut [u8], n: usize) -> usize { + let total = self.overlap.len + n; + let mut i = 0; + let mut j = 0; + let mut seq_start: usize = 0; + + while i < total { + // SAFETY: i < total <= BUF_SIZE = buf.len() + let byte = unsafe { *buf.get_unchecked(i) }; + + match self.state { + GenbankState::OutSeq => { + if byte == b'\n' || byte == b'\r' { + self.at_line_start = true; + } else if self.at_line_start && byte == b'O' { + self.state = GenbankState::MatchOrigin; + self.keyword_pos = 1; + self.at_line_start = false; + } else { + self.at_line_start = false; + } + i += 1; + } + GenbankState::MatchOrigin => { + if byte == b'\n' || byte == b'\r' { + self.state = GenbankState::OutSeq; + self.at_line_start = true; + } else if byte == ORIGIN_TAIL[self.keyword_pos - 1] { + self.keyword_pos += 1; + if self.keyword_pos == 6 { + self.state = GenbankState::SkipOriginLine; + } + } else { + self.state = GenbankState::OutSeq; + self.at_line_start = false; + } + i += 1; + } + GenbankState::SkipOriginLine => { + if byte == b'\n' || byte == b'\r' { + self.state = GenbankState::InSeq; + seq_start = j; + } + i += 1; + } + GenbankState::InSeq => { + if byte == b'\n' || byte == b'\r' { + self.at_line_start = true; + i += 1; + continue; + } + if self.at_line_start && byte == b'/' { + self.state = GenbankState::InSlash; + self.at_line_start = false; + i += 1; + continue; + } + self.at_line_start = false; + let nuc = byte & 0xDF; + if nuc == b'A' || nuc == b'C' || nuc == b'G' || nuc == b'T' { + // SAFETY: j <= i < total <= BUF_SIZE = buf.len() + unsafe { + *buf.get_unchecked_mut(j) = nuc; + } + j += 1; + } else if byte.is_ascii_digit() || byte == b' ' { + // position numbers and spacing between groups: skip + } else { + // ambiguous base: end current sequence if non-empty + if j > seq_start { + unsafe { + *buf.get_unchecked_mut(j) = 0; + } + j += 1; + } + self.state = GenbankState::InAmbiguous; + } + i += 1; + } + GenbankState::InSlash => { + if byte == b'/' { + // confirmed "//": end of sequence record + if j > seq_start { + unsafe { + *buf.get_unchecked_mut(j) = 0; + } + j += 1; + } + self.state = GenbankState::OutSeq; + self.at_line_start = false; + } else if byte == b'\n' || byte == b'\r' { + // single '/' line: back to sequence + self.state = GenbankState::InSeq; + self.at_line_start = true; + } else { + // false positive: single '/' mid-line, resume sequence + self.state = GenbankState::InSeq; + self.at_line_start = false; + } + i += 1; + } + GenbankState::InAmbiguous => { + if byte == b'\n' || byte == b'\r' { + self.at_line_start = true; + i += 1; + continue; + } + if self.at_line_start && byte == b'/' { + self.state = GenbankState::InSlash; + self.at_line_start = false; + i += 1; + continue; + } + self.at_line_start = false; + let nuc = byte & 0xDF; + if nuc == b'A' || nuc == b'C' || nuc == b'G' || nuc == b'T' { + seq_start = j; + // SAFETY: j <= i < total <= BUF_SIZE = buf.len() + unsafe { + *buf.get_unchecked_mut(j) = nuc; + } + j += 1; + self.state = GenbankState::InSeq; + } + // digits, spaces, other ambiguous codes: skip + i += 1; + } + } + } + + self.save_overlap(buf, j, seq_start) + } +} + +// ─── NucPage ────────────────────────────────────────────────────────────────── + +/// Owned page of compacted nucleotides: uppercase A/C/G/T bytes separated by `0` +/// at sequence boundaries. Automatically returns its buffer to the pool on drop. +pub struct NucPage { + data: ManuallyDrop>, + len: usize, + pool: Arc>>>, +} + +impl std::ops::Deref for NucPage { + type Target = [u8]; + fn deref(&self) -> &[u8] { + &self.data[..self.len] + } +} + +impl Drop for NucPage { + fn drop(&mut self) { + // SAFETY: data is never accessed after this point + let buf = unsafe { ManuallyDrop::take(&mut self.data) }; + self.pool.lock().unwrap().push(buf); + } +} + +// ─── NucPageCursor ──────────────────────────────────────────────────────────── + +/// A forward cursor over the normalised bytes of a [`NucPage`]. +/// +/// Provides the `next_byte` / `rewind` interface consumed by +/// [`obiskbuilder::SuperKmerStreamIter`]. +pub struct NucPageCursor<'a> { + data: &'a [u8], + pos: usize, +} + +impl NucPageCursor<'_> { + /// Returns the next byte in the page, or `None` at end. + #[inline] + pub fn next_byte(&mut self) -> Option { + if self.pos < self.data.len() { + let b = self.data[self.pos]; + self.pos += 1; + Some(b) + } else { + None + } + } + + /// Steps the cursor back by `n` bytes. + /// + /// The caller guarantees that the last `n` bytes were all `ACGT` + /// (no `0x00` separators), so they are still in the page buffer. + #[inline] + pub fn rewind(&mut self, n: usize) { + self.pos -= n; + } + + /// Total number of bytes in the underlying page. + #[inline] + pub fn len(&self) -> usize { + self.data.len() + } + + /// Returns `true` if the page contains no bytes. + #[inline] + pub fn is_empty(&self) -> bool { + self.data.is_empty() + } +} + +impl NucPage { + /// Creates a forward cursor positioned at the start of this page. + pub fn cursor(&self) -> NucPageCursor<'_> { + NucPageCursor { data: self, pos: 0 } + } +} + +// ─── NucStream ──────────────────────────────────────────────────────────────── + +pub(crate) struct NucStream { + reader: R, + parser: P, + pool: Arc>>>, + eof: bool, +} + +impl NucStream { + pub(crate) fn new(reader: R, k: usize) -> Self { + Self { + reader, + parser: P::new(k), + pool: Arc::new(Mutex::new(Vec::new())), + eof: false, + } + } + + pub(crate) fn read_page(&mut self) -> Option { + loop { + if self.eof { + return None; + } + // take a buffer from the pool, or allocate fresh if all are in-flight + let mut buf = self + .pool + .lock() + .unwrap() + .pop() + .unwrap_or_else(|| vec![0u8; BUF_SIZE]); + + let ol = self.parser.overlap_len(); + self.parser.write_overlap(&mut buf[..ol]); + let n = self.reader.read(&mut buf[ol..ol + PAGE_SIZE]).unwrap_or(0); + if n == 0 { + self.eof = true; + if ol == 0 { + self.pool.lock().unwrap().push(buf); + return None; + } + } + let out_len = self.parser.parse_inplace(&mut buf, n); + if out_len > 0 { + return Some(NucPage { + data: ManuallyDrop::new(buf), + len: out_len, + pool: Arc::clone(&self.pool), + }); + } + // empty page (all headers/ambiguous): return buf to pool and loop + self.pool.lock().unwrap().push(buf); + } + } +} + +impl Iterator for NucStream { + type Item = NucPage; + fn next(&mut self) -> Option { + self.read_page() + } +} + +// ─── FastaNucStream ─────────────────────────────────────────────────────────── + +pub(crate) type FastaNucStream = NucStream; +pub(crate) type FastqNucStream = NucStream; +pub(crate) type GenbankNucStream = NucStream; + +// ─── AnyNucStream ───────────────────────────────────────────────────────────── + +pub(crate) enum AnyNucStream { + Fasta(FastaNucStream), + Fastq(FastqNucStream), + Genbank(GenbankNucStream), +} + +impl Iterator for AnyNucStream { + type Item = NucPage; + fn next(&mut self) -> Option { + match self { + AnyNucStream::Fasta(s) => s.next(), + AnyNucStream::Fastq(s) => s.next(), + AnyNucStream::Genbank(s) => s.next(), + } + } +} + +fn dispatch( + mut guesser: MimeTypeGuesser, + k: usize, +) -> Option>> { + match guesser.mime_type() { + Some("text/fasta") => Some(AnyNucStream::Fasta(NucStream::new(guesser, k))), + Some("text/fastq") => Some(AnyNucStream::Fastq(NucStream::new(guesser, k))), + Some("text/gbff") => Some(AnyNucStream::Genbank(NucStream::new(guesser, k))), + _ => None, + } +} + +/// Wraps an already-open reader in a nucleotide stream, detecting its format. +/// Returns `None` if the format is not recognised. +pub(crate) fn nuc_stream( + reader: R, + k: usize, +) -> Option>> { + dispatch(MimeTypeGuesser::new(reader), k) +} + +/// Opens a nucleotide stream from any source (file path, URL, or `-` for stdin), +/// with transparent decompression and automatic format detection. +/// +/// # Errors +/// Returns an `io::Error` if the source cannot be opened, decompression fails, +/// or the format is not recognised. +pub fn open_nuc_stream( + source: &str, + k: usize, +) -> io::Result + Send>> { + let reader = open_raw(source)?; + dispatch(MimeTypeGuesser::new(reader), k) + .map(|s| Box::new(s) as Box + Send>) + .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "unknown sequence format")) +} diff --git a/src/obiread/src/stream.rs b/src/obiread/src/stream.rs deleted file mode 100644 index e797ee2..0000000 --- a/src/obiread/src/stream.rs +++ /dev/null @@ -1,507 +0,0 @@ -//! Streaming byte normaliser for FASTA, FASTQ, and GBFF sequence files. -//! -//! [`NormalizedByteStream`] wraps any `Read` source in a 64 KiB circular ring -//! buffer and emits a normalised byte stream: -//! -//! - Uppercase `ACGT` bytes for in-sequence positions. -//! - `0x00` as a separator between independent sequence segments (record -//! boundary, non-ACGT base, or end of the GBFF `//` record). -//! - `None` at true EOF. -//! -//! Unlike the Rope-based pipeline, the stream never accumulates a full -//! record: it is safe for chromosomes of arbitrary size (e.g. GBFF files -//! with 250 MiB ORIGIN sections). -//! -//! # Rewind invariant -//! -//! `rewind(n)` subtracts `n` from the read head. It is only called by -//! [`obiskbuilder::SuperKmerStreamIter`] when the last `n` bytes were all -//! ACGT (no format overhead in between), so the ring buffer always contains -//! those bytes at positions `[head-n .. head]`. The maximum rewind is -//! `k ≤ 31`, well within the 32 KiB kept below the refill watermark. - -use std::io::{self, Read}; - -// ── ring-buffer constants ───────────────────────────────────────────────────── - -const CAP: usize = 65536; // must be a power of two -const MASK: usize = CAP - 1; -const REFILL_THRESHOLD: usize = 32768; // refill when available drops below this -const REFILL_CHUNK: usize = 32768; // target bytes per refill call - -// ── format detection ────────────────────────────────────────────────────────── - -#[derive(Clone, Copy, PartialEq, Eq)] -enum Format { Fasta, Fastq, Gbff } - -fn detect_format(buf: &[u8]) -> Format { - if buf.starts_with(b"LOCUS ") { - Format::Gbff - } else if buf.first() == Some(&b'>') { - Format::Fasta - } else { - Format::Fastq - } -} - -// ── format state machines ───────────────────────────────────────────────────── - -#[derive(Clone, Copy)] -enum FaState { - OutSeq, // reading header / between records - InSeq, // reading sequence bases -} - -#[derive(Clone, Copy)] -enum FqState { - Header, // reading @id line - Seq, // reading sequence bases - Plus, // reading +[id] separator line - Qual(usize), // skipping quality bytes; payload = remaining count -} - -const ORIGIN: &[u8] = b"ORIGIN"; - -#[derive(Clone, Copy)] -enum GbState { - Pre(u8), // pre-ORIGIN: n chars of "ORIGIN" matched at current line start - PreSkip, // pre-ORIGIN: skip rest of non-ORIGIN line then → Pre(0) - LineStart, // ORIGIN data: at column 0 of a numbered data line - Num, // ORIGIN data: skipping spaces + line-number digits - Seq, // ORIGIN data: reading lowercase sequence bases - Slash, // ORIGIN data: saw the first '/' of a potential '//' end marker -} - -#[derive(Clone, Copy)] -enum State { - Fa(FaState), - Fq(FqState), - Gb(GbState), -} - -// ── NormalizedByteStream ────────────────────────────────────────────────────── - -/// A streaming normaliser over any `Read` source. -/// -/// Call [`next_byte`](NormalizedByteStream::next_byte) to consume one normalised -/// byte at a time. Call [`rewind`](NormalizedByteStream::rewind) to step back -/// up to 31 bytes (used by the superkmer builder when a minimizer or length -/// boundary is crossed). -pub struct NormalizedByteStream { - reader: R, - buf: Box<[u8; CAP]>, - head: usize, // absolute read position; index = head & MASK - write: usize, // absolute write position; index = write & MASK - eof: bool, - state: State, - seq_len: usize, // FASTQ only: non-newline chars seen in current sequence line -} - -impl NormalizedByteStream { - /// Wrap `reader` and detect its format from the first bytes. - pub fn new(mut reader: R) -> io::Result { - let mut buf = Box::new([0u8; CAP]); - - // Initial read — enough to detect the format. - let n = reader.read(&mut buf[..REFILL_CHUNK])?; - let fmt = detect_format(&buf[..n]); - let state = match fmt { - Format::Fasta => State::Fa(FaState::OutSeq), - Format::Fastq => State::Fq(FqState::Header), - Format::Gbff => State::Gb(GbState::Pre(0)), - }; - - Ok(Self { - reader, - buf, - head: 0, - write: n, - eof: n == 0, - state, - seq_len: 0, - }) - } - - /// Step the read head back by `n` bytes. - /// - /// # Safety - /// The caller guarantees that the last `n` bytes returned by - /// [`next_byte`] were all uppercase ACGT (no `0x00` separators), so those - /// bytes are still present in the ring buffer. - #[inline] - pub fn rewind(&mut self, n: usize) { - self.head -= n; - } - - #[inline] - fn available(&self) -> usize { - self.write - self.head - } - - fn try_refill(&mut self) -> io::Result<()> { - if self.eof || self.available() >= REFILL_THRESHOLD { - return Ok(()); - } - - // First segment: write_idx → end of buffer (contiguous, no wrap). - let write_idx = self.write & MASK; - let free = CAP - self.available(); - let to_end = CAP - write_idx; - let chunk1 = free.min(to_end).min(REFILL_CHUNK); - - if chunk1 > 0 { - let n = self.reader.read(&mut self.buf[write_idx..write_idx + chunk1])?; - self.write += n; - if n == 0 { - self.eof = true; - return Ok(()); - } - } - - // Second segment: if still below threshold, read into the wrapped region. - if self.available() < REFILL_THRESHOLD && !self.eof { - let write_idx2 = self.write & MASK; - let free2 = CAP - self.available(); - let to_end2 = CAP - write_idx2; - let chunk2 = free2.min(to_end2).min(REFILL_CHUNK); - if chunk2 > 0 { - let n = self.reader.read(&mut self.buf[write_idx2..write_idx2 + chunk2])?; - self.write += n; - if n == 0 { - self.eof = true; - } - } - } - - Ok(()) - } - - /// Return the next normalised byte, or `None` at EOF. - /// - /// Emits uppercase `A`, `C`, `G`, `T`, or `0x00` (segment separator). - pub fn next_byte(&mut self) -> Option { - loop { - // Proactive refill: keep ≥ REFILL_THRESHOLD bytes in the buffer - // so rewinds are always safe. - if !self.eof && self.available() < REFILL_THRESHOLD { - let _ = self.try_refill(); - } - - if self.available() == 0 { - return None; - } - - let b = self.buf[self.head & MASK]; - self.head += 1; - - // Copy state to avoid borrow conflicts while mutating self.state. - let state = self.state; - if let Some(out) = self.process(state, b) { - return Some(out); - } - } - } - - // ── per-format byte processors ──────────────────────────────────────────── - - #[inline] - fn process(&mut self, state: State, b: u8) -> Option { - match state { - State::Fa(fa) => self.fasta(fa, b), - State::Fq(fq) => self.fastq(fq, b), - State::Gb(gb) => self.gbff(gb, b), - } - } - - fn fasta(&mut self, fa: FaState, b: u8) -> Option { - match fa { - FaState::OutSeq => { - if b == b'\n' { - self.state = State::Fa(FaState::InSeq); - } - None // skip header bytes - } - FaState::InSeq => { - if b == b'>' { - self.state = State::Fa(FaState::OutSeq); - Some(0x00) // record boundary - } else if b == b'\n' || b == b'\r' { - None - } else { - Some(normalize_nuc(b)) - } - } - } - } - - fn fastq(&mut self, fq: FqState, b: u8) -> Option { - match fq { - FqState::Header => { - if b == b'\n' { - self.state = State::Fq(FqState::Seq); - self.seq_len = 0; - } - None - } - FqState::Seq => { - if b == b'\n' { - self.state = State::Fq(FqState::Plus); - None - } else if b == b'\r' { - None - } else { - self.seq_len += 1; // count all chars, including non-ACGT - Some(normalize_nuc(b)) - } - } - FqState::Plus => { - if b == b'\n' { - self.state = State::Fq(FqState::Qual(self.seq_len)); - } - None - } - FqState::Qual(rem) => { - if b == b'\n' { - if rem == 0 { - // Quality line ended after all bases consumed → new record. - self.state = State::Fq(FqState::Header); - Some(0x00) // record boundary - } else { - // Newline inside multi-line quality — keep counting. - None - } - } else if b == b'\r' { - None - } else if rem > 0 { - self.state = State::Fq(FqState::Qual(rem - 1)); - None - } else { - // rem == 0 but non-newline: shouldn't happen in valid FASTQ. - None - } - } - } - } - - fn gbff(&mut self, gb: GbState, b: u8) -> Option { - match gb { - GbState::Pre(n) => { - let n = n as usize; - if n < 6 { - if b == ORIGIN[n] { - self.state = State::Gb(GbState::Pre((n + 1) as u8)); - } else if b == b'\n' { - self.state = State::Gb(GbState::Pre(0)); - } else { - self.state = State::Gb(GbState::PreSkip); - } - } else { - // All 6 chars of "ORIGIN" matched; skip the rest of the header line. - if b == b'\n' { - self.state = State::Gb(GbState::LineStart); - } - // Non-'\n': stay in Pre(6) implicitly (state unchanged). - } - None - } - GbState::PreSkip => { - if b == b'\n' { - self.state = State::Gb(GbState::Pre(0)); - } - None - } - GbState::LineStart => { - if b == b'/' { - self.state = State::Gb(GbState::Slash); - } else if b != b'\n' { - // Space or digit: start of a numbered sequence line. - self.state = State::Gb(GbState::Num); - } - None - } - GbState::Num => { - if b == b'\n' { - self.state = State::Gb(GbState::LineStart); - None - } else if b.is_ascii_digit() || b == b' ' { - None // still in the number prefix - } else { - // First letter: transition to Seq and process this byte. - self.state = State::Gb(GbState::Seq); - Some(normalize_nuc(b)) - } - } - GbState::Seq => { - if b == b'\n' { - self.state = State::Gb(GbState::LineStart); - None - } else if b == b' ' { - None // inter-group space - } else { - Some(normalize_nuc(b)) - } - } - GbState::Slash => { - if b == b'/' { - // End of GBFF record ("//"). - self.state = State::Gb(GbState::Pre(0)); // ready for next record - Some(0x00) - } else { - // Stray '/' — shouldn't happen in valid GBFF; recover. - self.state = State::Gb(GbState::LineStart); - None - } - } - } - } -} - -// ── nucleotide normalisation ────────────────────────────────────────────────── - -#[inline] -fn normalize_nuc(b: u8) -> u8 { - match b | 0x20 { - b'a' => b'A', - b'c' => b'C', - b'g' => b'G', - b't' => b'T', - _ => 0x00, - } -} - -// ── tests ───────────────────────────────────────────────────────────────────── - -#[cfg(test)] -mod tests { - use super::*; - - fn stream(data: &[u8]) -> Vec { - let mut s = NormalizedByteStream::new(data).unwrap(); - let mut out = Vec::new(); - while let Some(b) = s.next_byte() { - out.push(b); - } - out - } - - // ── FASTA ───────────────────────────────────────────────────────────────── - - #[test] - fn fasta_single_record() { - assert_eq!(stream(b">s1\nACGTACGT\n"), b"ACGTACGT"); - } - - #[test] - fn fasta_two_records_separated() { - let out = stream(b">s1\nACGT\n>s2\nTTTT\n"); - assert_eq!(out, b"ACGT\x00TTTT"); - } - - #[test] - fn fasta_multiline_concatenated() { - assert_eq!(stream(b">s1\nACGT\nACGT\n"), b"ACGTACGT"); - } - - #[test] - fn fasta_lowercase_uppercased() { - assert_eq!(stream(b">s1\nacgt\n"), b"ACGT"); - } - - #[test] - fn fasta_non_acgt_becomes_null() { - let out = stream(b">s1\nACGTNACGT\n"); - assert_eq!(out, b"ACGT\x00ACGT"); - } - - // ── FASTQ ───────────────────────────────────────────────────────────────── - - fn make_fastq(records: &[&[u8]]) -> Vec { - let mut buf = Vec::new(); - for seq in records { - buf.extend_from_slice(b"@hdr\n"); - buf.extend_from_slice(seq); - buf.push(b'\n'); - buf.extend_from_slice(b"+\n"); - buf.extend_from_slice(&vec![b'I'; seq.len()]); - buf.push(b'\n'); - } - buf - } - - #[test] - fn fastq_single_record() { - // Trailing \x00 is emitted on the newline that ends the quality line. - assert_eq!(stream(&make_fastq(&[b"ACGTACGT"])), b"ACGTACGT\x00"); - } - - #[test] - fn fastq_two_records_separated() { - let out = stream(&make_fastq(&[b"ACGT", b"TTTT"])); - assert_eq!(out, b"ACGT\x00TTTT\x00"); - } - - #[test] - fn fastq_lowercase_uppercased() { - assert_eq!(stream(&make_fastq(&[b"acgt"])), b"ACGT\x00"); - } - - #[test] - fn fastq_non_acgt_becomes_null() { - let out = stream(&make_fastq(&[b"ACGTNACGT"])); - assert_eq!(out, b"ACGT\x00ACGT\x00"); - } - - // ── GBFF ────────────────────────────────────────────────────────────────── - - fn make_gbff(seqs: &[&[u8]]) -> Vec { - let mut buf = Vec::new(); - for seq in seqs { - buf.extend_from_slice(b"LOCUS NC_000001\nFEATURES\nORIGIN\n"); - // write numbered lines of 60 bases each - let mut pos = 1usize; - for chunk in seq.chunks(60) { - let groups: Vec<&[u8]> = chunk.chunks(10).collect(); - let line = groups.join(&b' '); - buf.extend_from_slice(format!("{:9} ", pos).as_bytes()); - buf.extend_from_slice(&line); - buf.push(b'\n'); - pos += chunk.len(); - } - buf.extend_from_slice(b"//\n"); - } - buf - } - - #[test] - fn gbff_single_record() { - let seq = b"acgtacgtacgt"; - let out = stream(&make_gbff(&[seq])); - assert_eq!(out, b"ACGTACGTACGT\x00"); - } - - #[test] - fn gbff_two_records_separated() { - let out = stream(&make_gbff(&[b"acgtacgt", b"tttttttt"])); - assert_eq!(out, b"ACGTACGT\x00TTTTTTTT\x00"); - } - - #[test] - fn gbff_non_acgt_becomes_null() { - let out = stream(&make_gbff(&[b"acgtnacgt"])); - assert_eq!(out, b"ACGT\x00ACGT\x00"); - } - - // ── rewind ──────────────────────────────────────────────────────────────── - - #[test] - fn rewind_replays_bytes() { - let mut s = NormalizedByteStream::new(b">s\nACGT\n" as &[u8]).unwrap(); - assert_eq!(s.next_byte(), Some(b'A')); - assert_eq!(s.next_byte(), Some(b'C')); - s.rewind(1); - assert_eq!(s.next_byte(), Some(b'C')); - assert_eq!(s.next_byte(), Some(b'G')); - assert_eq!(s.next_byte(), Some(b'T')); - assert_eq!(s.next_byte(), None); - } -} diff --git a/src/obiread/src/xopen.rs b/src/obiread/src/xopen.rs index f5dbbf0..f86585c 100644 --- a/src/obiread/src/xopen.rs +++ b/src/obiread/src/xopen.rs @@ -20,13 +20,13 @@ use std::io::{self, Read}; /// Open any source for reading, with transparent decompression. /// -/// Returns a `Box` that yields uncompressed bytes regardless of -/// whether the underlying source is plain text, gzip, bzip2, xz or zstd. +/// Returns a `Box` that yields uncompressed bytes regardless +/// of whether the underlying source is plain text, gzip, bzip2, xz or zstd. /// /// # Errors /// Returns an `io::Error` if the file cannot be opened, the URL cannot be /// fetched, or the compression header is malformed. -pub fn xopen(source: &str) -> io::Result>> { +pub(crate) fn open_raw(source: &str) -> io::Result> { let raw: Box = match source { "-" => Box::new(io::stdin()), s if s.starts_with("http://") || s.starts_with("https://") => http_reader(s)?, @@ -35,8 +35,15 @@ pub fn xopen(source: &str) -> io::Result>> Box::new(File::open(expanded.as_ref())?) } }; - let decompressed = decompress(raw)?; - Ok(MimeTypeGuesser::new(decompressed)) + decompress(raw) +} + +/// Open any source for reading, with transparent decompression and MIME detection. +/// +/// Wraps [`open_raw`] in a [`MimeTypeGuesser`] so callers can inspect the +/// format before consuming the stream. +pub fn xopen(source: &str) -> io::Result>> { + Ok(MimeTypeGuesser::new(open_raw(source)?)) } // ── internal helpers ────────────────────────────────────────────────────────── diff --git a/src/obiskbuilder/src/lib.rs b/src/obiskbuilder/src/lib.rs index a5409e1..6dc2de1 100644 --- a/src/obiskbuilder/src/lib.rs +++ b/src/obiskbuilder/src/lib.rs @@ -17,9 +17,21 @@ pub use iter::SuperKmerIter; pub use scratch::SuperKmerScratch; pub use stream_iter::SuperKmerStreamIter; +use obiread::NucPage; use obikrope::Rope; use obikseq::RoutableSuperKmer; +/// Collect all super-kmers from a normalised [`NucPage`]. +pub fn build_superkmers_page( + page: NucPage, + k: usize, + level_max: usize, + theta: f64, +) -> Vec { + let cursor = page.cursor(); + SuperKmerStreamIter::new(cursor, k, level_max, theta).collect() +} + /// Collect all super-kmers from a normalised rope chunk. pub fn build_superkmers( rope: Rope, diff --git a/src/obiskbuilder/src/stream_iter.rs b/src/obiskbuilder/src/stream_iter.rs index 25f41c5..dabb72b 100644 --- a/src/obiskbuilder/src/stream_iter.rs +++ b/src/obiskbuilder/src/stream_iter.rs @@ -1,35 +1,23 @@ -//! Streaming superkmer iterator that does not require a fully-buffered record. -//! -//! [`SuperKmerStreamIter`] wraps a [`NormalizedByteStream`] and yields -//! [`RoutableSuperKmer`] values one by one, exactly as [`SuperKmerIter`] does -//! over a [`Rope`], but without accumulating the whole input in memory first. -//! -//! This makes it suitable for large GBFF chromosomes (250 MiB ORIGIN sections) -//! or any other source where buffering the full record would exhaust memory. -//! -//! The cut conditions and superkmer semantics are identical to [`SuperKmerIter`]: -//! -//! | Condition | stream rewind | -//! |------------------------|---------------| -//! | entropy(kmer) ≤ θ | k−1 | -//! | minimizer changed | k | -//! | super-kmer length = 256| k | -//! -//! [`SuperKmerIter`]: crate::iter::SuperKmerIter -//! [`Rope`]: obikrope::Rope +//! Streaming superkmer iterator over a [`NucPageCursor`]. -use std::io::Read; - -use obiread::stream::NormalizedByteStream; +use obiread::NucPageCursor; use obikseq::RoutableSuperKmer; use obikseq::kmer::Minimizer; use crate::rolling_stat::RollingStat; use crate::scratch::SuperKmerScratch; -/// Streaming iterator over [`RoutableSuperKmer`] values. -pub struct SuperKmerStreamIter { - stream: NormalizedByteStream, +/// Streaming iterator over [`RoutableSuperKmer`] values from a [`NucPageCursor`]. +/// +/// Cut conditions (checked in order per nucleotide, once the k-mer window is full): +/// +/// | Condition | cursor rewind | +/// |------------------------|---------------| +/// | entropy(kmer) ≤ θ | k−1 | +/// | minimizer changed | k | +/// | super-kmer length = 256| k | +pub struct SuperKmerStreamIter<'a> { + cursor: NucPageCursor<'a>, k: usize, theta: f64, scratch: SuperKmerScratch, @@ -38,23 +26,18 @@ pub struct SuperKmerStreamIter { prev_min_pos: usize, } -impl SuperKmerStreamIter { - /// Build a streaming superkmer iterator from any `Read` source. - /// - /// - `reader`: raw bytes (FASTA, FASTQ, or GBFF; format auto-detected) - /// - `k`: k-mer size (must be odd, 11 ≤ k ≤ 31) - /// - `level_max`: maximum sub-word size for entropy (1–6) - /// - `theta`: entropy threshold; k-mers with score ≤ theta are rejected - pub fn new(reader: R, k: usize, level_max: usize, theta: f64) -> std::io::Result { - Ok(Self { - stream: NormalizedByteStream::new(reader)?, +impl<'a> SuperKmerStreamIter<'a> { + /// Build an iterator from a [`NucPageCursor`] over normalised sequence data. + pub fn new(cursor: NucPageCursor<'a>, k: usize, level_max: usize, theta: f64) -> Self { + Self { + cursor, k, theta, scratch: SuperKmerScratch::new(), stat: RollingStat::new(level_max), prev_min: None, prev_min_pos: 0, - }) + } } fn reset(&mut self) { @@ -73,12 +56,12 @@ impl SuperKmerStreamIter { } } -impl Iterator for SuperKmerStreamIter { +impl Iterator for SuperKmerStreamIter<'_> { type Item = RoutableSuperKmer; fn next(&mut self) -> Option { loop { - let byte = match self.stream.next_byte() { + let byte = match self.cursor.next_byte() { None => { return self.try_emit(); } @@ -103,7 +86,7 @@ impl Iterator for SuperKmerStreamIter { // ── 1. Entropy check ───────────────────────────────────────────── if self.stat.normalized_entropy().unwrap_or(1.0) < self.theta { let result = self.try_emit(); - self.stream.rewind(self.k - 1); + self.cursor.rewind(self.k - 1); self.reset(); if result.is_some() { return result; @@ -114,11 +97,11 @@ impl Iterator for SuperKmerStreamIter { let min = self.stat.canonical_minimizer().unwrap(); let min_pos = self.stat.minimizer_position().unwrap_or(0); - // ── 2. Minimizer change check ───────────────────────────────────── + // ── 2. Minimizer change ─────────────────────────────────────────── if let Some(prev) = self.prev_min { if min != prev { let result = self.try_emit(); - self.stream.rewind(self.k); + self.cursor.rewind(self.k); self.reset(); if result.is_some() { return result; @@ -127,10 +110,10 @@ impl Iterator for SuperKmerStreamIter { } } - // ── 3. Super-kmer length check ──────────────────────────────────── + // ── 3. Super-kmer length cap ────────────────────────────────────── if self.scratch.len() == 256 { let result = self.try_emit(); - self.stream.rewind(self.k); + self.cursor.rewind(self.k); self.reset(); if result.is_some() { return result; -- 2.52.0 From eaa52eaab55af0a9f871b33c5ed5f8e63904cb7f Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Thu, 28 May 2026 22:45:47 +0200 Subject: [PATCH 4/6] feat: introduce nucstream abstraction and comprehensive test suite Introduces a unified NucStream abstraction with NucPageCursor for byte-offset tracking and MIME-type dispatch to instantiate format-specific parsers. Exposes nuc_stream and open_nuc_stream APIs that return boxed, Send-compatible iterators. Additionally, adds a comprehensive test suite covering chunk boundary alignment, FASTA/FASTQ record parsing, sequence normalization, and edge cases such as CRLF line endings, @ in quality strings, and multi-slice rope processing. --- src/obiread/src/chunk.rs | 110 +----------- src/obiread/src/fasta.rs | 70 +------- src/obiread/src/fastq.rs | 77 +-------- src/obiread/src/normalize.rs | 238 +------------------------ src/obiread/src/nucstream.rs | 21 +-- src/obiread/src/tests/chunk.rs | 106 ++++++++++++ src/obiread/src/tests/fasta.rs | 66 +++++++ src/obiread/src/tests/fastq.rs | 73 ++++++++ src/obiread/src/tests/normalize.rs | 234 +++++++++++++++++++++++++ src/obiread/src/tests/nucstream.rs | 267 +++++++++++++++++++++++++++++ 10 files changed, 765 insertions(+), 497 deletions(-) create mode 100644 src/obiread/src/tests/chunk.rs create mode 100644 src/obiread/src/tests/fasta.rs create mode 100644 src/obiread/src/tests/fastq.rs create mode 100644 src/obiread/src/tests/normalize.rs create mode 100644 src/obiread/src/tests/nucstream.rs diff --git a/src/obiread/src/chunk.rs b/src/obiread/src/chunk.rs index 691e6d9..1bd7f0b 100644 --- a/src/obiread/src/chunk.rs +++ b/src/obiread/src/chunk.rs @@ -191,111 +191,5 @@ pub fn fastq_chunks(source: R) -> SeqChunkIter { } #[cfg(test)] -mod tests { - use super::*; - use crate::fasta::end_of_last_fasta_entry; - use crate::fastq::end_of_last_fastq_entry; - - fn fasta_iter(data: &'static [u8], block_size: usize) -> SeqChunkIter<&'static [u8]> { - SeqChunkIter::new(data, block_size, end_of_last_fasta_entry, None) - } - - fn fastq_iter(data: &'static [u8], block_size: usize) -> SeqChunkIter<&'static [u8]> { - SeqChunkIter::new(data, block_size, end_of_last_fastq_entry, None) - } - - fn rope_to_vec(rope: &Rope) -> Vec { - rope.fw_cursor().collect() - } - - // ── FASTA ───────────────────────────────────────────────────────────────── - - #[test] - fn fasta_single_record_one_chunk() { - let data: &[u8] = b">s1\nACGT\n"; - let chunks: Vec<_> = fasta_iter(data, 64).collect::>().unwrap(); - assert_eq!(chunks.len(), 1); - assert_eq!(rope_to_vec(&chunks[0]), b">s1\nACGT\n"); - } - - #[test] - fn fasta_two_records_split_across_chunks() { - let data: &[u8] = b">s1\nACGT\n>s2\nTTTT\n"; - let chunks: Vec<_> = fasta_iter(data, 10).collect::>().unwrap(); - let all: Vec = chunks.iter().flat_map(|r| rope_to_vec(r)).collect(); - assert_eq!(all, b">s1\nACGT\n>s2\nTTTT\n"); - } - - #[test] - fn fasta_each_chunk_ends_on_complete_record() { - let data: &[u8] = b">s1\nACGT\n>s2\nCCCC\n>s3\nGGGG\n>s4\nTTTT\n"; - for block in [8, 12, 20, 100] { - let chunks: Vec<_> = fasta_iter(data, block).collect::>().unwrap(); - for rope in &chunks { - let flat = rope_to_vec(rope); - assert_eq!(flat[0], b'>', "block={block}: chunk doesn't start with '>'"); - assert_eq!( - *flat.last().unwrap(), - b'\n', - "block={block}: chunk doesn't end with newline" - ); - } - } - } - - // ── FASTQ ───────────────────────────────────────────────────────────────── - - fn make_fastq(records: &[(&[u8], &[u8])]) -> Vec { - let mut buf = Vec::new(); - for (seq, qual) in records { - buf.extend_from_slice(b"@hdr\n"); - buf.extend_from_slice(seq); - buf.push(b'\n'); - buf.extend_from_slice(b"+\n"); - buf.extend_from_slice(qual); - buf.push(b'\n'); - } - buf - } - - #[test] - fn fastq_single_record_one_chunk() { - let data = Box::leak(make_fastq(&[(b"ACGT", b"IIII")]).into_boxed_slice()); - let chunks: Vec<_> = fastq_iter(data, 64).collect::>().unwrap(); - assert_eq!(chunks.len(), 1); - } - - #[test] - fn fastq_at_in_quality_handled() { - let data = Box::leak( - make_fastq(&[(b"ACGTACGT", b"@@@@IIII"), (b"TTTTTTTT", b"HHHHHHHH")]) - .into_boxed_slice(), - ); - let chunks: Vec<_> = fastq_iter(data, 16).collect::>().unwrap(); - let all: Vec = chunks.iter().flat_map(|r| rope_to_vec(r)).collect(); - assert_eq!(all, *data); - } - - #[test] - fn fastq_each_chunk_starts_with_at() { - let data = Box::leak( - make_fastq(&[ - (b"ACGT", b"IIII"), - (b"CCCC", b"JJJJ"), - (b"GGGG", b"KKKK"), - (b"TTTT", b"LLLL"), - ]) - .into_boxed_slice(), - ); - for block in [18, 30, 60] { - let chunks: Vec<_> = fastq_iter(data, block).collect::>().unwrap(); - for rope in &chunks { - let first_byte = rope_to_vec(rope)[0]; - assert_eq!( - first_byte, b'@', - "block={block}: chunk doesn't start with '@'" - ); - } - } - } -} +#[path = "tests/chunk.rs"] +mod tests; diff --git a/src/obiread/src/fasta.rs b/src/obiread/src/fasta.rs index b426ed6..13f6380 100644 --- a/src/obiread/src/fasta.rs +++ b/src/obiread/src/fasta.rs @@ -35,71 +35,5 @@ pub fn end_of_last_fasta_entry(rope: &Rope) -> Option { } #[cfg(test)] -mod tests { - use super::*; - - fn rope(data: &[u8]) -> Rope { - let mut r = Rope::new(None); - r.push(data.to_vec()); - r - } - - fn rope2(a: &[u8], b: &[u8]) -> Rope { - let mut r = Rope::new(None); - r.push(a.to_vec()); - r.push(b.to_vec()); - r - } - - fn flat(r: &Rope) -> Vec { - r.fw_cursor().collect() - } - - #[test] - fn single_entry_no_boundary() { - assert_eq!(end_of_last_fasta_entry(&rope(b">seq1\nACGT\n")), None); - } - - #[test] - fn two_entries_cuts_at_second_header() { - let data = b">seq1\nACGT\n>seq2\nTTTT\n"; - let r = rope(data); - let pos = end_of_last_fasta_entry(&r).unwrap(); - assert_eq!(&flat(&r)[pos..], b">seq2\nTTTT\n"); - assert_eq!(&flat(&r)[..pos], b">seq1\nACGT\n"); - } - - #[test] - fn three_entries_cuts_at_last_header() { - let data = b">s1\nAA\n>s2\nCC\n>s3\nGG\n"; - let r = rope(data); - let pos = end_of_last_fasta_entry(&r).unwrap(); - assert_eq!(&flat(&r)[pos..], b">s3\nGG\n"); - } - - #[test] - fn multiline_sequence() { - let data = b">s1\nACGT\nACGT\n>s2\nTTTT\n"; - let r = rope(data); - let pos = end_of_last_fasta_entry(&r).unwrap(); - assert_eq!(&flat(&r)[pos..], b">s2\nTTTT\n"); - } - - #[test] - fn crlf_line_endings() { - let data = b">s1\r\nACGT\r\n>s2\r\nTTTT\r\n"; - let r = rope(data); - let pos = end_of_last_fasta_entry(&r).unwrap(); - assert_eq!(&flat(&r)[pos..], b">s2\r\nTTTT\r\n"); - } - - #[test] - fn boundary_spans_two_blocks() { - let a = b">s1\nACGT\n"; - let b = b">s2\nTTTT\n"; - let r = rope2(a, b); - let all: Vec = flat(&r); - let pos = end_of_last_fasta_entry(&r).unwrap(); - assert_eq!(&all[pos..], b">s2\nTTTT\n"); - } -} +#[path = "tests/fasta.rs"] +mod tests; diff --git a/src/obiread/src/fastq.rs b/src/obiread/src/fastq.rs index 39d3665..46da1b5 100644 --- a/src/obiread/src/fastq.rs +++ b/src/obiread/src/fastq.rs @@ -107,78 +107,5 @@ pub fn end_of_last_fastq_entry(rope: &Rope) -> Option { } #[cfg(test)] -mod tests { - use super::*; - - fn rope(data: &[u8]) -> Rope { - let mut r = Rope::new(None); - r.push(data.to_vec()); - r - } - - fn make_fastq(records: &[(&[u8], &[u8])]) -> Vec { - let mut buf = Vec::new(); - for (seq, qual) in records { - buf.extend_from_slice(b"@header\n"); - buf.extend_from_slice(seq); - buf.push(b'\n'); - buf.extend_from_slice(b"+\n"); - buf.extend_from_slice(qual); - buf.push(b'\n'); - } - buf - } - - fn flat(r: &Rope) -> Vec { - r.fw_cursor().collect() - } - - #[test] - fn single_record_no_boundary() { - let buf = make_fastq(&[(b"ACGT", b"IIII")]); - assert_eq!(end_of_last_fastq_entry(&rope(&buf)), None); - } - - #[test] - fn two_records_cuts_at_second() { - let buf = make_fastq(&[(b"ACGT", b"IIII"), (b"TTTT", b"HHHH")]); - let r = rope(&buf); - let pos = end_of_last_fastq_entry(&r).unwrap(); - assert_eq!(flat(&r)[pos], b'@'); - assert_eq!( - &flat(&r)[pos..], - make_fastq(&[(b"TTTT", b"HHHH")]).as_slice() - ); - } - - #[test] - fn three_records_cuts_at_last() { - let buf = make_fastq(&[(b"ACGT", b"IIII"), (b"CCCC", b"JJJJ"), (b"GGGG", b"KKKK")]); - let r = rope(&buf); - let pos = end_of_last_fastq_entry(&r).unwrap(); - assert_eq!( - &flat(&r)[pos..], - make_fastq(&[(b"GGGG", b"KKKK")]).as_slice() - ); - } - - #[test] - fn at_sign_in_quality_does_not_confuse() { - let buf = make_fastq(&[(b"ACGTACGT", b"@@@@IIII"), (b"TTTT", b"HHHH")]); - let r = rope(&buf); - let pos = end_of_last_fastq_entry(&r).unwrap(); - assert_eq!( - &flat(&r)[pos..], - make_fastq(&[(b"TTTT", b"HHHH")]).as_slice() - ); - } - - #[test] - fn crlf_line_endings() { - let data = b"@h\r\nACGT\r\n+\r\nIIII\r\n@h\r\nTTTT\r\n+\r\nHHHH\r\n"; - let r = rope(data); - let pos = end_of_last_fastq_entry(&r).unwrap(); - assert_eq!(flat(&r)[pos], b'@'); - assert_eq!(&flat(&r)[pos..], b"@h\r\nTTTT\r\n+\r\nHHHH\r\n"); - } -} +#[path = "tests/fastq.rs"] +mod tests; diff --git a/src/obiread/src/normalize.rs b/src/obiread/src/normalize.rs index b92eb41..3a5d43b 100644 --- a/src/obiread/src/normalize.rs +++ b/src/obiread/src/normalize.rs @@ -215,239 +215,5 @@ fn is_acgt(upper: u8) -> bool { // ── tests ───────────────────────────────────────────────────────────────────── #[cfg(test)] -mod tests { - use super::*; - - fn make_rope(data: &[u8]) -> Rope { - let mut r = Rope::new(None); - r.push(data.to_vec()); - r - } - - fn flat(r: Rope) -> Vec { - r.fw_cursor().collect() - } - - fn run_fastq(data: &[u8], k: usize) -> Vec { - flat(normalize_fastq_chunk(make_rope(data), k)) - } - - fn run_fasta(data: &[u8], k: usize) -> Vec { - flat(normalize_fasta_chunk(make_rope(data), k)) - } - - fn make_fastq(records: &[&[u8]]) -> Vec { - let mut buf = Vec::new(); - for seq in records { - buf.extend_from_slice(b"@hdr\n"); - buf.extend_from_slice(seq); - buf.push(b'\n'); - buf.extend_from_slice(b"+\n"); - buf.extend_from_slice(&vec![b'I'; seq.len()]); - buf.push(b'\n'); - } - buf - } - - fn make_fasta(records: &[(&[u8], &[u8])]) -> Vec { - let mut buf = Vec::new(); - for (id, seq) in records { - buf.push(b'>'); - buf.extend_from_slice(id); - buf.push(b'\n'); - buf.extend_from_slice(seq); - buf.push(b'\n'); - } - buf - } - - // ── FASTQ basic ────────────────────────────────────────────────────────── - - #[test] - fn single_record_produces_seq_then_null() { - assert_eq!(run_fastq(&make_fastq(&[b"ACGTACGT"]), 4), b"ACGTACGT\x00"); - } - - #[test] - fn two_records_concatenated() { - assert_eq!( - run_fastq(&make_fastq(&[b"ACGTACGT", b"TTTTTTTT"]), 4), - b"ACGTACGT\x00TTTTTTTT\x00" - ); - } - - #[test] - fn lowercase_input_uppercased() { - assert_eq!(run_fastq(&make_fastq(&[b"acgtacgt"]), 4), b"ACGTACGT\x00"); - } - - #[test] - fn mixed_case_uppercased() { - assert_eq!(run_fastq(&make_fastq(&[b"AcGtAcGt"]), 4), b"ACGTACGT\x00"); - } - - #[test] - fn sequence_shorter_than_k_discarded() { - assert_eq!(run_fastq(&make_fastq(&[b"ACG"]), 4), b""); - } - - #[test] - fn sequence_exactly_k_kept() { - assert_eq!(run_fastq(&make_fastq(&[b"ACGT"]), 4), b"ACGT\x00"); - } - - #[test] - fn short_record_among_valid_ones_discarded() { - assert_eq!( - run_fastq(&make_fastq(&[b"ACGTACGT", b"AC", b"TTTTTTTT"]), 4), - b"ACGTACGT\x00TTTTTTTT\x00" - ); - } - - #[test] - fn ambiguous_splits_into_two_segments() { - assert_eq!( - run_fastq(&make_fastq(&[b"ACGTNACGT"]), 4), - b"ACGT\x00ACGT\x00" - ); - } - - #[test] - fn segment_after_ambiguous_too_short_discarded() { - assert_eq!( - run_fastq(&make_fastq(&[b"ACGTACGTNAC"]), 4), - b"ACGTACGT\x00" - ); - } - - #[test] - fn consecutive_ambiguous_produce_no_empty_segment() { - assert_eq!( - run_fastq(&make_fastq(&[b"ACGTNNNNACGT"]), 4), - b"ACGT\x00ACGT\x00" - ); - } - - #[test] - fn ambiguous_at_start_skipped() { - assert_eq!(run_fastq(&make_fastq(&[b"NNACGTACGT"]), 4), b"ACGTACGT\x00"); - } - - #[test] - fn ambiguous_at_end_produces_no_trailing_empty() { - assert_eq!(run_fastq(&make_fastq(&[b"ACGTACGTNN"]), 4), b"ACGTACGT\x00"); - } - - #[test] - fn crlf_handled() { - let data = b"@hdr\r\nACGTACGT\r\n+\r\nIIIIIIII\r\n"; - assert_eq!(run_fastq(data, 4), b"ACGTACGT\x00"); - } - - #[test] - fn multi_slice_rope() { - let data = make_fastq(&[b"ACGTACGT", b"TTTTTTTT"]); - let mid = data.len() / 2; - let mut rope = Rope::new(None); - rope.push(data[..mid].to_vec()); - rope.push(data[mid..].to_vec()); - assert_eq!( - flat(normalize_fastq_chunk(rope, 4)), - b"ACGTACGT\x00TTTTTTTT\x00" - ); - } - - // ── FASTA ───────────────────────────────────────────────────────────────── - - #[test] - fn fasta_single_record() { - assert_eq!( - run_fasta(&make_fasta(&[(b"s1", b"ACGTACGT")]), 4), - b"ACGTACGT\x00" - ); - } - - #[test] - fn fasta_two_records() { - assert_eq!( - run_fasta( - &make_fasta(&[(b"s1", b"ACGTACGT"), (b"s2", b"TTTTTTTT")]), - 4 - ), - b"ACGTACGT\x00TTTTTTTT\x00" - ); - } - - #[test] - fn fasta_multiline_sequence_concatenated() { - assert_eq!( - run_fasta(b">s1\nACGT\nACGT\nACGT\n", 4), - b"ACGTACGTACGT\x00" - ); - } - - #[test] - fn fasta_lowercase_uppercased() { - assert_eq!( - run_fasta(&make_fasta(&[(b"s1", b"acgtacgt")]), 4), - b"ACGTACGT\x00" - ); - } - - #[test] - fn fasta_short_record_discarded() { - assert_eq!(run_fasta(&make_fasta(&[(b"s1", b"ACG")]), 4), b""); - } - - #[test] - fn fasta_short_among_valid_discarded() { - assert_eq!( - run_fasta( - &make_fasta(&[(b"s1", b"ACGTACGT"), (b"s2", b"AC"), (b"s3", b"TTTTTTTT")]), - 4 - ), - b"ACGTACGT\x00TTTTTTTT\x00" - ); - } - - #[test] - fn fasta_ambiguous_splits_segments() { - assert_eq!(run_fasta(b">s1\nACGTNACGT\n", 4), b"ACGT\x00ACGT\x00"); - } - - #[test] - fn fasta_ambiguous_across_line_boundary() { - assert_eq!(run_fasta(b">s1\nACGT\nNACGT\n", 4), b"ACGT\x00ACGT\x00"); - } - - #[test] - fn fasta_ambiguous_short_segment_discarded() { - assert_eq!(run_fasta(b">s1\nACGTACGTNAC\n", 4), b"ACGTACGT\x00"); - } - - #[test] - fn fasta_no_trailing_newline() { - assert_eq!(run_fasta(b">s1\nACGTACGT", 4), b"ACGTACGT\x00"); - } - - #[test] - fn fasta_crlf_line_endings() { - assert_eq!( - run_fasta(b">s1\r\nACGT\r\nACGT\r\n>s2\r\nTTTT\r\n", 4), - b"ACGTACGT\x00TTTT\x00" - ); - } - - #[test] - fn fasta_multi_slice_rope() { - let data = make_fasta(&[(b"s1", b"ACGTACGT"), (b"s2", b"TTTTTTTT")]); - let mid = data.len() / 2; - let mut rope = Rope::new(None); - rope.push(data[..mid].to_vec()); - rope.push(data[mid..].to_vec()); - assert_eq!( - flat(normalize_fasta_chunk(rope, 4)), - b"ACGTACGT\x00TTTTTTTT\x00" - ); - } -} +#[path = "tests/normalize.rs"] +mod tests; diff --git a/src/obiread/src/nucstream.rs b/src/obiread/src/nucstream.rs index eb252d6..e6bc431 100644 --- a/src/obiread/src/nucstream.rs +++ b/src/obiread/src/nucstream.rs @@ -561,7 +561,7 @@ impl Drop for NucPage { /// [`obiskbuilder::SuperKmerStreamIter`]. pub struct NucPageCursor<'a> { data: &'a [u8], - pos: usize, + pos: usize, } impl NucPageCursor<'_> { @@ -687,8 +687,8 @@ impl Iterator for AnyNucStream { type Item = NucPage; fn next(&mut self) -> Option { match self { - AnyNucStream::Fasta(s) => s.next(), - AnyNucStream::Fastq(s) => s.next(), + AnyNucStream::Fasta(s) => s.next(), + AnyNucStream::Fastq(s) => s.next(), AnyNucStream::Genbank(s) => s.next(), } } @@ -701,17 +701,14 @@ fn dispatch( match guesser.mime_type() { Some("text/fasta") => Some(AnyNucStream::Fasta(NucStream::new(guesser, k))), Some("text/fastq") => Some(AnyNucStream::Fastq(NucStream::new(guesser, k))), - Some("text/gbff") => Some(AnyNucStream::Genbank(NucStream::new(guesser, k))), - _ => None, + Some("text/gbff") => Some(AnyNucStream::Genbank(NucStream::new(guesser, k))), + _ => None, } } /// Wraps an already-open reader in a nucleotide stream, detecting its format. /// Returns `None` if the format is not recognised. -pub(crate) fn nuc_stream( - reader: R, - k: usize, -) -> Option>> { +pub(crate) fn nuc_stream(reader: R, k: usize) -> Option>> { dispatch(MimeTypeGuesser::new(reader), k) } @@ -726,7 +723,11 @@ pub fn open_nuc_stream( k: usize, ) -> io::Result + Send>> { let reader = open_raw(source)?; - dispatch(MimeTypeGuesser::new(reader), k) + nuc_stream(reader, k) .map(|s| Box::new(s) as Box + Send>) .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "unknown sequence format")) } + +#[cfg(test)] +#[path = "tests/nucstream.rs"] +mod tests; diff --git a/src/obiread/src/tests/chunk.rs b/src/obiread/src/tests/chunk.rs new file mode 100644 index 0000000..ec5a51c --- /dev/null +++ b/src/obiread/src/tests/chunk.rs @@ -0,0 +1,106 @@ +use super::*; +use crate::fasta::end_of_last_fasta_entry; +use crate::fastq::end_of_last_fastq_entry; + +fn fasta_iter(data: &'static [u8], block_size: usize) -> SeqChunkIter<&'static [u8]> { + SeqChunkIter::new(data, block_size, end_of_last_fasta_entry, None) +} + +fn fastq_iter(data: &'static [u8], block_size: usize) -> SeqChunkIter<&'static [u8]> { + SeqChunkIter::new(data, block_size, end_of_last_fastq_entry, None) +} + +fn rope_to_vec(rope: &Rope) -> Vec { + rope.fw_cursor().collect() +} + +// ── FASTA ───────────────────────────────────────────────────────────────── + +#[test] +fn fasta_single_record_one_chunk() { + let data: &[u8] = b">s1\nACGT\n"; + let chunks: Vec<_> = fasta_iter(data, 64).collect::>().unwrap(); + assert_eq!(chunks.len(), 1); + assert_eq!(rope_to_vec(&chunks[0]), b">s1\nACGT\n"); +} + +#[test] +fn fasta_two_records_split_across_chunks() { + let data: &[u8] = b">s1\nACGT\n>s2\nTTTT\n"; + let chunks: Vec<_> = fasta_iter(data, 10).collect::>().unwrap(); + let all: Vec = chunks.iter().flat_map(|r| rope_to_vec(r)).collect(); + assert_eq!(all, b">s1\nACGT\n>s2\nTTTT\n"); +} + +#[test] +fn fasta_each_chunk_ends_on_complete_record() { + let data: &[u8] = b">s1\nACGT\n>s2\nCCCC\n>s3\nGGGG\n>s4\nTTTT\n"; + for block in [8, 12, 20, 100] { + let chunks: Vec<_> = fasta_iter(data, block).collect::>().unwrap(); + for rope in &chunks { + let flat = rope_to_vec(rope); + assert_eq!(flat[0], b'>', "block={block}: chunk doesn't start with '>'"); + assert_eq!( + *flat.last().unwrap(), + b'\n', + "block={block}: chunk doesn't end with newline" + ); + } + } +} + +// ── FASTQ ───────────────────────────────────────────────────────────────── + +fn make_fastq(records: &[(&[u8], &[u8])]) -> Vec { + let mut buf = Vec::new(); + for (seq, qual) in records { + buf.extend_from_slice(b"@hdr\n"); + buf.extend_from_slice(seq); + buf.push(b'\n'); + buf.extend_from_slice(b"+\n"); + buf.extend_from_slice(qual); + buf.push(b'\n'); + } + buf +} + +#[test] +fn fastq_single_record_one_chunk() { + let data = Box::leak(make_fastq(&[(b"ACGT", b"IIII")]).into_boxed_slice()); + let chunks: Vec<_> = fastq_iter(data, 64).collect::>().unwrap(); + assert_eq!(chunks.len(), 1); +} + +#[test] +fn fastq_at_in_quality_handled() { + let data = Box::leak( + make_fastq(&[(b"ACGTACGT", b"@@@@IIII"), (b"TTTTTTTT", b"HHHHHHHH")]) + .into_boxed_slice(), + ); + let chunks: Vec<_> = fastq_iter(data, 16).collect::>().unwrap(); + let all: Vec = chunks.iter().flat_map(|r| rope_to_vec(r)).collect(); + assert_eq!(all, *data); +} + +#[test] +fn fastq_each_chunk_starts_with_at() { + let data = Box::leak( + make_fastq(&[ + (b"ACGT", b"IIII"), + (b"CCCC", b"JJJJ"), + (b"GGGG", b"KKKK"), + (b"TTTT", b"LLLL"), + ]) + .into_boxed_slice(), + ); + for block in [18, 30, 60] { + let chunks: Vec<_> = fastq_iter(data, block).collect::>().unwrap(); + for rope in &chunks { + let first_byte = rope_to_vec(rope)[0]; + assert_eq!( + first_byte, b'@', + "block={block}: chunk doesn't start with '@'" + ); + } + } +} diff --git a/src/obiread/src/tests/fasta.rs b/src/obiread/src/tests/fasta.rs new file mode 100644 index 0000000..f52fd79 --- /dev/null +++ b/src/obiread/src/tests/fasta.rs @@ -0,0 +1,66 @@ +use super::*; + +fn rope(data: &[u8]) -> Rope { + let mut r = Rope::new(None); + r.push(data.to_vec()); + r +} + +fn rope2(a: &[u8], b: &[u8]) -> Rope { + let mut r = Rope::new(None); + r.push(a.to_vec()); + r.push(b.to_vec()); + r +} + +fn flat(r: &Rope) -> Vec { + r.fw_cursor().collect() +} + +#[test] +fn single_entry_no_boundary() { + assert_eq!(end_of_last_fasta_entry(&rope(b">seq1\nACGT\n")), None); +} + +#[test] +fn two_entries_cuts_at_second_header() { + let data = b">seq1\nACGT\n>seq2\nTTTT\n"; + let r = rope(data); + let pos = end_of_last_fasta_entry(&r).unwrap(); + assert_eq!(&flat(&r)[pos..], b">seq2\nTTTT\n"); + assert_eq!(&flat(&r)[..pos], b">seq1\nACGT\n"); +} + +#[test] +fn three_entries_cuts_at_last_header() { + let data = b">s1\nAA\n>s2\nCC\n>s3\nGG\n"; + let r = rope(data); + let pos = end_of_last_fasta_entry(&r).unwrap(); + assert_eq!(&flat(&r)[pos..], b">s3\nGG\n"); +} + +#[test] +fn multiline_sequence() { + let data = b">s1\nACGT\nACGT\n>s2\nTTTT\n"; + let r = rope(data); + let pos = end_of_last_fasta_entry(&r).unwrap(); + assert_eq!(&flat(&r)[pos..], b">s2\nTTTT\n"); +} + +#[test] +fn crlf_line_endings() { + let data = b">s1\r\nACGT\r\n>s2\r\nTTTT\r\n"; + let r = rope(data); + let pos = end_of_last_fasta_entry(&r).unwrap(); + assert_eq!(&flat(&r)[pos..], b">s2\r\nTTTT\r\n"); +} + +#[test] +fn boundary_spans_two_blocks() { + let a = b">s1\nACGT\n"; + let b = b">s2\nTTTT\n"; + let r = rope2(a, b); + let all: Vec = flat(&r); + let pos = end_of_last_fasta_entry(&r).unwrap(); + assert_eq!(&all[pos..], b">s2\nTTTT\n"); +} diff --git a/src/obiread/src/tests/fastq.rs b/src/obiread/src/tests/fastq.rs new file mode 100644 index 0000000..db78c37 --- /dev/null +++ b/src/obiread/src/tests/fastq.rs @@ -0,0 +1,73 @@ +use super::*; + +fn rope(data: &[u8]) -> Rope { + let mut r = Rope::new(None); + r.push(data.to_vec()); + r +} + +fn make_fastq(records: &[(&[u8], &[u8])]) -> Vec { + let mut buf = Vec::new(); + for (seq, qual) in records { + buf.extend_from_slice(b"@header\n"); + buf.extend_from_slice(seq); + buf.push(b'\n'); + buf.extend_from_slice(b"+\n"); + buf.extend_from_slice(qual); + buf.push(b'\n'); + } + buf +} + +fn flat(r: &Rope) -> Vec { + r.fw_cursor().collect() +} + +#[test] +fn single_record_no_boundary() { + let buf = make_fastq(&[(b"ACGT", b"IIII")]); + assert_eq!(end_of_last_fastq_entry(&rope(&buf)), None); +} + +#[test] +fn two_records_cuts_at_second() { + let buf = make_fastq(&[(b"ACGT", b"IIII"), (b"TTTT", b"HHHH")]); + let r = rope(&buf); + let pos = end_of_last_fastq_entry(&r).unwrap(); + assert_eq!(flat(&r)[pos], b'@'); + assert_eq!( + &flat(&r)[pos..], + make_fastq(&[(b"TTTT", b"HHHH")]).as_slice() + ); +} + +#[test] +fn three_records_cuts_at_last() { + let buf = make_fastq(&[(b"ACGT", b"IIII"), (b"CCCC", b"JJJJ"), (b"GGGG", b"KKKK")]); + let r = rope(&buf); + let pos = end_of_last_fastq_entry(&r).unwrap(); + assert_eq!( + &flat(&r)[pos..], + make_fastq(&[(b"GGGG", b"KKKK")]).as_slice() + ); +} + +#[test] +fn at_sign_in_quality_does_not_confuse() { + let buf = make_fastq(&[(b"ACGTACGT", b"@@@@IIII"), (b"TTTT", b"HHHH")]); + let r = rope(&buf); + let pos = end_of_last_fastq_entry(&r).unwrap(); + assert_eq!( + &flat(&r)[pos..], + make_fastq(&[(b"TTTT", b"HHHH")]).as_slice() + ); +} + +#[test] +fn crlf_line_endings() { + let data = b"@h\r\nACGT\r\n+\r\nIIII\r\n@h\r\nTTTT\r\n+\r\nHHHH\r\n"; + let r = rope(data); + let pos = end_of_last_fastq_entry(&r).unwrap(); + assert_eq!(flat(&r)[pos], b'@'); + assert_eq!(&flat(&r)[pos..], b"@h\r\nTTTT\r\n+\r\nHHHH\r\n"); +} diff --git a/src/obiread/src/tests/normalize.rs b/src/obiread/src/tests/normalize.rs new file mode 100644 index 0000000..a9cc52b --- /dev/null +++ b/src/obiread/src/tests/normalize.rs @@ -0,0 +1,234 @@ +use super::*; + +fn make_rope(data: &[u8]) -> Rope { + let mut r = Rope::new(None); + r.push(data.to_vec()); + r +} + +fn flat(r: Rope) -> Vec { + r.fw_cursor().collect() +} + +fn run_fastq(data: &[u8], k: usize) -> Vec { + flat(normalize_fastq_chunk(make_rope(data), k)) +} + +fn run_fasta(data: &[u8], k: usize) -> Vec { + flat(normalize_fasta_chunk(make_rope(data), k)) +} + +fn make_fastq(records: &[&[u8]]) -> Vec { + let mut buf = Vec::new(); + for seq in records { + buf.extend_from_slice(b"@hdr\n"); + buf.extend_from_slice(seq); + buf.push(b'\n'); + buf.extend_from_slice(b"+\n"); + buf.extend_from_slice(&vec![b'I'; seq.len()]); + buf.push(b'\n'); + } + buf +} + +fn make_fasta(records: &[(&[u8], &[u8])]) -> Vec { + let mut buf = Vec::new(); + for (id, seq) in records { + buf.push(b'>'); + buf.extend_from_slice(id); + buf.push(b'\n'); + buf.extend_from_slice(seq); + buf.push(b'\n'); + } + buf +} + +// ── FASTQ basic ────────────────────────────────────────────────────────── + +#[test] +fn single_record_produces_seq_then_null() { + assert_eq!(run_fastq(&make_fastq(&[b"ACGTACGT"]), 4), b"ACGTACGT\x00"); +} + +#[test] +fn two_records_concatenated() { + assert_eq!( + run_fastq(&make_fastq(&[b"ACGTACGT", b"TTTTTTTT"]), 4), + b"ACGTACGT\x00TTTTTTTT\x00" + ); +} + +#[test] +fn lowercase_input_uppercased() { + assert_eq!(run_fastq(&make_fastq(&[b"acgtacgt"]), 4), b"ACGTACGT\x00"); +} + +#[test] +fn mixed_case_uppercased() { + assert_eq!(run_fastq(&make_fastq(&[b"AcGtAcGt"]), 4), b"ACGTACGT\x00"); +} + +#[test] +fn sequence_shorter_than_k_discarded() { + assert_eq!(run_fastq(&make_fastq(&[b"ACG"]), 4), b""); +} + +#[test] +fn sequence_exactly_k_kept() { + assert_eq!(run_fastq(&make_fastq(&[b"ACGT"]), 4), b"ACGT\x00"); +} + +#[test] +fn short_record_among_valid_ones_discarded() { + assert_eq!( + run_fastq(&make_fastq(&[b"ACGTACGT", b"AC", b"TTTTTTTT"]), 4), + b"ACGTACGT\x00TTTTTTTT\x00" + ); +} + +#[test] +fn ambiguous_splits_into_two_segments() { + assert_eq!( + run_fastq(&make_fastq(&[b"ACGTNACGT"]), 4), + b"ACGT\x00ACGT\x00" + ); +} + +#[test] +fn segment_after_ambiguous_too_short_discarded() { + assert_eq!( + run_fastq(&make_fastq(&[b"ACGTACGTNAC"]), 4), + b"ACGTACGT\x00" + ); +} + +#[test] +fn consecutive_ambiguous_produce_no_empty_segment() { + assert_eq!( + run_fastq(&make_fastq(&[b"ACGTNNNNACGT"]), 4), + b"ACGT\x00ACGT\x00" + ); +} + +#[test] +fn ambiguous_at_start_skipped() { + assert_eq!(run_fastq(&make_fastq(&[b"NNACGTACGT"]), 4), b"ACGTACGT\x00"); +} + +#[test] +fn ambiguous_at_end_produces_no_trailing_empty() { + assert_eq!(run_fastq(&make_fastq(&[b"ACGTACGTNN"]), 4), b"ACGTACGT\x00"); +} + +#[test] +fn crlf_handled() { + let data = b"@hdr\r\nACGTACGT\r\n+\r\nIIIIIIII\r\n"; + assert_eq!(run_fastq(data, 4), b"ACGTACGT\x00"); +} + +#[test] +fn multi_slice_rope() { + let data = make_fastq(&[b"ACGTACGT", b"TTTTTTTT"]); + let mid = data.len() / 2; + let mut rope = Rope::new(None); + rope.push(data[..mid].to_vec()); + rope.push(data[mid..].to_vec()); + assert_eq!( + flat(normalize_fastq_chunk(rope, 4)), + b"ACGTACGT\x00TTTTTTTT\x00" + ); +} + +// ── FASTA ───────────────────────────────────────────────────────────────── + +#[test] +fn fasta_single_record() { + assert_eq!( + run_fasta(&make_fasta(&[(b"s1", b"ACGTACGT")]), 4), + b"ACGTACGT\x00" + ); +} + +#[test] +fn fasta_two_records() { + assert_eq!( + run_fasta( + &make_fasta(&[(b"s1", b"ACGTACGT"), (b"s2", b"TTTTTTTT")]), + 4 + ), + b"ACGTACGT\x00TTTTTTTT\x00" + ); +} + +#[test] +fn fasta_multiline_sequence_concatenated() { + assert_eq!( + run_fasta(b">s1\nACGT\nACGT\nACGT\n", 4), + b"ACGTACGTACGT\x00" + ); +} + +#[test] +fn fasta_lowercase_uppercased() { + assert_eq!( + run_fasta(&make_fasta(&[(b"s1", b"acgtacgt")]), 4), + b"ACGTACGT\x00" + ); +} + +#[test] +fn fasta_short_record_discarded() { + assert_eq!(run_fasta(&make_fasta(&[(b"s1", b"ACG")]), 4), b""); +} + +#[test] +fn fasta_short_among_valid_discarded() { + assert_eq!( + run_fasta( + &make_fasta(&[(b"s1", b"ACGTACGT"), (b"s2", b"AC"), (b"s3", b"TTTTTTTT")]), + 4 + ), + b"ACGTACGT\x00TTTTTTTT\x00" + ); +} + +#[test] +fn fasta_ambiguous_splits_segments() { + assert_eq!(run_fasta(b">s1\nACGTNACGT\n", 4), b"ACGT\x00ACGT\x00"); +} + +#[test] +fn fasta_ambiguous_across_line_boundary() { + assert_eq!(run_fasta(b">s1\nACGT\nNACGT\n", 4), b"ACGT\x00ACGT\x00"); +} + +#[test] +fn fasta_ambiguous_short_segment_discarded() { + assert_eq!(run_fasta(b">s1\nACGTACGTNAC\n", 4), b"ACGTACGT\x00"); +} + +#[test] +fn fasta_no_trailing_newline() { + assert_eq!(run_fasta(b">s1\nACGTACGT", 4), b"ACGTACGT\x00"); +} + +#[test] +fn fasta_crlf_line_endings() { + assert_eq!( + run_fasta(b">s1\r\nACGT\r\nACGT\r\n>s2\r\nTTTT\r\n", 4), + b"ACGTACGT\x00TTTT\x00" + ); +} + +#[test] +fn fasta_multi_slice_rope() { + let data = make_fasta(&[(b"s1", b"ACGTACGT"), (b"s2", b"TTTTTTTT")]); + let mid = data.len() / 2; + let mut rope = Rope::new(None); + rope.push(data[..mid].to_vec()); + rope.push(data[mid..].to_vec()); + assert_eq!( + flat(normalize_fasta_chunk(rope, 4)), + b"ACGTACGT\x00TTTTTTTT\x00" + ); +} diff --git a/src/obiread/src/tests/nucstream.rs b/src/obiread/src/tests/nucstream.rs new file mode 100644 index 0000000..bc135cb --- /dev/null +++ b/src/obiread/src/tests/nucstream.rs @@ -0,0 +1,267 @@ +use super::*; +use std::io::Cursor; +use std::ops::Deref; + +// ── helpers ─────────────────────────────────────────────────────────────── + +fn run_fasta(data: &[u8], k: usize) -> Vec { + NucStream::<_, FastaParser>::new(Cursor::new(data.to_vec()), k) + .flat_map(|p| p.deref().to_vec()) + .collect() +} + +fn run_fastq(data: &[u8], k: usize) -> Vec { + NucStream::<_, FastqParser>::new(Cursor::new(data.to_vec()), k) + .flat_map(|p| p.deref().to_vec()) + .collect() +} + +fn run_genbank(data: &[u8], k: usize) -> Vec { + NucStream::<_, GenbankParser>::new(Cursor::new(data.to_vec()), k) + .flat_map(|p| p.deref().to_vec()) + .collect() +} + +fn pages_fasta(data: &[u8], k: usize) -> Vec> { + NucStream::<_, FastaParser>::new(Cursor::new(data.to_vec()), k) + .map(|p| p.deref().to_vec()) + .collect() +} + +// ── FastaParser ─────────────────────────────────────────────────────────── + +#[test] +fn fasta_single_sequence() { + assert_eq!(run_fasta(b">s1\nACGTACGT\n", 4), b"ACGTACGT\x00"); +} + +#[test] +fn fasta_lowercase_uppercased() { + assert_eq!(run_fasta(b">s1\nacgtacgt\n", 4), b"ACGTACGT\x00"); +} + +#[test] +fn fasta_multiline_sequence_concatenated() { + assert_eq!(run_fasta(b">s1\nACGT\nACGT\n", 4), b"ACGTACGT\x00"); +} + +#[test] +fn fasta_two_sequences() { + let data = b">s1\nACGTACGT\n>s2\nTTTTTTTT\n"; + assert_eq!(run_fasta(data, 4), b"ACGTACGT\x00TTTTTTTT\x00"); +} + +#[test] +fn fasta_empty_input_yields_no_pages() { + assert_eq!(run_fasta(b"", 4), b""); +} + +#[test] +fn fasta_sequence_shorter_than_k_at_eof_discarded() { + // The 3-base fragment is saved as overlap and dropped at EOF (< k). + assert_eq!(run_fasta(b">s1\nACG\n", 4), b""); +} + +#[test] +fn fasta_ambiguous_splits_into_two_segments() { + assert_eq!(run_fasta(b">s1\nACGTNACGT\n", 4), b"ACGT\x00ACGT\x00"); +} + +#[test] +fn fasta_short_segment_before_ambiguous_emitted() { + // "AC" (< k=4) before N is written with a separator — filtering by + // length is deferred to the superkmer builder, not done here. + assert_eq!(run_fasta(b">s1\nACNACGTACGT\n", 4), b"AC\x00ACGTACGT\x00"); +} + +#[test] +fn fasta_ambiguous_at_start_skipped() { + assert_eq!(run_fasta(b">s1\nNNNACGTACGT\n", 4), b"ACGTACGT\x00"); +} + +// ── FastqParser ─────────────────────────────────────────────────────────── + +#[test] +fn fastq_single_record() { + assert_eq!( + run_fastq(b"@r1\nACGTACGT\n+\nIIIIIIII\n", 4), + b"ACGTACGT\x00" + ); +} + +#[test] +fn fastq_lowercase_uppercased() { + assert_eq!( + run_fastq(b"@r1\nacgtacgt\n+\nIIIIIIII\n", 4), + b"ACGTACGT\x00" + ); +} + +#[test] +fn fastq_quality_bytes_not_in_output() { + // '@' (Phred 31 = ASCII 64) in quality must not appear in output. + assert_eq!( + run_fastq(b"@r1\nACGTACGT\n+\n@@@@@@@@\n", 4), + b"ACGTACGT\x00" + ); +} + +#[test] +fn fastq_two_records() { + let data = b"@r1\nACGTACGT\n+\nIIIIIIII\n@r2\nTTTTTTTT\n+\nIIIIIIII\n"; + assert_eq!(run_fastq(data, 4), b"ACGTACGT\x00TTTTTTTT\x00"); +} + +#[test] +fn fastq_ambiguous_splits_sequence() { + assert_eq!( + run_fastq(b"@r1\nACGTNACGT\n+\nIIIIIIIII\n", 4), + b"ACGT\x00ACGT\x00" + ); +} + +#[test] +fn fastq_at_in_quality_line_not_a_record_start() { + // '@' in the quality line must not trigger a new record parse. + let data = b"@r1\nACGTACGT\n+\n@@@@@@@@\n@r2\nTTTTTTTT\n+\nIIIIIIII\n"; + assert_eq!(run_fastq(data, 4), b"ACGTACGT\x00TTTTTTTT\x00"); +} + +// ── GenbankParser ───────────────────────────────────────────────────────── + +#[test] +fn genbank_origin_to_slash() { + let data = b"LOCUS ...\nORIGIN\n 1 acgtacgt\n//\n"; + assert_eq!(run_genbank(data, 4), b"ACGTACGT\x00"); +} + +#[test] +fn genbank_position_numbers_and_spaces_skipped() { + let data = b"ORIGIN\n 1 acgt acgt\n//\n"; + assert_eq!(run_genbank(data, 4), b"ACGTACGT\x00"); +} + +#[test] +fn genbank_two_records() { + let data = b"ORIGIN\n 1 acgtacgt\n//\nLOCUS ...\nORIGIN\n 1 tttttttt\n//\n"; + assert_eq!(run_genbank(data, 4), b"ACGTACGT\x00TTTTTTTT\x00"); +} + +#[test] +fn genbank_ambiguous_splits_sequence() { + let data = b"ORIGIN\n 1 acgtnacgt\n//\n"; + assert_eq!(run_genbank(data, 4), b"ACGT\x00ACGT\x00"); +} + +// ── NucPage ─────────────────────────────────────────────────────────────── + +#[test] +fn nuc_page_deref_correct_bytes() { + let page = NucStream::<_, FastaParser>::new(Cursor::new(b">s1\nACGT\n".to_vec()), 4) + .next() + .expect("page"); + assert_eq!(page.deref(), b"ACGT\x00"); +} + +// ── NucPageCursor ───────────────────────────────────────────────────────── + +fn make_page(data: &[u8], k: usize) -> NucPage { + NucStream::<_, FastaParser>::new(Cursor::new(data.to_vec()), k) + .next() + .expect("at least one page") +} + +#[test] +fn cursor_reads_bytes_in_order() { + let page = make_page(b">s1\nACGTACGT\n", 4); + let mut cur = page.cursor(); + assert_eq!(cur.next_byte(), Some(b'A')); + assert_eq!(cur.next_byte(), Some(b'C')); + assert_eq!(cur.next_byte(), Some(b'G')); + assert_eq!(cur.next_byte(), Some(b'T')); +} + +#[test] +fn cursor_rewind_rereads_bytes() { + let page = make_page(b">s1\nACGTACGT\n", 4); + let mut cur = page.cursor(); + cur.next_byte(); // A + cur.next_byte(); // C + cur.rewind(1); + assert_eq!(cur.next_byte(), Some(b'C')); + cur.rewind(2); + assert_eq!(cur.next_byte(), Some(b'A')); +} + +#[test] +fn cursor_returns_none_at_end() { + // "ACGT\x00" = 5 bytes; consume all then expect None. + let page = make_page(b">s1\nACGT\n", 4); + let mut cur = page.cursor(); + for _ in 0..5 { + cur.next_byte(); + } + assert_eq!(cur.next_byte(), None); +} + +#[test] +fn cursor_len_matches_page_content() { + // "ACGTACGT\x00" = 9 bytes + let page = make_page(b">s1\nACGTACGT\n", 4); + let cur = page.cursor(); + assert_eq!(cur.len(), 9); + assert!(!cur.is_empty()); +} + +// ── Overlap at page boundary ────────────────────────────────────────────── + +#[test] +fn overlap_last_km1_bytes_prepended_to_next_page() { + const K: usize = 11; + // Sequence long enough to span two pages: PAGE_SIZE + K bytes. + // Pattern chosen so boundary bytes are unambiguous. + let seq: Vec = (0..PAGE_SIZE + K).map(|i| b"ACGT"[i % 4]).collect(); + let mut input = b">seq\n".to_vec(); + input.extend_from_slice(&seq); + input.push(b'\n'); + + let pages = pages_fasta(&input, K); + assert!(pages.len() >= 2, "need at least two pages"); + + let p1 = &pages[0]; + let p2 = &pages[1]; + + // page1 must end with a \x00 separator (written by save_overlap) + assert_eq!(*p1.last().unwrap(), 0x00, "page1 must end with separator"); + + // last K-1 ACGT bytes of page1 == first K-1 bytes of page2 + let ol = K - 1; + let p1_seq_end = &p1[p1.len() - 1 - ol..p1.len() - 1]; + let p2_start = &p2[..ol]; + assert_eq!( + p1_seq_end, p2_start, + "overlap bytes mismatch at page boundary" + ); +} + +// ── Pool ────────────────────────────────────────────────────────────────── + +#[test] +fn pool_buffer_reused_after_drop() { + // Drop page1 so its buffer returns to the pool, then verify page2 + // is produced correctly (no corruption, no panic). + const K: usize = 11; + let seq: Vec = vec![b'A'; PAGE_SIZE + K]; + let mut input = b">seq\n".to_vec(); + input.extend_from_slice(&seq); + input.push(b'\n'); + + let mut stream = NucStream::<_, FastaParser>::new(Cursor::new(input), K); + let page1 = stream.next().expect("page 1"); + assert!(!page1.deref().is_empty()); + drop(page1); // returns buffer to pool + let page2 = stream.next().expect("page 2"); + assert!(!page2.deref().is_empty()); + // page2 must still start with A's (overlap from page1) + assert_eq!(page2[0], b'A'); +} -- 2.52.0 From be0e8f10415ab8fa3d44f47617bf4f79776460d8 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Fri, 29 May 2026 07:24:48 +0200 Subject: [PATCH 5/6] refactor(query): parallelize query execution with obipipeline Extracts chunk processing into a dedicated function and introduces a QueryData enum with unsafe Send/Sync implementations to safely distribute Rope chunks across worker threads. Replaces nested iteration with a flat iterator and parallel block processing. Adds CLI argument parsing for presence, threshold, and detail flags to configure the pipeline. --- src/obikmer/src/cmd/query.rs | 256 ++++++++++++++++++++--------------- 1 file changed, 149 insertions(+), 107 deletions(-) diff --git a/src/obikmer/src/cmd/query.rs b/src/obikmer/src/cmd/query.rs index f86ec02..beb303e 100644 --- a/src/obikmer/src/cmd/query.rs +++ b/src/obikmer/src/cmd/query.rs @@ -1,16 +1,30 @@ use std::collections::HashMap; use std::io::{self, BufWriter, Write}; use std::path::PathBuf; +use std::sync::Arc; use clap::Args; use obikindex::KmerIndex; use obilayeredmap::IndexMode; -use obiread::record::{SeqRecord, parse_chunk}; use obiread::chunk::read_sequence_chunks; +use obiread::record::{SeqRecord, parse_chunk}; +use obikrope::Rope; use obikseq::{RoutableSuperKmer, set_k, set_m}; use obiskbuilder::SuperKmerIter; use tracing::info; +// ── Pipeline data ───────────────────────────────────────────────────────────── + +enum QueryData { + Chunk(Rope), + Output(Vec), +} + +// SAFETY: Rope contains Cell which is !Sync, but pipeline items are owned +// exclusively through channels — no item is ever shared across threads. +unsafe impl Send for QueryData {} +unsafe impl Sync for QueryData {} + // ── CLI ─────────────────────────────────────────────────────────────────────── #[derive(Args)] @@ -146,14 +160,6 @@ impl SeqAcc { // ── Findere z-window filter ─────────────────────────────────────────────────── /// Apply the Findere z-window filter to per-kmer query results for one superkmer. -/// -/// A k-mer at position i for genome g is confirmed only if it belongs to at least -/// one run of z consecutive positions where all k-mers are present for g. -/// Unconfirmed positions are zeroed; positions whose entire row becomes zero are -/// returned as `None`. -/// -/// When z <= 1 or the superkmer is shorter than z k-mers, results are returned -/// unchanged (short superkmers cannot satisfy the z-window constraint). fn apply_findere( results: &[Option>], z: usize, @@ -205,13 +211,106 @@ fn apply_findere( }).collect() } +// ── process_chunk ───────────────────────────────────────────────────────────── + +fn process_chunk( + idx: &KmerIndex, + rope: Rope, + k: usize, + n_genomes: usize, + n_partitions: usize, + with_counts: bool, + effective_z: usize, + detail: bool, + count_missing: bool, + force_presence: bool, + presence_threshold: u32, +) -> Vec { + let records = parse_chunk(&rope, k); + if records.is_empty() { + return Vec::new(); + } + + let batch = QueryBatch::from_records(records, k, 6, 0.7); + let n_seqs = batch.ids.len(); + + let mut accs: Vec = + (0..n_seqs).map(|_| SeqAcc::new(n_genomes)).collect(); + + let mut cov: Vec>> = if detail { + batch.n_kmers.iter() + .map(|&n| vec![vec![0u32; n as usize]; n_genomes]) + .collect() + } else { + Vec::new() + }; + + let by_part = batch.split_by_partition(n_partitions); + + for (part_idx, part_sks) in by_part.iter().enumerate() { + if part_sks.is_empty() { + continue; + } + + let kmer_results = idx + .partition() + .query_partition(part_idx, part_sks, k, n_genomes, with_counts) + .unwrap_or_else(|e| { + eprintln!("query error on partition {part_idx}: {e}"); + std::process::exit(1); + }); + + let presence = force_presence || !with_counts; + let threshold = presence_threshold; + + for (rsk, sk_kmer_results) in part_sks.iter().zip(kmer_results.iter()) { + let filtered = apply_findere(sk_kmer_results, effective_z, n_genomes); + let descs = batch.map.get(*rsk).expect("rsk must be in map"); + + for desc in descs { + let acc = &mut accs[desc.seq_idx as usize]; + + for (local_pos, hit) in filtered.iter().enumerate() { + match hit { + None => { + if sk_kmer_results[local_pos].is_none() { + acc.kmer_missing += 1; + } + } + Some(row) => { + acc.kmer_count += 1; + for (g, &v) in row.iter().enumerate() { + if v == 0 { continue; } + let contribution = if presence { + u32::from(v >= threshold) + } else { + v + }; + acc.genome_totals[g] += contribution; + if detail { + let abs_pos = desc.kmer_offset as usize + local_pos; + cov[desc.seq_idx as usize][g][abs_pos] += contribution; + } + } + } + } + } + } + } + } + + let mut buf = Vec::new(); + emit_batch(&batch, &accs, idx.meta(), count_missing, detail, &cov, &mut buf); + buf +} + // ── Entry point ─────────────────────────────────────────────────────────────── pub fn run(args: QueryArgs) { - let idx = KmerIndex::open(&args.index).unwrap_or_else(|e| { + let idx = Arc::new(KmerIndex::open(&args.index).unwrap_or_else(|e| { eprintln!("error opening index: {e}"); std::process::exit(1); - }); + })); set_k(idx.kmer_size()); set_m(idx.minimizer_size()); @@ -220,6 +319,7 @@ pub fn run(args: QueryArgs) { let n_genomes = idx.meta().genomes.len(); let n_partitions = idx.n_partitions(); let with_counts = idx.meta().config.with_counts; + let n_workers = args.threads.max(1); let effective_z: usize = args.findere_z.unwrap_or_else(|| { match idx.meta().config.evidence { @@ -238,106 +338,48 @@ pub fn run(args: QueryArgs) { eprintln!("warning: --mismatch not yet implemented, ignored"); } + let detail = args.detail; + let count_missing = args.count_missing; + let force_presence = args.force_presence; + let presence_threshold = args.presence_threshold; + + // Flat iterator over all Rope chunks from all input files. + // I/O runs in the source thread; chunk processing is parallelised by the pipe. let paths: Vec = args.inputs.iter().map(PathBuf::from).collect(); + let all_chunks = paths.into_iter().flat_map(|path| { + let path_str = path.to_str().unwrap_or("").to_owned(); + match read_sequence_chunks(&path_str) { + Ok(iter) => Box::new(iter.filter_map(|r| match r { + Ok(rope) => Some(rope), + Err(e) => { eprintln!("read error: {e}"); None } + })) as Box + Send>, + Err(e) => { + eprintln!("error opening {path_str}: {e}"); + std::process::exit(1); + } + } + }); + + let pipe = obipipeline::make_pipe! { + QueryData : Rope => Vec, + | { + let idx = Arc::clone(&idx); + move |rope: Rope| { + process_chunk( + &idx, rope, k, n_genomes, n_partitions, with_counts, effective_z, + detail, count_missing, force_presence, presence_threshold, + ) + } + } : Chunk => Output, + }; + let mut out = BufWriter::new(io::stdout()); - - for path in &paths { - let chunks = read_sequence_chunks(path.to_str().unwrap_or("")) - .unwrap_or_else(|e| { - eprintln!("error opening {}: {e}", path.display()); - std::process::exit(1); - }); - - for chunk_result in chunks { - let chunk = chunk_result.unwrap_or_else(|e| { - eprintln!("read error: {e}"); - std::process::exit(1); - }); - - let records = parse_chunk(&chunk, k); - if records.is_empty() { - continue; - } - - let batch = QueryBatch::from_records(records, k, 6, 0.7); - let n_seqs = batch.ids.len(); - - let mut accs: Vec = - (0..n_seqs).map(|_| SeqAcc::new(n_genomes)).collect(); - - // [seq_idx][genome_idx][kmer_position] — allocated only with --detail - let mut cov: Vec>> = if args.detail { - batch.n_kmers.iter() - .map(|&n| vec![vec![0u32; n as usize]; n_genomes]) - .collect() - } else { - Vec::new() - }; - - let by_part = batch.split_by_partition(n_partitions); - - for (part_idx, part_sks) in by_part.iter().enumerate() { - if part_sks.is_empty() { - continue; - } - - let kmer_results = idx - .partition() - .query_partition(part_idx, part_sks, k, n_genomes, with_counts) - .unwrap_or_else(|e| { - eprintln!("query error on partition {part_idx}: {e}"); - std::process::exit(1); - }); - - let presence = args.force_presence || !with_counts; - let threshold = args.presence_threshold; - - for (rsk, sk_kmer_results) in part_sks.iter().zip(kmer_results.iter()) { - let filtered = apply_findere(sk_kmer_results, effective_z, n_genomes); - let descs = batch.map.get(*rsk).expect("rsk must be in map"); - - for desc in descs { - let acc = &mut accs[desc.seq_idx as usize]; - - for (local_pos, hit) in filtered.iter().enumerate() { - match hit { - None => { - // Only truly missing if the index also had no entry. - if sk_kmer_results[local_pos].is_none() { - acc.kmer_missing += 1; - } - } - Some(row) => { - acc.kmer_count += 1; - for (g, &v) in row.iter().enumerate() { - if v == 0 { - continue; - } - let contribution = if presence { - u32::from(v >= threshold) - } else { - v - }; - acc.genome_totals[g] += contribution; - if args.detail { - let abs_pos = desc.kmer_offset as usize + local_pos; - cov[desc.seq_idx as usize][g][abs_pos] += contribution; - } - } - } - } - } - } - } - } - - emit_batch( - &batch, &accs, idx.meta(), - args.count_missing, args.detail, &cov, - &mut out, - ); + for block in pipe.apply(all_chunks, n_workers, 2) { + if !block.is_empty() { + out.write_all(&block).expect("write error"); } } + out.flush().expect("flush error"); } // ── Output ──────────────────────────────────────────────────────────────────── -- 2.52.0 From 86b88acb95c481fba3684e0d91f0af8af2e7afb4 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Fri, 29 May 2026 09:07:35 +0200 Subject: [PATCH 6/6] feat: implement approximate k-mer indexing and optimize query Enable approximate k-mer indexing via the `--approx` flag, computing an effective k-mer size of `k - z + 1` and configuring the appropriate indexing mode with validated probabilistic parameters. Refactor the Findere z-window filter in the query command to improve performance and correctness by replacing the precomputed vector with a lazy closure, optimizing cache locality, and fixing a variable naming bug. --- src/obikmer/src/cmd/index.rs | 11 +++---- src/obikmer/src/cmd/query.rs | 56 +++++++++++++++++------------------- 2 files changed, 32 insertions(+), 35 deletions(-) diff --git a/src/obikmer/src/cmd/index.rs b/src/obikmer/src/cmd/index.rs index 1a796f9..0f5f770 100644 --- a/src/obikmer/src/cmd/index.rs +++ b/src/obikmer/src/cmd/index.rs @@ -158,7 +158,7 @@ pub fn run(args: IndexArgs) { let mut rep = Reporter::new(); // ── Resolve evidence kind ──────────────────────────────────────────────── - let evidence = if args.approx { + let (evidence, effective_kmer_size) = if args.approx { let (z, b, fp) = resolve_approx_params(args.findere_z, args.evidence_bits, args.fp); let k = args.common.kmer_size; if z as usize >= k { @@ -169,10 +169,11 @@ pub fn run(args: IndexArgs) { ); std::process::exit(1); } - info!("approximate evidence: b={b}, z={z}, fp={fp:.2e}"); - IndexMode::Approx { b, z } + let s = k - z as usize + 1; + info!("approximate evidence: b={b}, z={z}, fp={fp:.2e}, indexed kmer size={s}"); + (IndexMode::Approx { b, z }, s) } else { - IndexMode::Exact + (IndexMode::Exact, args.common.kmer_size) }; // ── Open or create the index ───────────────────────────────────────────── @@ -197,7 +198,7 @@ pub fn run(args: IndexArgs) { } let block_bits = block_size_to_bits(args.block_size); let config = IndexConfig { - kmer_size: args.common.kmer_size, + kmer_size: effective_kmer_size, minimizer_size: args.common.minimizer_size, n_bits, with_counts: args.with_counts, diff --git a/src/obikmer/src/cmd/query.rs b/src/obikmer/src/cmd/query.rs index beb303e..2af272e 100644 --- a/src/obikmer/src/cmd/query.rs +++ b/src/obikmer/src/cmd/query.rs @@ -160,54 +160,50 @@ impl SeqAcc { // ── Findere z-window filter ─────────────────────────────────────────────────── /// Apply the Findere z-window filter to per-kmer query results for one superkmer. +/// Aggregate s-mer query results into k-mer answers using a Findere z-window. +/// +/// Input: N s-mer results (indexed kmer size s = k − z + 1). +/// Output: N − z + 1 k-mer results (user kmer size k). +/// +/// For each genome g independently: k-mer at position i is confirmed iff all z values +/// results[i..i+z][g] are nonzero (None counts as zero for all genomes). +/// Output values are taken from results[i]; genomes not confirmed are zeroed. fn apply_findere( results: &[Option>], z: usize, n_genomes: usize, ) -> Vec>> { let n = results.len(); - if z <= 1 || n < z { + if z <= 1 { return results.iter().map(|r| r.as_ref().map(|row| row.clone())).collect(); } + if n < z { + return Vec::new(); + } - let mut confirmed = vec![vec![false; n_genomes]; n]; + let out_n = n - z + 1; + let mut confirmed = vec![vec![false; n_genomes]; out_n]; for g in 0..n_genomes { - let present: Vec = results - .iter() - .map(|r| r.as_ref().map_or(false, |row| row[g] > 0)) - .collect(); + let hit = |i: usize| results[i].as_ref().map_or(false, |r| r[g] > 0); - let mut window_count = present[..z].iter().filter(|&&p| p).count(); - if window_count == z { - for c in confirmed[..z].iter_mut() { - c[g] = true; - } - } + let mut count: u32 = (0..z).filter(|&j| hit(j)).count() as u32; + if count == z as u32 { confirmed[0][g] = true; } - for j in 1..=(n - z) { - if present[j - 1] { window_count -= 1; } - if present[j + z - 1] { window_count += 1; } - if window_count == z { - for c in confirmed[j..j + z].iter_mut() { - c[g] = true; - } - } + for i in 1..out_n { + if hit(i - 1) { count -= 1; } + if hit(i + z - 1) { count += 1; } + if count == z as u32 { confirmed[i][g] = true; } } } - results.iter().enumerate().map(|(i, opt)| { - let row = opt.as_ref()?; - let mut new_row: Box<[u32]> = row.clone(); - let mut any = false; + (0..out_n).map(|i| { + let first = results[i].as_ref()?; + let mut row: Box<[u32]> = first.clone(); for g in 0..n_genomes { - if !confirmed[i][g] { - new_row[g] = 0; - } else { - any = true; - } + if !confirmed[i][g] { row[g] = 0; } } - if any { Some(new_row) } else { None } + if row.iter().any(|&v| v > 0) { Some(row) } else { None } }).collect() } -- 2.52.0