♻️ refactor rope implementation to use obikrope
- rename `obirope` → `obikroper`
- replace legacy rope with new in-place, Cell-based implementation
- add ForwardCursor/Backward Cursor & SeekMode support (no more BytesMut)
- update all dependents:
- obiread: switch to Rope + cursors, remove tape.rs
• chunk iterator yields `Rope` instead of Vec<Bytes>
- obiskbuilder: use ForwardCursor over Rope
- remove bytes dependency from affected crates
This commit is contained in:
@@ -5,5 +5,4 @@ edition = "2024"
|
||||
|
||||
[dependencies]
|
||||
obikseq = { path = "../obikseq" }
|
||||
obiread = { path = "../obiread" }
|
||||
bytes = "1"
|
||||
obikrope = { path = "../obikrope" }
|
||||
|
||||
@@ -15,8 +15,7 @@
|
||||
//! | minimizer changed | k |
|
||||
//! | super-kmer length = 256| k |
|
||||
|
||||
use bytes::Bytes;
|
||||
use obiread::tape::RopeCursor;
|
||||
use obikrope::{ForwardCursor, Rope, RopeCursor};
|
||||
use obikseq::superkmer::SuperKmer;
|
||||
|
||||
use crate::encoding::encode_base;
|
||||
@@ -26,9 +25,8 @@ use crate::scratch::SuperKmerScratch;
|
||||
use crate::window::KmerWindow;
|
||||
|
||||
/// Iterator over `(minimizer, SuperKmer)` pairs.
|
||||
pub struct SuperKmerIter {
|
||||
rope: Vec<Bytes>,
|
||||
cursor: RopeCursor,
|
||||
pub struct SuperKmerIter<'a> {
|
||||
cursor: ForwardCursor<'a>,
|
||||
k: usize,
|
||||
scratch: SuperKmerScratch,
|
||||
window: KmerWindow,
|
||||
@@ -38,23 +36,16 @@ pub struct SuperKmerIter {
|
||||
prev_minimizer_pos: u8,
|
||||
}
|
||||
|
||||
impl SuperKmerIter {
|
||||
impl<'a> SuperKmerIter<'a> {
|
||||
/// Build an iterator from a normalised rope chunk.
|
||||
///
|
||||
/// - `k`: k-mer size (1–31)
|
||||
/// - `m`: minimizer size (1 < m < k)
|
||||
/// - `level_max`: maximum sub-word size for entropy (typically 6)
|
||||
/// - `theta`: entropy threshold; k-mers with score ≤ theta are rejected
|
||||
pub fn new(
|
||||
chunks: Vec<Bytes>,
|
||||
k: usize,
|
||||
m: usize,
|
||||
level_max: usize,
|
||||
theta: f64,
|
||||
) -> Self {
|
||||
pub fn new(rope: &'a Rope, k: usize, m: usize, level_max: usize, theta: f64) -> Self {
|
||||
Self {
|
||||
rope: chunks,
|
||||
cursor: RopeCursor::new(),
|
||||
cursor: rope.fw_cursor(),
|
||||
k,
|
||||
scratch: SuperKmerScratch::new(),
|
||||
window: KmerWindow::new(k),
|
||||
@@ -65,13 +56,6 @@ impl SuperKmerIter {
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn read_byte(&mut self) -> Option<u8> {
|
||||
let b = self.cursor.peek(&self.rope)?;
|
||||
self.cursor.advance(&self.rope);
|
||||
Some(b)
|
||||
}
|
||||
|
||||
fn reset_state(&mut self) {
|
||||
self.window.reset();
|
||||
self.minimizer.reset();
|
||||
@@ -80,8 +64,6 @@ impl SuperKmerIter {
|
||||
self.prev_minimizer_pos = 0;
|
||||
}
|
||||
|
||||
/// Emit the current scratch as a super-kmer if it holds at least k nucleotides.
|
||||
/// Returns `None` and silently discards short buffers (< k nt, no complete kmer).
|
||||
fn try_emit(&mut self, min: u64) -> Option<(u64, SuperKmer)> {
|
||||
if self.scratch.len() < self.k {
|
||||
self.scratch.reset();
|
||||
@@ -93,19 +75,17 @@ impl SuperKmerIter {
|
||||
}
|
||||
}
|
||||
|
||||
impl Iterator for SuperKmerIter {
|
||||
impl<'a> Iterator for SuperKmerIter<'a> {
|
||||
type Item = (u64, SuperKmer);
|
||||
|
||||
fn next(&mut self) -> Option<(u64, SuperKmer)> {
|
||||
loop {
|
||||
let byte = match self.read_byte() {
|
||||
let byte = match self.cursor.read_next().ok() {
|
||||
None => {
|
||||
// EOF: flush whatever we have.
|
||||
let min = self.prev_minimizer.unwrap_or(0);
|
||||
return self.try_emit(min);
|
||||
}
|
||||
Some(0x00) => {
|
||||
// Segment boundary: flush current super-kmer, then reset.
|
||||
let min = self.prev_minimizer.unwrap_or(0);
|
||||
let result = self.try_emit(min);
|
||||
self.reset_state();
|
||||
@@ -118,25 +98,22 @@ impl Iterator for SuperKmerIter {
|
||||
};
|
||||
|
||||
let base2 = encode_base(byte);
|
||||
|
||||
// Update sliding windows; kmer_ready is true once we have k bases.
|
||||
let kmer_ready = self.window.push(base2);
|
||||
let current_min = self.minimizer.push(base2);
|
||||
|
||||
if !kmer_ready {
|
||||
// Warm-up phase: just accumulate.
|
||||
self.scratch.push(byte);
|
||||
continue;
|
||||
}
|
||||
|
||||
let kmer = self.window.kmer_u64();
|
||||
let min = current_min.unwrap(); // guaranteed when kmer_ready
|
||||
let min = current_min.unwrap();
|
||||
|
||||
// ── 1. Entropy check ─────────────────────────────────────────────
|
||||
if !self.entropy.accept(kmer) {
|
||||
let prev_min = self.prev_minimizer.unwrap_or(0);
|
||||
let result = self.try_emit(prev_min);
|
||||
self.cursor.retreat(self.k - 1, &self.rope);
|
||||
self.cursor.rewind(self.k - 1).ok();
|
||||
self.reset_state();
|
||||
if result.is_some() {
|
||||
return result;
|
||||
@@ -148,7 +125,7 @@ impl Iterator for SuperKmerIter {
|
||||
if let Some(prev) = self.prev_minimizer {
|
||||
if min != prev {
|
||||
let result = self.try_emit(prev);
|
||||
self.cursor.retreat(self.k, &self.rope);
|
||||
self.cursor.rewind(self.k).ok();
|
||||
self.reset_state();
|
||||
if result.is_some() {
|
||||
return result;
|
||||
@@ -158,10 +135,9 @@ impl Iterator for SuperKmerIter {
|
||||
}
|
||||
|
||||
// ── 3. Super-kmer length check ────────────────────────────────────
|
||||
// S[j] would be the 257th nucleotide → don't add it.
|
||||
if self.scratch.len() == 256 {
|
||||
let result = self.try_emit(min);
|
||||
self.cursor.retreat(self.k, &self.rope);
|
||||
self.cursor.rewind(self.k).ok();
|
||||
self.reset_state();
|
||||
if result.is_some() {
|
||||
return result;
|
||||
@@ -169,7 +145,6 @@ impl Iterator for SuperKmerIter {
|
||||
continue;
|
||||
}
|
||||
|
||||
// All checks passed: accept S[j].
|
||||
self.prev_minimizer = Some(min);
|
||||
self.prev_minimizer_pos = self.minimizer.minimizer_pos();
|
||||
self.scratch.push(byte);
|
||||
@@ -182,29 +157,24 @@ impl Iterator for SuperKmerIter {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use bytes::BytesMut;
|
||||
|
||||
fn chunks(data: &[u8]) -> Vec<Bytes> {
|
||||
vec![BytesMut::from(data).freeze()]
|
||||
fn make_rope(data: &[u8]) -> Rope {
|
||||
let mut r = Rope::new();
|
||||
r.push(data.to_vec());
|
||||
r
|
||||
}
|
||||
|
||||
/// theta=0: accept everything; level_max=1 (minimal entropy computation).
|
||||
fn run_nofilter(data: &[u8], k: usize, m: usize) -> Vec<Vec<u8>> {
|
||||
SuperKmerIter::new(chunks(data), k, m, 1, 0.0)
|
||||
let rope = make_rope(data);
|
||||
SuperKmerIter::new(&rope, k, m, 1, 0.0)
|
||||
.map(|(_, sk)| sk.to_ascii())
|
||||
.collect()
|
||||
}
|
||||
|
||||
// ── basic segmentation (no entropy / minimizer cut) ───────────────────────
|
||||
|
||||
#[test]
|
||||
fn single_segment_one_superkmer() {
|
||||
// k=4, m=2; one segment "ACGTACGT\x00"
|
||||
// All consecutive 4-mers share the same minimizer (or not — but we
|
||||
// just check that we get the full sequence back with theta=0).
|
||||
let out = run_nofilter(b"ACGTACGT\x00", 4, 2);
|
||||
assert!(!out.is_empty());
|
||||
// The concatenation of all emitted superkmers covers the segment.
|
||||
let total: Vec<u8> = out.into_iter().flatten().collect();
|
||||
assert!(total.len() >= 4);
|
||||
}
|
||||
@@ -227,54 +197,37 @@ mod tests {
|
||||
assert!(!out.is_empty());
|
||||
}
|
||||
|
||||
// ── entropy cut ───────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn low_complexity_kmer_is_rejected() {
|
||||
// "AAAAAAAAACGT\x00": the run of A's is low-complexity;
|
||||
// with a strict threshold the A-only kmers get cut.
|
||||
// With theta=0 everything passes → we get one superkmer.
|
||||
let out_pass = run_nofilter(b"AAAAAAAAACGT\x00", 4, 2);
|
||||
assert!(!out_pass.is_empty());
|
||||
|
||||
// With a high threshold polyA should be rejected.
|
||||
let out_reject: Vec<Vec<u8>> =
|
||||
SuperKmerIter::new(chunks(b"AAAAAAAA\x00"), 4, 2, 6, 0.9)
|
||||
.map(|(_, sk)| sk.to_ascii())
|
||||
.collect();
|
||||
// The sequence is pure polyA: all kmers should fail entropy → no emission.
|
||||
let rope = make_rope(b"AAAAAAAA\x00");
|
||||
let out_reject: Vec<Vec<u8>> = SuperKmerIter::new(&rope, 4, 2, 6, 0.9)
|
||||
.map(|(_, sk)| sk.to_ascii())
|
||||
.collect();
|
||||
assert!(out_reject.is_empty());
|
||||
}
|
||||
|
||||
// ── multi-slice rope ──────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn multi_slice_rope() {
|
||||
let data = b"ACGTACGTACGT\x00";
|
||||
let mid = data.len() / 2;
|
||||
let rope = vec![
|
||||
BytesMut::from(&data[..mid]).freeze(),
|
||||
BytesMut::from(&data[mid..]).freeze(),
|
||||
];
|
||||
let out: Vec<Vec<u8>> = SuperKmerIter::new(rope, 4, 2, 1, 0.0)
|
||||
let mut rope = Rope::new();
|
||||
rope.push(data[..mid].to_vec());
|
||||
rope.push(data[mid..].to_vec());
|
||||
let out: Vec<Vec<u8>> = SuperKmerIter::new(&rope, 4, 2, 1, 0.0)
|
||||
.map(|(_, sk)| sk.to_ascii())
|
||||
.collect();
|
||||
assert!(!out.is_empty());
|
||||
}
|
||||
|
||||
// ── minimizer is returned ─────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn yields_minimizer_value() {
|
||||
let data = b"ACGTACGT\x00";
|
||||
let results: Vec<(u64, Vec<u8>)> =
|
||||
SuperKmerIter::new(chunks(data), 4, 2, 1, 0.0)
|
||||
.map(|(min, sk)| (min, sk.to_ascii()))
|
||||
.collect();
|
||||
let rope = make_rope(b"ACGTACGT\x00");
|
||||
let results: Vec<(u64, Vec<u8>)> = SuperKmerIter::new(&rope, 4, 2, 1, 0.0)
|
||||
.map(|(min, sk)| (min, sk.to_ascii()))
|
||||
.collect();
|
||||
assert!(!results.is_empty());
|
||||
// The minimizer is a non-trivial u64 (just check it's been computed).
|
||||
for (min, _) in &results {
|
||||
let _ = min; // value is format-specific; just verify no panic
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user