♻️ refactor rope implementation to use obikrope

- rename `obirope` → `obikroper`
- replace legacy rope with new in-place, Cell-based implementation
  - add ForwardCursor/Backward Cursor & SeekMode support (no more BytesMut)
- update all dependents:
  - obiread: switch to Rope + cursors, remove tape.rs
    • chunk iterator yields `Rope` instead of Vec<Bytes>
  - obiskbuilder: use ForwardCursor over Rope
- remove bytes dependency from affected crates
This commit is contained in:
Eric Coissac
2026-04-19 21:22:10 +02:00
parent 5fab59f92c
commit 0dcb5dd6c2
19 changed files with 790 additions and 1140 deletions
+1 -2
View File
@@ -5,5 +5,4 @@ edition = "2024"
[dependencies]
obikseq = { path = "../obikseq" }
obiread = { path = "../obiread" }
bytes = "1"
obikrope = { path = "../obikrope" }
+30 -77
View File
@@ -15,8 +15,7 @@
//! | minimizer changed | k |
//! | super-kmer length = 256| k |
use bytes::Bytes;
use obiread::tape::RopeCursor;
use obikrope::{ForwardCursor, Rope, RopeCursor};
use obikseq::superkmer::SuperKmer;
use crate::encoding::encode_base;
@@ -26,9 +25,8 @@ use crate::scratch::SuperKmerScratch;
use crate::window::KmerWindow;
/// Iterator over `(minimizer, SuperKmer)` pairs.
pub struct SuperKmerIter {
rope: Vec<Bytes>,
cursor: RopeCursor,
pub struct SuperKmerIter<'a> {
cursor: ForwardCursor<'a>,
k: usize,
scratch: SuperKmerScratch,
window: KmerWindow,
@@ -38,23 +36,16 @@ pub struct SuperKmerIter {
prev_minimizer_pos: u8,
}
impl SuperKmerIter {
impl<'a> SuperKmerIter<'a> {
/// Build an iterator from a normalised rope chunk.
///
/// - `k`: k-mer size (131)
/// - `m`: minimizer size (1 < m < k)
/// - `level_max`: maximum sub-word size for entropy (typically 6)
/// - `theta`: entropy threshold; k-mers with score ≤ theta are rejected
pub fn new(
chunks: Vec<Bytes>,
k: usize,
m: usize,
level_max: usize,
theta: f64,
) -> Self {
pub fn new(rope: &'a Rope, k: usize, m: usize, level_max: usize, theta: f64) -> Self {
Self {
rope: chunks,
cursor: RopeCursor::new(),
cursor: rope.fw_cursor(),
k,
scratch: SuperKmerScratch::new(),
window: KmerWindow::new(k),
@@ -65,13 +56,6 @@ impl SuperKmerIter {
}
}
#[inline]
fn read_byte(&mut self) -> Option<u8> {
let b = self.cursor.peek(&self.rope)?;
self.cursor.advance(&self.rope);
Some(b)
}
fn reset_state(&mut self) {
self.window.reset();
self.minimizer.reset();
@@ -80,8 +64,6 @@ impl SuperKmerIter {
self.prev_minimizer_pos = 0;
}
/// Emit the current scratch as a super-kmer if it holds at least k nucleotides.
/// Returns `None` and silently discards short buffers (< k nt, no complete kmer).
fn try_emit(&mut self, min: u64) -> Option<(u64, SuperKmer)> {
if self.scratch.len() < self.k {
self.scratch.reset();
@@ -93,19 +75,17 @@ impl SuperKmerIter {
}
}
impl Iterator for SuperKmerIter {
impl<'a> Iterator for SuperKmerIter<'a> {
type Item = (u64, SuperKmer);
fn next(&mut self) -> Option<(u64, SuperKmer)> {
loop {
let byte = match self.read_byte() {
let byte = match self.cursor.read_next().ok() {
None => {
// EOF: flush whatever we have.
let min = self.prev_minimizer.unwrap_or(0);
return self.try_emit(min);
}
Some(0x00) => {
// Segment boundary: flush current super-kmer, then reset.
let min = self.prev_minimizer.unwrap_or(0);
let result = self.try_emit(min);
self.reset_state();
@@ -118,25 +98,22 @@ impl Iterator for SuperKmerIter {
};
let base2 = encode_base(byte);
// Update sliding windows; kmer_ready is true once we have k bases.
let kmer_ready = self.window.push(base2);
let current_min = self.minimizer.push(base2);
if !kmer_ready {
// Warm-up phase: just accumulate.
self.scratch.push(byte);
continue;
}
let kmer = self.window.kmer_u64();
let min = current_min.unwrap(); // guaranteed when kmer_ready
let min = current_min.unwrap();
// ── 1. Entropy check ─────────────────────────────────────────────
if !self.entropy.accept(kmer) {
let prev_min = self.prev_minimizer.unwrap_or(0);
let result = self.try_emit(prev_min);
self.cursor.retreat(self.k - 1, &self.rope);
self.cursor.rewind(self.k - 1).ok();
self.reset_state();
if result.is_some() {
return result;
@@ -148,7 +125,7 @@ impl Iterator for SuperKmerIter {
if let Some(prev) = self.prev_minimizer {
if min != prev {
let result = self.try_emit(prev);
self.cursor.retreat(self.k, &self.rope);
self.cursor.rewind(self.k).ok();
self.reset_state();
if result.is_some() {
return result;
@@ -158,10 +135,9 @@ impl Iterator for SuperKmerIter {
}
// ── 3. Super-kmer length check ────────────────────────────────────
// S[j] would be the 257th nucleotide → don't add it.
if self.scratch.len() == 256 {
let result = self.try_emit(min);
self.cursor.retreat(self.k, &self.rope);
self.cursor.rewind(self.k).ok();
self.reset_state();
if result.is_some() {
return result;
@@ -169,7 +145,6 @@ impl Iterator for SuperKmerIter {
continue;
}
// All checks passed: accept S[j].
self.prev_minimizer = Some(min);
self.prev_minimizer_pos = self.minimizer.minimizer_pos();
self.scratch.push(byte);
@@ -182,29 +157,24 @@ impl Iterator for SuperKmerIter {
#[cfg(test)]
mod tests {
use super::*;
use bytes::BytesMut;
fn chunks(data: &[u8]) -> Vec<Bytes> {
vec![BytesMut::from(data).freeze()]
fn make_rope(data: &[u8]) -> Rope {
let mut r = Rope::new();
r.push(data.to_vec());
r
}
/// theta=0: accept everything; level_max=1 (minimal entropy computation).
fn run_nofilter(data: &[u8], k: usize, m: usize) -> Vec<Vec<u8>> {
SuperKmerIter::new(chunks(data), k, m, 1, 0.0)
let rope = make_rope(data);
SuperKmerIter::new(&rope, k, m, 1, 0.0)
.map(|(_, sk)| sk.to_ascii())
.collect()
}
// ── basic segmentation (no entropy / minimizer cut) ───────────────────────
#[test]
fn single_segment_one_superkmer() {
// k=4, m=2; one segment "ACGTACGT\x00"
// All consecutive 4-mers share the same minimizer (or not — but we
// just check that we get the full sequence back with theta=0).
let out = run_nofilter(b"ACGTACGT\x00", 4, 2);
assert!(!out.is_empty());
// The concatenation of all emitted superkmers covers the segment.
let total: Vec<u8> = out.into_iter().flatten().collect();
assert!(total.len() >= 4);
}
@@ -227,54 +197,37 @@ mod tests {
assert!(!out.is_empty());
}
// ── entropy cut ───────────────────────────────────────────────────────────
#[test]
fn low_complexity_kmer_is_rejected() {
// "AAAAAAAAACGT\x00": the run of A's is low-complexity;
// with a strict threshold the A-only kmers get cut.
// With theta=0 everything passes → we get one superkmer.
let out_pass = run_nofilter(b"AAAAAAAAACGT\x00", 4, 2);
assert!(!out_pass.is_empty());
// With a high threshold polyA should be rejected.
let out_reject: Vec<Vec<u8>> =
SuperKmerIter::new(chunks(b"AAAAAAAA\x00"), 4, 2, 6, 0.9)
.map(|(_, sk)| sk.to_ascii())
.collect();
// The sequence is pure polyA: all kmers should fail entropy → no emission.
let rope = make_rope(b"AAAAAAAA\x00");
let out_reject: Vec<Vec<u8>> = SuperKmerIter::new(&rope, 4, 2, 6, 0.9)
.map(|(_, sk)| sk.to_ascii())
.collect();
assert!(out_reject.is_empty());
}
// ── multi-slice rope ──────────────────────────────────────────────────────
#[test]
fn multi_slice_rope() {
let data = b"ACGTACGTACGT\x00";
let mid = data.len() / 2;
let rope = vec![
BytesMut::from(&data[..mid]).freeze(),
BytesMut::from(&data[mid..]).freeze(),
];
let out: Vec<Vec<u8>> = SuperKmerIter::new(rope, 4, 2, 1, 0.0)
let mut rope = Rope::new();
rope.push(data[..mid].to_vec());
rope.push(data[mid..].to_vec());
let out: Vec<Vec<u8>> = SuperKmerIter::new(&rope, 4, 2, 1, 0.0)
.map(|(_, sk)| sk.to_ascii())
.collect();
assert!(!out.is_empty());
}
// ── minimizer is returned ─────────────────────────────────────────────────
#[test]
fn yields_minimizer_value() {
let data = b"ACGTACGT\x00";
let results: Vec<(u64, Vec<u8>)> =
SuperKmerIter::new(chunks(data), 4, 2, 1, 0.0)
.map(|(min, sk)| (min, sk.to_ascii()))
.collect();
let rope = make_rope(b"ACGTACGT\x00");
let results: Vec<(u64, Vec<u8>)> = SuperKmerIter::new(&rope, 4, 2, 1, 0.0)
.map(|(min, sk)| (min, sk.to_ascii()))
.collect();
assert!(!results.is_empty());
// The minimizer is a non-trivial u64 (just check it's been computed).
for (min, _) in &results {
let _ = min; // value is format-specific; just verify no panic
}
}
}