diff --git a/src/Cargo.lock b/src/Cargo.lock index cec3b4f..139b17b 100644 --- a/src/Cargo.lock +++ b/src/Cargo.lock @@ -586,10 +586,10 @@ dependencies = [ name = "obikmer" version = "0.1.0" dependencies = [ - "bytes", "clap", "crossbeam-channel", "obifastwrite", + "obikrope", "obikseq", "obiread", "obiskbuilder", diff --git a/src/obikmer/Cargo.toml b/src/obikmer/Cargo.toml index 7bba1cd..9b6e4ba 100644 --- a/src/obikmer/Cargo.toml +++ b/src/obikmer/Cargo.toml @@ -14,4 +14,4 @@ obiskbuilder = { path = "../obiskbuilder" } obifastwrite = { path = "../obifastwrite" } clap = { version = "4", features = ["derive"] } crossbeam-channel = "0.5" -bytes = "1" +obikrope = { path = "../obikrope" } diff --git a/src/obikmer/src/cmd/superkmer.rs b/src/obikmer/src/cmd/superkmer.rs index d2bd0cb..434ae21 100644 --- a/src/obikmer/src/cmd/superkmer.rs +++ b/src/obikmer/src/cmd/superkmer.rs @@ -1,10 +1,10 @@ use std::io::{self, BufWriter, Write}; use std::thread; -use bytes::Bytes; use clap::Args; use crossbeam_channel::bounded; use obifastwrite::write_scatter; +use obikrope::Rope; use obikseq::superkmer::SuperKmer; use obiskbuilder::SuperKmerIter; @@ -84,7 +84,7 @@ pub fn run(args: SuperkmerArgs) { let n_workers = args.threads.max(1); // raw chunks (reader → workers) - let (raw_tx, raw_rx) = bounded::>(n_workers * 2); + let (raw_tx, raw_rx) = bounded::(n_workers * 2); // superkmer batches (workers → output) let (sk_tx, sk_rx) = bounded::>(n_workers * 2); @@ -123,7 +123,7 @@ pub fn run(args: SuperkmerArgs) { }; const BATCH_SIZE: usize = 10_000; let mut batch = Vec::with_capacity(BATCH_SIZE); - for sk in SuperKmerIter::new(norm, k, m, level_max, theta) { + for sk in SuperKmerIter::new(&norm, k, m, level_max, theta) { batch.push(sk); if batch.len() == BATCH_SIZE { sk_tx.send(std::mem::replace( diff --git a/src/obikrope/src/cursor.rs b/src/obikrope/src/cursor.rs index 7a83201..301fcc2 100644 --- a/src/obikrope/src/cursor.rs +++ b/src/obikrope/src/cursor.rs @@ -1,35 +1,83 @@ +//! Cursors for sequential and random access over a [`Rope`]. +//! +//! # Design +//! +//! A cursor borrows a `&'a Rope` and keeps a small block cache so that +//! consecutive accesses within the same block cost O(1). The first access to a +//! new block costs O(log n) (binary search in [`Rope::lookup`]); subsequent +//! accesses within that block are free. +//! +//! All mutable state (current position, cache) is stored in [`Cell`] fields, +//! so every cursor method takes `&self` rather than `&mut self`. This means: +//! +//! - Two cursors can coexist on the same rope without lifetime conflicts. +//! - The `iter()` method returns a lightweight wrapper that holds `&Cursor`, +//! allowing `cursor.tell()` or `cursor.seek()` to be called **inside a `for` +//! loop** over the same cursor. +//! +//! # Cursors +//! +//! | Type | Direction | First `read_next` | `seek(Relative, +n)` | +//! |------|-----------|-------------------|----------------------| +//! | [`ForwardCursor`] | start → end | index 0 | advances (+n) | +//! | [`BackwardCursor`] | end → start | index `len-1` | retreats (+n) | +//! +//! # Example +//! +//! ``` +//! use obikrope::{Rope, RopeCursor}; +//! +//! let mut rope = Rope::new(); +//! rope.push(b"ACGT".to_vec()); +//! +//! let cursor = rope.fw_cursor(); +//! for byte in cursor.iter() { +//! // cursor.tell() is valid here — iter() holds &cursor, not &mut cursor +//! let _ = cursor.tell(); +//! } +//! ``` + use std::cell::Cell; use crate::{Rope, RopeError}; +/// Controls how the `pos` argument of [`RopeCursor::seek`] is interpreted. #[derive(Clone, Copy)] pub enum SeekMode { + /// `pos` is an absolute byte index from the start of the rope. Absolute, + /// `pos` is relative to the current position. + /// Positive = forward for [`ForwardCursor`], backward for [`BackwardCursor`]. Relative, + /// `pos` is counted back from the end: target = `len - pos`. RelativeToEnd, } // ── shared state ────────────────────────────────────────────────────────────── +/// Per-cursor cache of the last accessed block plus the current position. +/// +/// All fields are [`Cell`]-wrapped so they can be mutated through a shared +/// reference, enabling `&self` methods on cursors. #[derive(Clone)] pub struct CursorState<'a> { - block_idx: Cell, + block_idx: Cell, block_start: Cell, - block_end: Cell, - block: Cell<&'a [Cell]>, + block_end: Cell, + block: Cell<&'a [Cell]>, initialized: Cell, - current: Cell>, + current: Cell>, } impl<'a> CursorState<'a> { fn new() -> Self { Self { - block_idx: Cell::new(0), + block_idx: Cell::new(0), block_start: Cell::new(0), - block_end: Cell::new(0), - block: Cell::new(&[]), + block_end: Cell::new(0), + block: Cell::new(&[]), initialized: Cell::new(false), - current: Cell::new(None), + current: Cell::new(None), } } @@ -55,11 +103,10 @@ impl<'a> CursorState<'a> { self.block_idx.set(bi); self.block_start.set(bs); self.block_end.set(be); - self.block - .set(rope.get_block(bi).ok_or(RopeError::BlockNotFound(format!( - "Cannot find block for index {}", - i - )))?); + self.block.set(rope.get_block(bi).ok_or(RopeError::BlockNotFound(format!( + "Cannot find block for index {}", + i + )))?); self.initialized.set(true); } self.block.get()[i - self.block_start.get()].set(value); @@ -69,39 +116,75 @@ impl<'a> CursorState<'a> { // ── trait ───────────────────────────────────────────────────────────────────── +/// Common interface for all rope cursors. +/// +/// # Required methods +/// +/// Implementors must provide [`rope`](RopeCursor::rope), +/// [`state`](RopeCursor::state), [`read_next`](RopeCursor::read_next) and +/// [`seek`](RopeCursor::seek). Everything else has a default implementation. +/// +/// The direction of `read_next` and the sign convention for +/// [`SeekMode::Relative`] differ between [`ForwardCursor`] and +/// [`BackwardCursor`]; all other methods are identical. pub trait RopeCursor<'a> { + /// The rope this cursor is bound to. fn rope(&self) -> &'a Rope; + /// Internal cache state — implementation detail exposed for default methods. fn state(&self) -> &CursorState<'a>; - // Required: differ between Forward and Backward + /// Read the next byte in cursor direction and advance the position. + /// Returns `Err` at the exhausted end. fn read_next(&self) -> Result; + + /// Move the cursor to an absolute or relative position. + /// + /// For [`ForwardCursor`], `Relative +n` advances toward the end. + /// For [`BackwardCursor`], `Relative +n` retreats toward the start + /// (i.e. subtracts from the current index). fn seek(&self, pos: isize, mode: SeekMode) -> Result; - // Defaults: identical for all cursors + // ── default methods ─────────────────────────────────────────────────────── + + /// Read the byte at absolute index `i` without moving the position. fn get(&self, i: usize) -> Option { self.state().get(self.rope(), i) } + + /// Write `value` at absolute index `i` without moving the position. fn set(&self, i: usize, value: u8) -> Result<(), RopeError> { self.state().set(self.rope(), i, value) } + + /// Current position, or `None` if the cursor has not moved yet. fn tell(&self) -> Option { self.state().current.get() } + + /// Total number of bytes in the rope. fn len(&self) -> usize { self.rope().len() } + + /// Read the byte at the current position without advancing. fn peek(&self) -> Option { self.state().get(self.rope(), self.state().current.get()?) } + + /// Write `value` at the current position without advancing. fn poke(&self, value: u8) -> Result<(), RopeError> { let pos = self.state().current.get().ok_or(RopeError::CurrentNotSet)?; self.state().set(self.rope(), pos, value) } + + /// Move backward by `go_back_of` steps (toward lower indices for + /// [`ForwardCursor`], toward higher indices for [`BackwardCursor`]). fn rewind(&self, go_back_of: usize) -> Result<(), RopeError> { self.seek(-(go_back_of as isize), SeekMode::Relative)?; Ok(()) } + /// Move forward by `ahead` steps (opposite of [`rewind`](RopeCursor::rewind)). fn forward(&self, ahead: usize) -> Result<(), RopeError> { self.seek(ahead as isize, SeekMode::Relative)?; Ok(()) @@ -110,32 +193,40 @@ pub trait RopeCursor<'a> { // ── ForwardCursor ───────────────────────────────────────────────────────────── +/// A cursor that reads from the start toward the end of the rope. +/// +/// - `read_next`: first call reads index 0, then 1, 2, … +/// - `seek(Relative, +n)`: advances by n. +/// - `rewind(n)`: steps back by n. +/// +/// Extra methods not in the trait: [`read_ahead`](ForwardCursor::read_ahead), +/// [`write`](ForwardCursor::write), [`iter`](ForwardCursor::iter). #[derive(Clone)] pub struct ForwardCursor<'a> { - rope: &'a Rope, + rope: &'a Rope, state: CursorState<'a>, } impl<'a> ForwardCursor<'a> { + /// Create a new forward cursor positioned before the first byte. pub fn new(rope: &'a Rope) -> Self { - Self { - rope, - state: CursorState::new(), - } + Self { rope, state: CursorState::new() } } + /// Read the byte at `current + ahead` without moving the position. pub fn read_ahead(&self, ahead: usize) -> Result { let pos = self.state.current.get().ok_or(RopeError::CurrentNotSet)?; self.state .get(self.rope, pos + ahead) .ok_or(RopeError::OutOfBounds(format!( "index out of bounds: i={} + {} > {}", - pos, - ahead, - self.rope.len() + pos, ahead, self.rope.len() ))) } + /// Write `value` at the current position and advance by one. + /// + /// If the cursor has not moved yet, writes at index 0. pub fn write(&self, value: u8) -> Result<(), RopeError> { let pos = self.state.current.get().unwrap_or(0); self.state.set(self.rope, pos, value)?; @@ -143,31 +234,30 @@ impl<'a> ForwardCursor<'a> { Ok(()) } + /// Return a shared-borrow iterator that yields bytes forward. + /// + /// Because the iterator holds `&self` rather than `&mut self`, methods + /// such as [`tell`](RopeCursor::tell) and [`seek`](RopeCursor::seek) can + /// be called on the cursor inside the loop body. pub fn iter(&self) -> ForwardIter<'a, '_> { ForwardIter { cursor: self } } } impl<'a> RopeCursor<'a> for ForwardCursor<'a> { - fn rope(&self) -> &'a Rope { - self.rope - } - fn state(&self) -> &CursorState<'a> { - &self.state - } + fn rope(&self) -> &'a Rope { self.rope } + fn state(&self) -> &CursorState<'a> { &self.state } fn read_next(&self) -> Result { let next_pos = match self.state.current.get() { Some(i) => i + 1, - None => 0, + None => 0, }; - let value = self - .state + let value = self.state .get(self.rope, next_pos) .ok_or(RopeError::OutOfBounds(format!( "index out of bounds: i={} > {}", - next_pos, - self.rope.len() + next_pos, self.rope.len() )))?; self.state.current.set(Some(next_pos)); Ok(value) @@ -175,17 +265,12 @@ impl<'a> RopeCursor<'a> for ForwardCursor<'a> { fn seek(&self, pos: isize, mode: SeekMode) -> Result { let pos = match mode { - SeekMode::Absolute => pos, - SeekMode::Relative => { - self.state.current.get().ok_or(RopeError::CurrentNotSet)? as isize + pos - } + SeekMode::Absolute => pos, + SeekMode::Relative => self.state.current.get().ok_or(RopeError::CurrentNotSet)? as isize + pos, SeekMode::RelativeToEnd => self.rope.len() as isize - pos, }; if pos < 0 { - return Err(RopeError::OutOfBounds(format!( - "index out of bounds: i={} < 0", - pos - ))); + return Err(RopeError::OutOfBounds(format!("index out of bounds: i={} < 0", pos))); } self.state.current.set(Some(pos as usize)); Ok(pos as usize) @@ -194,38 +279,42 @@ impl<'a> RopeCursor<'a> for ForwardCursor<'a> { impl Iterator for ForwardCursor<'_> { type Item = u8; - fn next(&mut self) -> Option { - self.read_next().ok() - } + fn next(&mut self) -> Option { self.read_next().ok() } } +/// Shared-borrow iterator returned by [`ForwardCursor::iter`]. pub struct ForwardIter<'a, 'b> { cursor: &'b ForwardCursor<'a>, } impl Iterator for ForwardIter<'_, '_> { type Item = u8; - fn next(&mut self) -> Option { - self.cursor.read_next().ok() - } + fn next(&mut self) -> Option { self.cursor.read_next().ok() } } // ── BackwardCursor ──────────────────────────────────────────────────────────── +/// A cursor that reads from the end toward the start of the rope. +/// +/// - `read_next`: first call reads index `len-1`, then `len-2`, … +/// - `seek(Relative, +n)`: retreats by n (subtracts n from the index). +/// - `rewind(n)`: advances toward the end by n. +/// +/// Extra methods not in the trait: [`read_behind`](BackwardCursor::read_behind), +/// [`iter`](BackwardCursor::iter). #[derive(Clone)] pub struct BackwardCursor<'a> { - rope: &'a Rope, + rope: &'a Rope, state: CursorState<'a>, } impl<'a> BackwardCursor<'a> { + /// Create a new backward cursor positioned past the last byte. pub fn new(rope: &'a Rope) -> Self { - Self { - rope, - state: CursorState::new(), - } + Self { rope, state: CursorState::new() } } + /// Read the byte at `current + behind` (toward higher indices) without moving. pub fn read_behind(&self, behind: usize) -> Result { let pos = self.state.current.get().ok_or(RopeError::CurrentNotSet)?; let target = pos @@ -233,51 +322,41 @@ impl<'a> BackwardCursor<'a> { .filter(|&t| t < self.rope.len()) .ok_or(RopeError::OutOfBounds(format!( "index out of bounds: i={} + {} > {}", - pos, - behind, - self.rope.len() + pos, behind, self.rope.len() )))?; self.state .get(self.rope, target) .ok_or(RopeError::OutOfBounds(format!( "index out of bounds: i={} + {} > {}", - pos, - behind, - self.rope.len() + pos, behind, self.rope.len() ))) } + /// Return a shared-borrow iterator that yields bytes backward. + /// + /// Because the iterator holds `&self` rather than `&mut self`, methods + /// such as [`tell`](RopeCursor::tell) and [`seek`](RopeCursor::seek) can + /// be called on the cursor inside the loop body. pub fn iter(&self) -> BackwardIter<'a, '_> { BackwardIter { cursor: self } } } impl<'a> RopeCursor<'a> for BackwardCursor<'a> { - fn rope(&self) -> &'a Rope { - self.rope - } - fn state(&self) -> &CursorState<'a> { - &self.state - } + fn rope(&self) -> &'a Rope { self.rope } + fn state(&self) -> &CursorState<'a> { &self.state } fn read_next(&self) -> Result { let next_pos = match self.state.current.get() { - None => self - .rope - .len() - .checked_sub(1) - .ok_or(RopeError::OutOfBounds( - "BackwardCursor: rope is empty".to_string(), - ))?, - Some(0) => { - return Err(RopeError::OutOfBounds( - "BackwardCursor: already at beginning".to_string(), - )); - } + None => self.rope.len().checked_sub(1).ok_or(RopeError::OutOfBounds( + "BackwardCursor: rope is empty".to_string(), + ))?, + Some(0) => return Err(RopeError::OutOfBounds( + "BackwardCursor: already at beginning".to_string(), + )), Some(i) => i - 1, }; - let value = self - .state + let value = self.state .get(self.rope, next_pos) .ok_or(RopeError::OutOfBounds(format!( "BackwardCursor: index out of bounds at i={}", @@ -289,17 +368,12 @@ impl<'a> RopeCursor<'a> for BackwardCursor<'a> { fn seek(&self, pos: isize, mode: SeekMode) -> Result { let pos = match mode { - SeekMode::Absolute => pos, - SeekMode::Relative => { - self.state.current.get().ok_or(RopeError::CurrentNotSet)? as isize - pos - } + SeekMode::Absolute => pos, + SeekMode::Relative => self.state.current.get().ok_or(RopeError::CurrentNotSet)? as isize - pos, SeekMode::RelativeToEnd => self.rope.len() as isize - pos, }; if pos < 0 { - return Err(RopeError::OutOfBounds(format!( - "index out of bounds: i={} < 0", - pos - ))); + return Err(RopeError::OutOfBounds(format!("index out of bounds: i={} < 0", pos))); } self.state.current.set(Some(pos as usize)); Ok(pos as usize) @@ -308,18 +382,211 @@ impl<'a> RopeCursor<'a> for BackwardCursor<'a> { impl Iterator for BackwardCursor<'_> { type Item = u8; - fn next(&mut self) -> Option { - self.read_next().ok() - } + fn next(&mut self) -> Option { self.read_next().ok() } } +/// Shared-borrow iterator returned by [`BackwardCursor::iter`]. pub struct BackwardIter<'a, 'b> { cursor: &'b BackwardCursor<'a>, } impl Iterator for BackwardIter<'_, '_> { type Item = u8; - fn next(&mut self) -> Option { - self.cursor.read_next().ok() + fn next(&mut self) -> Option { self.cursor.read_next().ok() } +} + +// ── tests ───────────────────────────────────────────────────────────────────── + +#[cfg(test)] +mod tests { + use super::*; + use crate::Rope; + + fn rope(data: &[u8]) -> Rope { + let mut r = Rope::new(); + r.push(data.to_vec()); + r + } + + fn rope2(a: &[u8], b: &[u8]) -> Rope { + let mut r = Rope::new(); + r.push(a.to_vec()); + r.push(b.to_vec()); + r + } + + // ── ForwardCursor ───────────────────────────────────────────────────────── + + #[test] + fn forward_reads_all_bytes() { + let r = rope(b"ACGT"); + let c = r.fw_cursor(); + let out: Vec = c.collect(); + assert_eq!(out, b"ACGT"); + } + + #[test] + fn forward_tell_tracks_position() { + let r = rope(b"ACGT"); + let c = r.fw_cursor(); + assert_eq!(c.tell(), None); + c.read_next().unwrap(); + assert_eq!(c.tell(), Some(0)); + c.read_next().unwrap(); + assert_eq!(c.tell(), Some(1)); + } + + #[test] + fn forward_iter_with_tell_inside_loop() { + let r = rope(b"ACGT"); + let c = r.fw_cursor(); + let mut positions = Vec::new(); + for _ in c.iter() { + positions.push(c.tell()); + } + assert_eq!(positions, vec![Some(0), Some(1), Some(2), Some(3)]); + } + + #[test] + fn forward_read_ahead() { + let r = rope(b"ACGT"); + let c = r.fw_cursor(); + c.read_next().unwrap(); // at 0 = 'A' + assert_eq!(c.read_ahead(1).unwrap(), b'C'); + assert_eq!(c.read_ahead(2).unwrap(), b'G'); + assert_eq!(c.tell(), Some(0)); // position unchanged + } + + #[test] + fn forward_write_and_read_back() { + let r = rope(b"ACGT"); + let c = r.fw_cursor(); + c.write(b'X').unwrap(); + c.write(b'Y').unwrap(); + let c2 = r.fw_cursor(); + assert_eq!(c2.read_next().unwrap(), b'X'); + assert_eq!(c2.read_next().unwrap(), b'Y'); + assert_eq!(c2.read_next().unwrap(), b'G'); + } + + #[test] + fn forward_rewind_and_reread() { + let r = rope(b"ACGT"); + let c = r.fw_cursor(); + c.read_next().unwrap(); // A → current = Some(0) + c.read_next().unwrap(); // C → current = Some(1) + c.read_next().unwrap(); // G → current = Some(2) + c.rewind(1).unwrap(); // current = Some(1) → next read = index 2 + assert_eq!(c.read_next().unwrap(), b'G'); + } + + #[test] + fn forward_seek_absolute() { + let r = rope(b"ACGT"); + let c = r.fw_cursor(); + c.seek(2, SeekMode::Absolute).unwrap(); + assert_eq!(c.read_next().unwrap(), b'T'); + } + + #[test] + fn forward_seek_relative_to_end() { + let r = rope(b"ACGT"); + // seek(1, RelativeToEnd): current = len-1 = 3; peek() reads index 3 = T. + let c = r.fw_cursor(); + c.seek(1, SeekMode::RelativeToEnd).unwrap(); + assert_eq!(c.peek().unwrap(), b'T'); + // seek(2, RelativeToEnd): current = len-2 = 2; read_next reads index 3 = T. + let c2 = r.fw_cursor(); + c2.seek(2, SeekMode::RelativeToEnd).unwrap(); + assert_eq!(c2.read_next().unwrap(), b'T'); + } + + #[test] + fn forward_get_random_access() { + let r = rope(b"ACGT"); + let c = r.fw_cursor(); + assert_eq!(c.get(0), Some(b'A')); + assert_eq!(c.get(3), Some(b'T')); + assert_eq!(c.get(4), None); + } + + #[test] + fn forward_crosses_block_boundary() { + let r = rope2(b"AC", b"GT"); + let c = r.fw_cursor(); + let out: Vec = c.collect(); + assert_eq!(out, b"ACGT"); + } + + // ── BackwardCursor ──────────────────────────────────────────────────────── + + #[test] + fn backward_reads_all_bytes_in_reverse() { + let r = rope(b"ACGT"); + let c = r.bw_cursor(); + let out: Vec = c.collect(); + assert_eq!(out, b"TGCA"); + } + + #[test] + fn backward_tell_tracks_position() { + let r = rope(b"ACGT"); + let c = r.bw_cursor(); + assert_eq!(c.tell(), None); + c.read_next().unwrap(); // reads index 3 + assert_eq!(c.tell(), Some(3)); + c.read_next().unwrap(); // reads index 2 + assert_eq!(c.tell(), Some(2)); + } + + #[test] + fn backward_iter_with_tell_and_seek_inside_loop() { + let r = rope(b"ACGT"); + let c = r.bw_cursor(); + let mut restart: usize = 0; + for byte in c.iter() { + if byte == b'G' { + restart = c.tell().unwrap(); + } + if byte == b'A' { + // seek back to G and break + c.seek(restart as isize, SeekMode::Absolute).ok(); + break; + } + } + assert_eq!(c.tell(), Some(restart)); + } + + #[test] + fn backward_rewind_moves_toward_end() { + let r = rope(b"ACGT"); + let c = r.bw_cursor(); + c.read_next().unwrap(); // index 3 = T + c.read_next().unwrap(); // index 2 = G + c.rewind(1).unwrap(); // back to index 3 + assert_eq!(c.tell(), Some(3)); + assert_eq!(c.read_next().unwrap(), b'G'); // reads index 2 + } + + #[test] + fn backward_crosses_block_boundary() { + let r = rope2(b"AC", b"GT"); + let c = r.bw_cursor(); + let out: Vec = c.collect(); + assert_eq!(out, b"TGCA"); + } + + #[test] + fn backward_empty_rope_returns_error() { + let r = Rope::new(); + let c = r.bw_cursor(); + assert!(c.read_next().is_err()); + } + + #[test] + fn forward_empty_rope_returns_error() { + let r = Rope::new(); + let c = r.fw_cursor(); + assert!(c.read_next().is_err()); } } diff --git a/src/obikrope/src/rope.rs b/src/obikrope/src/rope.rs index 8b57e52..cec95e1 100644 --- a/src/obikrope/src/rope.rs +++ b/src/obikrope/src/rope.rs @@ -1,6 +1,32 @@ +//! The [`Rope`] type: a segmented, in-place-mutable byte sequence. +//! +//! A `Rope` is a sequence of byte blocks (slices) stored contiguously in a +//! `Vec>>`. Blocks are never merged or reallocated; bytes within +//! a block can be modified through a [`ForwardCursor`] while another cursor +//! reads ahead — the [`Cell`][std::cell::Cell] wrapper provides the +//! required interior mutability without `unsafe` at the call site. +//! +//! ## Core operations +//! +//! | Method | Description | +//! |---|---| +//! | [`push`][Rope::push] | Append a `Vec` block | +//! | [`split_off`][Rope::split_off] | Split the rope at a byte offset | +//! | [`fw_cursor`][Rope::fw_cursor] | Forward cursor (read/write left→right) | +//! | [`bw_cursor`][Rope::bw_cursor] | Backward cursor (read right→left) | +//! +//! ## Block indexing +//! +//! `start_block_idx[i]` holds the absolute byte offset of the first byte of +//! block `i`. [`lookup`][Rope::lookup] binary-searches this index to resolve +//! an absolute offset to `(block_idx, block_start, block_end)` in O(log n). + use crate::{BackwardCursor, ForwardCursor, RopeError}; use std::cell::Cell; +/// A segmented, in-place-mutable byte sequence. +/// +/// See the [module-level documentation][crate::rope] for a full overview. pub struct Rope { pub(crate) blocks: Vec>>, pub(crate) length: usize, @@ -8,6 +34,7 @@ pub struct Rope { } impl Rope { + /// Create an empty rope (no allocations). pub fn new() -> Self { Self { blocks: Vec::new(), @@ -16,10 +43,14 @@ impl Rope { } } + /// Append a block of bytes to the rope. + /// + /// The `Vec` is reinterpreted as `Vec>` in place (zero-copy) + /// using the guaranteed identical memory layout of `Cell` and `T`. pub fn push(&mut self, block: Vec) { let block_len = block.len(); self.start_block_idx.push(self.length); - // Safety: Cell has the same memory layout as u8 (guaranteed by the language) + // Cell has the same memory layout as u8 (language guarantee). let cell_block: Vec> = unsafe { let mut v = std::mem::ManuallyDrop::new(block); Vec::from_raw_parts(v.as_mut_ptr() as *mut Cell, v.len(), v.capacity()) @@ -28,18 +59,32 @@ impl Rope { self.length += block_len; } + /// Total number of blocks. pub fn n_blocks(&self) -> usize { self.blocks.len() } + /// Return the slice of `Cell` for block `block_idx`, or `None` if out + /// of range. pub(crate) fn get_block(&self, block_idx: usize) -> Option<&[Cell]> { self.blocks.get(block_idx).map(Vec::as_slice) } + /// Total byte length across all blocks. pub fn len(&self) -> usize { self.length } + /// `true` if the rope contains no bytes. + pub fn is_empty(&self) -> bool { + self.blocks.is_empty() + } + + /// Resolve absolute byte offset `i` to `(block_idx, block_start, block_end)`. + /// + /// Returns `None` when `i >= self.length` or the rope is empty. + /// `block_start` and `block_end` are absolute byte offsets of the first and + /// one-past-last byte of the block, respectively. pub(crate) fn lookup(&self, i: usize) -> Option<(usize, usize, usize)> { if i >= self.length || self.blocks.is_empty() { return None; @@ -54,6 +99,13 @@ impl Rope { Some((block_idx, from, to)) } + /// Split the rope at byte offset `pos`. + /// + /// `self` retains bytes `[0, pos)` and returns a new rope with bytes + /// `[pos, len)`. If `pos` falls inside a block, that block is split in + /// two. + /// + /// Returns `Err` if `pos > self.length`. pub fn split_off(&mut self, pos: usize) -> Result { if pos > self.length { return Err(RopeError::OutOfBounds(format!( @@ -62,7 +114,6 @@ impl Rope { ))); } - // pos == length: tail is empty. if pos == self.length { return Ok(Rope::new()); } @@ -72,7 +123,6 @@ impl Rope { })?; let cut_offset = pos - from; - // Keep block_idx in self temporarily, split it, move remainder to tail. let mut tail_blocks = self.blocks.split_off(block_idx + 1); self.start_block_idx.truncate(block_idx + 1); @@ -80,6 +130,11 @@ impl Rope { if !tail_part.is_empty() { tail_blocks.insert(0, tail_part); } + // If the cut was exactly at the start of this block, it is now empty — discard it. + if self.blocks[block_idx].is_empty() { + self.blocks.pop(); + self.start_block_idx.pop(); + } let mut tail_length = 0; let tail_starts: Vec = tail_blocks @@ -100,15 +155,196 @@ impl Rope { }) } - pub fn is_empty(&self) -> bool { - self.blocks.is_empty() - } - + /// Create a forward cursor positioned before the first byte. pub fn fw_cursor(&self) -> ForwardCursor<'_> { ForwardCursor::new(self) } + /// Create a backward cursor positioned after the last byte. pub fn bw_cursor(&self) -> BackwardCursor<'_> { BackwardCursor::new(self) } } + +// ── tests ───────────────────────────────────────────────────────────────────── + +#[cfg(test)] +mod tests { + use super::*; + + fn flat(r: &Rope) -> Vec { + r.fw_cursor().collect() + } + + fn make(data: &[u8]) -> Rope { + let mut r = Rope::new(); + r.push(data.to_vec()); + r + } + + fn make2(a: &[u8], b: &[u8]) -> Rope { + let mut r = Rope::new(); + r.push(a.to_vec()); + r.push(b.to_vec()); + r + } + + // ── basic properties ────────────────────────────────────────────────────── + + #[test] + fn empty_rope_is_empty() { + let r = Rope::new(); + assert!(r.is_empty()); + assert_eq!(r.len(), 0); + assert_eq!(r.n_blocks(), 0); + } + + #[test] + fn single_push_len_and_n_blocks() { + let r = make(b"hello"); + assert!(!r.is_empty()); + assert_eq!(r.len(), 5); + assert_eq!(r.n_blocks(), 1); + } + + #[test] + fn two_pushes_len_accumulates() { + let r = make2(b"abc", b"de"); + assert_eq!(r.len(), 5); + assert_eq!(r.n_blocks(), 2); + } + + #[test] + fn flat_read_matches_input() { + assert_eq!(flat(&make(b"ACGT")), b"ACGT"); + } + + #[test] + fn flat_read_two_blocks_concatenated() { + assert_eq!(flat(&make2(b"ACG", b"T")), b"ACGT"); + } + + // ── lookup ──────────────────────────────────────────────────────────────── + + #[test] + fn lookup_first_byte() { + let r = make(b"ABCD"); + let (bi, from, to) = r.lookup(0).unwrap(); + assert_eq!(bi, 0); + assert_eq!(from, 0); + assert_eq!(to, 4); + } + + #[test] + fn lookup_last_byte() { + let r = make(b"ABCD"); + let (bi, from, to) = r.lookup(3).unwrap(); + assert_eq!(bi, 0); + assert_eq!(from, 0); + assert_eq!(to, 4); + } + + #[test] + fn lookup_out_of_bounds_returns_none() { + let r = make(b"AB"); + assert!(r.lookup(2).is_none()); + assert!(r.lookup(99).is_none()); + } + + #[test] + fn lookup_empty_rope_returns_none() { + assert!(Rope::new().lookup(0).is_none()); + } + + #[test] + fn lookup_second_block_first_byte() { + let r = make2(b"ABC", b"DE"); + let (bi, from, to) = r.lookup(3).unwrap(); + assert_eq!(bi, 1); + assert_eq!(from, 3); + assert_eq!(to, 5); + } + + #[test] + fn lookup_second_block_last_byte() { + let r = make2(b"ABC", b"DE"); + let (bi, from, to) = r.lookup(4).unwrap(); + assert_eq!(bi, 1); + assert_eq!(from, 3); + assert_eq!(to, 5); + } + + // ── get_block ───────────────────────────────────────────────────────────── + + #[test] + fn get_block_returns_correct_slice() { + let r = make2(b"ABC", b"DE"); + let b0: Vec = r.get_block(0).unwrap().iter().map(|c| c.get()).collect(); + let b1: Vec = r.get_block(1).unwrap().iter().map(|c| c.get()).collect(); + assert_eq!(b0, b"ABC"); + assert_eq!(b1, b"DE"); + } + + #[test] + fn get_block_out_of_range_returns_none() { + let r = make(b"X"); + assert!(r.get_block(1).is_none()); + } + + // ── split_off ───────────────────────────────────────────────────────────── + + #[test] + fn split_off_at_zero_head_empty_tail_all() { + let mut r = make(b"ABCDE"); + let tail = r.split_off(0).unwrap(); + assert_eq!(r.len(), 0); + assert_eq!(flat(&tail), b"ABCDE"); + } + + #[test] + fn split_off_at_len_tail_empty_head_all() { + let mut r = make(b"ABCDE"); + let tail = r.split_off(5).unwrap(); + assert_eq!(flat(&r), b"ABCDE"); + assert_eq!(tail.len(), 0); + assert!(tail.is_empty()); + } + + #[test] + fn split_off_in_middle_of_block() { + let mut r = make(b"ABCDE"); + let tail = r.split_off(2).unwrap(); + assert_eq!(flat(&r), b"AB"); + assert_eq!(flat(&tail), b"CDE"); + } + + #[test] + fn split_off_at_block_boundary() { + let mut r = make2(b"ABC", b"DE"); + let tail = r.split_off(3).unwrap(); + assert_eq!(flat(&r), b"ABC"); + assert_eq!(flat(&tail), b"DE"); + } + + #[test] + fn split_off_inside_second_block() { + let mut r = make2(b"ABC", b"DE"); + let tail = r.split_off(4).unwrap(); + assert_eq!(flat(&r), b"ABCD"); + assert_eq!(flat(&tail), b"E"); + } + + #[test] + fn split_off_out_of_bounds_returns_err() { + let mut r = make(b"AB"); + assert!(r.split_off(3).is_err()); + } + + #[test] + fn split_off_preserves_n_blocks_head() { + let mut r = make2(b"ABCDE", b"FGHIJ"); + r.split_off(5).unwrap(); + assert_eq!(r.n_blocks(), 1); + assert_eq!(flat(&r), b"ABCDE"); + } +} diff --git a/src/obiread/src/fastq.rs b/src/obiread/src/fastq.rs index e7f9154..6f45ea8 100644 --- a/src/obiread/src/fastq.rs +++ b/src/obiread/src/fastq.rs @@ -34,12 +34,12 @@ fn is_seq_char(c: u8) -> bool { /// `rope[offset..]` is the remainder for the next chunk. /// Returns `None` if no valid boundary is found (need more data). pub fn end_of_last_fastq_entry(rope: &Rope) -> Option { - let mut cursor = rope.bw_cursor(); + let cursor = rope.bw_cursor(); let mut state: u8 = 0; let mut restart: usize = 0; let mut cut: usize = rope.len(); - while let Some(c) = cursor.next() { + for c in cursor.iter() { match state { 0 => { if c == b'+' {