From 097f7f0695ca6e93d6d169691630f0f2ed199951 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Sun, 19 Apr 2026 22:47:26 +0200 Subject: [PATCH] :wrench: refactor cursor API and normalize chunking logic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Change `rope_tell()` return type from Option to usize, always returning cursor's absolute position (offset if unmoved). - Update all call sites to remove `.unwrap_or(...)` around `rope_tell()`. - Add new method `::truncate(pos)`, replacing `split_off(...).map(|_| ())`. - Refactor FASTA/FASTQ normalizers to use a single mutable write cursor (`wc`) and document the protocol. - Simplify `end_segment()` logic: commit segment with 0x00 if length ≥ k, else reset. - Improve documentation for write-cursor protocol and rope truncation semantics. --- src/obikrope/src/cursor.rs | 25 +++--- src/obikrope/src/rope.rs | 8 ++ src/obiread/src/normalize.rs | 148 +++++++++++++++++++++++------------ 3 files changed, 122 insertions(+), 59 deletions(-) diff --git a/src/obikrope/src/cursor.rs b/src/obikrope/src/cursor.rs index f8d73ad..480fc06 100644 --- a/src/obikrope/src/cursor.rs +++ b/src/obikrope/src/cursor.rs @@ -185,11 +185,16 @@ pub trait RopeCursor<'a> { Some(abs.saturating_sub(self.state().offset.get())) } - /// Current position as an absolute rope index, or `None` if the cursor - /// has not moved yet. Use this when you need to pass a value to - /// [`seek`](RopeCursor::seek) with [`SeekMode::Absolute`]. - fn rope_tell(&self) -> Option { - self.state().current.get() + /// Current position as an absolute rope index. + /// + /// Unlike [`tell`](RopeCursor::tell), this method **always** returns a + /// value: if the cursor has not moved yet, it returns the cursor's offset + /// (the rope index of local position 0). + /// + /// Use the returned value with [`SeekMode::Rope`] to restore a position, + /// or as a truncation point after a write pass. + fn rope_tell(&self) -> usize { + self.state().current.get().unwrap_or(self.state().offset.get()) } /// Number of bytes visible through this cursor (`rope.len() - offset`). @@ -295,7 +300,7 @@ impl<'a> ForwardCursor<'a> { /// an `offset` equal to `self.absolute_tell()`. If `self` has not moved /// yet, the new cursor starts at the same offset as `self`. pub fn cursor(&self) -> ForwardCursor<'a> { - let new_offset = self.rope_tell().unwrap_or(self.state.offset.get()); + let new_offset = self.rope_tell(); ForwardCursor { rope: self.rope, state: CursorState::with_offset(new_offset), @@ -431,7 +436,7 @@ impl<'a> BackwardCursor<'a> { /// position of `self`. If `self` has not moved yet, the new cursor has the /// same offset as `self` (no restriction). pub fn cursor(&self) -> BackwardCursor<'a> { - let new_offset = self.rope_tell().unwrap_or(self.state.offset.get()); + let new_offset = self.rope_tell(); BackwardCursor { rope: self.rope, state: CursorState::with_offset(new_offset), @@ -723,7 +728,7 @@ mod tests { let sub = c.cursor(); // offset=2 (absolute_tell=2) assert_eq!(sub.read_next().unwrap(), b'C'); // reads index 2 assert_eq!(sub.tell(), Some(0)); // relative: 2-2=0 - assert_eq!(sub.rope_tell(), Some(2)); + assert_eq!(sub.rope_tell(), 2); assert_eq!(sub.read_next().unwrap(), b'D'); assert_eq!(sub.tell(), Some(1)); // relative: 3-2=1 } @@ -810,11 +815,11 @@ mod tests { sub.read_next().unwrap(); // reads index 0, absolute_tell=0 sub.read_next().unwrap(); // reads index 1, absolute_tell=1 assert_eq!(sub.tell(), Some(1)); - assert_eq!(sub.rope_tell(), Some(1)); + assert_eq!(sub.rope_tell(), 1); // sub2 with offset=1 let sub2 = sub.cursor(); // offset=1 sub2.read_next().unwrap(); // reads index 1, absolute=1 assert_eq!(sub2.tell(), Some(0)); // relative: 1-1=0 - assert_eq!(sub2.rope_tell(), Some(1)); + assert_eq!(sub2.rope_tell(), 1); } } diff --git a/src/obikrope/src/rope.rs b/src/obikrope/src/rope.rs index cec95e1..0045a22 100644 --- a/src/obikrope/src/rope.rs +++ b/src/obikrope/src/rope.rs @@ -155,6 +155,14 @@ impl Rope { }) } + /// Discard all bytes at and after `pos`, keeping `[0, pos)`. + /// + /// Equivalent to `split_off(pos)` with the tail dropped. + /// Returns `Err` if `pos > self.length`. + pub fn truncate(&mut self, pos: usize) -> Result<(), RopeError> { + self.split_off(pos).map(|_| ()) + } + /// Create a forward cursor positioned before the first byte. pub fn fw_cursor(&self) -> ForwardCursor<'_> { ForwardCursor::new(self) diff --git a/src/obiread/src/normalize.rs b/src/obiread/src/normalize.rs index 89c329f..6f37639 100644 --- a/src/obiread/src/normalize.rs +++ b/src/obiread/src/normalize.rs @@ -4,6 +4,18 @@ //! and produce a compact byte stream: ACGT segments of length ≥ k separated //! by `0x00`, uppercased. Ambiguous bases terminate the current segment. //! Segments shorter than k are silently discarded. +//! +//! # Write-cursor protocol +//! +//! A single write cursor `wc` tracks the current segment. `wc.tell()` +//! always gives the **segment length so far** (local position 0 = segment +//! start, because `cursor()` and `reset()` both move the offset forward or +//! keep it in place): +//! +//! - Segment accepted (length ≥ k): write `0x00`, then `wc = wc.cursor()`. +//! - Segment rejected (length < k): `wc.reset()`. +//! +//! After the automaton returns, `wc.rope_start()` is the rope truncation point. use obikrope::{ForwardCursor, Rope, RopeCursor}; @@ -11,50 +23,48 @@ use obikrope::{ForwardCursor, Rope, RopeCursor}; /// Normalise a FASTA chunk into a compact ACGT\x00-separated rope. pub fn normalize_fasta_chunk(mut rope: Rope, k: usize) -> Rope { - let write_pos = { + let end = { let read = rope.fw_cursor(); - let write = rope.fw_cursor(); - normalize_fasta(&read, &write, k); - write.tell().unwrap_or(0) + let mut wc = rope.fw_cursor(); + normalize_fasta(&read, &mut wc, k); + wc.rope_tell() }; - let _ = rope.split_off(write_pos); + rope.truncate(end).ok(); rope } /// Normalise a FASTQ chunk into a compact ACGT\x00-separated rope. pub fn normalize_fastq_chunk(mut rope: Rope, k: usize) -> Rope { - let write_pos = { + let end = { let read = rope.fw_cursor(); - let write = rope.fw_cursor(); - normalize_fastq(&read, &write, k); - write.tell().unwrap_or(0) + let mut wc = rope.fw_cursor(); + normalize_fastq(&read, &mut wc, k); + wc.rope_tell() }; - let _ = rope.split_off(write_pos); + rope.truncate(end).ok(); rope } // ── FASTA automaton ─────────────────────────────────────────────────────────── -fn normalize_fasta(read: &ForwardCursor<'_>, write: &ForwardCursor<'_>, k: usize) { +fn normalize_fasta(read: &ForwardCursor<'_>, mut wc: &mut ForwardCursor<'_>, k: usize) { skip_line(read); loop { - let mut seg_start = write.tell().unwrap_or(0); - 'seq: loop { let Some(c) = read.read_next().ok() else { - end_segment(write, &mut seg_start, k); + end_segment(wc, k); return; }; if is_newline(c) { match read.read_ahead(1).ok() { None => { - end_segment(write, &mut seg_start, k); + end_segment(wc, k); return; } Some(b'>') => { - end_segment(write, &mut seg_start, k); + end_segment(wc, k); skip_line(read); break 'seq; } @@ -64,11 +74,10 @@ fn normalize_fasta(read: &ForwardCursor<'_>, write: &ForwardCursor<'_>, k: usize let upper = c & !0x20u8; if is_acgt(upper) { - write.write(upper).ok(); + wc.write(upper).ok(); } else { - end_segment(write, &mut seg_start, k); + end_segment(wc, k); skip_until_acgt_or_newline(read); - seg_start = write.tell().unwrap_or(0); } } } @@ -76,15 +85,13 @@ fn normalize_fasta(read: &ForwardCursor<'_>, write: &ForwardCursor<'_>, k: usize // ── FASTQ automaton ─────────────────────────────────────────────────────────── -fn normalize_fastq(read: &ForwardCursor<'_>, write: &ForwardCursor<'_>, k: usize) { +fn normalize_fastq(read: &ForwardCursor<'_>, mut wc: &mut ForwardCursor<'_>, k: usize) { loop { skip_line(read); // skip header - let mut seg_start = write.tell().unwrap_or(0); - 'seq: loop { let Some(c) = read.read_next().ok() else { - end_segment(write, &mut seg_start, k); + end_segment(wc, k); return; }; @@ -95,15 +102,14 @@ fn normalize_fastq(read: &ForwardCursor<'_>, write: &ForwardCursor<'_>, k: usize let upper = c & !0x20u8; if is_acgt(upper) { - write.write(upper).ok(); + wc.write(upper).ok(); } else { - end_segment(write, &mut seg_start, k); + end_segment(wc, k); skip_until_acgt_or_newline(read); - seg_start = write.tell().unwrap_or(0); } } - end_segment(write, &mut seg_start, k); + end_segment(&mut wc, k); skip_line(read); // skip + line skip_line(read); // skip quality line @@ -116,6 +122,21 @@ fn normalize_fastq(read: &ForwardCursor<'_>, write: &ForwardCursor<'_>, k: usize // ── shared helpers ──────────────────────────────────────────────────────────── +/// Commit or discard the current segment. +/// +/// `wc.tell()` is the segment length (local coords start at 0 = segment start). +/// - Long enough: write `0x00`, then `wc = wc.cursor()` to anchor the next segment. +/// - Too short: `wc.reset()` to stay at the same rope position. +fn end_segment(wc: &mut ForwardCursor<'_>, k: usize) { + if wc.tell().unwrap_or(0) >= k { + wc.write(0x00).ok(); + let next = wc.cursor(); + *wc = next; + } else { + wc.reset(); + } +} + fn skip_line(read: &ForwardCursor<'_>) { while let Some(c) = read.read_next().ok() { if is_newline(c) { @@ -140,18 +161,14 @@ fn skip_until_acgt_or_newline(read: &ForwardCursor<'_>) { } } -fn end_segment(write: &ForwardCursor<'_>, seg_start: &mut usize, k: usize) { - let seg_len = write.tell().unwrap_or(0) - *seg_start; - if seg_len >= k { - write.write(0x00).ok(); - } else if seg_len > 0 { - write.rewind(seg_len).ok(); - } - *seg_start = write.tell().unwrap_or(0); +#[inline] +fn is_newline(c: u8) -> bool { + c == b'\n' || c == b'\r' +} +#[inline] +fn is_acgt(upper: u8) -> bool { + matches!(upper, b'A' | b'C' | b'G' | b'T') } - -#[inline] fn is_newline(c: u8) -> bool { c == b'\n' || c == b'\r' } -#[inline] fn is_acgt(upper: u8) -> bool { matches!(upper, b'A' | b'C' | b'G' | b'T') } // ── tests ───────────────────────────────────────────────────────────────────── @@ -247,17 +264,26 @@ mod tests { #[test] fn ambiguous_splits_into_two_segments() { - assert_eq!(run_fastq(&make_fastq(&[b"ACGTNACGT"]), 4), b"ACGT\x00ACGT\x00"); + assert_eq!( + run_fastq(&make_fastq(&[b"ACGTNACGT"]), 4), + b"ACGT\x00ACGT\x00" + ); } #[test] fn segment_after_ambiguous_too_short_discarded() { - assert_eq!(run_fastq(&make_fastq(&[b"ACGTACGTNAC"]), 4), b"ACGTACGT\x00"); + assert_eq!( + run_fastq(&make_fastq(&[b"ACGTACGTNAC"]), 4), + b"ACGTACGT\x00" + ); } #[test] fn consecutive_ambiguous_produce_no_empty_segment() { - assert_eq!(run_fastq(&make_fastq(&[b"ACGTNNNNACGT"]), 4), b"ACGT\x00ACGT\x00"); + assert_eq!( + run_fastq(&make_fastq(&[b"ACGTNNNNACGT"]), 4), + b"ACGT\x00ACGT\x00" + ); } #[test] @@ -283,32 +309,47 @@ mod tests { let mut rope = Rope::new(); rope.push(data[..mid].to_vec()); rope.push(data[mid..].to_vec()); - assert_eq!(flat(normalize_fastq_chunk(rope, 4)), b"ACGTACGT\x00TTTTTTTT\x00"); + assert_eq!( + flat(normalize_fastq_chunk(rope, 4)), + b"ACGTACGT\x00TTTTTTTT\x00" + ); } // ── FASTA ───────────────────────────────────────────────────────────────── #[test] fn fasta_single_record() { - assert_eq!(run_fasta(&make_fasta(&[(b"s1", b"ACGTACGT")]), 4), b"ACGTACGT\x00"); + assert_eq!( + run_fasta(&make_fasta(&[(b"s1", b"ACGTACGT")]), 4), + b"ACGTACGT\x00" + ); } #[test] fn fasta_two_records() { assert_eq!( - run_fasta(&make_fasta(&[(b"s1", b"ACGTACGT"), (b"s2", b"TTTTTTTT")]), 4), + run_fasta( + &make_fasta(&[(b"s1", b"ACGTACGT"), (b"s2", b"TTTTTTTT")]), + 4 + ), b"ACGTACGT\x00TTTTTTTT\x00" ); } #[test] fn fasta_multiline_sequence_concatenated() { - assert_eq!(run_fasta(b">s1\nACGT\nACGT\nACGT\n", 4), b"ACGTACGTACGT\x00"); + assert_eq!( + run_fasta(b">s1\nACGT\nACGT\nACGT\n", 4), + b"ACGTACGTACGT\x00" + ); } #[test] fn fasta_lowercase_uppercased() { - assert_eq!(run_fasta(&make_fasta(&[(b"s1", b"acgtacgt")]), 4), b"ACGTACGT\x00"); + assert_eq!( + run_fasta(&make_fasta(&[(b"s1", b"acgtacgt")]), 4), + b"ACGTACGT\x00" + ); } #[test] @@ -319,7 +360,10 @@ mod tests { #[test] fn fasta_short_among_valid_discarded() { assert_eq!( - run_fasta(&make_fasta(&[(b"s1", b"ACGTACGT"), (b"s2", b"AC"), (b"s3", b"TTTTTTTT")]), 4), + run_fasta( + &make_fasta(&[(b"s1", b"ACGTACGT"), (b"s2", b"AC"), (b"s3", b"TTTTTTTT")]), + 4 + ), b"ACGTACGT\x00TTTTTTTT\x00" ); } @@ -346,7 +390,10 @@ mod tests { #[test] fn fasta_crlf_line_endings() { - assert_eq!(run_fasta(b">s1\r\nACGT\r\nACGT\r\n>s2\r\nTTTT\r\n", 4), b"ACGTACGT\x00TTTT\x00"); + assert_eq!( + run_fasta(b">s1\r\nACGT\r\nACGT\r\n>s2\r\nTTTT\r\n", 4), + b"ACGTACGT\x00TTTT\x00" + ); } #[test] @@ -356,6 +403,9 @@ mod tests { let mut rope = Rope::new(); rope.push(data[..mid].to_vec()); rope.push(data[mid..].to_vec()); - assert_eq!(flat(normalize_fasta_chunk(rope, 4)), b"ACGTACGT\x00TTTTTTTT\x00"); + assert_eq!( + flat(normalize_fasta_chunk(rope, 4)), + b"ACGTACGT\x00TTTTTTTT\x00" + ); } }