🔧 refactor cursor API and normalize chunking logic

- Change `rope_tell()` return type from Option<usize> to usize, always returning cursor's absolute position (offset if unmoved).
- Update all call sites to remove `.unwrap_or(...)` around `rope_tell()`.
- Add new method `<Rope>::truncate(pos)`, replacing `split_off(...).map(|_| ())`.
- Refactor FASTA/FASTQ normalizers to use a single mutable write cursor (`wc`) and document the protocol.
- Simplify `end_segment()` logic: commit segment with 0x00 if length ≥ k, else reset.
- Improve documentation for write-cursor protocol and rope truncation semantics.
This commit is contained in:
Eric Coissac
2026-04-19 22:47:26 +02:00
parent 3716eeff7f
commit 097f7f0695
3 changed files with 122 additions and 59 deletions
+15 -10
View File
@@ -185,11 +185,16 @@ pub trait RopeCursor<'a> {
Some(abs.saturating_sub(self.state().offset.get()))
}
/// Current position as an absolute rope index, or `None` if the cursor
/// has not moved yet. Use this when you need to pass a value to
/// [`seek`](RopeCursor::seek) with [`SeekMode::Absolute`].
fn rope_tell(&self) -> Option<usize> {
self.state().current.get()
/// Current position as an absolute rope index.
///
/// Unlike [`tell`](RopeCursor::tell), this method **always** returns a
/// value: if the cursor has not moved yet, it returns the cursor's offset
/// (the rope index of local position 0).
///
/// Use the returned value with [`SeekMode::Rope`] to restore a position,
/// or as a truncation point after a write pass.
fn rope_tell(&self) -> usize {
self.state().current.get().unwrap_or(self.state().offset.get())
}
/// Number of bytes visible through this cursor (`rope.len() - offset`).
@@ -295,7 +300,7 @@ impl<'a> ForwardCursor<'a> {
/// an `offset` equal to `self.absolute_tell()`. If `self` has not moved
/// yet, the new cursor starts at the same offset as `self`.
pub fn cursor(&self) -> ForwardCursor<'a> {
let new_offset = self.rope_tell().unwrap_or(self.state.offset.get());
let new_offset = self.rope_tell();
ForwardCursor {
rope: self.rope,
state: CursorState::with_offset(new_offset),
@@ -431,7 +436,7 @@ impl<'a> BackwardCursor<'a> {
/// position of `self`. If `self` has not moved yet, the new cursor has the
/// same offset as `self` (no restriction).
pub fn cursor(&self) -> BackwardCursor<'a> {
let new_offset = self.rope_tell().unwrap_or(self.state.offset.get());
let new_offset = self.rope_tell();
BackwardCursor {
rope: self.rope,
state: CursorState::with_offset(new_offset),
@@ -723,7 +728,7 @@ mod tests {
let sub = c.cursor(); // offset=2 (absolute_tell=2)
assert_eq!(sub.read_next().unwrap(), b'C'); // reads index 2
assert_eq!(sub.tell(), Some(0)); // relative: 2-2=0
assert_eq!(sub.rope_tell(), Some(2));
assert_eq!(sub.rope_tell(), 2);
assert_eq!(sub.read_next().unwrap(), b'D');
assert_eq!(sub.tell(), Some(1)); // relative: 3-2=1
}
@@ -810,11 +815,11 @@ mod tests {
sub.read_next().unwrap(); // reads index 0, absolute_tell=0
sub.read_next().unwrap(); // reads index 1, absolute_tell=1
assert_eq!(sub.tell(), Some(1));
assert_eq!(sub.rope_tell(), Some(1));
assert_eq!(sub.rope_tell(), 1);
// sub2 with offset=1
let sub2 = sub.cursor(); // offset=1
sub2.read_next().unwrap(); // reads index 1, absolute=1
assert_eq!(sub2.tell(), Some(0)); // relative: 1-1=0
assert_eq!(sub2.rope_tell(), Some(1));
assert_eq!(sub2.rope_tell(), 1);
}
}
+8
View File
@@ -155,6 +155,14 @@ impl Rope {
})
}
/// Discard all bytes at and after `pos`, keeping `[0, pos)`.
///
/// Equivalent to `split_off(pos)` with the tail dropped.
/// Returns `Err` if `pos > self.length`.
pub fn truncate(&mut self, pos: usize) -> Result<(), RopeError> {
self.split_off(pos).map(|_| ())
}
/// Create a forward cursor positioned before the first byte.
pub fn fw_cursor(&self) -> ForwardCursor<'_> {
ForwardCursor::new(self)
+98 -48
View File
@@ -4,6 +4,18 @@
//! and produce a compact byte stream: ACGT segments of length ≥ k separated
//! by `0x00`, uppercased. Ambiguous bases terminate the current segment.
//! Segments shorter than k are silently discarded.
//!
//! # Write-cursor protocol
//!
//! A single write cursor `wc` tracks the current segment. `wc.tell()`
//! always gives the **segment length so far** (local position 0 = segment
//! start, because `cursor()` and `reset()` both move the offset forward or
//! keep it in place):
//!
//! - Segment accepted (length ≥ k): write `0x00`, then `wc = wc.cursor()`.
//! - Segment rejected (length < k): `wc.reset()`.
//!
//! After the automaton returns, `wc.rope_start()` is the rope truncation point.
use obikrope::{ForwardCursor, Rope, RopeCursor};
@@ -11,50 +23,48 @@ use obikrope::{ForwardCursor, Rope, RopeCursor};
/// Normalise a FASTA chunk into a compact ACGT\x00-separated rope.
pub fn normalize_fasta_chunk(mut rope: Rope, k: usize) -> Rope {
let write_pos = {
let end = {
let read = rope.fw_cursor();
let write = rope.fw_cursor();
normalize_fasta(&read, &write, k);
write.tell().unwrap_or(0)
let mut wc = rope.fw_cursor();
normalize_fasta(&read, &mut wc, k);
wc.rope_tell()
};
let _ = rope.split_off(write_pos);
rope.truncate(end).ok();
rope
}
/// Normalise a FASTQ chunk into a compact ACGT\x00-separated rope.
pub fn normalize_fastq_chunk(mut rope: Rope, k: usize) -> Rope {
let write_pos = {
let end = {
let read = rope.fw_cursor();
let write = rope.fw_cursor();
normalize_fastq(&read, &write, k);
write.tell().unwrap_or(0)
let mut wc = rope.fw_cursor();
normalize_fastq(&read, &mut wc, k);
wc.rope_tell()
};
let _ = rope.split_off(write_pos);
rope.truncate(end).ok();
rope
}
// ── FASTA automaton ───────────────────────────────────────────────────────────
fn normalize_fasta(read: &ForwardCursor<'_>, write: &ForwardCursor<'_>, k: usize) {
fn normalize_fasta(read: &ForwardCursor<'_>, mut wc: &mut ForwardCursor<'_>, k: usize) {
skip_line(read);
loop {
let mut seg_start = write.tell().unwrap_or(0);
'seq: loop {
let Some(c) = read.read_next().ok() else {
end_segment(write, &mut seg_start, k);
end_segment(wc, k);
return;
};
if is_newline(c) {
match read.read_ahead(1).ok() {
None => {
end_segment(write, &mut seg_start, k);
end_segment(wc, k);
return;
}
Some(b'>') => {
end_segment(write, &mut seg_start, k);
end_segment(wc, k);
skip_line(read);
break 'seq;
}
@@ -64,11 +74,10 @@ fn normalize_fasta(read: &ForwardCursor<'_>, write: &ForwardCursor<'_>, k: usize
let upper = c & !0x20u8;
if is_acgt(upper) {
write.write(upper).ok();
wc.write(upper).ok();
} else {
end_segment(write, &mut seg_start, k);
end_segment(wc, k);
skip_until_acgt_or_newline(read);
seg_start = write.tell().unwrap_or(0);
}
}
}
@@ -76,15 +85,13 @@ fn normalize_fasta(read: &ForwardCursor<'_>, write: &ForwardCursor<'_>, k: usize
// ── FASTQ automaton ───────────────────────────────────────────────────────────
fn normalize_fastq(read: &ForwardCursor<'_>, write: &ForwardCursor<'_>, k: usize) {
fn normalize_fastq(read: &ForwardCursor<'_>, mut wc: &mut ForwardCursor<'_>, k: usize) {
loop {
skip_line(read); // skip header
let mut seg_start = write.tell().unwrap_or(0);
'seq: loop {
let Some(c) = read.read_next().ok() else {
end_segment(write, &mut seg_start, k);
end_segment(wc, k);
return;
};
@@ -95,15 +102,14 @@ fn normalize_fastq(read: &ForwardCursor<'_>, write: &ForwardCursor<'_>, k: usize
let upper = c & !0x20u8;
if is_acgt(upper) {
write.write(upper).ok();
wc.write(upper).ok();
} else {
end_segment(write, &mut seg_start, k);
end_segment(wc, k);
skip_until_acgt_or_newline(read);
seg_start = write.tell().unwrap_or(0);
}
}
end_segment(write, &mut seg_start, k);
end_segment(&mut wc, k);
skip_line(read); // skip + line
skip_line(read); // skip quality line
@@ -116,6 +122,21 @@ fn normalize_fastq(read: &ForwardCursor<'_>, write: &ForwardCursor<'_>, k: usize
// ── shared helpers ────────────────────────────────────────────────────────────
/// Commit or discard the current segment.
///
/// `wc.tell()` is the segment length (local coords start at 0 = segment start).
/// - Long enough: write `0x00`, then `wc = wc.cursor()` to anchor the next segment.
/// - Too short: `wc.reset()` to stay at the same rope position.
fn end_segment(wc: &mut ForwardCursor<'_>, k: usize) {
if wc.tell().unwrap_or(0) >= k {
wc.write(0x00).ok();
let next = wc.cursor();
*wc = next;
} else {
wc.reset();
}
}
fn skip_line(read: &ForwardCursor<'_>) {
while let Some(c) = read.read_next().ok() {
if is_newline(c) {
@@ -140,19 +161,15 @@ fn skip_until_acgt_or_newline(read: &ForwardCursor<'_>) {
}
}
fn end_segment(write: &ForwardCursor<'_>, seg_start: &mut usize, k: usize) {
let seg_len = write.tell().unwrap_or(0) - *seg_start;
if seg_len >= k {
write.write(0x00).ok();
} else if seg_len > 0 {
write.rewind(seg_len).ok();
#[inline]
fn is_newline(c: u8) -> bool {
c == b'\n' || c == b'\r'
}
*seg_start = write.tell().unwrap_or(0);
#[inline]
fn is_acgt(upper: u8) -> bool {
matches!(upper, b'A' | b'C' | b'G' | b'T')
}
#[inline] fn is_newline(c: u8) -> bool { c == b'\n' || c == b'\r' }
#[inline] fn is_acgt(upper: u8) -> bool { matches!(upper, b'A' | b'C' | b'G' | b'T') }
// ── tests ─────────────────────────────────────────────────────────────────────
#[cfg(test)]
@@ -247,17 +264,26 @@ mod tests {
#[test]
fn ambiguous_splits_into_two_segments() {
assert_eq!(run_fastq(&make_fastq(&[b"ACGTNACGT"]), 4), b"ACGT\x00ACGT\x00");
assert_eq!(
run_fastq(&make_fastq(&[b"ACGTNACGT"]), 4),
b"ACGT\x00ACGT\x00"
);
}
#[test]
fn segment_after_ambiguous_too_short_discarded() {
assert_eq!(run_fastq(&make_fastq(&[b"ACGTACGTNAC"]), 4), b"ACGTACGT\x00");
assert_eq!(
run_fastq(&make_fastq(&[b"ACGTACGTNAC"]), 4),
b"ACGTACGT\x00"
);
}
#[test]
fn consecutive_ambiguous_produce_no_empty_segment() {
assert_eq!(run_fastq(&make_fastq(&[b"ACGTNNNNACGT"]), 4), b"ACGT\x00ACGT\x00");
assert_eq!(
run_fastq(&make_fastq(&[b"ACGTNNNNACGT"]), 4),
b"ACGT\x00ACGT\x00"
);
}
#[test]
@@ -283,32 +309,47 @@ mod tests {
let mut rope = Rope::new();
rope.push(data[..mid].to_vec());
rope.push(data[mid..].to_vec());
assert_eq!(flat(normalize_fastq_chunk(rope, 4)), b"ACGTACGT\x00TTTTTTTT\x00");
assert_eq!(
flat(normalize_fastq_chunk(rope, 4)),
b"ACGTACGT\x00TTTTTTTT\x00"
);
}
// ── FASTA ─────────────────────────────────────────────────────────────────
#[test]
fn fasta_single_record() {
assert_eq!(run_fasta(&make_fasta(&[(b"s1", b"ACGTACGT")]), 4), b"ACGTACGT\x00");
assert_eq!(
run_fasta(&make_fasta(&[(b"s1", b"ACGTACGT")]), 4),
b"ACGTACGT\x00"
);
}
#[test]
fn fasta_two_records() {
assert_eq!(
run_fasta(&make_fasta(&[(b"s1", b"ACGTACGT"), (b"s2", b"TTTTTTTT")]), 4),
run_fasta(
&make_fasta(&[(b"s1", b"ACGTACGT"), (b"s2", b"TTTTTTTT")]),
4
),
b"ACGTACGT\x00TTTTTTTT\x00"
);
}
#[test]
fn fasta_multiline_sequence_concatenated() {
assert_eq!(run_fasta(b">s1\nACGT\nACGT\nACGT\n", 4), b"ACGTACGTACGT\x00");
assert_eq!(
run_fasta(b">s1\nACGT\nACGT\nACGT\n", 4),
b"ACGTACGTACGT\x00"
);
}
#[test]
fn fasta_lowercase_uppercased() {
assert_eq!(run_fasta(&make_fasta(&[(b"s1", b"acgtacgt")]), 4), b"ACGTACGT\x00");
assert_eq!(
run_fasta(&make_fasta(&[(b"s1", b"acgtacgt")]), 4),
b"ACGTACGT\x00"
);
}
#[test]
@@ -319,7 +360,10 @@ mod tests {
#[test]
fn fasta_short_among_valid_discarded() {
assert_eq!(
run_fasta(&make_fasta(&[(b"s1", b"ACGTACGT"), (b"s2", b"AC"), (b"s3", b"TTTTTTTT")]), 4),
run_fasta(
&make_fasta(&[(b"s1", b"ACGTACGT"), (b"s2", b"AC"), (b"s3", b"TTTTTTTT")]),
4
),
b"ACGTACGT\x00TTTTTTTT\x00"
);
}
@@ -346,7 +390,10 @@ mod tests {
#[test]
fn fasta_crlf_line_endings() {
assert_eq!(run_fasta(b">s1\r\nACGT\r\nACGT\r\n>s2\r\nTTTT\r\n", 4), b"ACGTACGT\x00TTTT\x00");
assert_eq!(
run_fasta(b">s1\r\nACGT\r\nACGT\r\n>s2\r\nTTTT\r\n", 4),
b"ACGTACGT\x00TTTT\x00"
);
}
#[test]
@@ -356,6 +403,9 @@ mod tests {
let mut rope = Rope::new();
rope.push(data[..mid].to_vec());
rope.push(data[mid..].to_vec());
assert_eq!(flat(normalize_fasta_chunk(rope, 4)), b"ACGTACGT\x00TTTTTTTT\x00");
assert_eq!(
flat(normalize_fasta_chunk(rope, 4)),
b"ACGTACGT\x00TTTTTTTT\x00"
);
}
}