🔧 refactor cursor API and normalize chunking logic
- Change `rope_tell()` return type from Option<usize> to usize, always returning cursor's absolute position (offset if unmoved). - Update all call sites to remove `.unwrap_or(...)` around `rope_tell()`. - Add new method `<Rope>::truncate(pos)`, replacing `split_off(...).map(|_| ())`. - Refactor FASTA/FASTQ normalizers to use a single mutable write cursor (`wc`) and document the protocol. - Simplify `end_segment()` logic: commit segment with 0x00 if length ≥ k, else reset. - Improve documentation for write-cursor protocol and rope truncation semantics.
This commit is contained in:
+15
-10
@@ -185,11 +185,16 @@ pub trait RopeCursor<'a> {
|
||||
Some(abs.saturating_sub(self.state().offset.get()))
|
||||
}
|
||||
|
||||
/// Current position as an absolute rope index, or `None` if the cursor
|
||||
/// has not moved yet. Use this when you need to pass a value to
|
||||
/// [`seek`](RopeCursor::seek) with [`SeekMode::Absolute`].
|
||||
fn rope_tell(&self) -> Option<usize> {
|
||||
self.state().current.get()
|
||||
/// Current position as an absolute rope index.
|
||||
///
|
||||
/// Unlike [`tell`](RopeCursor::tell), this method **always** returns a
|
||||
/// value: if the cursor has not moved yet, it returns the cursor's offset
|
||||
/// (the rope index of local position 0).
|
||||
///
|
||||
/// Use the returned value with [`SeekMode::Rope`] to restore a position,
|
||||
/// or as a truncation point after a write pass.
|
||||
fn rope_tell(&self) -> usize {
|
||||
self.state().current.get().unwrap_or(self.state().offset.get())
|
||||
}
|
||||
|
||||
/// Number of bytes visible through this cursor (`rope.len() - offset`).
|
||||
@@ -295,7 +300,7 @@ impl<'a> ForwardCursor<'a> {
|
||||
/// an `offset` equal to `self.absolute_tell()`. If `self` has not moved
|
||||
/// yet, the new cursor starts at the same offset as `self`.
|
||||
pub fn cursor(&self) -> ForwardCursor<'a> {
|
||||
let new_offset = self.rope_tell().unwrap_or(self.state.offset.get());
|
||||
let new_offset = self.rope_tell();
|
||||
ForwardCursor {
|
||||
rope: self.rope,
|
||||
state: CursorState::with_offset(new_offset),
|
||||
@@ -431,7 +436,7 @@ impl<'a> BackwardCursor<'a> {
|
||||
/// position of `self`. If `self` has not moved yet, the new cursor has the
|
||||
/// same offset as `self` (no restriction).
|
||||
pub fn cursor(&self) -> BackwardCursor<'a> {
|
||||
let new_offset = self.rope_tell().unwrap_or(self.state.offset.get());
|
||||
let new_offset = self.rope_tell();
|
||||
BackwardCursor {
|
||||
rope: self.rope,
|
||||
state: CursorState::with_offset(new_offset),
|
||||
@@ -723,7 +728,7 @@ mod tests {
|
||||
let sub = c.cursor(); // offset=2 (absolute_tell=2)
|
||||
assert_eq!(sub.read_next().unwrap(), b'C'); // reads index 2
|
||||
assert_eq!(sub.tell(), Some(0)); // relative: 2-2=0
|
||||
assert_eq!(sub.rope_tell(), Some(2));
|
||||
assert_eq!(sub.rope_tell(), 2);
|
||||
assert_eq!(sub.read_next().unwrap(), b'D');
|
||||
assert_eq!(sub.tell(), Some(1)); // relative: 3-2=1
|
||||
}
|
||||
@@ -810,11 +815,11 @@ mod tests {
|
||||
sub.read_next().unwrap(); // reads index 0, absolute_tell=0
|
||||
sub.read_next().unwrap(); // reads index 1, absolute_tell=1
|
||||
assert_eq!(sub.tell(), Some(1));
|
||||
assert_eq!(sub.rope_tell(), Some(1));
|
||||
assert_eq!(sub.rope_tell(), 1);
|
||||
// sub2 with offset=1
|
||||
let sub2 = sub.cursor(); // offset=1
|
||||
sub2.read_next().unwrap(); // reads index 1, absolute=1
|
||||
assert_eq!(sub2.tell(), Some(0)); // relative: 1-1=0
|
||||
assert_eq!(sub2.rope_tell(), Some(1));
|
||||
assert_eq!(sub2.rope_tell(), 1);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -155,6 +155,14 @@ impl Rope {
|
||||
})
|
||||
}
|
||||
|
||||
/// Discard all bytes at and after `pos`, keeping `[0, pos)`.
|
||||
///
|
||||
/// Equivalent to `split_off(pos)` with the tail dropped.
|
||||
/// Returns `Err` if `pos > self.length`.
|
||||
pub fn truncate(&mut self, pos: usize) -> Result<(), RopeError> {
|
||||
self.split_off(pos).map(|_| ())
|
||||
}
|
||||
|
||||
/// Create a forward cursor positioned before the first byte.
|
||||
pub fn fw_cursor(&self) -> ForwardCursor<'_> {
|
||||
ForwardCursor::new(self)
|
||||
|
||||
@@ -4,6 +4,18 @@
|
||||
//! and produce a compact byte stream: ACGT segments of length ≥ k separated
|
||||
//! by `0x00`, uppercased. Ambiguous bases terminate the current segment.
|
||||
//! Segments shorter than k are silently discarded.
|
||||
//!
|
||||
//! # Write-cursor protocol
|
||||
//!
|
||||
//! A single write cursor `wc` tracks the current segment. `wc.tell()`
|
||||
//! always gives the **segment length so far** (local position 0 = segment
|
||||
//! start, because `cursor()` and `reset()` both move the offset forward or
|
||||
//! keep it in place):
|
||||
//!
|
||||
//! - Segment accepted (length ≥ k): write `0x00`, then `wc = wc.cursor()`.
|
||||
//! - Segment rejected (length < k): `wc.reset()`.
|
||||
//!
|
||||
//! After the automaton returns, `wc.rope_start()` is the rope truncation point.
|
||||
|
||||
use obikrope::{ForwardCursor, Rope, RopeCursor};
|
||||
|
||||
@@ -11,50 +23,48 @@ use obikrope::{ForwardCursor, Rope, RopeCursor};
|
||||
|
||||
/// Normalise a FASTA chunk into a compact ACGT\x00-separated rope.
|
||||
pub fn normalize_fasta_chunk(mut rope: Rope, k: usize) -> Rope {
|
||||
let write_pos = {
|
||||
let end = {
|
||||
let read = rope.fw_cursor();
|
||||
let write = rope.fw_cursor();
|
||||
normalize_fasta(&read, &write, k);
|
||||
write.tell().unwrap_or(0)
|
||||
let mut wc = rope.fw_cursor();
|
||||
normalize_fasta(&read, &mut wc, k);
|
||||
wc.rope_tell()
|
||||
};
|
||||
let _ = rope.split_off(write_pos);
|
||||
rope.truncate(end).ok();
|
||||
rope
|
||||
}
|
||||
|
||||
/// Normalise a FASTQ chunk into a compact ACGT\x00-separated rope.
|
||||
pub fn normalize_fastq_chunk(mut rope: Rope, k: usize) -> Rope {
|
||||
let write_pos = {
|
||||
let end = {
|
||||
let read = rope.fw_cursor();
|
||||
let write = rope.fw_cursor();
|
||||
normalize_fastq(&read, &write, k);
|
||||
write.tell().unwrap_or(0)
|
||||
let mut wc = rope.fw_cursor();
|
||||
normalize_fastq(&read, &mut wc, k);
|
||||
wc.rope_tell()
|
||||
};
|
||||
let _ = rope.split_off(write_pos);
|
||||
rope.truncate(end).ok();
|
||||
rope
|
||||
}
|
||||
|
||||
// ── FASTA automaton ───────────────────────────────────────────────────────────
|
||||
|
||||
fn normalize_fasta(read: &ForwardCursor<'_>, write: &ForwardCursor<'_>, k: usize) {
|
||||
fn normalize_fasta(read: &ForwardCursor<'_>, mut wc: &mut ForwardCursor<'_>, k: usize) {
|
||||
skip_line(read);
|
||||
|
||||
loop {
|
||||
let mut seg_start = write.tell().unwrap_or(0);
|
||||
|
||||
'seq: loop {
|
||||
let Some(c) = read.read_next().ok() else {
|
||||
end_segment(write, &mut seg_start, k);
|
||||
end_segment(wc, k);
|
||||
return;
|
||||
};
|
||||
|
||||
if is_newline(c) {
|
||||
match read.read_ahead(1).ok() {
|
||||
None => {
|
||||
end_segment(write, &mut seg_start, k);
|
||||
end_segment(wc, k);
|
||||
return;
|
||||
}
|
||||
Some(b'>') => {
|
||||
end_segment(write, &mut seg_start, k);
|
||||
end_segment(wc, k);
|
||||
skip_line(read);
|
||||
break 'seq;
|
||||
}
|
||||
@@ -64,11 +74,10 @@ fn normalize_fasta(read: &ForwardCursor<'_>, write: &ForwardCursor<'_>, k: usize
|
||||
|
||||
let upper = c & !0x20u8;
|
||||
if is_acgt(upper) {
|
||||
write.write(upper).ok();
|
||||
wc.write(upper).ok();
|
||||
} else {
|
||||
end_segment(write, &mut seg_start, k);
|
||||
end_segment(wc, k);
|
||||
skip_until_acgt_or_newline(read);
|
||||
seg_start = write.tell().unwrap_or(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -76,15 +85,13 @@ fn normalize_fasta(read: &ForwardCursor<'_>, write: &ForwardCursor<'_>, k: usize
|
||||
|
||||
// ── FASTQ automaton ───────────────────────────────────────────────────────────
|
||||
|
||||
fn normalize_fastq(read: &ForwardCursor<'_>, write: &ForwardCursor<'_>, k: usize) {
|
||||
fn normalize_fastq(read: &ForwardCursor<'_>, mut wc: &mut ForwardCursor<'_>, k: usize) {
|
||||
loop {
|
||||
skip_line(read); // skip header
|
||||
|
||||
let mut seg_start = write.tell().unwrap_or(0);
|
||||
|
||||
'seq: loop {
|
||||
let Some(c) = read.read_next().ok() else {
|
||||
end_segment(write, &mut seg_start, k);
|
||||
end_segment(wc, k);
|
||||
return;
|
||||
};
|
||||
|
||||
@@ -95,15 +102,14 @@ fn normalize_fastq(read: &ForwardCursor<'_>, write: &ForwardCursor<'_>, k: usize
|
||||
|
||||
let upper = c & !0x20u8;
|
||||
if is_acgt(upper) {
|
||||
write.write(upper).ok();
|
||||
wc.write(upper).ok();
|
||||
} else {
|
||||
end_segment(write, &mut seg_start, k);
|
||||
end_segment(wc, k);
|
||||
skip_until_acgt_or_newline(read);
|
||||
seg_start = write.tell().unwrap_or(0);
|
||||
}
|
||||
}
|
||||
|
||||
end_segment(write, &mut seg_start, k);
|
||||
end_segment(&mut wc, k);
|
||||
|
||||
skip_line(read); // skip + line
|
||||
skip_line(read); // skip quality line
|
||||
@@ -116,6 +122,21 @@ fn normalize_fastq(read: &ForwardCursor<'_>, write: &ForwardCursor<'_>, k: usize
|
||||
|
||||
// ── shared helpers ────────────────────────────────────────────────────────────
|
||||
|
||||
/// Commit or discard the current segment.
|
||||
///
|
||||
/// `wc.tell()` is the segment length (local coords start at 0 = segment start).
|
||||
/// - Long enough: write `0x00`, then `wc = wc.cursor()` to anchor the next segment.
|
||||
/// - Too short: `wc.reset()` to stay at the same rope position.
|
||||
fn end_segment(wc: &mut ForwardCursor<'_>, k: usize) {
|
||||
if wc.tell().unwrap_or(0) >= k {
|
||||
wc.write(0x00).ok();
|
||||
let next = wc.cursor();
|
||||
*wc = next;
|
||||
} else {
|
||||
wc.reset();
|
||||
}
|
||||
}
|
||||
|
||||
fn skip_line(read: &ForwardCursor<'_>) {
|
||||
while let Some(c) = read.read_next().ok() {
|
||||
if is_newline(c) {
|
||||
@@ -140,18 +161,14 @@ fn skip_until_acgt_or_newline(read: &ForwardCursor<'_>) {
|
||||
}
|
||||
}
|
||||
|
||||
fn end_segment(write: &ForwardCursor<'_>, seg_start: &mut usize, k: usize) {
|
||||
let seg_len = write.tell().unwrap_or(0) - *seg_start;
|
||||
if seg_len >= k {
|
||||
write.write(0x00).ok();
|
||||
} else if seg_len > 0 {
|
||||
write.rewind(seg_len).ok();
|
||||
}
|
||||
*seg_start = write.tell().unwrap_or(0);
|
||||
#[inline]
|
||||
fn is_newline(c: u8) -> bool {
|
||||
c == b'\n' || c == b'\r'
|
||||
}
|
||||
#[inline]
|
||||
fn is_acgt(upper: u8) -> bool {
|
||||
matches!(upper, b'A' | b'C' | b'G' | b'T')
|
||||
}
|
||||
|
||||
#[inline] fn is_newline(c: u8) -> bool { c == b'\n' || c == b'\r' }
|
||||
#[inline] fn is_acgt(upper: u8) -> bool { matches!(upper, b'A' | b'C' | b'G' | b'T') }
|
||||
|
||||
// ── tests ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
@@ -247,17 +264,26 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn ambiguous_splits_into_two_segments() {
|
||||
assert_eq!(run_fastq(&make_fastq(&[b"ACGTNACGT"]), 4), b"ACGT\x00ACGT\x00");
|
||||
assert_eq!(
|
||||
run_fastq(&make_fastq(&[b"ACGTNACGT"]), 4),
|
||||
b"ACGT\x00ACGT\x00"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn segment_after_ambiguous_too_short_discarded() {
|
||||
assert_eq!(run_fastq(&make_fastq(&[b"ACGTACGTNAC"]), 4), b"ACGTACGT\x00");
|
||||
assert_eq!(
|
||||
run_fastq(&make_fastq(&[b"ACGTACGTNAC"]), 4),
|
||||
b"ACGTACGT\x00"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn consecutive_ambiguous_produce_no_empty_segment() {
|
||||
assert_eq!(run_fastq(&make_fastq(&[b"ACGTNNNNACGT"]), 4), b"ACGT\x00ACGT\x00");
|
||||
assert_eq!(
|
||||
run_fastq(&make_fastq(&[b"ACGTNNNNACGT"]), 4),
|
||||
b"ACGT\x00ACGT\x00"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -283,32 +309,47 @@ mod tests {
|
||||
let mut rope = Rope::new();
|
||||
rope.push(data[..mid].to_vec());
|
||||
rope.push(data[mid..].to_vec());
|
||||
assert_eq!(flat(normalize_fastq_chunk(rope, 4)), b"ACGTACGT\x00TTTTTTTT\x00");
|
||||
assert_eq!(
|
||||
flat(normalize_fastq_chunk(rope, 4)),
|
||||
b"ACGTACGT\x00TTTTTTTT\x00"
|
||||
);
|
||||
}
|
||||
|
||||
// ── FASTA ─────────────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn fasta_single_record() {
|
||||
assert_eq!(run_fasta(&make_fasta(&[(b"s1", b"ACGTACGT")]), 4), b"ACGTACGT\x00");
|
||||
assert_eq!(
|
||||
run_fasta(&make_fasta(&[(b"s1", b"ACGTACGT")]), 4),
|
||||
b"ACGTACGT\x00"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fasta_two_records() {
|
||||
assert_eq!(
|
||||
run_fasta(&make_fasta(&[(b"s1", b"ACGTACGT"), (b"s2", b"TTTTTTTT")]), 4),
|
||||
run_fasta(
|
||||
&make_fasta(&[(b"s1", b"ACGTACGT"), (b"s2", b"TTTTTTTT")]),
|
||||
4
|
||||
),
|
||||
b"ACGTACGT\x00TTTTTTTT\x00"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fasta_multiline_sequence_concatenated() {
|
||||
assert_eq!(run_fasta(b">s1\nACGT\nACGT\nACGT\n", 4), b"ACGTACGTACGT\x00");
|
||||
assert_eq!(
|
||||
run_fasta(b">s1\nACGT\nACGT\nACGT\n", 4),
|
||||
b"ACGTACGTACGT\x00"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fasta_lowercase_uppercased() {
|
||||
assert_eq!(run_fasta(&make_fasta(&[(b"s1", b"acgtacgt")]), 4), b"ACGTACGT\x00");
|
||||
assert_eq!(
|
||||
run_fasta(&make_fasta(&[(b"s1", b"acgtacgt")]), 4),
|
||||
b"ACGTACGT\x00"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -319,7 +360,10 @@ mod tests {
|
||||
#[test]
|
||||
fn fasta_short_among_valid_discarded() {
|
||||
assert_eq!(
|
||||
run_fasta(&make_fasta(&[(b"s1", b"ACGTACGT"), (b"s2", b"AC"), (b"s3", b"TTTTTTTT")]), 4),
|
||||
run_fasta(
|
||||
&make_fasta(&[(b"s1", b"ACGTACGT"), (b"s2", b"AC"), (b"s3", b"TTTTTTTT")]),
|
||||
4
|
||||
),
|
||||
b"ACGTACGT\x00TTTTTTTT\x00"
|
||||
);
|
||||
}
|
||||
@@ -346,7 +390,10 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn fasta_crlf_line_endings() {
|
||||
assert_eq!(run_fasta(b">s1\r\nACGT\r\nACGT\r\n>s2\r\nTTTT\r\n", 4), b"ACGTACGT\x00TTTT\x00");
|
||||
assert_eq!(
|
||||
run_fasta(b">s1\r\nACGT\r\nACGT\r\n>s2\r\nTTTT\r\n", 4),
|
||||
b"ACGTACGT\x00TTTT\x00"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -356,6 +403,9 @@ mod tests {
|
||||
let mut rope = Rope::new();
|
||||
rope.push(data[..mid].to_vec());
|
||||
rope.push(data[mid..].to_vec());
|
||||
assert_eq!(flat(normalize_fasta_chunk(rope, 4)), b"ACGTACGT\x00TTTTTTTT\x00");
|
||||
assert_eq!(
|
||||
flat(normalize_fasta_chunk(rope, 4)),
|
||||
b"ACGTACGT\x00TTTTTTTT\x00"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user