feat: introduce nucstream abstraction and comprehensive test suite

Introduces a unified NucStream abstraction with NucPageCursor for byte-offset tracking and MIME-type dispatch to instantiate format-specific parsers. Exposes nuc_stream and open_nuc_stream APIs that return boxed, Send-compatible iterators. Additionally, adds a comprehensive test suite covering chunk boundary alignment, FASTA/FASTQ record parsing, sequence normalization, and edge cases such as CRLF line endings, @ in quality strings, and multi-slice rope processing.
This commit is contained in:
Eric Coissac
2026-05-28 22:45:47 +02:00
parent cfadf63bbc
commit eaa52eaab5
10 changed files with 765 additions and 497 deletions
+2 -108
View File
@@ -191,111 +191,5 @@ pub fn fastq_chunks<R: Read>(source: R) -> SeqChunkIter<R> {
}
#[cfg(test)]
mod tests {
use super::*;
use crate::fasta::end_of_last_fasta_entry;
use crate::fastq::end_of_last_fastq_entry;
fn fasta_iter(data: &'static [u8], block_size: usize) -> SeqChunkIter<&'static [u8]> {
SeqChunkIter::new(data, block_size, end_of_last_fasta_entry, None)
}
fn fastq_iter(data: &'static [u8], block_size: usize) -> SeqChunkIter<&'static [u8]> {
SeqChunkIter::new(data, block_size, end_of_last_fastq_entry, None)
}
fn rope_to_vec(rope: &Rope) -> Vec<u8> {
rope.fw_cursor().collect()
}
// ── FASTA ─────────────────────────────────────────────────────────────────
#[test]
fn fasta_single_record_one_chunk() {
let data: &[u8] = b">s1\nACGT\n";
let chunks: Vec<_> = fasta_iter(data, 64).collect::<Result<_, _>>().unwrap();
assert_eq!(chunks.len(), 1);
assert_eq!(rope_to_vec(&chunks[0]), b">s1\nACGT\n");
}
#[test]
fn fasta_two_records_split_across_chunks() {
let data: &[u8] = b">s1\nACGT\n>s2\nTTTT\n";
let chunks: Vec<_> = fasta_iter(data, 10).collect::<Result<_, _>>().unwrap();
let all: Vec<u8> = chunks.iter().flat_map(|r| rope_to_vec(r)).collect();
assert_eq!(all, b">s1\nACGT\n>s2\nTTTT\n");
}
#[test]
fn fasta_each_chunk_ends_on_complete_record() {
let data: &[u8] = b">s1\nACGT\n>s2\nCCCC\n>s3\nGGGG\n>s4\nTTTT\n";
for block in [8, 12, 20, 100] {
let chunks: Vec<_> = fasta_iter(data, block).collect::<Result<_, _>>().unwrap();
for rope in &chunks {
let flat = rope_to_vec(rope);
assert_eq!(flat[0], b'>', "block={block}: chunk doesn't start with '>'");
assert_eq!(
*flat.last().unwrap(),
b'\n',
"block={block}: chunk doesn't end with newline"
);
}
}
}
// ── FASTQ ─────────────────────────────────────────────────────────────────
fn make_fastq(records: &[(&[u8], &[u8])]) -> Vec<u8> {
let mut buf = Vec::new();
for (seq, qual) in records {
buf.extend_from_slice(b"@hdr\n");
buf.extend_from_slice(seq);
buf.push(b'\n');
buf.extend_from_slice(b"+\n");
buf.extend_from_slice(qual);
buf.push(b'\n');
}
buf
}
#[test]
fn fastq_single_record_one_chunk() {
let data = Box::leak(make_fastq(&[(b"ACGT", b"IIII")]).into_boxed_slice());
let chunks: Vec<_> = fastq_iter(data, 64).collect::<Result<_, _>>().unwrap();
assert_eq!(chunks.len(), 1);
}
#[test]
fn fastq_at_in_quality_handled() {
let data = Box::leak(
make_fastq(&[(b"ACGTACGT", b"@@@@IIII"), (b"TTTTTTTT", b"HHHHHHHH")])
.into_boxed_slice(),
);
let chunks: Vec<_> = fastq_iter(data, 16).collect::<Result<_, _>>().unwrap();
let all: Vec<u8> = chunks.iter().flat_map(|r| rope_to_vec(r)).collect();
assert_eq!(all, *data);
}
#[test]
fn fastq_each_chunk_starts_with_at() {
let data = Box::leak(
make_fastq(&[
(b"ACGT", b"IIII"),
(b"CCCC", b"JJJJ"),
(b"GGGG", b"KKKK"),
(b"TTTT", b"LLLL"),
])
.into_boxed_slice(),
);
for block in [18, 30, 60] {
let chunks: Vec<_> = fastq_iter(data, block).collect::<Result<_, _>>().unwrap();
for rope in &chunks {
let first_byte = rope_to_vec(rope)[0];
assert_eq!(
first_byte, b'@',
"block={block}: chunk doesn't start with '@'"
);
}
}
}
}
#[path = "tests/chunk.rs"]
mod tests;
+2 -68
View File
@@ -35,71 +35,5 @@ pub fn end_of_last_fasta_entry(rope: &Rope) -> Option<usize> {
}
#[cfg(test)]
mod tests {
use super::*;
fn rope(data: &[u8]) -> Rope {
let mut r = Rope::new(None);
r.push(data.to_vec());
r
}
fn rope2(a: &[u8], b: &[u8]) -> Rope {
let mut r = Rope::new(None);
r.push(a.to_vec());
r.push(b.to_vec());
r
}
fn flat(r: &Rope) -> Vec<u8> {
r.fw_cursor().collect()
}
#[test]
fn single_entry_no_boundary() {
assert_eq!(end_of_last_fasta_entry(&rope(b">seq1\nACGT\n")), None);
}
#[test]
fn two_entries_cuts_at_second_header() {
let data = b">seq1\nACGT\n>seq2\nTTTT\n";
let r = rope(data);
let pos = end_of_last_fasta_entry(&r).unwrap();
assert_eq!(&flat(&r)[pos..], b">seq2\nTTTT\n");
assert_eq!(&flat(&r)[..pos], b">seq1\nACGT\n");
}
#[test]
fn three_entries_cuts_at_last_header() {
let data = b">s1\nAA\n>s2\nCC\n>s3\nGG\n";
let r = rope(data);
let pos = end_of_last_fasta_entry(&r).unwrap();
assert_eq!(&flat(&r)[pos..], b">s3\nGG\n");
}
#[test]
fn multiline_sequence() {
let data = b">s1\nACGT\nACGT\n>s2\nTTTT\n";
let r = rope(data);
let pos = end_of_last_fasta_entry(&r).unwrap();
assert_eq!(&flat(&r)[pos..], b">s2\nTTTT\n");
}
#[test]
fn crlf_line_endings() {
let data = b">s1\r\nACGT\r\n>s2\r\nTTTT\r\n";
let r = rope(data);
let pos = end_of_last_fasta_entry(&r).unwrap();
assert_eq!(&flat(&r)[pos..], b">s2\r\nTTTT\r\n");
}
#[test]
fn boundary_spans_two_blocks() {
let a = b">s1\nACGT\n";
let b = b">s2\nTTTT\n";
let r = rope2(a, b);
let all: Vec<u8> = flat(&r);
let pos = end_of_last_fasta_entry(&r).unwrap();
assert_eq!(&all[pos..], b">s2\nTTTT\n");
}
}
#[path = "tests/fasta.rs"]
mod tests;
+2 -75
View File
@@ -107,78 +107,5 @@ pub fn end_of_last_fastq_entry(rope: &Rope) -> Option<usize> {
}
#[cfg(test)]
mod tests {
use super::*;
fn rope(data: &[u8]) -> Rope {
let mut r = Rope::new(None);
r.push(data.to_vec());
r
}
fn make_fastq(records: &[(&[u8], &[u8])]) -> Vec<u8> {
let mut buf = Vec::new();
for (seq, qual) in records {
buf.extend_from_slice(b"@header\n");
buf.extend_from_slice(seq);
buf.push(b'\n');
buf.extend_from_slice(b"+\n");
buf.extend_from_slice(qual);
buf.push(b'\n');
}
buf
}
fn flat(r: &Rope) -> Vec<u8> {
r.fw_cursor().collect()
}
#[test]
fn single_record_no_boundary() {
let buf = make_fastq(&[(b"ACGT", b"IIII")]);
assert_eq!(end_of_last_fastq_entry(&rope(&buf)), None);
}
#[test]
fn two_records_cuts_at_second() {
let buf = make_fastq(&[(b"ACGT", b"IIII"), (b"TTTT", b"HHHH")]);
let r = rope(&buf);
let pos = end_of_last_fastq_entry(&r).unwrap();
assert_eq!(flat(&r)[pos], b'@');
assert_eq!(
&flat(&r)[pos..],
make_fastq(&[(b"TTTT", b"HHHH")]).as_slice()
);
}
#[test]
fn three_records_cuts_at_last() {
let buf = make_fastq(&[(b"ACGT", b"IIII"), (b"CCCC", b"JJJJ"), (b"GGGG", b"KKKK")]);
let r = rope(&buf);
let pos = end_of_last_fastq_entry(&r).unwrap();
assert_eq!(
&flat(&r)[pos..],
make_fastq(&[(b"GGGG", b"KKKK")]).as_slice()
);
}
#[test]
fn at_sign_in_quality_does_not_confuse() {
let buf = make_fastq(&[(b"ACGTACGT", b"@@@@IIII"), (b"TTTT", b"HHHH")]);
let r = rope(&buf);
let pos = end_of_last_fastq_entry(&r).unwrap();
assert_eq!(
&flat(&r)[pos..],
make_fastq(&[(b"TTTT", b"HHHH")]).as_slice()
);
}
#[test]
fn crlf_line_endings() {
let data = b"@h\r\nACGT\r\n+\r\nIIII\r\n@h\r\nTTTT\r\n+\r\nHHHH\r\n";
let r = rope(data);
let pos = end_of_last_fastq_entry(&r).unwrap();
assert_eq!(flat(&r)[pos], b'@');
assert_eq!(&flat(&r)[pos..], b"@h\r\nTTTT\r\n+\r\nHHHH\r\n");
}
}
#[path = "tests/fastq.rs"]
mod tests;
+2 -236
View File
@@ -215,239 +215,5 @@ fn is_acgt(upper: u8) -> bool {
// ── tests ─────────────────────────────────────────────────────────────────────
#[cfg(test)]
mod tests {
use super::*;
fn make_rope(data: &[u8]) -> Rope {
let mut r = Rope::new(None);
r.push(data.to_vec());
r
}
fn flat(r: Rope) -> Vec<u8> {
r.fw_cursor().collect()
}
fn run_fastq(data: &[u8], k: usize) -> Vec<u8> {
flat(normalize_fastq_chunk(make_rope(data), k))
}
fn run_fasta(data: &[u8], k: usize) -> Vec<u8> {
flat(normalize_fasta_chunk(make_rope(data), k))
}
fn make_fastq(records: &[&[u8]]) -> Vec<u8> {
let mut buf = Vec::new();
for seq in records {
buf.extend_from_slice(b"@hdr\n");
buf.extend_from_slice(seq);
buf.push(b'\n');
buf.extend_from_slice(b"+\n");
buf.extend_from_slice(&vec![b'I'; seq.len()]);
buf.push(b'\n');
}
buf
}
fn make_fasta(records: &[(&[u8], &[u8])]) -> Vec<u8> {
let mut buf = Vec::new();
for (id, seq) in records {
buf.push(b'>');
buf.extend_from_slice(id);
buf.push(b'\n');
buf.extend_from_slice(seq);
buf.push(b'\n');
}
buf
}
// ── FASTQ basic ──────────────────────────────────────────────────────────
#[test]
fn single_record_produces_seq_then_null() {
assert_eq!(run_fastq(&make_fastq(&[b"ACGTACGT"]), 4), b"ACGTACGT\x00");
}
#[test]
fn two_records_concatenated() {
assert_eq!(
run_fastq(&make_fastq(&[b"ACGTACGT", b"TTTTTTTT"]), 4),
b"ACGTACGT\x00TTTTTTTT\x00"
);
}
#[test]
fn lowercase_input_uppercased() {
assert_eq!(run_fastq(&make_fastq(&[b"acgtacgt"]), 4), b"ACGTACGT\x00");
}
#[test]
fn mixed_case_uppercased() {
assert_eq!(run_fastq(&make_fastq(&[b"AcGtAcGt"]), 4), b"ACGTACGT\x00");
}
#[test]
fn sequence_shorter_than_k_discarded() {
assert_eq!(run_fastq(&make_fastq(&[b"ACG"]), 4), b"");
}
#[test]
fn sequence_exactly_k_kept() {
assert_eq!(run_fastq(&make_fastq(&[b"ACGT"]), 4), b"ACGT\x00");
}
#[test]
fn short_record_among_valid_ones_discarded() {
assert_eq!(
run_fastq(&make_fastq(&[b"ACGTACGT", b"AC", b"TTTTTTTT"]), 4),
b"ACGTACGT\x00TTTTTTTT\x00"
);
}
#[test]
fn ambiguous_splits_into_two_segments() {
assert_eq!(
run_fastq(&make_fastq(&[b"ACGTNACGT"]), 4),
b"ACGT\x00ACGT\x00"
);
}
#[test]
fn segment_after_ambiguous_too_short_discarded() {
assert_eq!(
run_fastq(&make_fastq(&[b"ACGTACGTNAC"]), 4),
b"ACGTACGT\x00"
);
}
#[test]
fn consecutive_ambiguous_produce_no_empty_segment() {
assert_eq!(
run_fastq(&make_fastq(&[b"ACGTNNNNACGT"]), 4),
b"ACGT\x00ACGT\x00"
);
}
#[test]
fn ambiguous_at_start_skipped() {
assert_eq!(run_fastq(&make_fastq(&[b"NNACGTACGT"]), 4), b"ACGTACGT\x00");
}
#[test]
fn ambiguous_at_end_produces_no_trailing_empty() {
assert_eq!(run_fastq(&make_fastq(&[b"ACGTACGTNN"]), 4), b"ACGTACGT\x00");
}
#[test]
fn crlf_handled() {
let data = b"@hdr\r\nACGTACGT\r\n+\r\nIIIIIIII\r\n";
assert_eq!(run_fastq(data, 4), b"ACGTACGT\x00");
}
#[test]
fn multi_slice_rope() {
let data = make_fastq(&[b"ACGTACGT", b"TTTTTTTT"]);
let mid = data.len() / 2;
let mut rope = Rope::new(None);
rope.push(data[..mid].to_vec());
rope.push(data[mid..].to_vec());
assert_eq!(
flat(normalize_fastq_chunk(rope, 4)),
b"ACGTACGT\x00TTTTTTTT\x00"
);
}
// ── FASTA ─────────────────────────────────────────────────────────────────
#[test]
fn fasta_single_record() {
assert_eq!(
run_fasta(&make_fasta(&[(b"s1", b"ACGTACGT")]), 4),
b"ACGTACGT\x00"
);
}
#[test]
fn fasta_two_records() {
assert_eq!(
run_fasta(
&make_fasta(&[(b"s1", b"ACGTACGT"), (b"s2", b"TTTTTTTT")]),
4
),
b"ACGTACGT\x00TTTTTTTT\x00"
);
}
#[test]
fn fasta_multiline_sequence_concatenated() {
assert_eq!(
run_fasta(b">s1\nACGT\nACGT\nACGT\n", 4),
b"ACGTACGTACGT\x00"
);
}
#[test]
fn fasta_lowercase_uppercased() {
assert_eq!(
run_fasta(&make_fasta(&[(b"s1", b"acgtacgt")]), 4),
b"ACGTACGT\x00"
);
}
#[test]
fn fasta_short_record_discarded() {
assert_eq!(run_fasta(&make_fasta(&[(b"s1", b"ACG")]), 4), b"");
}
#[test]
fn fasta_short_among_valid_discarded() {
assert_eq!(
run_fasta(
&make_fasta(&[(b"s1", b"ACGTACGT"), (b"s2", b"AC"), (b"s3", b"TTTTTTTT")]),
4
),
b"ACGTACGT\x00TTTTTTTT\x00"
);
}
#[test]
fn fasta_ambiguous_splits_segments() {
assert_eq!(run_fasta(b">s1\nACGTNACGT\n", 4), b"ACGT\x00ACGT\x00");
}
#[test]
fn fasta_ambiguous_across_line_boundary() {
assert_eq!(run_fasta(b">s1\nACGT\nNACGT\n", 4), b"ACGT\x00ACGT\x00");
}
#[test]
fn fasta_ambiguous_short_segment_discarded() {
assert_eq!(run_fasta(b">s1\nACGTACGTNAC\n", 4), b"ACGTACGT\x00");
}
#[test]
fn fasta_no_trailing_newline() {
assert_eq!(run_fasta(b">s1\nACGTACGT", 4), b"ACGTACGT\x00");
}
#[test]
fn fasta_crlf_line_endings() {
assert_eq!(
run_fasta(b">s1\r\nACGT\r\nACGT\r\n>s2\r\nTTTT\r\n", 4),
b"ACGTACGT\x00TTTT\x00"
);
}
#[test]
fn fasta_multi_slice_rope() {
let data = make_fasta(&[(b"s1", b"ACGTACGT"), (b"s2", b"TTTTTTTT")]);
let mid = data.len() / 2;
let mut rope = Rope::new(None);
rope.push(data[..mid].to_vec());
rope.push(data[mid..].to_vec());
assert_eq!(
flat(normalize_fasta_chunk(rope, 4)),
b"ACGTACGT\x00TTTTTTTT\x00"
);
}
}
#[path = "tests/normalize.rs"]
mod tests;
+6 -5
View File
@@ -708,10 +708,7 @@ fn dispatch<R: Read>(
/// Wraps an already-open reader in a nucleotide stream, detecting its format.
/// Returns `None` if the format is not recognised.
pub(crate) fn nuc_stream<R: Read>(
reader: R,
k: usize,
) -> Option<AnyNucStream<MimeTypeGuesser<R>>> {
pub(crate) fn nuc_stream<R: Read>(reader: R, k: usize) -> Option<AnyNucStream<MimeTypeGuesser<R>>> {
dispatch(MimeTypeGuesser::new(reader), k)
}
@@ -726,7 +723,11 @@ pub fn open_nuc_stream(
k: usize,
) -> io::Result<Box<dyn Iterator<Item = NucPage> + Send>> {
let reader = open_raw(source)?;
dispatch(MimeTypeGuesser::new(reader), k)
nuc_stream(reader, k)
.map(|s| Box::new(s) as Box<dyn Iterator<Item = NucPage> + Send>)
.ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "unknown sequence format"))
}
#[cfg(test)]
#[path = "tests/nucstream.rs"]
mod tests;
+106
View File
@@ -0,0 +1,106 @@
use super::*;
use crate::fasta::end_of_last_fasta_entry;
use crate::fastq::end_of_last_fastq_entry;
fn fasta_iter(data: &'static [u8], block_size: usize) -> SeqChunkIter<&'static [u8]> {
SeqChunkIter::new(data, block_size, end_of_last_fasta_entry, None)
}
fn fastq_iter(data: &'static [u8], block_size: usize) -> SeqChunkIter<&'static [u8]> {
SeqChunkIter::new(data, block_size, end_of_last_fastq_entry, None)
}
fn rope_to_vec(rope: &Rope) -> Vec<u8> {
rope.fw_cursor().collect()
}
// ── FASTA ─────────────────────────────────────────────────────────────────
#[test]
fn fasta_single_record_one_chunk() {
let data: &[u8] = b">s1\nACGT\n";
let chunks: Vec<_> = fasta_iter(data, 64).collect::<Result<_, _>>().unwrap();
assert_eq!(chunks.len(), 1);
assert_eq!(rope_to_vec(&chunks[0]), b">s1\nACGT\n");
}
#[test]
fn fasta_two_records_split_across_chunks() {
let data: &[u8] = b">s1\nACGT\n>s2\nTTTT\n";
let chunks: Vec<_> = fasta_iter(data, 10).collect::<Result<_, _>>().unwrap();
let all: Vec<u8> = chunks.iter().flat_map(|r| rope_to_vec(r)).collect();
assert_eq!(all, b">s1\nACGT\n>s2\nTTTT\n");
}
#[test]
fn fasta_each_chunk_ends_on_complete_record() {
let data: &[u8] = b">s1\nACGT\n>s2\nCCCC\n>s3\nGGGG\n>s4\nTTTT\n";
for block in [8, 12, 20, 100] {
let chunks: Vec<_> = fasta_iter(data, block).collect::<Result<_, _>>().unwrap();
for rope in &chunks {
let flat = rope_to_vec(rope);
assert_eq!(flat[0], b'>', "block={block}: chunk doesn't start with '>'");
assert_eq!(
*flat.last().unwrap(),
b'\n',
"block={block}: chunk doesn't end with newline"
);
}
}
}
// ── FASTQ ─────────────────────────────────────────────────────────────────
fn make_fastq(records: &[(&[u8], &[u8])]) -> Vec<u8> {
let mut buf = Vec::new();
for (seq, qual) in records {
buf.extend_from_slice(b"@hdr\n");
buf.extend_from_slice(seq);
buf.push(b'\n');
buf.extend_from_slice(b"+\n");
buf.extend_from_slice(qual);
buf.push(b'\n');
}
buf
}
#[test]
fn fastq_single_record_one_chunk() {
let data = Box::leak(make_fastq(&[(b"ACGT", b"IIII")]).into_boxed_slice());
let chunks: Vec<_> = fastq_iter(data, 64).collect::<Result<_, _>>().unwrap();
assert_eq!(chunks.len(), 1);
}
#[test]
fn fastq_at_in_quality_handled() {
let data = Box::leak(
make_fastq(&[(b"ACGTACGT", b"@@@@IIII"), (b"TTTTTTTT", b"HHHHHHHH")])
.into_boxed_slice(),
);
let chunks: Vec<_> = fastq_iter(data, 16).collect::<Result<_, _>>().unwrap();
let all: Vec<u8> = chunks.iter().flat_map(|r| rope_to_vec(r)).collect();
assert_eq!(all, *data);
}
#[test]
fn fastq_each_chunk_starts_with_at() {
let data = Box::leak(
make_fastq(&[
(b"ACGT", b"IIII"),
(b"CCCC", b"JJJJ"),
(b"GGGG", b"KKKK"),
(b"TTTT", b"LLLL"),
])
.into_boxed_slice(),
);
for block in [18, 30, 60] {
let chunks: Vec<_> = fastq_iter(data, block).collect::<Result<_, _>>().unwrap();
for rope in &chunks {
let first_byte = rope_to_vec(rope)[0];
assert_eq!(
first_byte, b'@',
"block={block}: chunk doesn't start with '@'"
);
}
}
}
+66
View File
@@ -0,0 +1,66 @@
use super::*;
fn rope(data: &[u8]) -> Rope {
let mut r = Rope::new(None);
r.push(data.to_vec());
r
}
fn rope2(a: &[u8], b: &[u8]) -> Rope {
let mut r = Rope::new(None);
r.push(a.to_vec());
r.push(b.to_vec());
r
}
fn flat(r: &Rope) -> Vec<u8> {
r.fw_cursor().collect()
}
#[test]
fn single_entry_no_boundary() {
assert_eq!(end_of_last_fasta_entry(&rope(b">seq1\nACGT\n")), None);
}
#[test]
fn two_entries_cuts_at_second_header() {
let data = b">seq1\nACGT\n>seq2\nTTTT\n";
let r = rope(data);
let pos = end_of_last_fasta_entry(&r).unwrap();
assert_eq!(&flat(&r)[pos..], b">seq2\nTTTT\n");
assert_eq!(&flat(&r)[..pos], b">seq1\nACGT\n");
}
#[test]
fn three_entries_cuts_at_last_header() {
let data = b">s1\nAA\n>s2\nCC\n>s3\nGG\n";
let r = rope(data);
let pos = end_of_last_fasta_entry(&r).unwrap();
assert_eq!(&flat(&r)[pos..], b">s3\nGG\n");
}
#[test]
fn multiline_sequence() {
let data = b">s1\nACGT\nACGT\n>s2\nTTTT\n";
let r = rope(data);
let pos = end_of_last_fasta_entry(&r).unwrap();
assert_eq!(&flat(&r)[pos..], b">s2\nTTTT\n");
}
#[test]
fn crlf_line_endings() {
let data = b">s1\r\nACGT\r\n>s2\r\nTTTT\r\n";
let r = rope(data);
let pos = end_of_last_fasta_entry(&r).unwrap();
assert_eq!(&flat(&r)[pos..], b">s2\r\nTTTT\r\n");
}
#[test]
fn boundary_spans_two_blocks() {
let a = b">s1\nACGT\n";
let b = b">s2\nTTTT\n";
let r = rope2(a, b);
let all: Vec<u8> = flat(&r);
let pos = end_of_last_fasta_entry(&r).unwrap();
assert_eq!(&all[pos..], b">s2\nTTTT\n");
}
+73
View File
@@ -0,0 +1,73 @@
use super::*;
fn rope(data: &[u8]) -> Rope {
let mut r = Rope::new(None);
r.push(data.to_vec());
r
}
fn make_fastq(records: &[(&[u8], &[u8])]) -> Vec<u8> {
let mut buf = Vec::new();
for (seq, qual) in records {
buf.extend_from_slice(b"@header\n");
buf.extend_from_slice(seq);
buf.push(b'\n');
buf.extend_from_slice(b"+\n");
buf.extend_from_slice(qual);
buf.push(b'\n');
}
buf
}
fn flat(r: &Rope) -> Vec<u8> {
r.fw_cursor().collect()
}
#[test]
fn single_record_no_boundary() {
let buf = make_fastq(&[(b"ACGT", b"IIII")]);
assert_eq!(end_of_last_fastq_entry(&rope(&buf)), None);
}
#[test]
fn two_records_cuts_at_second() {
let buf = make_fastq(&[(b"ACGT", b"IIII"), (b"TTTT", b"HHHH")]);
let r = rope(&buf);
let pos = end_of_last_fastq_entry(&r).unwrap();
assert_eq!(flat(&r)[pos], b'@');
assert_eq!(
&flat(&r)[pos..],
make_fastq(&[(b"TTTT", b"HHHH")]).as_slice()
);
}
#[test]
fn three_records_cuts_at_last() {
let buf = make_fastq(&[(b"ACGT", b"IIII"), (b"CCCC", b"JJJJ"), (b"GGGG", b"KKKK")]);
let r = rope(&buf);
let pos = end_of_last_fastq_entry(&r).unwrap();
assert_eq!(
&flat(&r)[pos..],
make_fastq(&[(b"GGGG", b"KKKK")]).as_slice()
);
}
#[test]
fn at_sign_in_quality_does_not_confuse() {
let buf = make_fastq(&[(b"ACGTACGT", b"@@@@IIII"), (b"TTTT", b"HHHH")]);
let r = rope(&buf);
let pos = end_of_last_fastq_entry(&r).unwrap();
assert_eq!(
&flat(&r)[pos..],
make_fastq(&[(b"TTTT", b"HHHH")]).as_slice()
);
}
#[test]
fn crlf_line_endings() {
let data = b"@h\r\nACGT\r\n+\r\nIIII\r\n@h\r\nTTTT\r\n+\r\nHHHH\r\n";
let r = rope(data);
let pos = end_of_last_fastq_entry(&r).unwrap();
assert_eq!(flat(&r)[pos], b'@');
assert_eq!(&flat(&r)[pos..], b"@h\r\nTTTT\r\n+\r\nHHHH\r\n");
}
+234
View File
@@ -0,0 +1,234 @@
use super::*;
fn make_rope(data: &[u8]) -> Rope {
let mut r = Rope::new(None);
r.push(data.to_vec());
r
}
fn flat(r: Rope) -> Vec<u8> {
r.fw_cursor().collect()
}
fn run_fastq(data: &[u8], k: usize) -> Vec<u8> {
flat(normalize_fastq_chunk(make_rope(data), k))
}
fn run_fasta(data: &[u8], k: usize) -> Vec<u8> {
flat(normalize_fasta_chunk(make_rope(data), k))
}
fn make_fastq(records: &[&[u8]]) -> Vec<u8> {
let mut buf = Vec::new();
for seq in records {
buf.extend_from_slice(b"@hdr\n");
buf.extend_from_slice(seq);
buf.push(b'\n');
buf.extend_from_slice(b"+\n");
buf.extend_from_slice(&vec![b'I'; seq.len()]);
buf.push(b'\n');
}
buf
}
fn make_fasta(records: &[(&[u8], &[u8])]) -> Vec<u8> {
let mut buf = Vec::new();
for (id, seq) in records {
buf.push(b'>');
buf.extend_from_slice(id);
buf.push(b'\n');
buf.extend_from_slice(seq);
buf.push(b'\n');
}
buf
}
// ── FASTQ basic ──────────────────────────────────────────────────────────
#[test]
fn single_record_produces_seq_then_null() {
assert_eq!(run_fastq(&make_fastq(&[b"ACGTACGT"]), 4), b"ACGTACGT\x00");
}
#[test]
fn two_records_concatenated() {
assert_eq!(
run_fastq(&make_fastq(&[b"ACGTACGT", b"TTTTTTTT"]), 4),
b"ACGTACGT\x00TTTTTTTT\x00"
);
}
#[test]
fn lowercase_input_uppercased() {
assert_eq!(run_fastq(&make_fastq(&[b"acgtacgt"]), 4), b"ACGTACGT\x00");
}
#[test]
fn mixed_case_uppercased() {
assert_eq!(run_fastq(&make_fastq(&[b"AcGtAcGt"]), 4), b"ACGTACGT\x00");
}
#[test]
fn sequence_shorter_than_k_discarded() {
assert_eq!(run_fastq(&make_fastq(&[b"ACG"]), 4), b"");
}
#[test]
fn sequence_exactly_k_kept() {
assert_eq!(run_fastq(&make_fastq(&[b"ACGT"]), 4), b"ACGT\x00");
}
#[test]
fn short_record_among_valid_ones_discarded() {
assert_eq!(
run_fastq(&make_fastq(&[b"ACGTACGT", b"AC", b"TTTTTTTT"]), 4),
b"ACGTACGT\x00TTTTTTTT\x00"
);
}
#[test]
fn ambiguous_splits_into_two_segments() {
assert_eq!(
run_fastq(&make_fastq(&[b"ACGTNACGT"]), 4),
b"ACGT\x00ACGT\x00"
);
}
#[test]
fn segment_after_ambiguous_too_short_discarded() {
assert_eq!(
run_fastq(&make_fastq(&[b"ACGTACGTNAC"]), 4),
b"ACGTACGT\x00"
);
}
#[test]
fn consecutive_ambiguous_produce_no_empty_segment() {
assert_eq!(
run_fastq(&make_fastq(&[b"ACGTNNNNACGT"]), 4),
b"ACGT\x00ACGT\x00"
);
}
#[test]
fn ambiguous_at_start_skipped() {
assert_eq!(run_fastq(&make_fastq(&[b"NNACGTACGT"]), 4), b"ACGTACGT\x00");
}
#[test]
fn ambiguous_at_end_produces_no_trailing_empty() {
assert_eq!(run_fastq(&make_fastq(&[b"ACGTACGTNN"]), 4), b"ACGTACGT\x00");
}
#[test]
fn crlf_handled() {
let data = b"@hdr\r\nACGTACGT\r\n+\r\nIIIIIIII\r\n";
assert_eq!(run_fastq(data, 4), b"ACGTACGT\x00");
}
#[test]
fn multi_slice_rope() {
let data = make_fastq(&[b"ACGTACGT", b"TTTTTTTT"]);
let mid = data.len() / 2;
let mut rope = Rope::new(None);
rope.push(data[..mid].to_vec());
rope.push(data[mid..].to_vec());
assert_eq!(
flat(normalize_fastq_chunk(rope, 4)),
b"ACGTACGT\x00TTTTTTTT\x00"
);
}
// ── FASTA ─────────────────────────────────────────────────────────────────
#[test]
fn fasta_single_record() {
assert_eq!(
run_fasta(&make_fasta(&[(b"s1", b"ACGTACGT")]), 4),
b"ACGTACGT\x00"
);
}
#[test]
fn fasta_two_records() {
assert_eq!(
run_fasta(
&make_fasta(&[(b"s1", b"ACGTACGT"), (b"s2", b"TTTTTTTT")]),
4
),
b"ACGTACGT\x00TTTTTTTT\x00"
);
}
#[test]
fn fasta_multiline_sequence_concatenated() {
assert_eq!(
run_fasta(b">s1\nACGT\nACGT\nACGT\n", 4),
b"ACGTACGTACGT\x00"
);
}
#[test]
fn fasta_lowercase_uppercased() {
assert_eq!(
run_fasta(&make_fasta(&[(b"s1", b"acgtacgt")]), 4),
b"ACGTACGT\x00"
);
}
#[test]
fn fasta_short_record_discarded() {
assert_eq!(run_fasta(&make_fasta(&[(b"s1", b"ACG")]), 4), b"");
}
#[test]
fn fasta_short_among_valid_discarded() {
assert_eq!(
run_fasta(
&make_fasta(&[(b"s1", b"ACGTACGT"), (b"s2", b"AC"), (b"s3", b"TTTTTTTT")]),
4
),
b"ACGTACGT\x00TTTTTTTT\x00"
);
}
#[test]
fn fasta_ambiguous_splits_segments() {
assert_eq!(run_fasta(b">s1\nACGTNACGT\n", 4), b"ACGT\x00ACGT\x00");
}
#[test]
fn fasta_ambiguous_across_line_boundary() {
assert_eq!(run_fasta(b">s1\nACGT\nNACGT\n", 4), b"ACGT\x00ACGT\x00");
}
#[test]
fn fasta_ambiguous_short_segment_discarded() {
assert_eq!(run_fasta(b">s1\nACGTACGTNAC\n", 4), b"ACGTACGT\x00");
}
#[test]
fn fasta_no_trailing_newline() {
assert_eq!(run_fasta(b">s1\nACGTACGT", 4), b"ACGTACGT\x00");
}
#[test]
fn fasta_crlf_line_endings() {
assert_eq!(
run_fasta(b">s1\r\nACGT\r\nACGT\r\n>s2\r\nTTTT\r\n", 4),
b"ACGTACGT\x00TTTT\x00"
);
}
#[test]
fn fasta_multi_slice_rope() {
let data = make_fasta(&[(b"s1", b"ACGTACGT"), (b"s2", b"TTTTTTTT")]);
let mid = data.len() / 2;
let mut rope = Rope::new(None);
rope.push(data[..mid].to_vec());
rope.push(data[mid..].to_vec());
assert_eq!(
flat(normalize_fasta_chunk(rope, 4)),
b"ACGTACGT\x00TTTTTTTT\x00"
);
}
+267
View File
@@ -0,0 +1,267 @@
use super::*;
use std::io::Cursor;
use std::ops::Deref;
// ── helpers ───────────────────────────────────────────────────────────────
fn run_fasta(data: &[u8], k: usize) -> Vec<u8> {
NucStream::<_, FastaParser>::new(Cursor::new(data.to_vec()), k)
.flat_map(|p| p.deref().to_vec())
.collect()
}
fn run_fastq(data: &[u8], k: usize) -> Vec<u8> {
NucStream::<_, FastqParser>::new(Cursor::new(data.to_vec()), k)
.flat_map(|p| p.deref().to_vec())
.collect()
}
fn run_genbank(data: &[u8], k: usize) -> Vec<u8> {
NucStream::<_, GenbankParser>::new(Cursor::new(data.to_vec()), k)
.flat_map(|p| p.deref().to_vec())
.collect()
}
fn pages_fasta(data: &[u8], k: usize) -> Vec<Vec<u8>> {
NucStream::<_, FastaParser>::new(Cursor::new(data.to_vec()), k)
.map(|p| p.deref().to_vec())
.collect()
}
// ── FastaParser ───────────────────────────────────────────────────────────
#[test]
fn fasta_single_sequence() {
assert_eq!(run_fasta(b">s1\nACGTACGT\n", 4), b"ACGTACGT\x00");
}
#[test]
fn fasta_lowercase_uppercased() {
assert_eq!(run_fasta(b">s1\nacgtacgt\n", 4), b"ACGTACGT\x00");
}
#[test]
fn fasta_multiline_sequence_concatenated() {
assert_eq!(run_fasta(b">s1\nACGT\nACGT\n", 4), b"ACGTACGT\x00");
}
#[test]
fn fasta_two_sequences() {
let data = b">s1\nACGTACGT\n>s2\nTTTTTTTT\n";
assert_eq!(run_fasta(data, 4), b"ACGTACGT\x00TTTTTTTT\x00");
}
#[test]
fn fasta_empty_input_yields_no_pages() {
assert_eq!(run_fasta(b"", 4), b"");
}
#[test]
fn fasta_sequence_shorter_than_k_at_eof_discarded() {
// The 3-base fragment is saved as overlap and dropped at EOF (< k).
assert_eq!(run_fasta(b">s1\nACG\n", 4), b"");
}
#[test]
fn fasta_ambiguous_splits_into_two_segments() {
assert_eq!(run_fasta(b">s1\nACGTNACGT\n", 4), b"ACGT\x00ACGT\x00");
}
#[test]
fn fasta_short_segment_before_ambiguous_emitted() {
// "AC" (< k=4) before N is written with a separator — filtering by
// length is deferred to the superkmer builder, not done here.
assert_eq!(run_fasta(b">s1\nACNACGTACGT\n", 4), b"AC\x00ACGTACGT\x00");
}
#[test]
fn fasta_ambiguous_at_start_skipped() {
assert_eq!(run_fasta(b">s1\nNNNACGTACGT\n", 4), b"ACGTACGT\x00");
}
// ── FastqParser ───────────────────────────────────────────────────────────
#[test]
fn fastq_single_record() {
assert_eq!(
run_fastq(b"@r1\nACGTACGT\n+\nIIIIIIII\n", 4),
b"ACGTACGT\x00"
);
}
#[test]
fn fastq_lowercase_uppercased() {
assert_eq!(
run_fastq(b"@r1\nacgtacgt\n+\nIIIIIIII\n", 4),
b"ACGTACGT\x00"
);
}
#[test]
fn fastq_quality_bytes_not_in_output() {
// '@' (Phred 31 = ASCII 64) in quality must not appear in output.
assert_eq!(
run_fastq(b"@r1\nACGTACGT\n+\n@@@@@@@@\n", 4),
b"ACGTACGT\x00"
);
}
#[test]
fn fastq_two_records() {
let data = b"@r1\nACGTACGT\n+\nIIIIIIII\n@r2\nTTTTTTTT\n+\nIIIIIIII\n";
assert_eq!(run_fastq(data, 4), b"ACGTACGT\x00TTTTTTTT\x00");
}
#[test]
fn fastq_ambiguous_splits_sequence() {
assert_eq!(
run_fastq(b"@r1\nACGTNACGT\n+\nIIIIIIIII\n", 4),
b"ACGT\x00ACGT\x00"
);
}
#[test]
fn fastq_at_in_quality_line_not_a_record_start() {
// '@' in the quality line must not trigger a new record parse.
let data = b"@r1\nACGTACGT\n+\n@@@@@@@@\n@r2\nTTTTTTTT\n+\nIIIIIIII\n";
assert_eq!(run_fastq(data, 4), b"ACGTACGT\x00TTTTTTTT\x00");
}
// ── GenbankParser ─────────────────────────────────────────────────────────
#[test]
fn genbank_origin_to_slash() {
let data = b"LOCUS ...\nORIGIN\n 1 acgtacgt\n//\n";
assert_eq!(run_genbank(data, 4), b"ACGTACGT\x00");
}
#[test]
fn genbank_position_numbers_and_spaces_skipped() {
let data = b"ORIGIN\n 1 acgt acgt\n//\n";
assert_eq!(run_genbank(data, 4), b"ACGTACGT\x00");
}
#[test]
fn genbank_two_records() {
let data = b"ORIGIN\n 1 acgtacgt\n//\nLOCUS ...\nORIGIN\n 1 tttttttt\n//\n";
assert_eq!(run_genbank(data, 4), b"ACGTACGT\x00TTTTTTTT\x00");
}
#[test]
fn genbank_ambiguous_splits_sequence() {
let data = b"ORIGIN\n 1 acgtnacgt\n//\n";
assert_eq!(run_genbank(data, 4), b"ACGT\x00ACGT\x00");
}
// ── NucPage ───────────────────────────────────────────────────────────────
#[test]
fn nuc_page_deref_correct_bytes() {
let page = NucStream::<_, FastaParser>::new(Cursor::new(b">s1\nACGT\n".to_vec()), 4)
.next()
.expect("page");
assert_eq!(page.deref(), b"ACGT\x00");
}
// ── NucPageCursor ─────────────────────────────────────────────────────────
fn make_page(data: &[u8], k: usize) -> NucPage {
NucStream::<_, FastaParser>::new(Cursor::new(data.to_vec()), k)
.next()
.expect("at least one page")
}
#[test]
fn cursor_reads_bytes_in_order() {
let page = make_page(b">s1\nACGTACGT\n", 4);
let mut cur = page.cursor();
assert_eq!(cur.next_byte(), Some(b'A'));
assert_eq!(cur.next_byte(), Some(b'C'));
assert_eq!(cur.next_byte(), Some(b'G'));
assert_eq!(cur.next_byte(), Some(b'T'));
}
#[test]
fn cursor_rewind_rereads_bytes() {
let page = make_page(b">s1\nACGTACGT\n", 4);
let mut cur = page.cursor();
cur.next_byte(); // A
cur.next_byte(); // C
cur.rewind(1);
assert_eq!(cur.next_byte(), Some(b'C'));
cur.rewind(2);
assert_eq!(cur.next_byte(), Some(b'A'));
}
#[test]
fn cursor_returns_none_at_end() {
// "ACGT\x00" = 5 bytes; consume all then expect None.
let page = make_page(b">s1\nACGT\n", 4);
let mut cur = page.cursor();
for _ in 0..5 {
cur.next_byte();
}
assert_eq!(cur.next_byte(), None);
}
#[test]
fn cursor_len_matches_page_content() {
// "ACGTACGT\x00" = 9 bytes
let page = make_page(b">s1\nACGTACGT\n", 4);
let cur = page.cursor();
assert_eq!(cur.len(), 9);
assert!(!cur.is_empty());
}
// ── Overlap at page boundary ──────────────────────────────────────────────
#[test]
fn overlap_last_km1_bytes_prepended_to_next_page() {
const K: usize = 11;
// Sequence long enough to span two pages: PAGE_SIZE + K bytes.
// Pattern chosen so boundary bytes are unambiguous.
let seq: Vec<u8> = (0..PAGE_SIZE + K).map(|i| b"ACGT"[i % 4]).collect();
let mut input = b">seq\n".to_vec();
input.extend_from_slice(&seq);
input.push(b'\n');
let pages = pages_fasta(&input, K);
assert!(pages.len() >= 2, "need at least two pages");
let p1 = &pages[0];
let p2 = &pages[1];
// page1 must end with a \x00 separator (written by save_overlap)
assert_eq!(*p1.last().unwrap(), 0x00, "page1 must end with separator");
// last K-1 ACGT bytes of page1 == first K-1 bytes of page2
let ol = K - 1;
let p1_seq_end = &p1[p1.len() - 1 - ol..p1.len() - 1];
let p2_start = &p2[..ol];
assert_eq!(
p1_seq_end, p2_start,
"overlap bytes mismatch at page boundary"
);
}
// ── Pool ──────────────────────────────────────────────────────────────────
#[test]
fn pool_buffer_reused_after_drop() {
// Drop page1 so its buffer returns to the pool, then verify page2
// is produced correctly (no corruption, no panic).
const K: usize = 11;
let seq: Vec<u8> = vec![b'A'; PAGE_SIZE + K];
let mut input = b">seq\n".to_vec();
input.extend_from_slice(&seq);
input.push(b'\n');
let mut stream = NucStream::<_, FastaParser>::new(Cursor::new(input), K);
let page1 = stream.next().expect("page 1");
assert!(!page1.deref().is_empty());
drop(page1); // returns buffer to pool
let page2 = stream.next().expect("page 2");
assert!(!page2.deref().is_empty());
// page2 must still start with A's (overlap from page1)
assert_eq!(page2[0], b'A');
}