Push nvyqwlpspwvl #11
+2
-108
@@ -191,111 +191,5 @@ pub fn fastq_chunks<R: Read>(source: R) -> SeqChunkIter<R> {
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::fasta::end_of_last_fasta_entry;
|
||||
use crate::fastq::end_of_last_fastq_entry;
|
||||
|
||||
fn fasta_iter(data: &'static [u8], block_size: usize) -> SeqChunkIter<&'static [u8]> {
|
||||
SeqChunkIter::new(data, block_size, end_of_last_fasta_entry, None)
|
||||
}
|
||||
|
||||
fn fastq_iter(data: &'static [u8], block_size: usize) -> SeqChunkIter<&'static [u8]> {
|
||||
SeqChunkIter::new(data, block_size, end_of_last_fastq_entry, None)
|
||||
}
|
||||
|
||||
fn rope_to_vec(rope: &Rope) -> Vec<u8> {
|
||||
rope.fw_cursor().collect()
|
||||
}
|
||||
|
||||
// ── FASTA ─────────────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn fasta_single_record_one_chunk() {
|
||||
let data: &[u8] = b">s1\nACGT\n";
|
||||
let chunks: Vec<_> = fasta_iter(data, 64).collect::<Result<_, _>>().unwrap();
|
||||
assert_eq!(chunks.len(), 1);
|
||||
assert_eq!(rope_to_vec(&chunks[0]), b">s1\nACGT\n");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fasta_two_records_split_across_chunks() {
|
||||
let data: &[u8] = b">s1\nACGT\n>s2\nTTTT\n";
|
||||
let chunks: Vec<_> = fasta_iter(data, 10).collect::<Result<_, _>>().unwrap();
|
||||
let all: Vec<u8> = chunks.iter().flat_map(|r| rope_to_vec(r)).collect();
|
||||
assert_eq!(all, b">s1\nACGT\n>s2\nTTTT\n");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fasta_each_chunk_ends_on_complete_record() {
|
||||
let data: &[u8] = b">s1\nACGT\n>s2\nCCCC\n>s3\nGGGG\n>s4\nTTTT\n";
|
||||
for block in [8, 12, 20, 100] {
|
||||
let chunks: Vec<_> = fasta_iter(data, block).collect::<Result<_, _>>().unwrap();
|
||||
for rope in &chunks {
|
||||
let flat = rope_to_vec(rope);
|
||||
assert_eq!(flat[0], b'>', "block={block}: chunk doesn't start with '>'");
|
||||
assert_eq!(
|
||||
*flat.last().unwrap(),
|
||||
b'\n',
|
||||
"block={block}: chunk doesn't end with newline"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ── FASTQ ─────────────────────────────────────────────────────────────────
|
||||
|
||||
fn make_fastq(records: &[(&[u8], &[u8])]) -> Vec<u8> {
|
||||
let mut buf = Vec::new();
|
||||
for (seq, qual) in records {
|
||||
buf.extend_from_slice(b"@hdr\n");
|
||||
buf.extend_from_slice(seq);
|
||||
buf.push(b'\n');
|
||||
buf.extend_from_slice(b"+\n");
|
||||
buf.extend_from_slice(qual);
|
||||
buf.push(b'\n');
|
||||
}
|
||||
buf
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fastq_single_record_one_chunk() {
|
||||
let data = Box::leak(make_fastq(&[(b"ACGT", b"IIII")]).into_boxed_slice());
|
||||
let chunks: Vec<_> = fastq_iter(data, 64).collect::<Result<_, _>>().unwrap();
|
||||
assert_eq!(chunks.len(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fastq_at_in_quality_handled() {
|
||||
let data = Box::leak(
|
||||
make_fastq(&[(b"ACGTACGT", b"@@@@IIII"), (b"TTTTTTTT", b"HHHHHHHH")])
|
||||
.into_boxed_slice(),
|
||||
);
|
||||
let chunks: Vec<_> = fastq_iter(data, 16).collect::<Result<_, _>>().unwrap();
|
||||
let all: Vec<u8> = chunks.iter().flat_map(|r| rope_to_vec(r)).collect();
|
||||
assert_eq!(all, *data);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fastq_each_chunk_starts_with_at() {
|
||||
let data = Box::leak(
|
||||
make_fastq(&[
|
||||
(b"ACGT", b"IIII"),
|
||||
(b"CCCC", b"JJJJ"),
|
||||
(b"GGGG", b"KKKK"),
|
||||
(b"TTTT", b"LLLL"),
|
||||
])
|
||||
.into_boxed_slice(),
|
||||
);
|
||||
for block in [18, 30, 60] {
|
||||
let chunks: Vec<_> = fastq_iter(data, block).collect::<Result<_, _>>().unwrap();
|
||||
for rope in &chunks {
|
||||
let first_byte = rope_to_vec(rope)[0];
|
||||
assert_eq!(
|
||||
first_byte, b'@',
|
||||
"block={block}: chunk doesn't start with '@'"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#[path = "tests/chunk.rs"]
|
||||
mod tests;
|
||||
|
||||
@@ -35,71 +35,5 @@ pub fn end_of_last_fasta_entry(rope: &Rope) -> Option<usize> {
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn rope(data: &[u8]) -> Rope {
|
||||
let mut r = Rope::new(None);
|
||||
r.push(data.to_vec());
|
||||
r
|
||||
}
|
||||
|
||||
fn rope2(a: &[u8], b: &[u8]) -> Rope {
|
||||
let mut r = Rope::new(None);
|
||||
r.push(a.to_vec());
|
||||
r.push(b.to_vec());
|
||||
r
|
||||
}
|
||||
|
||||
fn flat(r: &Rope) -> Vec<u8> {
|
||||
r.fw_cursor().collect()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn single_entry_no_boundary() {
|
||||
assert_eq!(end_of_last_fasta_entry(&rope(b">seq1\nACGT\n")), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn two_entries_cuts_at_second_header() {
|
||||
let data = b">seq1\nACGT\n>seq2\nTTTT\n";
|
||||
let r = rope(data);
|
||||
let pos = end_of_last_fasta_entry(&r).unwrap();
|
||||
assert_eq!(&flat(&r)[pos..], b">seq2\nTTTT\n");
|
||||
assert_eq!(&flat(&r)[..pos], b">seq1\nACGT\n");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn three_entries_cuts_at_last_header() {
|
||||
let data = b">s1\nAA\n>s2\nCC\n>s3\nGG\n";
|
||||
let r = rope(data);
|
||||
let pos = end_of_last_fasta_entry(&r).unwrap();
|
||||
assert_eq!(&flat(&r)[pos..], b">s3\nGG\n");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multiline_sequence() {
|
||||
let data = b">s1\nACGT\nACGT\n>s2\nTTTT\n";
|
||||
let r = rope(data);
|
||||
let pos = end_of_last_fasta_entry(&r).unwrap();
|
||||
assert_eq!(&flat(&r)[pos..], b">s2\nTTTT\n");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn crlf_line_endings() {
|
||||
let data = b">s1\r\nACGT\r\n>s2\r\nTTTT\r\n";
|
||||
let r = rope(data);
|
||||
let pos = end_of_last_fasta_entry(&r).unwrap();
|
||||
assert_eq!(&flat(&r)[pos..], b">s2\r\nTTTT\r\n");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn boundary_spans_two_blocks() {
|
||||
let a = b">s1\nACGT\n";
|
||||
let b = b">s2\nTTTT\n";
|
||||
let r = rope2(a, b);
|
||||
let all: Vec<u8> = flat(&r);
|
||||
let pos = end_of_last_fasta_entry(&r).unwrap();
|
||||
assert_eq!(&all[pos..], b">s2\nTTTT\n");
|
||||
}
|
||||
}
|
||||
#[path = "tests/fasta.rs"]
|
||||
mod tests;
|
||||
|
||||
@@ -107,78 +107,5 @@ pub fn end_of_last_fastq_entry(rope: &Rope) -> Option<usize> {
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn rope(data: &[u8]) -> Rope {
|
||||
let mut r = Rope::new(None);
|
||||
r.push(data.to_vec());
|
||||
r
|
||||
}
|
||||
|
||||
fn make_fastq(records: &[(&[u8], &[u8])]) -> Vec<u8> {
|
||||
let mut buf = Vec::new();
|
||||
for (seq, qual) in records {
|
||||
buf.extend_from_slice(b"@header\n");
|
||||
buf.extend_from_slice(seq);
|
||||
buf.push(b'\n');
|
||||
buf.extend_from_slice(b"+\n");
|
||||
buf.extend_from_slice(qual);
|
||||
buf.push(b'\n');
|
||||
}
|
||||
buf
|
||||
}
|
||||
|
||||
fn flat(r: &Rope) -> Vec<u8> {
|
||||
r.fw_cursor().collect()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn single_record_no_boundary() {
|
||||
let buf = make_fastq(&[(b"ACGT", b"IIII")]);
|
||||
assert_eq!(end_of_last_fastq_entry(&rope(&buf)), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn two_records_cuts_at_second() {
|
||||
let buf = make_fastq(&[(b"ACGT", b"IIII"), (b"TTTT", b"HHHH")]);
|
||||
let r = rope(&buf);
|
||||
let pos = end_of_last_fastq_entry(&r).unwrap();
|
||||
assert_eq!(flat(&r)[pos], b'@');
|
||||
assert_eq!(
|
||||
&flat(&r)[pos..],
|
||||
make_fastq(&[(b"TTTT", b"HHHH")]).as_slice()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn three_records_cuts_at_last() {
|
||||
let buf = make_fastq(&[(b"ACGT", b"IIII"), (b"CCCC", b"JJJJ"), (b"GGGG", b"KKKK")]);
|
||||
let r = rope(&buf);
|
||||
let pos = end_of_last_fastq_entry(&r).unwrap();
|
||||
assert_eq!(
|
||||
&flat(&r)[pos..],
|
||||
make_fastq(&[(b"GGGG", b"KKKK")]).as_slice()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn at_sign_in_quality_does_not_confuse() {
|
||||
let buf = make_fastq(&[(b"ACGTACGT", b"@@@@IIII"), (b"TTTT", b"HHHH")]);
|
||||
let r = rope(&buf);
|
||||
let pos = end_of_last_fastq_entry(&r).unwrap();
|
||||
assert_eq!(
|
||||
&flat(&r)[pos..],
|
||||
make_fastq(&[(b"TTTT", b"HHHH")]).as_slice()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn crlf_line_endings() {
|
||||
let data = b"@h\r\nACGT\r\n+\r\nIIII\r\n@h\r\nTTTT\r\n+\r\nHHHH\r\n";
|
||||
let r = rope(data);
|
||||
let pos = end_of_last_fastq_entry(&r).unwrap();
|
||||
assert_eq!(flat(&r)[pos], b'@');
|
||||
assert_eq!(&flat(&r)[pos..], b"@h\r\nTTTT\r\n+\r\nHHHH\r\n");
|
||||
}
|
||||
}
|
||||
#[path = "tests/fastq.rs"]
|
||||
mod tests;
|
||||
|
||||
@@ -215,239 +215,5 @@ fn is_acgt(upper: u8) -> bool {
|
||||
// ── tests ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn make_rope(data: &[u8]) -> Rope {
|
||||
let mut r = Rope::new(None);
|
||||
r.push(data.to_vec());
|
||||
r
|
||||
}
|
||||
|
||||
fn flat(r: Rope) -> Vec<u8> {
|
||||
r.fw_cursor().collect()
|
||||
}
|
||||
|
||||
fn run_fastq(data: &[u8], k: usize) -> Vec<u8> {
|
||||
flat(normalize_fastq_chunk(make_rope(data), k))
|
||||
}
|
||||
|
||||
fn run_fasta(data: &[u8], k: usize) -> Vec<u8> {
|
||||
flat(normalize_fasta_chunk(make_rope(data), k))
|
||||
}
|
||||
|
||||
fn make_fastq(records: &[&[u8]]) -> Vec<u8> {
|
||||
let mut buf = Vec::new();
|
||||
for seq in records {
|
||||
buf.extend_from_slice(b"@hdr\n");
|
||||
buf.extend_from_slice(seq);
|
||||
buf.push(b'\n');
|
||||
buf.extend_from_slice(b"+\n");
|
||||
buf.extend_from_slice(&vec![b'I'; seq.len()]);
|
||||
buf.push(b'\n');
|
||||
}
|
||||
buf
|
||||
}
|
||||
|
||||
fn make_fasta(records: &[(&[u8], &[u8])]) -> Vec<u8> {
|
||||
let mut buf = Vec::new();
|
||||
for (id, seq) in records {
|
||||
buf.push(b'>');
|
||||
buf.extend_from_slice(id);
|
||||
buf.push(b'\n');
|
||||
buf.extend_from_slice(seq);
|
||||
buf.push(b'\n');
|
||||
}
|
||||
buf
|
||||
}
|
||||
|
||||
// ── FASTQ basic ──────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn single_record_produces_seq_then_null() {
|
||||
assert_eq!(run_fastq(&make_fastq(&[b"ACGTACGT"]), 4), b"ACGTACGT\x00");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn two_records_concatenated() {
|
||||
assert_eq!(
|
||||
run_fastq(&make_fastq(&[b"ACGTACGT", b"TTTTTTTT"]), 4),
|
||||
b"ACGTACGT\x00TTTTTTTT\x00"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lowercase_input_uppercased() {
|
||||
assert_eq!(run_fastq(&make_fastq(&[b"acgtacgt"]), 4), b"ACGTACGT\x00");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn mixed_case_uppercased() {
|
||||
assert_eq!(run_fastq(&make_fastq(&[b"AcGtAcGt"]), 4), b"ACGTACGT\x00");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sequence_shorter_than_k_discarded() {
|
||||
assert_eq!(run_fastq(&make_fastq(&[b"ACG"]), 4), b"");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sequence_exactly_k_kept() {
|
||||
assert_eq!(run_fastq(&make_fastq(&[b"ACGT"]), 4), b"ACGT\x00");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn short_record_among_valid_ones_discarded() {
|
||||
assert_eq!(
|
||||
run_fastq(&make_fastq(&[b"ACGTACGT", b"AC", b"TTTTTTTT"]), 4),
|
||||
b"ACGTACGT\x00TTTTTTTT\x00"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ambiguous_splits_into_two_segments() {
|
||||
assert_eq!(
|
||||
run_fastq(&make_fastq(&[b"ACGTNACGT"]), 4),
|
||||
b"ACGT\x00ACGT\x00"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn segment_after_ambiguous_too_short_discarded() {
|
||||
assert_eq!(
|
||||
run_fastq(&make_fastq(&[b"ACGTACGTNAC"]), 4),
|
||||
b"ACGTACGT\x00"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn consecutive_ambiguous_produce_no_empty_segment() {
|
||||
assert_eq!(
|
||||
run_fastq(&make_fastq(&[b"ACGTNNNNACGT"]), 4),
|
||||
b"ACGT\x00ACGT\x00"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ambiguous_at_start_skipped() {
|
||||
assert_eq!(run_fastq(&make_fastq(&[b"NNACGTACGT"]), 4), b"ACGTACGT\x00");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ambiguous_at_end_produces_no_trailing_empty() {
|
||||
assert_eq!(run_fastq(&make_fastq(&[b"ACGTACGTNN"]), 4), b"ACGTACGT\x00");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn crlf_handled() {
|
||||
let data = b"@hdr\r\nACGTACGT\r\n+\r\nIIIIIIII\r\n";
|
||||
assert_eq!(run_fastq(data, 4), b"ACGTACGT\x00");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multi_slice_rope() {
|
||||
let data = make_fastq(&[b"ACGTACGT", b"TTTTTTTT"]);
|
||||
let mid = data.len() / 2;
|
||||
let mut rope = Rope::new(None);
|
||||
rope.push(data[..mid].to_vec());
|
||||
rope.push(data[mid..].to_vec());
|
||||
assert_eq!(
|
||||
flat(normalize_fastq_chunk(rope, 4)),
|
||||
b"ACGTACGT\x00TTTTTTTT\x00"
|
||||
);
|
||||
}
|
||||
|
||||
// ── FASTA ─────────────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn fasta_single_record() {
|
||||
assert_eq!(
|
||||
run_fasta(&make_fasta(&[(b"s1", b"ACGTACGT")]), 4),
|
||||
b"ACGTACGT\x00"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fasta_two_records() {
|
||||
assert_eq!(
|
||||
run_fasta(
|
||||
&make_fasta(&[(b"s1", b"ACGTACGT"), (b"s2", b"TTTTTTTT")]),
|
||||
4
|
||||
),
|
||||
b"ACGTACGT\x00TTTTTTTT\x00"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fasta_multiline_sequence_concatenated() {
|
||||
assert_eq!(
|
||||
run_fasta(b">s1\nACGT\nACGT\nACGT\n", 4),
|
||||
b"ACGTACGTACGT\x00"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fasta_lowercase_uppercased() {
|
||||
assert_eq!(
|
||||
run_fasta(&make_fasta(&[(b"s1", b"acgtacgt")]), 4),
|
||||
b"ACGTACGT\x00"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fasta_short_record_discarded() {
|
||||
assert_eq!(run_fasta(&make_fasta(&[(b"s1", b"ACG")]), 4), b"");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fasta_short_among_valid_discarded() {
|
||||
assert_eq!(
|
||||
run_fasta(
|
||||
&make_fasta(&[(b"s1", b"ACGTACGT"), (b"s2", b"AC"), (b"s3", b"TTTTTTTT")]),
|
||||
4
|
||||
),
|
||||
b"ACGTACGT\x00TTTTTTTT\x00"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fasta_ambiguous_splits_segments() {
|
||||
assert_eq!(run_fasta(b">s1\nACGTNACGT\n", 4), b"ACGT\x00ACGT\x00");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fasta_ambiguous_across_line_boundary() {
|
||||
assert_eq!(run_fasta(b">s1\nACGT\nNACGT\n", 4), b"ACGT\x00ACGT\x00");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fasta_ambiguous_short_segment_discarded() {
|
||||
assert_eq!(run_fasta(b">s1\nACGTACGTNAC\n", 4), b"ACGTACGT\x00");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fasta_no_trailing_newline() {
|
||||
assert_eq!(run_fasta(b">s1\nACGTACGT", 4), b"ACGTACGT\x00");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fasta_crlf_line_endings() {
|
||||
assert_eq!(
|
||||
run_fasta(b">s1\r\nACGT\r\nACGT\r\n>s2\r\nTTTT\r\n", 4),
|
||||
b"ACGTACGT\x00TTTT\x00"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fasta_multi_slice_rope() {
|
||||
let data = make_fasta(&[(b"s1", b"ACGTACGT"), (b"s2", b"TTTTTTTT")]);
|
||||
let mid = data.len() / 2;
|
||||
let mut rope = Rope::new(None);
|
||||
rope.push(data[..mid].to_vec());
|
||||
rope.push(data[mid..].to_vec());
|
||||
assert_eq!(
|
||||
flat(normalize_fasta_chunk(rope, 4)),
|
||||
b"ACGTACGT\x00TTTTTTTT\x00"
|
||||
);
|
||||
}
|
||||
}
|
||||
#[path = "tests/normalize.rs"]
|
||||
mod tests;
|
||||
|
||||
@@ -708,10 +708,7 @@ fn dispatch<R: Read>(
|
||||
|
||||
/// Wraps an already-open reader in a nucleotide stream, detecting its format.
|
||||
/// Returns `None` if the format is not recognised.
|
||||
pub(crate) fn nuc_stream<R: Read>(
|
||||
reader: R,
|
||||
k: usize,
|
||||
) -> Option<AnyNucStream<MimeTypeGuesser<R>>> {
|
||||
pub(crate) fn nuc_stream<R: Read>(reader: R, k: usize) -> Option<AnyNucStream<MimeTypeGuesser<R>>> {
|
||||
dispatch(MimeTypeGuesser::new(reader), k)
|
||||
}
|
||||
|
||||
@@ -726,7 +723,11 @@ pub fn open_nuc_stream(
|
||||
k: usize,
|
||||
) -> io::Result<Box<dyn Iterator<Item = NucPage> + Send>> {
|
||||
let reader = open_raw(source)?;
|
||||
dispatch(MimeTypeGuesser::new(reader), k)
|
||||
nuc_stream(reader, k)
|
||||
.map(|s| Box::new(s) as Box<dyn Iterator<Item = NucPage> + Send>)
|
||||
.ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "unknown sequence format"))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
#[path = "tests/nucstream.rs"]
|
||||
mod tests;
|
||||
|
||||
@@ -0,0 +1,106 @@
|
||||
use super::*;
|
||||
use crate::fasta::end_of_last_fasta_entry;
|
||||
use crate::fastq::end_of_last_fastq_entry;
|
||||
|
||||
fn fasta_iter(data: &'static [u8], block_size: usize) -> SeqChunkIter<&'static [u8]> {
|
||||
SeqChunkIter::new(data, block_size, end_of_last_fasta_entry, None)
|
||||
}
|
||||
|
||||
fn fastq_iter(data: &'static [u8], block_size: usize) -> SeqChunkIter<&'static [u8]> {
|
||||
SeqChunkIter::new(data, block_size, end_of_last_fastq_entry, None)
|
||||
}
|
||||
|
||||
fn rope_to_vec(rope: &Rope) -> Vec<u8> {
|
||||
rope.fw_cursor().collect()
|
||||
}
|
||||
|
||||
// ── FASTA ─────────────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn fasta_single_record_one_chunk() {
|
||||
let data: &[u8] = b">s1\nACGT\n";
|
||||
let chunks: Vec<_> = fasta_iter(data, 64).collect::<Result<_, _>>().unwrap();
|
||||
assert_eq!(chunks.len(), 1);
|
||||
assert_eq!(rope_to_vec(&chunks[0]), b">s1\nACGT\n");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fasta_two_records_split_across_chunks() {
|
||||
let data: &[u8] = b">s1\nACGT\n>s2\nTTTT\n";
|
||||
let chunks: Vec<_> = fasta_iter(data, 10).collect::<Result<_, _>>().unwrap();
|
||||
let all: Vec<u8> = chunks.iter().flat_map(|r| rope_to_vec(r)).collect();
|
||||
assert_eq!(all, b">s1\nACGT\n>s2\nTTTT\n");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fasta_each_chunk_ends_on_complete_record() {
|
||||
let data: &[u8] = b">s1\nACGT\n>s2\nCCCC\n>s3\nGGGG\n>s4\nTTTT\n";
|
||||
for block in [8, 12, 20, 100] {
|
||||
let chunks: Vec<_> = fasta_iter(data, block).collect::<Result<_, _>>().unwrap();
|
||||
for rope in &chunks {
|
||||
let flat = rope_to_vec(rope);
|
||||
assert_eq!(flat[0], b'>', "block={block}: chunk doesn't start with '>'");
|
||||
assert_eq!(
|
||||
*flat.last().unwrap(),
|
||||
b'\n',
|
||||
"block={block}: chunk doesn't end with newline"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ── FASTQ ─────────────────────────────────────────────────────────────────
|
||||
|
||||
fn make_fastq(records: &[(&[u8], &[u8])]) -> Vec<u8> {
|
||||
let mut buf = Vec::new();
|
||||
for (seq, qual) in records {
|
||||
buf.extend_from_slice(b"@hdr\n");
|
||||
buf.extend_from_slice(seq);
|
||||
buf.push(b'\n');
|
||||
buf.extend_from_slice(b"+\n");
|
||||
buf.extend_from_slice(qual);
|
||||
buf.push(b'\n');
|
||||
}
|
||||
buf
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fastq_single_record_one_chunk() {
|
||||
let data = Box::leak(make_fastq(&[(b"ACGT", b"IIII")]).into_boxed_slice());
|
||||
let chunks: Vec<_> = fastq_iter(data, 64).collect::<Result<_, _>>().unwrap();
|
||||
assert_eq!(chunks.len(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fastq_at_in_quality_handled() {
|
||||
let data = Box::leak(
|
||||
make_fastq(&[(b"ACGTACGT", b"@@@@IIII"), (b"TTTTTTTT", b"HHHHHHHH")])
|
||||
.into_boxed_slice(),
|
||||
);
|
||||
let chunks: Vec<_> = fastq_iter(data, 16).collect::<Result<_, _>>().unwrap();
|
||||
let all: Vec<u8> = chunks.iter().flat_map(|r| rope_to_vec(r)).collect();
|
||||
assert_eq!(all, *data);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fastq_each_chunk_starts_with_at() {
|
||||
let data = Box::leak(
|
||||
make_fastq(&[
|
||||
(b"ACGT", b"IIII"),
|
||||
(b"CCCC", b"JJJJ"),
|
||||
(b"GGGG", b"KKKK"),
|
||||
(b"TTTT", b"LLLL"),
|
||||
])
|
||||
.into_boxed_slice(),
|
||||
);
|
||||
for block in [18, 30, 60] {
|
||||
let chunks: Vec<_> = fastq_iter(data, block).collect::<Result<_, _>>().unwrap();
|
||||
for rope in &chunks {
|
||||
let first_byte = rope_to_vec(rope)[0];
|
||||
assert_eq!(
|
||||
first_byte, b'@',
|
||||
"block={block}: chunk doesn't start with '@'"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,66 @@
|
||||
use super::*;
|
||||
|
||||
fn rope(data: &[u8]) -> Rope {
|
||||
let mut r = Rope::new(None);
|
||||
r.push(data.to_vec());
|
||||
r
|
||||
}
|
||||
|
||||
fn rope2(a: &[u8], b: &[u8]) -> Rope {
|
||||
let mut r = Rope::new(None);
|
||||
r.push(a.to_vec());
|
||||
r.push(b.to_vec());
|
||||
r
|
||||
}
|
||||
|
||||
fn flat(r: &Rope) -> Vec<u8> {
|
||||
r.fw_cursor().collect()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn single_entry_no_boundary() {
|
||||
assert_eq!(end_of_last_fasta_entry(&rope(b">seq1\nACGT\n")), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn two_entries_cuts_at_second_header() {
|
||||
let data = b">seq1\nACGT\n>seq2\nTTTT\n";
|
||||
let r = rope(data);
|
||||
let pos = end_of_last_fasta_entry(&r).unwrap();
|
||||
assert_eq!(&flat(&r)[pos..], b">seq2\nTTTT\n");
|
||||
assert_eq!(&flat(&r)[..pos], b">seq1\nACGT\n");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn three_entries_cuts_at_last_header() {
|
||||
let data = b">s1\nAA\n>s2\nCC\n>s3\nGG\n";
|
||||
let r = rope(data);
|
||||
let pos = end_of_last_fasta_entry(&r).unwrap();
|
||||
assert_eq!(&flat(&r)[pos..], b">s3\nGG\n");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multiline_sequence() {
|
||||
let data = b">s1\nACGT\nACGT\n>s2\nTTTT\n";
|
||||
let r = rope(data);
|
||||
let pos = end_of_last_fasta_entry(&r).unwrap();
|
||||
assert_eq!(&flat(&r)[pos..], b">s2\nTTTT\n");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn crlf_line_endings() {
|
||||
let data = b">s1\r\nACGT\r\n>s2\r\nTTTT\r\n";
|
||||
let r = rope(data);
|
||||
let pos = end_of_last_fasta_entry(&r).unwrap();
|
||||
assert_eq!(&flat(&r)[pos..], b">s2\r\nTTTT\r\n");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn boundary_spans_two_blocks() {
|
||||
let a = b">s1\nACGT\n";
|
||||
let b = b">s2\nTTTT\n";
|
||||
let r = rope2(a, b);
|
||||
let all: Vec<u8> = flat(&r);
|
||||
let pos = end_of_last_fasta_entry(&r).unwrap();
|
||||
assert_eq!(&all[pos..], b">s2\nTTTT\n");
|
||||
}
|
||||
@@ -0,0 +1,73 @@
|
||||
use super::*;
|
||||
|
||||
fn rope(data: &[u8]) -> Rope {
|
||||
let mut r = Rope::new(None);
|
||||
r.push(data.to_vec());
|
||||
r
|
||||
}
|
||||
|
||||
fn make_fastq(records: &[(&[u8], &[u8])]) -> Vec<u8> {
|
||||
let mut buf = Vec::new();
|
||||
for (seq, qual) in records {
|
||||
buf.extend_from_slice(b"@header\n");
|
||||
buf.extend_from_slice(seq);
|
||||
buf.push(b'\n');
|
||||
buf.extend_from_slice(b"+\n");
|
||||
buf.extend_from_slice(qual);
|
||||
buf.push(b'\n');
|
||||
}
|
||||
buf
|
||||
}
|
||||
|
||||
fn flat(r: &Rope) -> Vec<u8> {
|
||||
r.fw_cursor().collect()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn single_record_no_boundary() {
|
||||
let buf = make_fastq(&[(b"ACGT", b"IIII")]);
|
||||
assert_eq!(end_of_last_fastq_entry(&rope(&buf)), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn two_records_cuts_at_second() {
|
||||
let buf = make_fastq(&[(b"ACGT", b"IIII"), (b"TTTT", b"HHHH")]);
|
||||
let r = rope(&buf);
|
||||
let pos = end_of_last_fastq_entry(&r).unwrap();
|
||||
assert_eq!(flat(&r)[pos], b'@');
|
||||
assert_eq!(
|
||||
&flat(&r)[pos..],
|
||||
make_fastq(&[(b"TTTT", b"HHHH")]).as_slice()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn three_records_cuts_at_last() {
|
||||
let buf = make_fastq(&[(b"ACGT", b"IIII"), (b"CCCC", b"JJJJ"), (b"GGGG", b"KKKK")]);
|
||||
let r = rope(&buf);
|
||||
let pos = end_of_last_fastq_entry(&r).unwrap();
|
||||
assert_eq!(
|
||||
&flat(&r)[pos..],
|
||||
make_fastq(&[(b"GGGG", b"KKKK")]).as_slice()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn at_sign_in_quality_does_not_confuse() {
|
||||
let buf = make_fastq(&[(b"ACGTACGT", b"@@@@IIII"), (b"TTTT", b"HHHH")]);
|
||||
let r = rope(&buf);
|
||||
let pos = end_of_last_fastq_entry(&r).unwrap();
|
||||
assert_eq!(
|
||||
&flat(&r)[pos..],
|
||||
make_fastq(&[(b"TTTT", b"HHHH")]).as_slice()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn crlf_line_endings() {
|
||||
let data = b"@h\r\nACGT\r\n+\r\nIIII\r\n@h\r\nTTTT\r\n+\r\nHHHH\r\n";
|
||||
let r = rope(data);
|
||||
let pos = end_of_last_fastq_entry(&r).unwrap();
|
||||
assert_eq!(flat(&r)[pos], b'@');
|
||||
assert_eq!(&flat(&r)[pos..], b"@h\r\nTTTT\r\n+\r\nHHHH\r\n");
|
||||
}
|
||||
@@ -0,0 +1,234 @@
|
||||
use super::*;
|
||||
|
||||
fn make_rope(data: &[u8]) -> Rope {
|
||||
let mut r = Rope::new(None);
|
||||
r.push(data.to_vec());
|
||||
r
|
||||
}
|
||||
|
||||
fn flat(r: Rope) -> Vec<u8> {
|
||||
r.fw_cursor().collect()
|
||||
}
|
||||
|
||||
fn run_fastq(data: &[u8], k: usize) -> Vec<u8> {
|
||||
flat(normalize_fastq_chunk(make_rope(data), k))
|
||||
}
|
||||
|
||||
fn run_fasta(data: &[u8], k: usize) -> Vec<u8> {
|
||||
flat(normalize_fasta_chunk(make_rope(data), k))
|
||||
}
|
||||
|
||||
fn make_fastq(records: &[&[u8]]) -> Vec<u8> {
|
||||
let mut buf = Vec::new();
|
||||
for seq in records {
|
||||
buf.extend_from_slice(b"@hdr\n");
|
||||
buf.extend_from_slice(seq);
|
||||
buf.push(b'\n');
|
||||
buf.extend_from_slice(b"+\n");
|
||||
buf.extend_from_slice(&vec![b'I'; seq.len()]);
|
||||
buf.push(b'\n');
|
||||
}
|
||||
buf
|
||||
}
|
||||
|
||||
fn make_fasta(records: &[(&[u8], &[u8])]) -> Vec<u8> {
|
||||
let mut buf = Vec::new();
|
||||
for (id, seq) in records {
|
||||
buf.push(b'>');
|
||||
buf.extend_from_slice(id);
|
||||
buf.push(b'\n');
|
||||
buf.extend_from_slice(seq);
|
||||
buf.push(b'\n');
|
||||
}
|
||||
buf
|
||||
}
|
||||
|
||||
// ── FASTQ basic ──────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn single_record_produces_seq_then_null() {
|
||||
assert_eq!(run_fastq(&make_fastq(&[b"ACGTACGT"]), 4), b"ACGTACGT\x00");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn two_records_concatenated() {
|
||||
assert_eq!(
|
||||
run_fastq(&make_fastq(&[b"ACGTACGT", b"TTTTTTTT"]), 4),
|
||||
b"ACGTACGT\x00TTTTTTTT\x00"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lowercase_input_uppercased() {
|
||||
assert_eq!(run_fastq(&make_fastq(&[b"acgtacgt"]), 4), b"ACGTACGT\x00");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn mixed_case_uppercased() {
|
||||
assert_eq!(run_fastq(&make_fastq(&[b"AcGtAcGt"]), 4), b"ACGTACGT\x00");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sequence_shorter_than_k_discarded() {
|
||||
assert_eq!(run_fastq(&make_fastq(&[b"ACG"]), 4), b"");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sequence_exactly_k_kept() {
|
||||
assert_eq!(run_fastq(&make_fastq(&[b"ACGT"]), 4), b"ACGT\x00");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn short_record_among_valid_ones_discarded() {
|
||||
assert_eq!(
|
||||
run_fastq(&make_fastq(&[b"ACGTACGT", b"AC", b"TTTTTTTT"]), 4),
|
||||
b"ACGTACGT\x00TTTTTTTT\x00"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ambiguous_splits_into_two_segments() {
|
||||
assert_eq!(
|
||||
run_fastq(&make_fastq(&[b"ACGTNACGT"]), 4),
|
||||
b"ACGT\x00ACGT\x00"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn segment_after_ambiguous_too_short_discarded() {
|
||||
assert_eq!(
|
||||
run_fastq(&make_fastq(&[b"ACGTACGTNAC"]), 4),
|
||||
b"ACGTACGT\x00"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn consecutive_ambiguous_produce_no_empty_segment() {
|
||||
assert_eq!(
|
||||
run_fastq(&make_fastq(&[b"ACGTNNNNACGT"]), 4),
|
||||
b"ACGT\x00ACGT\x00"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ambiguous_at_start_skipped() {
|
||||
assert_eq!(run_fastq(&make_fastq(&[b"NNACGTACGT"]), 4), b"ACGTACGT\x00");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ambiguous_at_end_produces_no_trailing_empty() {
|
||||
assert_eq!(run_fastq(&make_fastq(&[b"ACGTACGTNN"]), 4), b"ACGTACGT\x00");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn crlf_handled() {
|
||||
let data = b"@hdr\r\nACGTACGT\r\n+\r\nIIIIIIII\r\n";
|
||||
assert_eq!(run_fastq(data, 4), b"ACGTACGT\x00");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multi_slice_rope() {
|
||||
let data = make_fastq(&[b"ACGTACGT", b"TTTTTTTT"]);
|
||||
let mid = data.len() / 2;
|
||||
let mut rope = Rope::new(None);
|
||||
rope.push(data[..mid].to_vec());
|
||||
rope.push(data[mid..].to_vec());
|
||||
assert_eq!(
|
||||
flat(normalize_fastq_chunk(rope, 4)),
|
||||
b"ACGTACGT\x00TTTTTTTT\x00"
|
||||
);
|
||||
}
|
||||
|
||||
// ── FASTA ─────────────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn fasta_single_record() {
|
||||
assert_eq!(
|
||||
run_fasta(&make_fasta(&[(b"s1", b"ACGTACGT")]), 4),
|
||||
b"ACGTACGT\x00"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fasta_two_records() {
|
||||
assert_eq!(
|
||||
run_fasta(
|
||||
&make_fasta(&[(b"s1", b"ACGTACGT"), (b"s2", b"TTTTTTTT")]),
|
||||
4
|
||||
),
|
||||
b"ACGTACGT\x00TTTTTTTT\x00"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fasta_multiline_sequence_concatenated() {
|
||||
assert_eq!(
|
||||
run_fasta(b">s1\nACGT\nACGT\nACGT\n", 4),
|
||||
b"ACGTACGTACGT\x00"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fasta_lowercase_uppercased() {
|
||||
assert_eq!(
|
||||
run_fasta(&make_fasta(&[(b"s1", b"acgtacgt")]), 4),
|
||||
b"ACGTACGT\x00"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fasta_short_record_discarded() {
|
||||
assert_eq!(run_fasta(&make_fasta(&[(b"s1", b"ACG")]), 4), b"");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fasta_short_among_valid_discarded() {
|
||||
assert_eq!(
|
||||
run_fasta(
|
||||
&make_fasta(&[(b"s1", b"ACGTACGT"), (b"s2", b"AC"), (b"s3", b"TTTTTTTT")]),
|
||||
4
|
||||
),
|
||||
b"ACGTACGT\x00TTTTTTTT\x00"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fasta_ambiguous_splits_segments() {
|
||||
assert_eq!(run_fasta(b">s1\nACGTNACGT\n", 4), b"ACGT\x00ACGT\x00");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fasta_ambiguous_across_line_boundary() {
|
||||
assert_eq!(run_fasta(b">s1\nACGT\nNACGT\n", 4), b"ACGT\x00ACGT\x00");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fasta_ambiguous_short_segment_discarded() {
|
||||
assert_eq!(run_fasta(b">s1\nACGTACGTNAC\n", 4), b"ACGTACGT\x00");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fasta_no_trailing_newline() {
|
||||
assert_eq!(run_fasta(b">s1\nACGTACGT", 4), b"ACGTACGT\x00");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fasta_crlf_line_endings() {
|
||||
assert_eq!(
|
||||
run_fasta(b">s1\r\nACGT\r\nACGT\r\n>s2\r\nTTTT\r\n", 4),
|
||||
b"ACGTACGT\x00TTTT\x00"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fasta_multi_slice_rope() {
|
||||
let data = make_fasta(&[(b"s1", b"ACGTACGT"), (b"s2", b"TTTTTTTT")]);
|
||||
let mid = data.len() / 2;
|
||||
let mut rope = Rope::new(None);
|
||||
rope.push(data[..mid].to_vec());
|
||||
rope.push(data[mid..].to_vec());
|
||||
assert_eq!(
|
||||
flat(normalize_fasta_chunk(rope, 4)),
|
||||
b"ACGTACGT\x00TTTTTTTT\x00"
|
||||
);
|
||||
}
|
||||
@@ -0,0 +1,267 @@
|
||||
use super::*;
|
||||
use std::io::Cursor;
|
||||
use std::ops::Deref;
|
||||
|
||||
// ── helpers ───────────────────────────────────────────────────────────────
|
||||
|
||||
fn run_fasta(data: &[u8], k: usize) -> Vec<u8> {
|
||||
NucStream::<_, FastaParser>::new(Cursor::new(data.to_vec()), k)
|
||||
.flat_map(|p| p.deref().to_vec())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn run_fastq(data: &[u8], k: usize) -> Vec<u8> {
|
||||
NucStream::<_, FastqParser>::new(Cursor::new(data.to_vec()), k)
|
||||
.flat_map(|p| p.deref().to_vec())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn run_genbank(data: &[u8], k: usize) -> Vec<u8> {
|
||||
NucStream::<_, GenbankParser>::new(Cursor::new(data.to_vec()), k)
|
||||
.flat_map(|p| p.deref().to_vec())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn pages_fasta(data: &[u8], k: usize) -> Vec<Vec<u8>> {
|
||||
NucStream::<_, FastaParser>::new(Cursor::new(data.to_vec()), k)
|
||||
.map(|p| p.deref().to_vec())
|
||||
.collect()
|
||||
}
|
||||
|
||||
// ── FastaParser ───────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn fasta_single_sequence() {
|
||||
assert_eq!(run_fasta(b">s1\nACGTACGT\n", 4), b"ACGTACGT\x00");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fasta_lowercase_uppercased() {
|
||||
assert_eq!(run_fasta(b">s1\nacgtacgt\n", 4), b"ACGTACGT\x00");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fasta_multiline_sequence_concatenated() {
|
||||
assert_eq!(run_fasta(b">s1\nACGT\nACGT\n", 4), b"ACGTACGT\x00");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fasta_two_sequences() {
|
||||
let data = b">s1\nACGTACGT\n>s2\nTTTTTTTT\n";
|
||||
assert_eq!(run_fasta(data, 4), b"ACGTACGT\x00TTTTTTTT\x00");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fasta_empty_input_yields_no_pages() {
|
||||
assert_eq!(run_fasta(b"", 4), b"");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fasta_sequence_shorter_than_k_at_eof_discarded() {
|
||||
// The 3-base fragment is saved as overlap and dropped at EOF (< k).
|
||||
assert_eq!(run_fasta(b">s1\nACG\n", 4), b"");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fasta_ambiguous_splits_into_two_segments() {
|
||||
assert_eq!(run_fasta(b">s1\nACGTNACGT\n", 4), b"ACGT\x00ACGT\x00");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fasta_short_segment_before_ambiguous_emitted() {
|
||||
// "AC" (< k=4) before N is written with a separator — filtering by
|
||||
// length is deferred to the superkmer builder, not done here.
|
||||
assert_eq!(run_fasta(b">s1\nACNACGTACGT\n", 4), b"AC\x00ACGTACGT\x00");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fasta_ambiguous_at_start_skipped() {
|
||||
assert_eq!(run_fasta(b">s1\nNNNACGTACGT\n", 4), b"ACGTACGT\x00");
|
||||
}
|
||||
|
||||
// ── FastqParser ───────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn fastq_single_record() {
|
||||
assert_eq!(
|
||||
run_fastq(b"@r1\nACGTACGT\n+\nIIIIIIII\n", 4),
|
||||
b"ACGTACGT\x00"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fastq_lowercase_uppercased() {
|
||||
assert_eq!(
|
||||
run_fastq(b"@r1\nacgtacgt\n+\nIIIIIIII\n", 4),
|
||||
b"ACGTACGT\x00"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fastq_quality_bytes_not_in_output() {
|
||||
// '@' (Phred 31 = ASCII 64) in quality must not appear in output.
|
||||
assert_eq!(
|
||||
run_fastq(b"@r1\nACGTACGT\n+\n@@@@@@@@\n", 4),
|
||||
b"ACGTACGT\x00"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fastq_two_records() {
|
||||
let data = b"@r1\nACGTACGT\n+\nIIIIIIII\n@r2\nTTTTTTTT\n+\nIIIIIIII\n";
|
||||
assert_eq!(run_fastq(data, 4), b"ACGTACGT\x00TTTTTTTT\x00");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fastq_ambiguous_splits_sequence() {
|
||||
assert_eq!(
|
||||
run_fastq(b"@r1\nACGTNACGT\n+\nIIIIIIIII\n", 4),
|
||||
b"ACGT\x00ACGT\x00"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fastq_at_in_quality_line_not_a_record_start() {
|
||||
// '@' in the quality line must not trigger a new record parse.
|
||||
let data = b"@r1\nACGTACGT\n+\n@@@@@@@@\n@r2\nTTTTTTTT\n+\nIIIIIIII\n";
|
||||
assert_eq!(run_fastq(data, 4), b"ACGTACGT\x00TTTTTTTT\x00");
|
||||
}
|
||||
|
||||
// ── GenbankParser ─────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn genbank_origin_to_slash() {
|
||||
let data = b"LOCUS ...\nORIGIN\n 1 acgtacgt\n//\n";
|
||||
assert_eq!(run_genbank(data, 4), b"ACGTACGT\x00");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn genbank_position_numbers_and_spaces_skipped() {
|
||||
let data = b"ORIGIN\n 1 acgt acgt\n//\n";
|
||||
assert_eq!(run_genbank(data, 4), b"ACGTACGT\x00");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn genbank_two_records() {
|
||||
let data = b"ORIGIN\n 1 acgtacgt\n//\nLOCUS ...\nORIGIN\n 1 tttttttt\n//\n";
|
||||
assert_eq!(run_genbank(data, 4), b"ACGTACGT\x00TTTTTTTT\x00");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn genbank_ambiguous_splits_sequence() {
|
||||
let data = b"ORIGIN\n 1 acgtnacgt\n//\n";
|
||||
assert_eq!(run_genbank(data, 4), b"ACGT\x00ACGT\x00");
|
||||
}
|
||||
|
||||
// ── NucPage ───────────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn nuc_page_deref_correct_bytes() {
|
||||
let page = NucStream::<_, FastaParser>::new(Cursor::new(b">s1\nACGT\n".to_vec()), 4)
|
||||
.next()
|
||||
.expect("page");
|
||||
assert_eq!(page.deref(), b"ACGT\x00");
|
||||
}
|
||||
|
||||
// ── NucPageCursor ─────────────────────────────────────────────────────────
|
||||
|
||||
fn make_page(data: &[u8], k: usize) -> NucPage {
|
||||
NucStream::<_, FastaParser>::new(Cursor::new(data.to_vec()), k)
|
||||
.next()
|
||||
.expect("at least one page")
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cursor_reads_bytes_in_order() {
|
||||
let page = make_page(b">s1\nACGTACGT\n", 4);
|
||||
let mut cur = page.cursor();
|
||||
assert_eq!(cur.next_byte(), Some(b'A'));
|
||||
assert_eq!(cur.next_byte(), Some(b'C'));
|
||||
assert_eq!(cur.next_byte(), Some(b'G'));
|
||||
assert_eq!(cur.next_byte(), Some(b'T'));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cursor_rewind_rereads_bytes() {
|
||||
let page = make_page(b">s1\nACGTACGT\n", 4);
|
||||
let mut cur = page.cursor();
|
||||
cur.next_byte(); // A
|
||||
cur.next_byte(); // C
|
||||
cur.rewind(1);
|
||||
assert_eq!(cur.next_byte(), Some(b'C'));
|
||||
cur.rewind(2);
|
||||
assert_eq!(cur.next_byte(), Some(b'A'));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cursor_returns_none_at_end() {
|
||||
// "ACGT\x00" = 5 bytes; consume all then expect None.
|
||||
let page = make_page(b">s1\nACGT\n", 4);
|
||||
let mut cur = page.cursor();
|
||||
for _ in 0..5 {
|
||||
cur.next_byte();
|
||||
}
|
||||
assert_eq!(cur.next_byte(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cursor_len_matches_page_content() {
|
||||
// "ACGTACGT\x00" = 9 bytes
|
||||
let page = make_page(b">s1\nACGTACGT\n", 4);
|
||||
let cur = page.cursor();
|
||||
assert_eq!(cur.len(), 9);
|
||||
assert!(!cur.is_empty());
|
||||
}
|
||||
|
||||
// ── Overlap at page boundary ──────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn overlap_last_km1_bytes_prepended_to_next_page() {
|
||||
const K: usize = 11;
|
||||
// Sequence long enough to span two pages: PAGE_SIZE + K bytes.
|
||||
// Pattern chosen so boundary bytes are unambiguous.
|
||||
let seq: Vec<u8> = (0..PAGE_SIZE + K).map(|i| b"ACGT"[i % 4]).collect();
|
||||
let mut input = b">seq\n".to_vec();
|
||||
input.extend_from_slice(&seq);
|
||||
input.push(b'\n');
|
||||
|
||||
let pages = pages_fasta(&input, K);
|
||||
assert!(pages.len() >= 2, "need at least two pages");
|
||||
|
||||
let p1 = &pages[0];
|
||||
let p2 = &pages[1];
|
||||
|
||||
// page1 must end with a \x00 separator (written by save_overlap)
|
||||
assert_eq!(*p1.last().unwrap(), 0x00, "page1 must end with separator");
|
||||
|
||||
// last K-1 ACGT bytes of page1 == first K-1 bytes of page2
|
||||
let ol = K - 1;
|
||||
let p1_seq_end = &p1[p1.len() - 1 - ol..p1.len() - 1];
|
||||
let p2_start = &p2[..ol];
|
||||
assert_eq!(
|
||||
p1_seq_end, p2_start,
|
||||
"overlap bytes mismatch at page boundary"
|
||||
);
|
||||
}
|
||||
|
||||
// ── Pool ──────────────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn pool_buffer_reused_after_drop() {
|
||||
// Drop page1 so its buffer returns to the pool, then verify page2
|
||||
// is produced correctly (no corruption, no panic).
|
||||
const K: usize = 11;
|
||||
let seq: Vec<u8> = vec![b'A'; PAGE_SIZE + K];
|
||||
let mut input = b">seq\n".to_vec();
|
||||
input.extend_from_slice(&seq);
|
||||
input.push(b'\n');
|
||||
|
||||
let mut stream = NucStream::<_, FastaParser>::new(Cursor::new(input), K);
|
||||
let page1 = stream.next().expect("page 1");
|
||||
assert!(!page1.deref().is_empty());
|
||||
drop(page1); // returns buffer to pool
|
||||
let page2 = stream.next().expect("page 2");
|
||||
assert!(!page2.deref().is_empty());
|
||||
// page2 must still start with A's (overlap from page1)
|
||||
assert_eq!(page2[0], b'A');
|
||||
}
|
||||
Reference in New Issue
Block a user