diff --git a/src/obiread/src/chunk.rs b/src/obiread/src/chunk.rs index 691e6d9..1bd7f0b 100644 --- a/src/obiread/src/chunk.rs +++ b/src/obiread/src/chunk.rs @@ -191,111 +191,5 @@ pub fn fastq_chunks(source: R) -> SeqChunkIter { } #[cfg(test)] -mod tests { - use super::*; - use crate::fasta::end_of_last_fasta_entry; - use crate::fastq::end_of_last_fastq_entry; - - fn fasta_iter(data: &'static [u8], block_size: usize) -> SeqChunkIter<&'static [u8]> { - SeqChunkIter::new(data, block_size, end_of_last_fasta_entry, None) - } - - fn fastq_iter(data: &'static [u8], block_size: usize) -> SeqChunkIter<&'static [u8]> { - SeqChunkIter::new(data, block_size, end_of_last_fastq_entry, None) - } - - fn rope_to_vec(rope: &Rope) -> Vec { - rope.fw_cursor().collect() - } - - // ── FASTA ───────────────────────────────────────────────────────────────── - - #[test] - fn fasta_single_record_one_chunk() { - let data: &[u8] = b">s1\nACGT\n"; - let chunks: Vec<_> = fasta_iter(data, 64).collect::>().unwrap(); - assert_eq!(chunks.len(), 1); - assert_eq!(rope_to_vec(&chunks[0]), b">s1\nACGT\n"); - } - - #[test] - fn fasta_two_records_split_across_chunks() { - let data: &[u8] = b">s1\nACGT\n>s2\nTTTT\n"; - let chunks: Vec<_> = fasta_iter(data, 10).collect::>().unwrap(); - let all: Vec = chunks.iter().flat_map(|r| rope_to_vec(r)).collect(); - assert_eq!(all, b">s1\nACGT\n>s2\nTTTT\n"); - } - - #[test] - fn fasta_each_chunk_ends_on_complete_record() { - let data: &[u8] = b">s1\nACGT\n>s2\nCCCC\n>s3\nGGGG\n>s4\nTTTT\n"; - for block in [8, 12, 20, 100] { - let chunks: Vec<_> = fasta_iter(data, block).collect::>().unwrap(); - for rope in &chunks { - let flat = rope_to_vec(rope); - assert_eq!(flat[0], b'>', "block={block}: chunk doesn't start with '>'"); - assert_eq!( - *flat.last().unwrap(), - b'\n', - "block={block}: chunk doesn't end with newline" - ); - } - } - } - - // ── FASTQ ───────────────────────────────────────────────────────────────── - - fn make_fastq(records: &[(&[u8], &[u8])]) -> Vec { - let mut buf = Vec::new(); - for (seq, qual) in records { - buf.extend_from_slice(b"@hdr\n"); - buf.extend_from_slice(seq); - buf.push(b'\n'); - buf.extend_from_slice(b"+\n"); - buf.extend_from_slice(qual); - buf.push(b'\n'); - } - buf - } - - #[test] - fn fastq_single_record_one_chunk() { - let data = Box::leak(make_fastq(&[(b"ACGT", b"IIII")]).into_boxed_slice()); - let chunks: Vec<_> = fastq_iter(data, 64).collect::>().unwrap(); - assert_eq!(chunks.len(), 1); - } - - #[test] - fn fastq_at_in_quality_handled() { - let data = Box::leak( - make_fastq(&[(b"ACGTACGT", b"@@@@IIII"), (b"TTTTTTTT", b"HHHHHHHH")]) - .into_boxed_slice(), - ); - let chunks: Vec<_> = fastq_iter(data, 16).collect::>().unwrap(); - let all: Vec = chunks.iter().flat_map(|r| rope_to_vec(r)).collect(); - assert_eq!(all, *data); - } - - #[test] - fn fastq_each_chunk_starts_with_at() { - let data = Box::leak( - make_fastq(&[ - (b"ACGT", b"IIII"), - (b"CCCC", b"JJJJ"), - (b"GGGG", b"KKKK"), - (b"TTTT", b"LLLL"), - ]) - .into_boxed_slice(), - ); - for block in [18, 30, 60] { - let chunks: Vec<_> = fastq_iter(data, block).collect::>().unwrap(); - for rope in &chunks { - let first_byte = rope_to_vec(rope)[0]; - assert_eq!( - first_byte, b'@', - "block={block}: chunk doesn't start with '@'" - ); - } - } - } -} +#[path = "tests/chunk.rs"] +mod tests; diff --git a/src/obiread/src/fasta.rs b/src/obiread/src/fasta.rs index b426ed6..13f6380 100644 --- a/src/obiread/src/fasta.rs +++ b/src/obiread/src/fasta.rs @@ -35,71 +35,5 @@ pub fn end_of_last_fasta_entry(rope: &Rope) -> Option { } #[cfg(test)] -mod tests { - use super::*; - - fn rope(data: &[u8]) -> Rope { - let mut r = Rope::new(None); - r.push(data.to_vec()); - r - } - - fn rope2(a: &[u8], b: &[u8]) -> Rope { - let mut r = Rope::new(None); - r.push(a.to_vec()); - r.push(b.to_vec()); - r - } - - fn flat(r: &Rope) -> Vec { - r.fw_cursor().collect() - } - - #[test] - fn single_entry_no_boundary() { - assert_eq!(end_of_last_fasta_entry(&rope(b">seq1\nACGT\n")), None); - } - - #[test] - fn two_entries_cuts_at_second_header() { - let data = b">seq1\nACGT\n>seq2\nTTTT\n"; - let r = rope(data); - let pos = end_of_last_fasta_entry(&r).unwrap(); - assert_eq!(&flat(&r)[pos..], b">seq2\nTTTT\n"); - assert_eq!(&flat(&r)[..pos], b">seq1\nACGT\n"); - } - - #[test] - fn three_entries_cuts_at_last_header() { - let data = b">s1\nAA\n>s2\nCC\n>s3\nGG\n"; - let r = rope(data); - let pos = end_of_last_fasta_entry(&r).unwrap(); - assert_eq!(&flat(&r)[pos..], b">s3\nGG\n"); - } - - #[test] - fn multiline_sequence() { - let data = b">s1\nACGT\nACGT\n>s2\nTTTT\n"; - let r = rope(data); - let pos = end_of_last_fasta_entry(&r).unwrap(); - assert_eq!(&flat(&r)[pos..], b">s2\nTTTT\n"); - } - - #[test] - fn crlf_line_endings() { - let data = b">s1\r\nACGT\r\n>s2\r\nTTTT\r\n"; - let r = rope(data); - let pos = end_of_last_fasta_entry(&r).unwrap(); - assert_eq!(&flat(&r)[pos..], b">s2\r\nTTTT\r\n"); - } - - #[test] - fn boundary_spans_two_blocks() { - let a = b">s1\nACGT\n"; - let b = b">s2\nTTTT\n"; - let r = rope2(a, b); - let all: Vec = flat(&r); - let pos = end_of_last_fasta_entry(&r).unwrap(); - assert_eq!(&all[pos..], b">s2\nTTTT\n"); - } -} +#[path = "tests/fasta.rs"] +mod tests; diff --git a/src/obiread/src/fastq.rs b/src/obiread/src/fastq.rs index 39d3665..46da1b5 100644 --- a/src/obiread/src/fastq.rs +++ b/src/obiread/src/fastq.rs @@ -107,78 +107,5 @@ pub fn end_of_last_fastq_entry(rope: &Rope) -> Option { } #[cfg(test)] -mod tests { - use super::*; - - fn rope(data: &[u8]) -> Rope { - let mut r = Rope::new(None); - r.push(data.to_vec()); - r - } - - fn make_fastq(records: &[(&[u8], &[u8])]) -> Vec { - let mut buf = Vec::new(); - for (seq, qual) in records { - buf.extend_from_slice(b"@header\n"); - buf.extend_from_slice(seq); - buf.push(b'\n'); - buf.extend_from_slice(b"+\n"); - buf.extend_from_slice(qual); - buf.push(b'\n'); - } - buf - } - - fn flat(r: &Rope) -> Vec { - r.fw_cursor().collect() - } - - #[test] - fn single_record_no_boundary() { - let buf = make_fastq(&[(b"ACGT", b"IIII")]); - assert_eq!(end_of_last_fastq_entry(&rope(&buf)), None); - } - - #[test] - fn two_records_cuts_at_second() { - let buf = make_fastq(&[(b"ACGT", b"IIII"), (b"TTTT", b"HHHH")]); - let r = rope(&buf); - let pos = end_of_last_fastq_entry(&r).unwrap(); - assert_eq!(flat(&r)[pos], b'@'); - assert_eq!( - &flat(&r)[pos..], - make_fastq(&[(b"TTTT", b"HHHH")]).as_slice() - ); - } - - #[test] - fn three_records_cuts_at_last() { - let buf = make_fastq(&[(b"ACGT", b"IIII"), (b"CCCC", b"JJJJ"), (b"GGGG", b"KKKK")]); - let r = rope(&buf); - let pos = end_of_last_fastq_entry(&r).unwrap(); - assert_eq!( - &flat(&r)[pos..], - make_fastq(&[(b"GGGG", b"KKKK")]).as_slice() - ); - } - - #[test] - fn at_sign_in_quality_does_not_confuse() { - let buf = make_fastq(&[(b"ACGTACGT", b"@@@@IIII"), (b"TTTT", b"HHHH")]); - let r = rope(&buf); - let pos = end_of_last_fastq_entry(&r).unwrap(); - assert_eq!( - &flat(&r)[pos..], - make_fastq(&[(b"TTTT", b"HHHH")]).as_slice() - ); - } - - #[test] - fn crlf_line_endings() { - let data = b"@h\r\nACGT\r\n+\r\nIIII\r\n@h\r\nTTTT\r\n+\r\nHHHH\r\n"; - let r = rope(data); - let pos = end_of_last_fastq_entry(&r).unwrap(); - assert_eq!(flat(&r)[pos], b'@'); - assert_eq!(&flat(&r)[pos..], b"@h\r\nTTTT\r\n+\r\nHHHH\r\n"); - } -} +#[path = "tests/fastq.rs"] +mod tests; diff --git a/src/obiread/src/normalize.rs b/src/obiread/src/normalize.rs index b92eb41..3a5d43b 100644 --- a/src/obiread/src/normalize.rs +++ b/src/obiread/src/normalize.rs @@ -215,239 +215,5 @@ fn is_acgt(upper: u8) -> bool { // ── tests ───────────────────────────────────────────────────────────────────── #[cfg(test)] -mod tests { - use super::*; - - fn make_rope(data: &[u8]) -> Rope { - let mut r = Rope::new(None); - r.push(data.to_vec()); - r - } - - fn flat(r: Rope) -> Vec { - r.fw_cursor().collect() - } - - fn run_fastq(data: &[u8], k: usize) -> Vec { - flat(normalize_fastq_chunk(make_rope(data), k)) - } - - fn run_fasta(data: &[u8], k: usize) -> Vec { - flat(normalize_fasta_chunk(make_rope(data), k)) - } - - fn make_fastq(records: &[&[u8]]) -> Vec { - let mut buf = Vec::new(); - for seq in records { - buf.extend_from_slice(b"@hdr\n"); - buf.extend_from_slice(seq); - buf.push(b'\n'); - buf.extend_from_slice(b"+\n"); - buf.extend_from_slice(&vec![b'I'; seq.len()]); - buf.push(b'\n'); - } - buf - } - - fn make_fasta(records: &[(&[u8], &[u8])]) -> Vec { - let mut buf = Vec::new(); - for (id, seq) in records { - buf.push(b'>'); - buf.extend_from_slice(id); - buf.push(b'\n'); - buf.extend_from_slice(seq); - buf.push(b'\n'); - } - buf - } - - // ── FASTQ basic ────────────────────────────────────────────────────────── - - #[test] - fn single_record_produces_seq_then_null() { - assert_eq!(run_fastq(&make_fastq(&[b"ACGTACGT"]), 4), b"ACGTACGT\x00"); - } - - #[test] - fn two_records_concatenated() { - assert_eq!( - run_fastq(&make_fastq(&[b"ACGTACGT", b"TTTTTTTT"]), 4), - b"ACGTACGT\x00TTTTTTTT\x00" - ); - } - - #[test] - fn lowercase_input_uppercased() { - assert_eq!(run_fastq(&make_fastq(&[b"acgtacgt"]), 4), b"ACGTACGT\x00"); - } - - #[test] - fn mixed_case_uppercased() { - assert_eq!(run_fastq(&make_fastq(&[b"AcGtAcGt"]), 4), b"ACGTACGT\x00"); - } - - #[test] - fn sequence_shorter_than_k_discarded() { - assert_eq!(run_fastq(&make_fastq(&[b"ACG"]), 4), b""); - } - - #[test] - fn sequence_exactly_k_kept() { - assert_eq!(run_fastq(&make_fastq(&[b"ACGT"]), 4), b"ACGT\x00"); - } - - #[test] - fn short_record_among_valid_ones_discarded() { - assert_eq!( - run_fastq(&make_fastq(&[b"ACGTACGT", b"AC", b"TTTTTTTT"]), 4), - b"ACGTACGT\x00TTTTTTTT\x00" - ); - } - - #[test] - fn ambiguous_splits_into_two_segments() { - assert_eq!( - run_fastq(&make_fastq(&[b"ACGTNACGT"]), 4), - b"ACGT\x00ACGT\x00" - ); - } - - #[test] - fn segment_after_ambiguous_too_short_discarded() { - assert_eq!( - run_fastq(&make_fastq(&[b"ACGTACGTNAC"]), 4), - b"ACGTACGT\x00" - ); - } - - #[test] - fn consecutive_ambiguous_produce_no_empty_segment() { - assert_eq!( - run_fastq(&make_fastq(&[b"ACGTNNNNACGT"]), 4), - b"ACGT\x00ACGT\x00" - ); - } - - #[test] - fn ambiguous_at_start_skipped() { - assert_eq!(run_fastq(&make_fastq(&[b"NNACGTACGT"]), 4), b"ACGTACGT\x00"); - } - - #[test] - fn ambiguous_at_end_produces_no_trailing_empty() { - assert_eq!(run_fastq(&make_fastq(&[b"ACGTACGTNN"]), 4), b"ACGTACGT\x00"); - } - - #[test] - fn crlf_handled() { - let data = b"@hdr\r\nACGTACGT\r\n+\r\nIIIIIIII\r\n"; - assert_eq!(run_fastq(data, 4), b"ACGTACGT\x00"); - } - - #[test] - fn multi_slice_rope() { - let data = make_fastq(&[b"ACGTACGT", b"TTTTTTTT"]); - let mid = data.len() / 2; - let mut rope = Rope::new(None); - rope.push(data[..mid].to_vec()); - rope.push(data[mid..].to_vec()); - assert_eq!( - flat(normalize_fastq_chunk(rope, 4)), - b"ACGTACGT\x00TTTTTTTT\x00" - ); - } - - // ── FASTA ───────────────────────────────────────────────────────────────── - - #[test] - fn fasta_single_record() { - assert_eq!( - run_fasta(&make_fasta(&[(b"s1", b"ACGTACGT")]), 4), - b"ACGTACGT\x00" - ); - } - - #[test] - fn fasta_two_records() { - assert_eq!( - run_fasta( - &make_fasta(&[(b"s1", b"ACGTACGT"), (b"s2", b"TTTTTTTT")]), - 4 - ), - b"ACGTACGT\x00TTTTTTTT\x00" - ); - } - - #[test] - fn fasta_multiline_sequence_concatenated() { - assert_eq!( - run_fasta(b">s1\nACGT\nACGT\nACGT\n", 4), - b"ACGTACGTACGT\x00" - ); - } - - #[test] - fn fasta_lowercase_uppercased() { - assert_eq!( - run_fasta(&make_fasta(&[(b"s1", b"acgtacgt")]), 4), - b"ACGTACGT\x00" - ); - } - - #[test] - fn fasta_short_record_discarded() { - assert_eq!(run_fasta(&make_fasta(&[(b"s1", b"ACG")]), 4), b""); - } - - #[test] - fn fasta_short_among_valid_discarded() { - assert_eq!( - run_fasta( - &make_fasta(&[(b"s1", b"ACGTACGT"), (b"s2", b"AC"), (b"s3", b"TTTTTTTT")]), - 4 - ), - b"ACGTACGT\x00TTTTTTTT\x00" - ); - } - - #[test] - fn fasta_ambiguous_splits_segments() { - assert_eq!(run_fasta(b">s1\nACGTNACGT\n", 4), b"ACGT\x00ACGT\x00"); - } - - #[test] - fn fasta_ambiguous_across_line_boundary() { - assert_eq!(run_fasta(b">s1\nACGT\nNACGT\n", 4), b"ACGT\x00ACGT\x00"); - } - - #[test] - fn fasta_ambiguous_short_segment_discarded() { - assert_eq!(run_fasta(b">s1\nACGTACGTNAC\n", 4), b"ACGTACGT\x00"); - } - - #[test] - fn fasta_no_trailing_newline() { - assert_eq!(run_fasta(b">s1\nACGTACGT", 4), b"ACGTACGT\x00"); - } - - #[test] - fn fasta_crlf_line_endings() { - assert_eq!( - run_fasta(b">s1\r\nACGT\r\nACGT\r\n>s2\r\nTTTT\r\n", 4), - b"ACGTACGT\x00TTTT\x00" - ); - } - - #[test] - fn fasta_multi_slice_rope() { - let data = make_fasta(&[(b"s1", b"ACGTACGT"), (b"s2", b"TTTTTTTT")]); - let mid = data.len() / 2; - let mut rope = Rope::new(None); - rope.push(data[..mid].to_vec()); - rope.push(data[mid..].to_vec()); - assert_eq!( - flat(normalize_fasta_chunk(rope, 4)), - b"ACGTACGT\x00TTTTTTTT\x00" - ); - } -} +#[path = "tests/normalize.rs"] +mod tests; diff --git a/src/obiread/src/nucstream.rs b/src/obiread/src/nucstream.rs index eb252d6..e6bc431 100644 --- a/src/obiread/src/nucstream.rs +++ b/src/obiread/src/nucstream.rs @@ -561,7 +561,7 @@ impl Drop for NucPage { /// [`obiskbuilder::SuperKmerStreamIter`]. pub struct NucPageCursor<'a> { data: &'a [u8], - pos: usize, + pos: usize, } impl NucPageCursor<'_> { @@ -687,8 +687,8 @@ impl Iterator for AnyNucStream { type Item = NucPage; fn next(&mut self) -> Option { match self { - AnyNucStream::Fasta(s) => s.next(), - AnyNucStream::Fastq(s) => s.next(), + AnyNucStream::Fasta(s) => s.next(), + AnyNucStream::Fastq(s) => s.next(), AnyNucStream::Genbank(s) => s.next(), } } @@ -701,17 +701,14 @@ fn dispatch( match guesser.mime_type() { Some("text/fasta") => Some(AnyNucStream::Fasta(NucStream::new(guesser, k))), Some("text/fastq") => Some(AnyNucStream::Fastq(NucStream::new(guesser, k))), - Some("text/gbff") => Some(AnyNucStream::Genbank(NucStream::new(guesser, k))), - _ => None, + Some("text/gbff") => Some(AnyNucStream::Genbank(NucStream::new(guesser, k))), + _ => None, } } /// Wraps an already-open reader in a nucleotide stream, detecting its format. /// Returns `None` if the format is not recognised. -pub(crate) fn nuc_stream( - reader: R, - k: usize, -) -> Option>> { +pub(crate) fn nuc_stream(reader: R, k: usize) -> Option>> { dispatch(MimeTypeGuesser::new(reader), k) } @@ -726,7 +723,11 @@ pub fn open_nuc_stream( k: usize, ) -> io::Result + Send>> { let reader = open_raw(source)?; - dispatch(MimeTypeGuesser::new(reader), k) + nuc_stream(reader, k) .map(|s| Box::new(s) as Box + Send>) .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "unknown sequence format")) } + +#[cfg(test)] +#[path = "tests/nucstream.rs"] +mod tests; diff --git a/src/obiread/src/tests/chunk.rs b/src/obiread/src/tests/chunk.rs new file mode 100644 index 0000000..ec5a51c --- /dev/null +++ b/src/obiread/src/tests/chunk.rs @@ -0,0 +1,106 @@ +use super::*; +use crate::fasta::end_of_last_fasta_entry; +use crate::fastq::end_of_last_fastq_entry; + +fn fasta_iter(data: &'static [u8], block_size: usize) -> SeqChunkIter<&'static [u8]> { + SeqChunkIter::new(data, block_size, end_of_last_fasta_entry, None) +} + +fn fastq_iter(data: &'static [u8], block_size: usize) -> SeqChunkIter<&'static [u8]> { + SeqChunkIter::new(data, block_size, end_of_last_fastq_entry, None) +} + +fn rope_to_vec(rope: &Rope) -> Vec { + rope.fw_cursor().collect() +} + +// ── FASTA ───────────────────────────────────────────────────────────────── + +#[test] +fn fasta_single_record_one_chunk() { + let data: &[u8] = b">s1\nACGT\n"; + let chunks: Vec<_> = fasta_iter(data, 64).collect::>().unwrap(); + assert_eq!(chunks.len(), 1); + assert_eq!(rope_to_vec(&chunks[0]), b">s1\nACGT\n"); +} + +#[test] +fn fasta_two_records_split_across_chunks() { + let data: &[u8] = b">s1\nACGT\n>s2\nTTTT\n"; + let chunks: Vec<_> = fasta_iter(data, 10).collect::>().unwrap(); + let all: Vec = chunks.iter().flat_map(|r| rope_to_vec(r)).collect(); + assert_eq!(all, b">s1\nACGT\n>s2\nTTTT\n"); +} + +#[test] +fn fasta_each_chunk_ends_on_complete_record() { + let data: &[u8] = b">s1\nACGT\n>s2\nCCCC\n>s3\nGGGG\n>s4\nTTTT\n"; + for block in [8, 12, 20, 100] { + let chunks: Vec<_> = fasta_iter(data, block).collect::>().unwrap(); + for rope in &chunks { + let flat = rope_to_vec(rope); + assert_eq!(flat[0], b'>', "block={block}: chunk doesn't start with '>'"); + assert_eq!( + *flat.last().unwrap(), + b'\n', + "block={block}: chunk doesn't end with newline" + ); + } + } +} + +// ── FASTQ ───────────────────────────────────────────────────────────────── + +fn make_fastq(records: &[(&[u8], &[u8])]) -> Vec { + let mut buf = Vec::new(); + for (seq, qual) in records { + buf.extend_from_slice(b"@hdr\n"); + buf.extend_from_slice(seq); + buf.push(b'\n'); + buf.extend_from_slice(b"+\n"); + buf.extend_from_slice(qual); + buf.push(b'\n'); + } + buf +} + +#[test] +fn fastq_single_record_one_chunk() { + let data = Box::leak(make_fastq(&[(b"ACGT", b"IIII")]).into_boxed_slice()); + let chunks: Vec<_> = fastq_iter(data, 64).collect::>().unwrap(); + assert_eq!(chunks.len(), 1); +} + +#[test] +fn fastq_at_in_quality_handled() { + let data = Box::leak( + make_fastq(&[(b"ACGTACGT", b"@@@@IIII"), (b"TTTTTTTT", b"HHHHHHHH")]) + .into_boxed_slice(), + ); + let chunks: Vec<_> = fastq_iter(data, 16).collect::>().unwrap(); + let all: Vec = chunks.iter().flat_map(|r| rope_to_vec(r)).collect(); + assert_eq!(all, *data); +} + +#[test] +fn fastq_each_chunk_starts_with_at() { + let data = Box::leak( + make_fastq(&[ + (b"ACGT", b"IIII"), + (b"CCCC", b"JJJJ"), + (b"GGGG", b"KKKK"), + (b"TTTT", b"LLLL"), + ]) + .into_boxed_slice(), + ); + for block in [18, 30, 60] { + let chunks: Vec<_> = fastq_iter(data, block).collect::>().unwrap(); + for rope in &chunks { + let first_byte = rope_to_vec(rope)[0]; + assert_eq!( + first_byte, b'@', + "block={block}: chunk doesn't start with '@'" + ); + } + } +} diff --git a/src/obiread/src/tests/fasta.rs b/src/obiread/src/tests/fasta.rs new file mode 100644 index 0000000..f52fd79 --- /dev/null +++ b/src/obiread/src/tests/fasta.rs @@ -0,0 +1,66 @@ +use super::*; + +fn rope(data: &[u8]) -> Rope { + let mut r = Rope::new(None); + r.push(data.to_vec()); + r +} + +fn rope2(a: &[u8], b: &[u8]) -> Rope { + let mut r = Rope::new(None); + r.push(a.to_vec()); + r.push(b.to_vec()); + r +} + +fn flat(r: &Rope) -> Vec { + r.fw_cursor().collect() +} + +#[test] +fn single_entry_no_boundary() { + assert_eq!(end_of_last_fasta_entry(&rope(b">seq1\nACGT\n")), None); +} + +#[test] +fn two_entries_cuts_at_second_header() { + let data = b">seq1\nACGT\n>seq2\nTTTT\n"; + let r = rope(data); + let pos = end_of_last_fasta_entry(&r).unwrap(); + assert_eq!(&flat(&r)[pos..], b">seq2\nTTTT\n"); + assert_eq!(&flat(&r)[..pos], b">seq1\nACGT\n"); +} + +#[test] +fn three_entries_cuts_at_last_header() { + let data = b">s1\nAA\n>s2\nCC\n>s3\nGG\n"; + let r = rope(data); + let pos = end_of_last_fasta_entry(&r).unwrap(); + assert_eq!(&flat(&r)[pos..], b">s3\nGG\n"); +} + +#[test] +fn multiline_sequence() { + let data = b">s1\nACGT\nACGT\n>s2\nTTTT\n"; + let r = rope(data); + let pos = end_of_last_fasta_entry(&r).unwrap(); + assert_eq!(&flat(&r)[pos..], b">s2\nTTTT\n"); +} + +#[test] +fn crlf_line_endings() { + let data = b">s1\r\nACGT\r\n>s2\r\nTTTT\r\n"; + let r = rope(data); + let pos = end_of_last_fasta_entry(&r).unwrap(); + assert_eq!(&flat(&r)[pos..], b">s2\r\nTTTT\r\n"); +} + +#[test] +fn boundary_spans_two_blocks() { + let a = b">s1\nACGT\n"; + let b = b">s2\nTTTT\n"; + let r = rope2(a, b); + let all: Vec = flat(&r); + let pos = end_of_last_fasta_entry(&r).unwrap(); + assert_eq!(&all[pos..], b">s2\nTTTT\n"); +} diff --git a/src/obiread/src/tests/fastq.rs b/src/obiread/src/tests/fastq.rs new file mode 100644 index 0000000..db78c37 --- /dev/null +++ b/src/obiread/src/tests/fastq.rs @@ -0,0 +1,73 @@ +use super::*; + +fn rope(data: &[u8]) -> Rope { + let mut r = Rope::new(None); + r.push(data.to_vec()); + r +} + +fn make_fastq(records: &[(&[u8], &[u8])]) -> Vec { + let mut buf = Vec::new(); + for (seq, qual) in records { + buf.extend_from_slice(b"@header\n"); + buf.extend_from_slice(seq); + buf.push(b'\n'); + buf.extend_from_slice(b"+\n"); + buf.extend_from_slice(qual); + buf.push(b'\n'); + } + buf +} + +fn flat(r: &Rope) -> Vec { + r.fw_cursor().collect() +} + +#[test] +fn single_record_no_boundary() { + let buf = make_fastq(&[(b"ACGT", b"IIII")]); + assert_eq!(end_of_last_fastq_entry(&rope(&buf)), None); +} + +#[test] +fn two_records_cuts_at_second() { + let buf = make_fastq(&[(b"ACGT", b"IIII"), (b"TTTT", b"HHHH")]); + let r = rope(&buf); + let pos = end_of_last_fastq_entry(&r).unwrap(); + assert_eq!(flat(&r)[pos], b'@'); + assert_eq!( + &flat(&r)[pos..], + make_fastq(&[(b"TTTT", b"HHHH")]).as_slice() + ); +} + +#[test] +fn three_records_cuts_at_last() { + let buf = make_fastq(&[(b"ACGT", b"IIII"), (b"CCCC", b"JJJJ"), (b"GGGG", b"KKKK")]); + let r = rope(&buf); + let pos = end_of_last_fastq_entry(&r).unwrap(); + assert_eq!( + &flat(&r)[pos..], + make_fastq(&[(b"GGGG", b"KKKK")]).as_slice() + ); +} + +#[test] +fn at_sign_in_quality_does_not_confuse() { + let buf = make_fastq(&[(b"ACGTACGT", b"@@@@IIII"), (b"TTTT", b"HHHH")]); + let r = rope(&buf); + let pos = end_of_last_fastq_entry(&r).unwrap(); + assert_eq!( + &flat(&r)[pos..], + make_fastq(&[(b"TTTT", b"HHHH")]).as_slice() + ); +} + +#[test] +fn crlf_line_endings() { + let data = b"@h\r\nACGT\r\n+\r\nIIII\r\n@h\r\nTTTT\r\n+\r\nHHHH\r\n"; + let r = rope(data); + let pos = end_of_last_fastq_entry(&r).unwrap(); + assert_eq!(flat(&r)[pos], b'@'); + assert_eq!(&flat(&r)[pos..], b"@h\r\nTTTT\r\n+\r\nHHHH\r\n"); +} diff --git a/src/obiread/src/tests/normalize.rs b/src/obiread/src/tests/normalize.rs new file mode 100644 index 0000000..a9cc52b --- /dev/null +++ b/src/obiread/src/tests/normalize.rs @@ -0,0 +1,234 @@ +use super::*; + +fn make_rope(data: &[u8]) -> Rope { + let mut r = Rope::new(None); + r.push(data.to_vec()); + r +} + +fn flat(r: Rope) -> Vec { + r.fw_cursor().collect() +} + +fn run_fastq(data: &[u8], k: usize) -> Vec { + flat(normalize_fastq_chunk(make_rope(data), k)) +} + +fn run_fasta(data: &[u8], k: usize) -> Vec { + flat(normalize_fasta_chunk(make_rope(data), k)) +} + +fn make_fastq(records: &[&[u8]]) -> Vec { + let mut buf = Vec::new(); + for seq in records { + buf.extend_from_slice(b"@hdr\n"); + buf.extend_from_slice(seq); + buf.push(b'\n'); + buf.extend_from_slice(b"+\n"); + buf.extend_from_slice(&vec![b'I'; seq.len()]); + buf.push(b'\n'); + } + buf +} + +fn make_fasta(records: &[(&[u8], &[u8])]) -> Vec { + let mut buf = Vec::new(); + for (id, seq) in records { + buf.push(b'>'); + buf.extend_from_slice(id); + buf.push(b'\n'); + buf.extend_from_slice(seq); + buf.push(b'\n'); + } + buf +} + +// ── FASTQ basic ────────────────────────────────────────────────────────── + +#[test] +fn single_record_produces_seq_then_null() { + assert_eq!(run_fastq(&make_fastq(&[b"ACGTACGT"]), 4), b"ACGTACGT\x00"); +} + +#[test] +fn two_records_concatenated() { + assert_eq!( + run_fastq(&make_fastq(&[b"ACGTACGT", b"TTTTTTTT"]), 4), + b"ACGTACGT\x00TTTTTTTT\x00" + ); +} + +#[test] +fn lowercase_input_uppercased() { + assert_eq!(run_fastq(&make_fastq(&[b"acgtacgt"]), 4), b"ACGTACGT\x00"); +} + +#[test] +fn mixed_case_uppercased() { + assert_eq!(run_fastq(&make_fastq(&[b"AcGtAcGt"]), 4), b"ACGTACGT\x00"); +} + +#[test] +fn sequence_shorter_than_k_discarded() { + assert_eq!(run_fastq(&make_fastq(&[b"ACG"]), 4), b""); +} + +#[test] +fn sequence_exactly_k_kept() { + assert_eq!(run_fastq(&make_fastq(&[b"ACGT"]), 4), b"ACGT\x00"); +} + +#[test] +fn short_record_among_valid_ones_discarded() { + assert_eq!( + run_fastq(&make_fastq(&[b"ACGTACGT", b"AC", b"TTTTTTTT"]), 4), + b"ACGTACGT\x00TTTTTTTT\x00" + ); +} + +#[test] +fn ambiguous_splits_into_two_segments() { + assert_eq!( + run_fastq(&make_fastq(&[b"ACGTNACGT"]), 4), + b"ACGT\x00ACGT\x00" + ); +} + +#[test] +fn segment_after_ambiguous_too_short_discarded() { + assert_eq!( + run_fastq(&make_fastq(&[b"ACGTACGTNAC"]), 4), + b"ACGTACGT\x00" + ); +} + +#[test] +fn consecutive_ambiguous_produce_no_empty_segment() { + assert_eq!( + run_fastq(&make_fastq(&[b"ACGTNNNNACGT"]), 4), + b"ACGT\x00ACGT\x00" + ); +} + +#[test] +fn ambiguous_at_start_skipped() { + assert_eq!(run_fastq(&make_fastq(&[b"NNACGTACGT"]), 4), b"ACGTACGT\x00"); +} + +#[test] +fn ambiguous_at_end_produces_no_trailing_empty() { + assert_eq!(run_fastq(&make_fastq(&[b"ACGTACGTNN"]), 4), b"ACGTACGT\x00"); +} + +#[test] +fn crlf_handled() { + let data = b"@hdr\r\nACGTACGT\r\n+\r\nIIIIIIII\r\n"; + assert_eq!(run_fastq(data, 4), b"ACGTACGT\x00"); +} + +#[test] +fn multi_slice_rope() { + let data = make_fastq(&[b"ACGTACGT", b"TTTTTTTT"]); + let mid = data.len() / 2; + let mut rope = Rope::new(None); + rope.push(data[..mid].to_vec()); + rope.push(data[mid..].to_vec()); + assert_eq!( + flat(normalize_fastq_chunk(rope, 4)), + b"ACGTACGT\x00TTTTTTTT\x00" + ); +} + +// ── FASTA ───────────────────────────────────────────────────────────────── + +#[test] +fn fasta_single_record() { + assert_eq!( + run_fasta(&make_fasta(&[(b"s1", b"ACGTACGT")]), 4), + b"ACGTACGT\x00" + ); +} + +#[test] +fn fasta_two_records() { + assert_eq!( + run_fasta( + &make_fasta(&[(b"s1", b"ACGTACGT"), (b"s2", b"TTTTTTTT")]), + 4 + ), + b"ACGTACGT\x00TTTTTTTT\x00" + ); +} + +#[test] +fn fasta_multiline_sequence_concatenated() { + assert_eq!( + run_fasta(b">s1\nACGT\nACGT\nACGT\n", 4), + b"ACGTACGTACGT\x00" + ); +} + +#[test] +fn fasta_lowercase_uppercased() { + assert_eq!( + run_fasta(&make_fasta(&[(b"s1", b"acgtacgt")]), 4), + b"ACGTACGT\x00" + ); +} + +#[test] +fn fasta_short_record_discarded() { + assert_eq!(run_fasta(&make_fasta(&[(b"s1", b"ACG")]), 4), b""); +} + +#[test] +fn fasta_short_among_valid_discarded() { + assert_eq!( + run_fasta( + &make_fasta(&[(b"s1", b"ACGTACGT"), (b"s2", b"AC"), (b"s3", b"TTTTTTTT")]), + 4 + ), + b"ACGTACGT\x00TTTTTTTT\x00" + ); +} + +#[test] +fn fasta_ambiguous_splits_segments() { + assert_eq!(run_fasta(b">s1\nACGTNACGT\n", 4), b"ACGT\x00ACGT\x00"); +} + +#[test] +fn fasta_ambiguous_across_line_boundary() { + assert_eq!(run_fasta(b">s1\nACGT\nNACGT\n", 4), b"ACGT\x00ACGT\x00"); +} + +#[test] +fn fasta_ambiguous_short_segment_discarded() { + assert_eq!(run_fasta(b">s1\nACGTACGTNAC\n", 4), b"ACGTACGT\x00"); +} + +#[test] +fn fasta_no_trailing_newline() { + assert_eq!(run_fasta(b">s1\nACGTACGT", 4), b"ACGTACGT\x00"); +} + +#[test] +fn fasta_crlf_line_endings() { + assert_eq!( + run_fasta(b">s1\r\nACGT\r\nACGT\r\n>s2\r\nTTTT\r\n", 4), + b"ACGTACGT\x00TTTT\x00" + ); +} + +#[test] +fn fasta_multi_slice_rope() { + let data = make_fasta(&[(b"s1", b"ACGTACGT"), (b"s2", b"TTTTTTTT")]); + let mid = data.len() / 2; + let mut rope = Rope::new(None); + rope.push(data[..mid].to_vec()); + rope.push(data[mid..].to_vec()); + assert_eq!( + flat(normalize_fasta_chunk(rope, 4)), + b"ACGTACGT\x00TTTTTTTT\x00" + ); +} diff --git a/src/obiread/src/tests/nucstream.rs b/src/obiread/src/tests/nucstream.rs new file mode 100644 index 0000000..bc135cb --- /dev/null +++ b/src/obiread/src/tests/nucstream.rs @@ -0,0 +1,267 @@ +use super::*; +use std::io::Cursor; +use std::ops::Deref; + +// ── helpers ─────────────────────────────────────────────────────────────── + +fn run_fasta(data: &[u8], k: usize) -> Vec { + NucStream::<_, FastaParser>::new(Cursor::new(data.to_vec()), k) + .flat_map(|p| p.deref().to_vec()) + .collect() +} + +fn run_fastq(data: &[u8], k: usize) -> Vec { + NucStream::<_, FastqParser>::new(Cursor::new(data.to_vec()), k) + .flat_map(|p| p.deref().to_vec()) + .collect() +} + +fn run_genbank(data: &[u8], k: usize) -> Vec { + NucStream::<_, GenbankParser>::new(Cursor::new(data.to_vec()), k) + .flat_map(|p| p.deref().to_vec()) + .collect() +} + +fn pages_fasta(data: &[u8], k: usize) -> Vec> { + NucStream::<_, FastaParser>::new(Cursor::new(data.to_vec()), k) + .map(|p| p.deref().to_vec()) + .collect() +} + +// ── FastaParser ─────────────────────────────────────────────────────────── + +#[test] +fn fasta_single_sequence() { + assert_eq!(run_fasta(b">s1\nACGTACGT\n", 4), b"ACGTACGT\x00"); +} + +#[test] +fn fasta_lowercase_uppercased() { + assert_eq!(run_fasta(b">s1\nacgtacgt\n", 4), b"ACGTACGT\x00"); +} + +#[test] +fn fasta_multiline_sequence_concatenated() { + assert_eq!(run_fasta(b">s1\nACGT\nACGT\n", 4), b"ACGTACGT\x00"); +} + +#[test] +fn fasta_two_sequences() { + let data = b">s1\nACGTACGT\n>s2\nTTTTTTTT\n"; + assert_eq!(run_fasta(data, 4), b"ACGTACGT\x00TTTTTTTT\x00"); +} + +#[test] +fn fasta_empty_input_yields_no_pages() { + assert_eq!(run_fasta(b"", 4), b""); +} + +#[test] +fn fasta_sequence_shorter_than_k_at_eof_discarded() { + // The 3-base fragment is saved as overlap and dropped at EOF (< k). + assert_eq!(run_fasta(b">s1\nACG\n", 4), b""); +} + +#[test] +fn fasta_ambiguous_splits_into_two_segments() { + assert_eq!(run_fasta(b">s1\nACGTNACGT\n", 4), b"ACGT\x00ACGT\x00"); +} + +#[test] +fn fasta_short_segment_before_ambiguous_emitted() { + // "AC" (< k=4) before N is written with a separator — filtering by + // length is deferred to the superkmer builder, not done here. + assert_eq!(run_fasta(b">s1\nACNACGTACGT\n", 4), b"AC\x00ACGTACGT\x00"); +} + +#[test] +fn fasta_ambiguous_at_start_skipped() { + assert_eq!(run_fasta(b">s1\nNNNACGTACGT\n", 4), b"ACGTACGT\x00"); +} + +// ── FastqParser ─────────────────────────────────────────────────────────── + +#[test] +fn fastq_single_record() { + assert_eq!( + run_fastq(b"@r1\nACGTACGT\n+\nIIIIIIII\n", 4), + b"ACGTACGT\x00" + ); +} + +#[test] +fn fastq_lowercase_uppercased() { + assert_eq!( + run_fastq(b"@r1\nacgtacgt\n+\nIIIIIIII\n", 4), + b"ACGTACGT\x00" + ); +} + +#[test] +fn fastq_quality_bytes_not_in_output() { + // '@' (Phred 31 = ASCII 64) in quality must not appear in output. + assert_eq!( + run_fastq(b"@r1\nACGTACGT\n+\n@@@@@@@@\n", 4), + b"ACGTACGT\x00" + ); +} + +#[test] +fn fastq_two_records() { + let data = b"@r1\nACGTACGT\n+\nIIIIIIII\n@r2\nTTTTTTTT\n+\nIIIIIIII\n"; + assert_eq!(run_fastq(data, 4), b"ACGTACGT\x00TTTTTTTT\x00"); +} + +#[test] +fn fastq_ambiguous_splits_sequence() { + assert_eq!( + run_fastq(b"@r1\nACGTNACGT\n+\nIIIIIIIII\n", 4), + b"ACGT\x00ACGT\x00" + ); +} + +#[test] +fn fastq_at_in_quality_line_not_a_record_start() { + // '@' in the quality line must not trigger a new record parse. + let data = b"@r1\nACGTACGT\n+\n@@@@@@@@\n@r2\nTTTTTTTT\n+\nIIIIIIII\n"; + assert_eq!(run_fastq(data, 4), b"ACGTACGT\x00TTTTTTTT\x00"); +} + +// ── GenbankParser ───────────────────────────────────────────────────────── + +#[test] +fn genbank_origin_to_slash() { + let data = b"LOCUS ...\nORIGIN\n 1 acgtacgt\n//\n"; + assert_eq!(run_genbank(data, 4), b"ACGTACGT\x00"); +} + +#[test] +fn genbank_position_numbers_and_spaces_skipped() { + let data = b"ORIGIN\n 1 acgt acgt\n//\n"; + assert_eq!(run_genbank(data, 4), b"ACGTACGT\x00"); +} + +#[test] +fn genbank_two_records() { + let data = b"ORIGIN\n 1 acgtacgt\n//\nLOCUS ...\nORIGIN\n 1 tttttttt\n//\n"; + assert_eq!(run_genbank(data, 4), b"ACGTACGT\x00TTTTTTTT\x00"); +} + +#[test] +fn genbank_ambiguous_splits_sequence() { + let data = b"ORIGIN\n 1 acgtnacgt\n//\n"; + assert_eq!(run_genbank(data, 4), b"ACGT\x00ACGT\x00"); +} + +// ── NucPage ─────────────────────────────────────────────────────────────── + +#[test] +fn nuc_page_deref_correct_bytes() { + let page = NucStream::<_, FastaParser>::new(Cursor::new(b">s1\nACGT\n".to_vec()), 4) + .next() + .expect("page"); + assert_eq!(page.deref(), b"ACGT\x00"); +} + +// ── NucPageCursor ───────────────────────────────────────────────────────── + +fn make_page(data: &[u8], k: usize) -> NucPage { + NucStream::<_, FastaParser>::new(Cursor::new(data.to_vec()), k) + .next() + .expect("at least one page") +} + +#[test] +fn cursor_reads_bytes_in_order() { + let page = make_page(b">s1\nACGTACGT\n", 4); + let mut cur = page.cursor(); + assert_eq!(cur.next_byte(), Some(b'A')); + assert_eq!(cur.next_byte(), Some(b'C')); + assert_eq!(cur.next_byte(), Some(b'G')); + assert_eq!(cur.next_byte(), Some(b'T')); +} + +#[test] +fn cursor_rewind_rereads_bytes() { + let page = make_page(b">s1\nACGTACGT\n", 4); + let mut cur = page.cursor(); + cur.next_byte(); // A + cur.next_byte(); // C + cur.rewind(1); + assert_eq!(cur.next_byte(), Some(b'C')); + cur.rewind(2); + assert_eq!(cur.next_byte(), Some(b'A')); +} + +#[test] +fn cursor_returns_none_at_end() { + // "ACGT\x00" = 5 bytes; consume all then expect None. + let page = make_page(b">s1\nACGT\n", 4); + let mut cur = page.cursor(); + for _ in 0..5 { + cur.next_byte(); + } + assert_eq!(cur.next_byte(), None); +} + +#[test] +fn cursor_len_matches_page_content() { + // "ACGTACGT\x00" = 9 bytes + let page = make_page(b">s1\nACGTACGT\n", 4); + let cur = page.cursor(); + assert_eq!(cur.len(), 9); + assert!(!cur.is_empty()); +} + +// ── Overlap at page boundary ────────────────────────────────────────────── + +#[test] +fn overlap_last_km1_bytes_prepended_to_next_page() { + const K: usize = 11; + // Sequence long enough to span two pages: PAGE_SIZE + K bytes. + // Pattern chosen so boundary bytes are unambiguous. + let seq: Vec = (0..PAGE_SIZE + K).map(|i| b"ACGT"[i % 4]).collect(); + let mut input = b">seq\n".to_vec(); + input.extend_from_slice(&seq); + input.push(b'\n'); + + let pages = pages_fasta(&input, K); + assert!(pages.len() >= 2, "need at least two pages"); + + let p1 = &pages[0]; + let p2 = &pages[1]; + + // page1 must end with a \x00 separator (written by save_overlap) + assert_eq!(*p1.last().unwrap(), 0x00, "page1 must end with separator"); + + // last K-1 ACGT bytes of page1 == first K-1 bytes of page2 + let ol = K - 1; + let p1_seq_end = &p1[p1.len() - 1 - ol..p1.len() - 1]; + let p2_start = &p2[..ol]; + assert_eq!( + p1_seq_end, p2_start, + "overlap bytes mismatch at page boundary" + ); +} + +// ── Pool ────────────────────────────────────────────────────────────────── + +#[test] +fn pool_buffer_reused_after_drop() { + // Drop page1 so its buffer returns to the pool, then verify page2 + // is produced correctly (no corruption, no panic). + const K: usize = 11; + let seq: Vec = vec![b'A'; PAGE_SIZE + K]; + let mut input = b">seq\n".to_vec(); + input.extend_from_slice(&seq); + input.push(b'\n'); + + let mut stream = NucStream::<_, FastaParser>::new(Cursor::new(input), K); + let page1 = stream.next().expect("page 1"); + assert!(!page1.deref().is_empty()); + drop(page1); // returns buffer to pool + let page2 = stream.next().expect("page 2"); + assert!(!page2.deref().is_empty()); + // page2 must still start with A's (overlap from page1) + assert_eq!(page2[0], b'A'); +}