⬆️ refactor superkmer to use obipipeline
- Replace manual threading with Pipeline abstraction from `obipipline` - Remove crossbeam-channel dependency and format detection logic - Introduce typed `PipelineData` enum for pipeline stages (RawChunk, Norm Chunk, Batch) - Implement shared normalization and extraction steps as `SharedFn`ƒ - Add unsafe Send/Sync impls for PipelineData (Rope ownership is moved, not shared) - Replace manual reader/worker/output threads with a single Pipeline execution - Uses `make_source_fallible!`, shared transform functions, and a sink for output - Simplify argument handling (remove `--format` flag) - Update Cargo.toml: remove crossbeam-channel, add obipipeline
This commit is contained in:
@@ -194,7 +194,10 @@ pub trait RopeCursor<'a> {
|
|||||||
/// Use the returned value with [`SeekMode::Rope`] to restore a position,
|
/// Use the returned value with [`SeekMode::Rope`] to restore a position,
|
||||||
/// or as a truncation point after a write pass.
|
/// or as a truncation point after a write pass.
|
||||||
fn rope_tell(&self) -> usize {
|
fn rope_tell(&self) -> usize {
|
||||||
self.state().current.get().unwrap_or(self.state().offset.get())
|
self.state()
|
||||||
|
.current
|
||||||
|
.get()
|
||||||
|
.unwrap_or(self.state().offset.get())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Number of bytes visible through this cursor (`rope.len() - offset`).
|
/// Number of bytes visible through this cursor (`rope.len() - offset`).
|
||||||
@@ -528,13 +531,13 @@ mod tests {
|
|||||||
use crate::Rope;
|
use crate::Rope;
|
||||||
|
|
||||||
fn rope(data: &[u8]) -> Rope {
|
fn rope(data: &[u8]) -> Rope {
|
||||||
let mut r = Rope::new();
|
let mut r = Rope::new(None);
|
||||||
r.push(data.to_vec());
|
r.push(data.to_vec());
|
||||||
r
|
r
|
||||||
}
|
}
|
||||||
|
|
||||||
fn rope2(a: &[u8], b: &[u8]) -> Rope {
|
fn rope2(a: &[u8], b: &[u8]) -> Rope {
|
||||||
let mut r = Rope::new();
|
let mut r = Rope::new(None);
|
||||||
r.push(a.to_vec());
|
r.push(a.to_vec());
|
||||||
r.push(b.to_vec());
|
r.push(b.to_vec());
|
||||||
r
|
r
|
||||||
@@ -703,14 +706,14 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn backward_empty_rope_returns_error() {
|
fn backward_empty_rope_returns_error() {
|
||||||
let r = Rope::new();
|
let r = Rope::new(None);
|
||||||
let c = r.bw_cursor();
|
let c = r.bw_cursor();
|
||||||
assert!(c.read_next().is_err());
|
assert!(c.read_next().is_err());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn forward_empty_rope_returns_error() {
|
fn forward_empty_rope_returns_error() {
|
||||||
let r = Rope::new();
|
let r = Rope::new(None);
|
||||||
let c = r.fw_cursor();
|
let c = r.fw_cursor();
|
||||||
assert!(c.read_next().is_err());
|
assert!(c.read_next().is_err());
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -28,6 +28,7 @@ use std::cell::Cell;
|
|||||||
///
|
///
|
||||||
/// See the [module-level documentation][crate::rope] for a full overview.
|
/// See the [module-level documentation][crate::rope] for a full overview.
|
||||||
pub struct Rope {
|
pub struct Rope {
|
||||||
|
pub(crate) mime_type: Option<&'static str>,
|
||||||
pub(crate) blocks: Vec<Vec<Cell<u8>>>,
|
pub(crate) blocks: Vec<Vec<Cell<u8>>>,
|
||||||
pub(crate) length: usize,
|
pub(crate) length: usize,
|
||||||
pub(crate) start_block_idx: Vec<usize>,
|
pub(crate) start_block_idx: Vec<usize>,
|
||||||
@@ -35,8 +36,9 @@ pub struct Rope {
|
|||||||
|
|
||||||
impl Rope {
|
impl Rope {
|
||||||
/// Create an empty rope (no allocations).
|
/// Create an empty rope (no allocations).
|
||||||
pub fn new() -> Self {
|
pub fn new(mime_type: Option<&'static str>) -> Self {
|
||||||
Self {
|
Self {
|
||||||
|
mime_type,
|
||||||
blocks: Vec::new(),
|
blocks: Vec::new(),
|
||||||
length: 0,
|
length: 0,
|
||||||
start_block_idx: Vec::new(),
|
start_block_idx: Vec::new(),
|
||||||
@@ -59,6 +61,11 @@ impl Rope {
|
|||||||
self.length += block_len;
|
self.length += block_len;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// The MIME type of the rope, if known.
|
||||||
|
pub fn mime_type(&self) -> Option<&str> {
|
||||||
|
self.mime_type.as_deref()
|
||||||
|
}
|
||||||
|
|
||||||
/// Total number of blocks.
|
/// Total number of blocks.
|
||||||
pub fn n_blocks(&self) -> usize {
|
pub fn n_blocks(&self) -> usize {
|
||||||
self.blocks.len()
|
self.blocks.len()
|
||||||
@@ -115,7 +122,7 @@ impl Rope {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if pos == self.length {
|
if pos == self.length {
|
||||||
return Ok(Rope::new());
|
return Ok(Rope::new(self.mime_type.clone()));
|
||||||
}
|
}
|
||||||
|
|
||||||
let (block_idx, from, _) = self.lookup(pos).ok_or_else(|| {
|
let (block_idx, from, _) = self.lookup(pos).ok_or_else(|| {
|
||||||
@@ -149,6 +156,7 @@ impl Rope {
|
|||||||
self.length = pos;
|
self.length = pos;
|
||||||
|
|
||||||
Ok(Rope {
|
Ok(Rope {
|
||||||
|
mime_type: self.mime_type.clone(),
|
||||||
blocks: tail_blocks,
|
blocks: tail_blocks,
|
||||||
length: tail_length,
|
length: tail_length,
|
||||||
start_block_idx: tail_starts,
|
start_block_idx: tail_starts,
|
||||||
@@ -185,13 +193,13 @@ mod tests {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn make(data: &[u8]) -> Rope {
|
fn make(data: &[u8]) -> Rope {
|
||||||
let mut r = Rope::new();
|
let mut r = Rope::new(None);
|
||||||
r.push(data.to_vec());
|
r.push(data.to_vec());
|
||||||
r
|
r
|
||||||
}
|
}
|
||||||
|
|
||||||
fn make2(a: &[u8], b: &[u8]) -> Rope {
|
fn make2(a: &[u8], b: &[u8]) -> Rope {
|
||||||
let mut r = Rope::new();
|
let mut r = Rope::new(None);
|
||||||
r.push(a.to_vec());
|
r.push(a.to_vec());
|
||||||
r.push(b.to_vec());
|
r.push(b.to_vec());
|
||||||
r
|
r
|
||||||
@@ -201,7 +209,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn empty_rope_is_empty() {
|
fn empty_rope_is_empty() {
|
||||||
let r = Rope::new();
|
let r = Rope::new(None);
|
||||||
assert!(r.is_empty());
|
assert!(r.is_empty());
|
||||||
assert_eq!(r.len(), 0);
|
assert_eq!(r.len(), 0);
|
||||||
assert_eq!(r.n_blocks(), 0);
|
assert_eq!(r.n_blocks(), 0);
|
||||||
@@ -213,6 +221,7 @@ mod tests {
|
|||||||
assert!(!r.is_empty());
|
assert!(!r.is_empty());
|
||||||
assert_eq!(r.len(), 5);
|
assert_eq!(r.len(), 5);
|
||||||
assert_eq!(r.n_blocks(), 1);
|
assert_eq!(r.n_blocks(), 1);
|
||||||
|
assert_eq!(r.mime_type(), None);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -220,6 +229,7 @@ mod tests {
|
|||||||
let r = make2(b"abc", b"de");
|
let r = make2(b"abc", b"de");
|
||||||
assert_eq!(r.len(), 5);
|
assert_eq!(r.len(), 5);
|
||||||
assert_eq!(r.n_blocks(), 2);
|
assert_eq!(r.n_blocks(), 2);
|
||||||
|
assert_eq!(r.mime_type(), None);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -261,7 +271,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn lookup_empty_rope_returns_none() {
|
fn lookup_empty_rope_returns_none() {
|
||||||
assert!(Rope::new().lookup(0).is_none());
|
assert!(Rope::new(None).lookup(0).is_none());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|||||||
+124
-14
@@ -3,6 +3,12 @@
|
|||||||
//! Each `Rope` yielded by [`SeqChunkIter`] contains one or more blocks that
|
//! Each `Rope` yielded by [`SeqChunkIter`] contains one or more blocks that
|
||||||
//! together form a self-contained block of complete sequence records.
|
//! together form a self-contained block of complete sequence records.
|
||||||
|
|
||||||
|
use crate::DEFAULT_BLOCK_SIZE;
|
||||||
|
use crate::MimeTypeGuesser;
|
||||||
|
use crate::fasta;
|
||||||
|
use crate::fastq;
|
||||||
|
use crate::xopen::xopen;
|
||||||
|
|
||||||
use std::io::{self, Read};
|
use std::io::{self, Read};
|
||||||
|
|
||||||
use obikrope::Rope;
|
use obikrope::Rope;
|
||||||
@@ -14,6 +20,7 @@ pub type Splitter = fn(&Rope) -> Option<usize>;
|
|||||||
/// Iterator that reads from `R` in blocks and yields `Rope` chunks,
|
/// Iterator that reads from `R` in blocks and yields `Rope` chunks,
|
||||||
/// each ending on a complete sequence record boundary.
|
/// each ending on a complete sequence record boundary.
|
||||||
pub struct SeqChunkIter<R> {
|
pub struct SeqChunkIter<R> {
|
||||||
|
mime_type: Option<&'static str>,
|
||||||
source: R,
|
source: R,
|
||||||
rope: Rope,
|
rope: Rope,
|
||||||
block_size: usize,
|
block_size: usize,
|
||||||
@@ -23,10 +30,16 @@ pub struct SeqChunkIter<R> {
|
|||||||
|
|
||||||
impl<R: Read> SeqChunkIter<R> {
|
impl<R: Read> SeqChunkIter<R> {
|
||||||
/// Create a new iterator.
|
/// Create a new iterator.
|
||||||
pub fn new(source: R, block_size: usize, splitter: Splitter) -> Self {
|
pub fn new(
|
||||||
|
source: R,
|
||||||
|
block_size: usize,
|
||||||
|
splitter: Splitter,
|
||||||
|
mime_type: Option<&'static str>,
|
||||||
|
) -> Self {
|
||||||
Self {
|
Self {
|
||||||
|
mime_type,
|
||||||
source,
|
source,
|
||||||
rope: Rope::new(),
|
rope: Rope::new(mime_type),
|
||||||
block_size,
|
block_size,
|
||||||
splitter,
|
splitter,
|
||||||
eof: false,
|
eof: false,
|
||||||
@@ -66,7 +79,10 @@ impl<R: Read> Iterator for SeqChunkIter<R> {
|
|||||||
if self.rope.is_empty() {
|
if self.rope.is_empty() {
|
||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
return Some(Ok(std::mem::replace(&mut self.rope, Rope::new())));
|
return Some(Ok(std::mem::replace(
|
||||||
|
&mut self.rope,
|
||||||
|
Rope::new(self.mime_type),
|
||||||
|
)));
|
||||||
}
|
}
|
||||||
|
|
||||||
match self.read_block() {
|
match self.read_block() {
|
||||||
@@ -81,7 +97,9 @@ impl<R: Read> Iterator for SeqChunkIter<R> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if let Some(abs_offset) = (self.splitter)(&self.rope) {
|
if let Some(abs_offset) = (self.splitter)(&self.rope) {
|
||||||
let tail = self.rope.split_off(abs_offset)
|
let tail = self
|
||||||
|
.rope
|
||||||
|
.split_off(abs_offset)
|
||||||
.expect("splitter returned valid offset");
|
.expect("splitter returned valid offset");
|
||||||
let chunk = std::mem::replace(&mut self.rope, tail);
|
let chunk = std::mem::replace(&mut self.rope, tail);
|
||||||
return Some(Ok(chunk));
|
return Some(Ok(chunk));
|
||||||
@@ -90,6 +108,88 @@ impl<R: Read> Iterator for SeqChunkIter<R> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Open `path` and iterate over its FASTA records as chunks.
|
||||||
|
pub fn read_fasta_chunks(
|
||||||
|
path: &str,
|
||||||
|
) -> io::Result<SeqChunkIter<MimeTypeGuesser<Box<dyn Read + Send>>>> {
|
||||||
|
let input = match xopen(path) {
|
||||||
|
Ok(mut i) => {
|
||||||
|
if i.mime_type() == Some("text/fasta") {
|
||||||
|
i
|
||||||
|
} else {
|
||||||
|
return Err(io::Error::new(
|
||||||
|
io::ErrorKind::InvalidData,
|
||||||
|
"not a FASTA file",
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => return Err(e),
|
||||||
|
};
|
||||||
|
Ok(fasta_chunks(input))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Open `path` and iterate over its FASTQ records as chunks.
|
||||||
|
pub fn read_fastq_chunks(
|
||||||
|
path: &str,
|
||||||
|
) -> io::Result<SeqChunkIter<MimeTypeGuesser<Box<dyn Read + Send>>>> {
|
||||||
|
let input = match xopen(path) {
|
||||||
|
Ok(mut i) => {
|
||||||
|
if i.mime_type() == Some("text/fastq") {
|
||||||
|
i
|
||||||
|
} else {
|
||||||
|
return Err(io::Error::new(
|
||||||
|
io::ErrorKind::InvalidData,
|
||||||
|
"not a FASTQ file",
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => return Err(e),
|
||||||
|
};
|
||||||
|
Ok(fastq_chunks(input))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Open `path` and auto-detect whether it is FASTA or FASTQ, then iterate over its records.
|
||||||
|
///
|
||||||
|
/// Returns an error if the format cannot be identified as `text/fasta` or `text/fastq`.
|
||||||
|
pub fn read_sequence_chunks(
|
||||||
|
path: &str,
|
||||||
|
) -> io::Result<SeqChunkIter<MimeTypeGuesser<Box<dyn Read + Send>>>> {
|
||||||
|
let input = match xopen(path) {
|
||||||
|
Ok(mut i) => match i.mime_type() {
|
||||||
|
Some("text/fasta") => fasta_chunks(i),
|
||||||
|
Some("text/fastq") => fastq_chunks(i),
|
||||||
|
_ => {
|
||||||
|
return Err(io::Error::new(
|
||||||
|
io::ErrorKind::InvalidData,
|
||||||
|
"not a FASTA or FASTQ file",
|
||||||
|
));
|
||||||
|
}
|
||||||
|
},
|
||||||
|
Err(e) => return Err(e),
|
||||||
|
};
|
||||||
|
Ok(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create a FASTA chunk iterator over `source`.
|
||||||
|
pub fn fasta_chunks<R: Read>(source: R) -> SeqChunkIter<R> {
|
||||||
|
SeqChunkIter::new(
|
||||||
|
source,
|
||||||
|
DEFAULT_BLOCK_SIZE,
|
||||||
|
fasta::end_of_last_fasta_entry,
|
||||||
|
Some("text/fasta"),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create a FASTQ chunk iterator over `source`.
|
||||||
|
pub fn fastq_chunks<R: Read>(source: R) -> SeqChunkIter<R> {
|
||||||
|
SeqChunkIter::new(
|
||||||
|
source,
|
||||||
|
DEFAULT_BLOCK_SIZE,
|
||||||
|
fastq::end_of_last_fastq_entry,
|
||||||
|
Some("text/fastq"),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
@@ -97,11 +197,11 @@ mod tests {
|
|||||||
use crate::fastq::end_of_last_fastq_entry;
|
use crate::fastq::end_of_last_fastq_entry;
|
||||||
|
|
||||||
fn fasta_iter(data: &'static [u8], block_size: usize) -> SeqChunkIter<&'static [u8]> {
|
fn fasta_iter(data: &'static [u8], block_size: usize) -> SeqChunkIter<&'static [u8]> {
|
||||||
SeqChunkIter::new(data, block_size, end_of_last_fasta_entry)
|
SeqChunkIter::new(data, block_size, end_of_last_fasta_entry, None)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn fastq_iter(data: &'static [u8], block_size: usize) -> SeqChunkIter<&'static [u8]> {
|
fn fastq_iter(data: &'static [u8], block_size: usize) -> SeqChunkIter<&'static [u8]> {
|
||||||
SeqChunkIter::new(data, block_size, end_of_last_fastq_entry)
|
SeqChunkIter::new(data, block_size, end_of_last_fastq_entry, None)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn rope_to_vec(rope: &Rope) -> Vec<u8> {
|
fn rope_to_vec(rope: &Rope) -> Vec<u8> {
|
||||||
@@ -134,7 +234,11 @@ mod tests {
|
|||||||
for rope in &chunks {
|
for rope in &chunks {
|
||||||
let flat = rope_to_vec(rope);
|
let flat = rope_to_vec(rope);
|
||||||
assert_eq!(flat[0], b'>', "block={block}: chunk doesn't start with '>'");
|
assert_eq!(flat[0], b'>', "block={block}: chunk doesn't start with '>'");
|
||||||
assert_eq!(*flat.last().unwrap(), b'\n', "block={block}: chunk doesn't end with newline");
|
assert_eq!(
|
||||||
|
*flat.last().unwrap(),
|
||||||
|
b'\n',
|
||||||
|
"block={block}: chunk doesn't end with newline"
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -163,10 +267,10 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn fastq_at_in_quality_handled() {
|
fn fastq_at_in_quality_handled() {
|
||||||
let data = Box::leak(make_fastq(&[
|
let data = Box::leak(
|
||||||
(b"ACGTACGT", b"@@@@IIII"),
|
make_fastq(&[(b"ACGTACGT", b"@@@@IIII"), (b"TTTTTTTT", b"HHHHHHHH")])
|
||||||
(b"TTTTTTTT", b"HHHHHHHH"),
|
.into_boxed_slice(),
|
||||||
]).into_boxed_slice());
|
);
|
||||||
let chunks: Vec<_> = fastq_iter(data, 16).collect::<Result<_, _>>().unwrap();
|
let chunks: Vec<_> = fastq_iter(data, 16).collect::<Result<_, _>>().unwrap();
|
||||||
let all: Vec<u8> = chunks.iter().flat_map(|r| rope_to_vec(r)).collect();
|
let all: Vec<u8> = chunks.iter().flat_map(|r| rope_to_vec(r)).collect();
|
||||||
assert_eq!(all, *data);
|
assert_eq!(all, *data);
|
||||||
@@ -174,17 +278,23 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn fastq_each_chunk_starts_with_at() {
|
fn fastq_each_chunk_starts_with_at() {
|
||||||
let data = Box::leak(make_fastq(&[
|
let data = Box::leak(
|
||||||
|
make_fastq(&[
|
||||||
(b"ACGT", b"IIII"),
|
(b"ACGT", b"IIII"),
|
||||||
(b"CCCC", b"JJJJ"),
|
(b"CCCC", b"JJJJ"),
|
||||||
(b"GGGG", b"KKKK"),
|
(b"GGGG", b"KKKK"),
|
||||||
(b"TTTT", b"LLLL"),
|
(b"TTTT", b"LLLL"),
|
||||||
]).into_boxed_slice());
|
])
|
||||||
|
.into_boxed_slice(),
|
||||||
|
);
|
||||||
for block in [18, 30, 60] {
|
for block in [18, 30, 60] {
|
||||||
let chunks: Vec<_> = fastq_iter(data, block).collect::<Result<_, _>>().unwrap();
|
let chunks: Vec<_> = fastq_iter(data, block).collect::<Result<_, _>>().unwrap();
|
||||||
for rope in &chunks {
|
for rope in &chunks {
|
||||||
let first_byte = rope_to_vec(rope)[0];
|
let first_byte = rope_to_vec(rope)[0];
|
||||||
assert_eq!(first_byte, b'@', "block={block}: chunk doesn't start with '@'");
|
assert_eq!(
|
||||||
|
first_byte, b'@',
|
||||||
|
"block={block}: chunk doesn't start with '@'"
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -39,13 +39,13 @@ mod tests {
|
|||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
fn rope(data: &[u8]) -> Rope {
|
fn rope(data: &[u8]) -> Rope {
|
||||||
let mut r = Rope::new();
|
let mut r = Rope::new(None);
|
||||||
r.push(data.to_vec());
|
r.push(data.to_vec());
|
||||||
r
|
r
|
||||||
}
|
}
|
||||||
|
|
||||||
fn rope2(a: &[u8], b: &[u8]) -> Rope {
|
fn rope2(a: &[u8], b: &[u8]) -> Rope {
|
||||||
let mut r = Rope::new();
|
let mut r = Rope::new(None);
|
||||||
r.push(a.to_vec());
|
r.push(a.to_vec());
|
||||||
r.push(b.to_vec());
|
r.push(b.to_vec());
|
||||||
r
|
r
|
||||||
|
|||||||
+15
-13
@@ -111,7 +111,7 @@ mod tests {
|
|||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
fn rope(data: &[u8]) -> Rope {
|
fn rope(data: &[u8]) -> Rope {
|
||||||
let mut r = Rope::new();
|
let mut r = Rope::new(None);
|
||||||
r.push(data.to_vec());
|
r.push(data.to_vec());
|
||||||
r
|
r
|
||||||
}
|
}
|
||||||
@@ -145,30 +145,32 @@ mod tests {
|
|||||||
let r = rope(&buf);
|
let r = rope(&buf);
|
||||||
let pos = end_of_last_fastq_entry(&r).unwrap();
|
let pos = end_of_last_fastq_entry(&r).unwrap();
|
||||||
assert_eq!(flat(&r)[pos], b'@');
|
assert_eq!(flat(&r)[pos], b'@');
|
||||||
assert_eq!(&flat(&r)[pos..], make_fastq(&[(b"TTTT", b"HHHH")]).as_slice());
|
assert_eq!(
|
||||||
|
&flat(&r)[pos..],
|
||||||
|
make_fastq(&[(b"TTTT", b"HHHH")]).as_slice()
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn three_records_cuts_at_last() {
|
fn three_records_cuts_at_last() {
|
||||||
let buf = make_fastq(&[
|
let buf = make_fastq(&[(b"ACGT", b"IIII"), (b"CCCC", b"JJJJ"), (b"GGGG", b"KKKK")]);
|
||||||
(b"ACGT", b"IIII"),
|
|
||||||
(b"CCCC", b"JJJJ"),
|
|
||||||
(b"GGGG", b"KKKK"),
|
|
||||||
]);
|
|
||||||
let r = rope(&buf);
|
let r = rope(&buf);
|
||||||
let pos = end_of_last_fastq_entry(&r).unwrap();
|
let pos = end_of_last_fastq_entry(&r).unwrap();
|
||||||
assert_eq!(&flat(&r)[pos..], make_fastq(&[(b"GGGG", b"KKKK")]).as_slice());
|
assert_eq!(
|
||||||
|
&flat(&r)[pos..],
|
||||||
|
make_fastq(&[(b"GGGG", b"KKKK")]).as_slice()
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn at_sign_in_quality_does_not_confuse() {
|
fn at_sign_in_quality_does_not_confuse() {
|
||||||
let buf = make_fastq(&[
|
let buf = make_fastq(&[(b"ACGTACGT", b"@@@@IIII"), (b"TTTT", b"HHHH")]);
|
||||||
(b"ACGTACGT", b"@@@@IIII"),
|
|
||||||
(b"TTTT", b"HHHH"),
|
|
||||||
]);
|
|
||||||
let r = rope(&buf);
|
let r = rope(&buf);
|
||||||
let pos = end_of_last_fastq_entry(&r).unwrap();
|
let pos = end_of_last_fastq_entry(&r).unwrap();
|
||||||
assert_eq!(&flat(&r)[pos..], make_fastq(&[(b"TTTT", b"HHHH")]).as_slice());
|
assert_eq!(
|
||||||
|
&flat(&r)[pos..],
|
||||||
|
make_fastq(&[(b"TTTT", b"HHHH")]).as_slice()
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|||||||
+5
-15
@@ -8,29 +8,19 @@
|
|||||||
pub mod chunk;
|
pub mod chunk;
|
||||||
mod fasta;
|
mod fasta;
|
||||||
mod fastq;
|
mod fastq;
|
||||||
pub mod mimetype;
|
mod mimetype;
|
||||||
pub mod normalize;
|
pub mod normalize;
|
||||||
mod path_iterator;
|
mod path_iterator;
|
||||||
pub mod peakreader;
|
pub mod peakreader;
|
||||||
pub mod xopen;
|
pub mod xopen;
|
||||||
|
|
||||||
|
pub use chunk::{SeqChunkIter, fasta_chunks, fastq_chunks,
|
||||||
|
read_fasta_chunks, read_fastq_chunks, read_sequence_chunks};
|
||||||
|
pub use normalize::{normalize_fasta_chunk, normalize_fastq_chunk, normalize_sequence_chunk};
|
||||||
|
pub use mimetype::MimeTypeGuesser;
|
||||||
pub use path_iterator::{PathIter, path_iter};
|
pub use path_iterator::{PathIter, path_iter};
|
||||||
pub use peakreader::PeekReader;
|
pub use peakreader::PeekReader;
|
||||||
|
|
||||||
use std::io::Read;
|
|
||||||
|
|
||||||
use chunk::SeqChunkIter;
|
|
||||||
pub use xopen::xopen;
|
pub use xopen::xopen;
|
||||||
|
|
||||||
/// Default read block size: 1 MiB.
|
/// Default read block size: 1 MiB.
|
||||||
pub const DEFAULT_BLOCK_SIZE: usize = 1024 * 1024;
|
pub const DEFAULT_BLOCK_SIZE: usize = 1024 * 1024;
|
||||||
|
|
||||||
/// Create a FASTA chunk iterator over `source`.
|
|
||||||
pub fn fasta_chunks<R: Read>(source: R) -> SeqChunkIter<R> {
|
|
||||||
SeqChunkIter::new(source, DEFAULT_BLOCK_SIZE, fasta::end_of_last_fasta_entry)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Create a FASTQ chunk iterator over `source`.
|
|
||||||
pub fn fastq_chunks<R: Read>(source: R) -> SeqChunkIter<R> {
|
|
||||||
SeqChunkIter::new(source, DEFAULT_BLOCK_SIZE, fastq::end_of_last_fastq_entry)
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
/// A struct that guesses the MIME type of a reader by peeking at the first few bytes.
|
||||||
use std::io;
|
use std::io;
|
||||||
use std::sync::LazyLock;
|
use std::sync::LazyLock;
|
||||||
|
|
||||||
@@ -35,13 +36,16 @@ static INFER: LazyLock<Infer> = LazyLock::new(|| {
|
|||||||
infer
|
infer
|
||||||
});
|
});
|
||||||
|
|
||||||
|
/// A struct that guesses the MIME type of a reader by peeking at the first few bytes.
|
||||||
pub struct MimeTypeGuesser<R: std::io::Read>(PeekReader<R>);
|
pub struct MimeTypeGuesser<R: std::io::Read>(PeekReader<R>);
|
||||||
|
|
||||||
impl<R: std::io::Read> MimeTypeGuesser<R> {
|
impl<R: std::io::Read> MimeTypeGuesser<R> {
|
||||||
|
/// Creates a new MimeTypeGuesser that wraps the given reader.
|
||||||
pub fn new(reader: R) -> Self {
|
pub fn new(reader: R) -> Self {
|
||||||
Self(PeekReader::new(reader, BUF_SIZE))
|
Self(PeekReader::new(reader, BUF_SIZE))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns the detected MIME type, if any.
|
||||||
pub fn mime_type(&mut self) -> Option<&'static str> {
|
pub fn mime_type(&mut self) -> Option<&'static str> {
|
||||||
let buf = self.0.header();
|
let buf = self.0.header();
|
||||||
INFER.get(buf).map(|kind| kind.mime_type())
|
INFER.get(buf).map(|kind| kind.mime_type())
|
||||||
|
|||||||
@@ -17,10 +17,31 @@
|
|||||||
//!
|
//!
|
||||||
//! After the automaton returns, `wc.rope_start()` is the rope truncation point.
|
//! After the automaton returns, `wc.rope_start()` is the rope truncation point.
|
||||||
|
|
||||||
|
use std::io;
|
||||||
|
|
||||||
use obikrope::{ForwardCursor, Rope, RopeCursor};
|
use obikrope::{ForwardCursor, Rope, RopeCursor};
|
||||||
|
|
||||||
// ── public entry points ───────────────────────────────────────────────────────
|
// ── public entry points ───────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/// Normalise a sequence chunk into a compact ACGT\x00-separated rope,
|
||||||
|
/// dispatching to the FASTA or FASTQ automaton based on the rope's mime type.
|
||||||
|
///
|
||||||
|
/// Returns an error if the rope carries no mime type or an unsupported one.
|
||||||
|
pub fn normalize_sequence_chunk(rope: Rope, k: usize) -> io::Result<Rope> {
|
||||||
|
match rope.mime_type() {
|
||||||
|
Some("text/fasta") => Ok(normalize_fasta_chunk(rope, k)),
|
||||||
|
Some("text/fastq") => Ok(normalize_fastq_chunk(rope, k)),
|
||||||
|
Some(other) => Err(io::Error::new(
|
||||||
|
io::ErrorKind::InvalidData,
|
||||||
|
format!("unsupported sequence format: {other}"),
|
||||||
|
)),
|
||||||
|
None => Err(io::Error::new(
|
||||||
|
io::ErrorKind::InvalidData,
|
||||||
|
"rope has no mime type",
|
||||||
|
)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Normalise a FASTA chunk into a compact ACGT\x00-separated rope.
|
/// Normalise a FASTA chunk into a compact ACGT\x00-separated rope.
|
||||||
pub fn normalize_fasta_chunk(mut rope: Rope, k: usize) -> Rope {
|
pub fn normalize_fasta_chunk(mut rope: Rope, k: usize) -> Rope {
|
||||||
let end = {
|
let end = {
|
||||||
@@ -47,7 +68,17 @@ pub fn normalize_fastq_chunk(mut rope: Rope, k: usize) -> Rope {
|
|||||||
|
|
||||||
// ── FASTA automaton ───────────────────────────────────────────────────────────
|
// ── FASTA automaton ───────────────────────────────────────────────────────────
|
||||||
|
|
||||||
fn normalize_fasta(read: &ForwardCursor<'_>, mut wc: &mut ForwardCursor<'_>, k: usize) {
|
/// Normalise a FASTA chunk into a compact ACGT\x00-separated rope.
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
///
|
||||||
|
/// * `read` - The input FASTA chunk cursor.
|
||||||
|
/// * `wc` - The output rope cursor.
|
||||||
|
/// * `k` - The k-mer size.
|
||||||
|
///
|
||||||
|
/// The k-mer size is used to determine the minimum length of Sequence
|
||||||
|
/// to be considered valid.
|
||||||
|
fn normalize_fasta(read: &ForwardCursor<'_>, wc: &mut ForwardCursor<'_>, k: usize) {
|
||||||
skip_line(read);
|
skip_line(read);
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
@@ -85,6 +116,17 @@ fn normalize_fasta(read: &ForwardCursor<'_>, mut wc: &mut ForwardCursor<'_>, k:
|
|||||||
|
|
||||||
// ── FASTQ automaton ───────────────────────────────────────────────────────────
|
// ── FASTQ automaton ───────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/// Normalizes a FASTQ read by skipping the header and sequence lines,
|
||||||
|
/// and writing the sequence to `wc`.
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
///
|
||||||
|
/// * `read` - The input FASTA chunk cursor.
|
||||||
|
/// * `wc` - The output rope cursor.
|
||||||
|
/// * `k` - The k-mer size.
|
||||||
|
///
|
||||||
|
/// The k-mer size is used to determine the minimum length of Sequence
|
||||||
|
/// to be considered valid.
|
||||||
fn normalize_fastq(read: &ForwardCursor<'_>, mut wc: &mut ForwardCursor<'_>, k: usize) {
|
fn normalize_fastq(read: &ForwardCursor<'_>, mut wc: &mut ForwardCursor<'_>, k: usize) {
|
||||||
loop {
|
loop {
|
||||||
skip_line(read); // skip header
|
skip_line(read); // skip header
|
||||||
@@ -177,7 +219,7 @@ mod tests {
|
|||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
fn make_rope(data: &[u8]) -> Rope {
|
fn make_rope(data: &[u8]) -> Rope {
|
||||||
let mut r = Rope::new();
|
let mut r = Rope::new(None);
|
||||||
r.push(data.to_vec());
|
r.push(data.to_vec());
|
||||||
r
|
r
|
||||||
}
|
}
|
||||||
@@ -306,7 +348,7 @@ mod tests {
|
|||||||
fn multi_slice_rope() {
|
fn multi_slice_rope() {
|
||||||
let data = make_fastq(&[b"ACGTACGT", b"TTTTTTTT"]);
|
let data = make_fastq(&[b"ACGTACGT", b"TTTTTTTT"]);
|
||||||
let mid = data.len() / 2;
|
let mid = data.len() / 2;
|
||||||
let mut rope = Rope::new();
|
let mut rope = Rope::new(None);
|
||||||
rope.push(data[..mid].to_vec());
|
rope.push(data[..mid].to_vec());
|
||||||
rope.push(data[mid..].to_vec());
|
rope.push(data[mid..].to_vec());
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
@@ -400,7 +442,7 @@ mod tests {
|
|||||||
fn fasta_multi_slice_rope() {
|
fn fasta_multi_slice_rope() {
|
||||||
let data = make_fasta(&[(b"s1", b"ACGTACGT"), (b"s2", b"TTTTTTTT")]);
|
let data = make_fasta(&[(b"s1", b"ACGTACGT"), (b"s2", b"TTTTTTTT")]);
|
||||||
let mid = data.len() / 2;
|
let mid = data.len() / 2;
|
||||||
let mut rope = Rope::new();
|
let mut rope = Rope::new(None);
|
||||||
rope.push(data[..mid].to_vec());
|
rope.push(data[..mid].to_vec());
|
||||||
rope.push(data[mid..].to_vec());
|
rope.push(data[mid..].to_vec());
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
|
|||||||
@@ -12,8 +12,9 @@
|
|||||||
//! | `http://` or `https://` | HTTP GET via `ureq` |
|
//! | `http://` or `https://` | HTTP GET via `ureq` |
|
||||||
//! | anything else | local file; `~/` is expanded to the home directory |
|
//! | anything else | local file; `~/` is expanded to the home directory |
|
||||||
|
|
||||||
use std::io::{self, Read};
|
use crate::mimetype::MimeTypeGuesser;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
|
use std::io::{self, Read};
|
||||||
|
|
||||||
// ── public API ────────────────────────────────────────────────────────────────
|
// ── public API ────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
@@ -25,18 +26,17 @@ use std::fs::File;
|
|||||||
/// # Errors
|
/// # Errors
|
||||||
/// Returns an `io::Error` if the file cannot be opened, the URL cannot be
|
/// Returns an `io::Error` if the file cannot be opened, the URL cannot be
|
||||||
/// fetched, or the compression header is malformed.
|
/// fetched, or the compression header is malformed.
|
||||||
pub fn xopen(source: &str) -> io::Result<Box<dyn Read + Send>> {
|
pub fn xopen(source: &str) -> io::Result<MimeTypeGuesser<Box<dyn Read + Send>>> {
|
||||||
let raw: Box<dyn Read + Send> = match source {
|
let raw: Box<dyn Read + Send> = match source {
|
||||||
"-" => Box::new(io::stdin()),
|
"-" => Box::new(io::stdin()),
|
||||||
s if s.starts_with("http://") || s.starts_with("https://") => {
|
s if s.starts_with("http://") || s.starts_with("https://") => http_reader(s)?,
|
||||||
http_reader(s)?
|
|
||||||
}
|
|
||||||
path => {
|
path => {
|
||||||
let expanded = expand_tilde(path);
|
let expanded = expand_tilde(path);
|
||||||
Box::new(File::open(expanded.as_ref())?)
|
Box::new(File::open(expanded.as_ref())?)
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
decompress(raw)
|
let decompressed = decompress(raw)?;
|
||||||
|
Ok(MimeTypeGuesser::new(decompressed))
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── internal helpers ──────────────────────────────────────────────────────────
|
// ── internal helpers ──────────────────────────────────────────────────────────
|
||||||
|
|||||||
@@ -156,7 +156,7 @@ mod tests {
|
|||||||
use obikrope::Rope;
|
use obikrope::Rope;
|
||||||
|
|
||||||
fn make_rope(data: &[u8]) -> Rope {
|
fn make_rope(data: &[u8]) -> Rope {
|
||||||
let mut r = Rope::new();
|
let mut r = Rope::new(None);
|
||||||
r.push(data.to_vec());
|
r.push(data.to_vec());
|
||||||
r
|
r
|
||||||
}
|
}
|
||||||
@@ -210,7 +210,7 @@ mod tests {
|
|||||||
fn multi_slice_rope() {
|
fn multi_slice_rope() {
|
||||||
let data = b"ACGTACGTACGT\x00";
|
let data = b"ACGTACGTACGT\x00";
|
||||||
let mid = data.len() / 2;
|
let mid = data.len() / 2;
|
||||||
let mut rope = Rope::new();
|
let mut rope = Rope::new(None);
|
||||||
rope.push(data[..mid].to_vec());
|
rope.push(data[..mid].to_vec());
|
||||||
rope.push(data[mid..].to_vec());
|
rope.push(data[mid..].to_vec());
|
||||||
let out: Vec<Vec<u8>> = SuperKmerIter::new(&rope, 4, 2, 1, 0.0)
|
let out: Vec<Vec<u8>> = SuperKmerIter::new(&rope, 4, 2, 1, 0.0)
|
||||||
|
|||||||
Reference in New Issue
Block a user