cfadf63bbc
Replace the existing chunk and Rope-based processing pipeline with a fixed-size NucPage architecture. Introduce a new nucstream module featuring buffer-pooled, in-place parsing that auto-detects and decompresses FASTA/FASTQ/GenBank inputs into normalized ACGT streams with k-mer overlap preservation. Update obikmer scatter and superkmer stages to consume NucPage iterators and cursor-based navigation, eliminating std::io::Read dependencies and optimizing memory management. Add a configurable max_open_files CLI argument and update implementation documentation to reflect the new record vs. stream reading paths.
74 lines
2.9 KiB
Rust
74 lines
2.9 KiB
Rust
//! Transparent reader for local files, HTTP/HTTPS URLs, and stdin.
|
|
//!
|
|
//! Compression is detected from the magic bytes (not the file extension),
|
|
//! so gzip, bzip2, xz and zstd files are decompressed automatically regardless
|
|
//! of whether they carry a `.gz` / `.bz2` / `.xz` / `.zst` suffix.
|
|
//!
|
|
//! # Source strings
|
|
//!
|
|
//! | Prefix | Behaviour |
|
|
//! |--------|-----------|
|
|
//! | `-` | reads from stdin |
|
|
//! | `http://` or `https://` | HTTP GET via `ureq` |
|
|
//! | anything else | local file; `~/` is expanded to the home directory |
|
|
|
|
use crate::mimetype::MimeTypeGuesser;
|
|
use std::fs::File;
|
|
use std::io::{self, Read};
|
|
|
|
// ── public API ────────────────────────────────────────────────────────────────
|
|
|
|
/// Open any source for reading, with transparent decompression.
|
|
///
|
|
/// Returns a `Box<dyn Read + Send>` that yields uncompressed bytes regardless
|
|
/// of whether the underlying source is plain text, gzip, bzip2, xz or zstd.
|
|
///
|
|
/// # Errors
|
|
/// Returns an `io::Error` if the file cannot be opened, the URL cannot be
|
|
/// fetched, or the compression header is malformed.
|
|
pub(crate) fn open_raw(source: &str) -> io::Result<Box<dyn Read + Send>> {
|
|
let raw: Box<dyn Read + Send> = match source {
|
|
"-" => Box::new(io::stdin()),
|
|
s if s.starts_with("http://") || s.starts_with("https://") => http_reader(s)?,
|
|
path => {
|
|
let expanded = expand_tilde(path);
|
|
Box::new(File::open(expanded.as_ref())?)
|
|
}
|
|
};
|
|
decompress(raw)
|
|
}
|
|
|
|
/// Open any source for reading, with transparent decompression and MIME detection.
|
|
///
|
|
/// Wraps [`open_raw`] in a [`MimeTypeGuesser`] so callers can inspect the
|
|
/// format before consuming the stream.
|
|
pub fn xopen(source: &str) -> io::Result<MimeTypeGuesser<Box<dyn Read + Send>>> {
|
|
Ok(MimeTypeGuesser::new(open_raw(source)?))
|
|
}
|
|
|
|
// ── internal helpers ──────────────────────────────────────────────────────────
|
|
|
|
fn http_reader(url: &str) -> io::Result<Box<dyn Read + Send>> {
|
|
ureq::get(url)
|
|
.call()
|
|
.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))
|
|
.map(|resp| -> Box<dyn Read + Send> { Box::new(resp.into_reader()) })
|
|
}
|
|
|
|
fn decompress(raw: Box<dyn Read + Send>) -> io::Result<Box<dyn Read + Send>> {
|
|
niffler::send::get_reader(raw)
|
|
.map(|(reader, _fmt)| reader)
|
|
.map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e.to_string()))
|
|
}
|
|
|
|
fn expand_tilde(path: &str) -> std::borrow::Cow<'_, str> {
|
|
if path.starts_with("~/") {
|
|
if let Some(home) = std::env::var_os("HOME") {
|
|
let mut expanded = home.to_string_lossy().into_owned();
|
|
expanded.push_str(&path[1..]);
|
|
return std::borrow::Cow::Owned(expanded);
|
|
}
|
|
}
|
|
std::borrow::Cow::Borrowed(path)
|
|
}
|