(deps) Add regex crate and improve MIME type detection
- Added `regex` dependency to obiread crate - Replaced manual byte checks with regex-based detection for FASTA/FASTQ formats in mimetype.rs - Switched from `once_cell::sync::Lazy` to standard library's `std:: sync :: LazyLock` - Added generic text/plain fallback detection for ASCII-compatible content - Updated `MimeTypeGuesser::new` constructor call syntax and simplified API usage of PeekReader's header method - Implemented `Read trait for MimeTypeGuesser to allow transparent passthrough reading
This commit is contained in:
@@ -10,3 +10,4 @@ ureq = "2"
|
||||
tracing = "0.1.44"
|
||||
tracing-subscriber = { version = "0.3.23", features = ["fmt", "env-filter"] }
|
||||
infer = "0.19.0"
|
||||
regex = "1"
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
pub mod chunk;
|
||||
mod fasta;
|
||||
mod fastq;
|
||||
pub mod mimetype;
|
||||
pub mod normalize;
|
||||
mod path_iterator;
|
||||
pub mod peakreader;
|
||||
|
||||
+26
-14
@@ -1,25 +1,37 @@
|
||||
use std::io;
|
||||
use std::sync::LazyLock;
|
||||
|
||||
use infer::Infer;
|
||||
use once_cell::sync::Lazy;
|
||||
use regex::Regex;
|
||||
|
||||
use crate::peakreader::PeekReader;
|
||||
|
||||
const BUF_SIZE: usize = 4096;
|
||||
|
||||
static RE_FASTA: Lazy<Regex> = Lazy::new(|| Regex::new(r"^>[^ ]").unwrap());
|
||||
static RE_FASTA: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^>[^ ]").unwrap());
|
||||
fn is_fasta(buf: &[u8]) -> bool {
|
||||
RE_FASTA.is_match(buf)
|
||||
std::str::from_utf8(buf).map_or(false, |s| RE_FASTA.is_match(s))
|
||||
}
|
||||
|
||||
static RE_FASTQ: Lazy<Regex> = Lazy::new(|| Regex::new(r"^@[^ ].*\n[A-Za-z.-]+").unwrap());
|
||||
static RE_FASTQ: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^@[^ ].*\n[A-Za-z.-]+").unwrap());
|
||||
fn is_fastq(buf: &[u8]) -> bool {
|
||||
RE_FASTQ.is_match(buf)
|
||||
std::str::from_utf8(buf).map_or(false, |s| RE_FASTQ.is_match(s))
|
||||
}
|
||||
|
||||
static INFER: Lazy<Infer> = Lazy::new(|| {
|
||||
fn is_text(buf: &[u8]) -> bool {
|
||||
!buf.is_empty()
|
||||
&& buf
|
||||
.iter()
|
||||
.all(|&b| b.is_ascii() && (b >= 0x20 || matches!(b, b'\t' | b'\n' | b'\r')))
|
||||
}
|
||||
|
||||
// Custom matchers are checked in insertion order (first added = first checked).
|
||||
// Most specific formats (fastq, fasta) come before the generic text/plain fallback.
|
||||
static INFER: LazyLock<Infer> = LazyLock::new(|| {
|
||||
let mut infer = Infer::new();
|
||||
infer.add("text/fasta", "fasta", |buf| buf.starts_with(b">"));
|
||||
infer.add("text/fastq", "fastq", |buf| {
|
||||
buf.starts_with(b"@") && !buf.starts_with(b"@param,")
|
||||
});
|
||||
infer.add("text/fastq", "fastq", is_fastq);
|
||||
infer.add("text/fasta", "fasta", is_fasta);
|
||||
infer.add("text/plain", "txt", is_text);
|
||||
infer
|
||||
});
|
||||
|
||||
@@ -27,16 +39,16 @@ pub struct MimeTypeGuesser<R: std::io::Read>(PeekReader<R>);
|
||||
|
||||
impl<R: std::io::Read> MimeTypeGuesser<R> {
|
||||
pub fn new(reader: R) -> Self {
|
||||
Self { PeekReader::new(reader, BUF_SIZE) }
|
||||
Self(PeekReader::new(reader, BUF_SIZE))
|
||||
}
|
||||
|
||||
pub fn mime_type(&mut self) -> Option<&'static str> {
|
||||
let buf = self.0.header(BUF_SIZE)?;
|
||||
INFER.get_mime_type_for_bytes(buf).map(|kind| kind.mime_type())
|
||||
let buf = self.0.header();
|
||||
INFER.get(buf).map(|kind| kind.mime_type())
|
||||
}
|
||||
}
|
||||
|
||||
impl<R: std::io::Read> for MimeTypeGuesser<R> {
|
||||
impl<R: std::io::Read> io::Read for MimeTypeGuesser<R> {
|
||||
fn read(&mut self, out: &mut [u8]) -> io::Result<usize> {
|
||||
self.0.read(out)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user