(deps) Add regex crate and improve MIME type detection
- Added `regex` dependency to obiread crate - Replaced manual byte checks with regex-based detection for FASTA/FASTQ formats in mimetype.rs - Switched from `once_cell::sync::Lazy` to standard library's `std:: sync :: LazyLock` - Added generic text/plain fallback detection for ASCII-compatible content - Updated `MimeTypeGuesser::new` constructor call syntax and simplified API usage of PeekReader's header method - Implemented `Read trait for MimeTypeGuesser to allow transparent passthrough reading
This commit is contained in:
Generated
+13
@@ -706,6 +706,7 @@ dependencies = [
|
|||||||
"infer",
|
"infer",
|
||||||
"niffler",
|
"niffler",
|
||||||
"obikrope",
|
"obikrope",
|
||||||
|
"regex",
|
||||||
"tracing",
|
"tracing",
|
||||||
"tracing-subscriber",
|
"tracing-subscriber",
|
||||||
"ureq",
|
"ureq",
|
||||||
@@ -815,6 +816,18 @@ dependencies = [
|
|||||||
"crossbeam-utils",
|
"crossbeam-utils",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "regex"
|
||||||
|
version = "1.12.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276"
|
||||||
|
dependencies = [
|
||||||
|
"aho-corasick",
|
||||||
|
"memchr",
|
||||||
|
"regex-automata",
|
||||||
|
"regex-syntax",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "regex-automata"
|
name = "regex-automata"
|
||||||
version = "0.4.14"
|
version = "0.4.14"
|
||||||
|
|||||||
@@ -10,3 +10,4 @@ ureq = "2"
|
|||||||
tracing = "0.1.44"
|
tracing = "0.1.44"
|
||||||
tracing-subscriber = { version = "0.3.23", features = ["fmt", "env-filter"] }
|
tracing-subscriber = { version = "0.3.23", features = ["fmt", "env-filter"] }
|
||||||
infer = "0.19.0"
|
infer = "0.19.0"
|
||||||
|
regex = "1"
|
||||||
|
|||||||
@@ -8,6 +8,7 @@
|
|||||||
pub mod chunk;
|
pub mod chunk;
|
||||||
mod fasta;
|
mod fasta;
|
||||||
mod fastq;
|
mod fastq;
|
||||||
|
pub mod mimetype;
|
||||||
pub mod normalize;
|
pub mod normalize;
|
||||||
mod path_iterator;
|
mod path_iterator;
|
||||||
pub mod peakreader;
|
pub mod peakreader;
|
||||||
|
|||||||
+26
-14
@@ -1,25 +1,37 @@
|
|||||||
|
use std::io;
|
||||||
|
use std::sync::LazyLock;
|
||||||
|
|
||||||
use infer::Infer;
|
use infer::Infer;
|
||||||
use once_cell::sync::Lazy;
|
|
||||||
use regex::Regex;
|
use regex::Regex;
|
||||||
|
|
||||||
|
use crate::peakreader::PeekReader;
|
||||||
|
|
||||||
const BUF_SIZE: usize = 4096;
|
const BUF_SIZE: usize = 4096;
|
||||||
|
|
||||||
static RE_FASTA: Lazy<Regex> = Lazy::new(|| Regex::new(r"^>[^ ]").unwrap());
|
static RE_FASTA: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^>[^ ]").unwrap());
|
||||||
fn is_fasta(buf: &[u8]) -> bool {
|
fn is_fasta(buf: &[u8]) -> bool {
|
||||||
RE_FASTA.is_match(buf)
|
std::str::from_utf8(buf).map_or(false, |s| RE_FASTA.is_match(s))
|
||||||
}
|
}
|
||||||
|
|
||||||
static RE_FASTQ: Lazy<Regex> = Lazy::new(|| Regex::new(r"^@[^ ].*\n[A-Za-z.-]+").unwrap());
|
static RE_FASTQ: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^@[^ ].*\n[A-Za-z.-]+").unwrap());
|
||||||
fn is_fastq(buf: &[u8]) -> bool {
|
fn is_fastq(buf: &[u8]) -> bool {
|
||||||
RE_FASTQ.is_match(buf)
|
std::str::from_utf8(buf).map_or(false, |s| RE_FASTQ.is_match(s))
|
||||||
}
|
}
|
||||||
|
|
||||||
static INFER: Lazy<Infer> = Lazy::new(|| {
|
fn is_text(buf: &[u8]) -> bool {
|
||||||
|
!buf.is_empty()
|
||||||
|
&& buf
|
||||||
|
.iter()
|
||||||
|
.all(|&b| b.is_ascii() && (b >= 0x20 || matches!(b, b'\t' | b'\n' | b'\r')))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Custom matchers are checked in insertion order (first added = first checked).
|
||||||
|
// Most specific formats (fastq, fasta) come before the generic text/plain fallback.
|
||||||
|
static INFER: LazyLock<Infer> = LazyLock::new(|| {
|
||||||
let mut infer = Infer::new();
|
let mut infer = Infer::new();
|
||||||
infer.add("text/fasta", "fasta", |buf| buf.starts_with(b">"));
|
infer.add("text/fastq", "fastq", is_fastq);
|
||||||
infer.add("text/fastq", "fastq", |buf| {
|
infer.add("text/fasta", "fasta", is_fasta);
|
||||||
buf.starts_with(b"@") && !buf.starts_with(b"@param,")
|
infer.add("text/plain", "txt", is_text);
|
||||||
});
|
|
||||||
infer
|
infer
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -27,16 +39,16 @@ pub struct MimeTypeGuesser<R: std::io::Read>(PeekReader<R>);
|
|||||||
|
|
||||||
impl<R: std::io::Read> MimeTypeGuesser<R> {
|
impl<R: std::io::Read> MimeTypeGuesser<R> {
|
||||||
pub fn new(reader: R) -> Self {
|
pub fn new(reader: R) -> Self {
|
||||||
Self { PeekReader::new(reader, BUF_SIZE) }
|
Self(PeekReader::new(reader, BUF_SIZE))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn mime_type(&mut self) -> Option<&'static str> {
|
pub fn mime_type(&mut self) -> Option<&'static str> {
|
||||||
let buf = self.0.header(BUF_SIZE)?;
|
let buf = self.0.header();
|
||||||
INFER.get_mime_type_for_bytes(buf).map(|kind| kind.mime_type())
|
INFER.get(buf).map(|kind| kind.mime_type())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<R: std::io::Read> for MimeTypeGuesser<R> {
|
impl<R: std::io::Read> io::Read for MimeTypeGuesser<R> {
|
||||||
fn read(&mut self, out: &mut [u8]) -> io::Result<usize> {
|
fn read(&mut self, out: &mut [u8]) -> io::Result<usize> {
|
||||||
self.0.read(out)
|
self.0.read(out)
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user