(deps) Add regex crate and improve MIME type detection

- Added `regex` dependency to obiread crate
- Replaced manual byte checks with regex-based detection for FASTA/FASTQ formats in mimetype.rs
- Switched from `once_cell::sync::Lazy` to standard library's `std:: sync :: LazyLock`
- Added generic text/plain fallback detection for ASCII-compatible content
- Updated `MimeTypeGuesser::new` constructor call syntax and simplified API usage of PeekReader's header method
- Implemented `Read trait for MimeTypeGuesser to allow transparent passthrough reading
This commit is contained in:
Eric Coissac
2026-04-24 17:14:33 +02:00
parent 22951fb0e8
commit 75bf980046
4 changed files with 41 additions and 14 deletions
+1
View File
@@ -10,3 +10,4 @@ ureq = "2"
tracing = "0.1.44"
tracing-subscriber = { version = "0.3.23", features = ["fmt", "env-filter"] }
infer = "0.19.0"
regex = "1"
+1
View File
@@ -8,6 +8,7 @@
pub mod chunk;
mod fasta;
mod fastq;
pub mod mimetype;
pub mod normalize;
mod path_iterator;
pub mod peakreader;
+26 -14
View File
@@ -1,25 +1,37 @@
use std::io;
use std::sync::LazyLock;
use infer::Infer;
use once_cell::sync::Lazy;
use regex::Regex;
use crate::peakreader::PeekReader;
const BUF_SIZE: usize = 4096;
static RE_FASTA: Lazy<Regex> = Lazy::new(|| Regex::new(r"^>[^ ]").unwrap());
static RE_FASTA: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^>[^ ]").unwrap());
fn is_fasta(buf: &[u8]) -> bool {
RE_FASTA.is_match(buf)
std::str::from_utf8(buf).map_or(false, |s| RE_FASTA.is_match(s))
}
static RE_FASTQ: Lazy<Regex> = Lazy::new(|| Regex::new(r"^@[^ ].*\n[A-Za-z.-]+").unwrap());
static RE_FASTQ: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^@[^ ].*\n[A-Za-z.-]+").unwrap());
fn is_fastq(buf: &[u8]) -> bool {
RE_FASTQ.is_match(buf)
std::str::from_utf8(buf).map_or(false, |s| RE_FASTQ.is_match(s))
}
static INFER: Lazy<Infer> = Lazy::new(|| {
fn is_text(buf: &[u8]) -> bool {
!buf.is_empty()
&& buf
.iter()
.all(|&b| b.is_ascii() && (b >= 0x20 || matches!(b, b'\t' | b'\n' | b'\r')))
}
// Custom matchers are checked in insertion order (first added = first checked).
// Most specific formats (fastq, fasta) come before the generic text/plain fallback.
static INFER: LazyLock<Infer> = LazyLock::new(|| {
let mut infer = Infer::new();
infer.add("text/fasta", "fasta", |buf| buf.starts_with(b">"));
infer.add("text/fastq", "fastq", |buf| {
buf.starts_with(b"@") && !buf.starts_with(b"@param,")
});
infer.add("text/fastq", "fastq", is_fastq);
infer.add("text/fasta", "fasta", is_fasta);
infer.add("text/plain", "txt", is_text);
infer
});
@@ -27,16 +39,16 @@ pub struct MimeTypeGuesser<R: std::io::Read>(PeekReader<R>);
impl<R: std::io::Read> MimeTypeGuesser<R> {
pub fn new(reader: R) -> Self {
Self { PeekReader::new(reader, BUF_SIZE) }
Self(PeekReader::new(reader, BUF_SIZE))
}
pub fn mime_type(&mut self) -> Option<&'static str> {
let buf = self.0.header(BUF_SIZE)?;
INFER.get_mime_type_for_bytes(buf).map(|kind| kind.mime_type())
let buf = self.0.header();
INFER.get(buf).map(|kind| kind.mime_type())
}
}
impl<R: std::io::Read> for MimeTypeGuesser<R> {
impl<R: std::io::Read> io::Read for MimeTypeGuesser<R> {
fn read(&mut self, out: &mut [u8]) -> io::Result<usize> {
self.0.read(out)
}