From 75bf98004684e707eb1349866da15e1083b215ad Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Fri, 24 Apr 2026 17:14:33 +0200 Subject: [PATCH] (deps) Add regex crate and improve MIME type detection - Added `regex` dependency to obiread crate - Replaced manual byte checks with regex-based detection for FASTA/FASTQ formats in mimetype.rs - Switched from `once_cell::sync::Lazy` to standard library's `std:: sync :: LazyLock` - Added generic text/plain fallback detection for ASCII-compatible content - Updated `MimeTypeGuesser::new` constructor call syntax and simplified API usage of PeekReader's header method - Implemented `Read trait for MimeTypeGuesser to allow transparent passthrough reading --- src/Cargo.lock | 13 ++++++++++++ src/obiread/Cargo.toml | 1 + src/obiread/src/lib.rs | 1 + src/obiread/src/mimetype.rs | 40 ++++++++++++++++++++++++------------- 4 files changed, 41 insertions(+), 14 deletions(-) diff --git a/src/Cargo.lock b/src/Cargo.lock index 2d0becb..5b41240 100644 --- a/src/Cargo.lock +++ b/src/Cargo.lock @@ -706,6 +706,7 @@ dependencies = [ "infer", "niffler", "obikrope", + "regex", "tracing", "tracing-subscriber", "ureq", @@ -815,6 +816,18 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "regex" +version = "1.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + [[package]] name = "regex-automata" version = "0.4.14" diff --git a/src/obiread/Cargo.toml b/src/obiread/Cargo.toml index b982392..9d4d0b1 100644 --- a/src/obiread/Cargo.toml +++ b/src/obiread/Cargo.toml @@ -10,3 +10,4 @@ ureq = "2" tracing = "0.1.44" tracing-subscriber = { version = "0.3.23", features = ["fmt", "env-filter"] } infer = "0.19.0" +regex = "1" diff --git a/src/obiread/src/lib.rs b/src/obiread/src/lib.rs index c758579..047d0b8 100644 --- a/src/obiread/src/lib.rs +++ b/src/obiread/src/lib.rs @@ -8,6 +8,7 @@ pub mod chunk; mod fasta; mod fastq; +pub mod mimetype; pub mod normalize; mod path_iterator; pub mod peakreader; diff --git a/src/obiread/src/mimetype.rs b/src/obiread/src/mimetype.rs index fda9f28..78a66c7 100644 --- a/src/obiread/src/mimetype.rs +++ b/src/obiread/src/mimetype.rs @@ -1,25 +1,37 @@ +use std::io; +use std::sync::LazyLock; + use infer::Infer; -use once_cell::sync::Lazy; use regex::Regex; +use crate::peakreader::PeekReader; + const BUF_SIZE: usize = 4096; -static RE_FASTA: Lazy = Lazy::new(|| Regex::new(r"^>[^ ]").unwrap()); +static RE_FASTA: LazyLock = LazyLock::new(|| Regex::new(r"^>[^ ]").unwrap()); fn is_fasta(buf: &[u8]) -> bool { - RE_FASTA.is_match(buf) + std::str::from_utf8(buf).map_or(false, |s| RE_FASTA.is_match(s)) } -static RE_FASTQ: Lazy = Lazy::new(|| Regex::new(r"^@[^ ].*\n[A-Za-z.-]+").unwrap()); +static RE_FASTQ: LazyLock = LazyLock::new(|| Regex::new(r"^@[^ ].*\n[A-Za-z.-]+").unwrap()); fn is_fastq(buf: &[u8]) -> bool { - RE_FASTQ.is_match(buf) + std::str::from_utf8(buf).map_or(false, |s| RE_FASTQ.is_match(s)) } -static INFER: Lazy = Lazy::new(|| { +fn is_text(buf: &[u8]) -> bool { + !buf.is_empty() + && buf + .iter() + .all(|&b| b.is_ascii() && (b >= 0x20 || matches!(b, b'\t' | b'\n' | b'\r'))) +} + +// Custom matchers are checked in insertion order (first added = first checked). +// Most specific formats (fastq, fasta) come before the generic text/plain fallback. +static INFER: LazyLock = LazyLock::new(|| { let mut infer = Infer::new(); - infer.add("text/fasta", "fasta", |buf| buf.starts_with(b">")); - infer.add("text/fastq", "fastq", |buf| { - buf.starts_with(b"@") && !buf.starts_with(b"@param,") - }); + infer.add("text/fastq", "fastq", is_fastq); + infer.add("text/fasta", "fasta", is_fasta); + infer.add("text/plain", "txt", is_text); infer }); @@ -27,16 +39,16 @@ pub struct MimeTypeGuesser(PeekReader); impl MimeTypeGuesser { pub fn new(reader: R) -> Self { - Self { PeekReader::new(reader, BUF_SIZE) } + Self(PeekReader::new(reader, BUF_SIZE)) } pub fn mime_type(&mut self) -> Option<&'static str> { - let buf = self.0.header(BUF_SIZE)?; - INFER.get_mime_type_for_bytes(buf).map(|kind| kind.mime_type()) + let buf = self.0.header(); + INFER.get(buf).map(|kind| kind.mime_type()) } } -impl for MimeTypeGuesser { +impl io::Read for MimeTypeGuesser { fn read(&mut self, out: &mut [u8]) -> io::Result { self.0.read(out) }