📦 Add infer and new pipeline infrastructure
- Update Cargo.lock with dependency additions (bumpalo, byteorder, cfb, fnv, infer, js-sys, uuid wasm-bindgen) - Refactor obikseq::superkmer: reorder imports and improve formatting - Add `obipipeline` crate with scheduler, error handling & macros (WIP) - Replace obiread::expand_paths logic with PathIter and path_iterator module - Add mimetype detection using `infer` crate via PeekReader wrapper
This commit is contained in:
@@ -9,3 +9,4 @@ niffler = { version = "2", default-features = false, features = ["gz", "bz2", "l
|
||||
ureq = "2"
|
||||
tracing = "0.1.44"
|
||||
tracing-subscriber = { version = "0.3.23", features = ["fmt", "env-filter"] }
|
||||
infer = "0.19.0"
|
||||
|
||||
@@ -1,19 +1,21 @@
|
||||
use obiread::expand_paths;
|
||||
use tracing::{info, subscriber};
|
||||
use obiread::path_iter;
|
||||
use std::env;
|
||||
use tracing::info;
|
||||
use tracing_subscriber::{EnvFilter, fmt};
|
||||
|
||||
fn main() {
|
||||
// Build a subscriber with environment-based filtering
|
||||
tracing_subscriber::fmt()
|
||||
.with_env_filter(tracing_subscriber::EnvFilter::from_default_env())
|
||||
.init();
|
||||
fmt().with_env_filter(EnvFilter::from_default_env()).init();
|
||||
|
||||
info!("Expanding paths...");
|
||||
let paths = vec![
|
||||
"/home/user/data".to_string(),
|
||||
"/home/user/sample.fastq.gz".to_string(),
|
||||
];
|
||||
let files = expand_paths(&paths);
|
||||
let args: Vec<String> = env::args().skip(1).collect();
|
||||
|
||||
if args.is_empty() {
|
||||
eprintln!("Usage: cargo run -p obiread --example expand_path -- <path> [path...]");
|
||||
std::process::exit(1);
|
||||
}
|
||||
|
||||
let files = path_iter(&args);
|
||||
for f in files {
|
||||
println!("{}", f.display());
|
||||
}
|
||||
|
||||
@@ -8,11 +8,13 @@
|
||||
pub mod chunk;
|
||||
mod fasta;
|
||||
mod fastq;
|
||||
mod list_of_files;
|
||||
pub mod normalize;
|
||||
mod path_iterator;
|
||||
pub mod peakreader;
|
||||
pub mod xopen;
|
||||
|
||||
pub use list_of_files::expand_paths;
|
||||
pub use path_iterator::{PathIter, path_iter};
|
||||
pub use peakreader::PeekReader;
|
||||
|
||||
use std::io::Read;
|
||||
|
||||
|
||||
@@ -1,47 +0,0 @@
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
use std::path::PathBuf;
|
||||
use tracing::info;
|
||||
|
||||
/// Returns true if the path ends with a fasta or fastq file extension.
|
||||
fn is_fasta_or_fastq(path: &Path) -> bool {
|
||||
let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
|
||||
name.ends_with(".fasta")
|
||||
|| name.ends_with(".fa")
|
||||
|| name.ends_with(".fastq")
|
||||
|| name.ends_with(".fq")
|
||||
|| name.ends_with(".fasta.gz")
|
||||
|| name.ends_with(".fa.gz")
|
||||
|| name.ends_with(".fastq.gz")
|
||||
|| name.ends_with(".fq.gz")
|
||||
}
|
||||
|
||||
/// Walks a directory, collecting fasta or fastq files into the output vector.
|
||||
fn walk_dir(dir: &Path, out: &mut Vec<PathBuf>) {
|
||||
if let Ok(entries) = fs::read_dir(dir) {
|
||||
for entry in entries.flatten() {
|
||||
let path = entry.path();
|
||||
if path.is_dir() {
|
||||
walk_dir(&path, out);
|
||||
} else if path.is_file() && is_fasta_or_fastq(&path) {
|
||||
out.push(path);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Expands a list of paths, returning a vector of `PathBuf` for fasta or fastq files.
|
||||
pub fn expand_paths(paths: &[String]) -> Vec<PathBuf> {
|
||||
let mut result = Vec::new();
|
||||
for path_str in paths {
|
||||
info!("Current step: {}", path_str);
|
||||
let path = Path::new(path_str);
|
||||
if path.is_dir() {
|
||||
walk_dir(path, &mut result);
|
||||
} else if path.is_file() && is_fasta_or_fastq(path) {
|
||||
info!("Found fasta or fastq file: {}", path_str);
|
||||
result.push(path.to_path_buf());
|
||||
}
|
||||
}
|
||||
result
|
||||
}
|
||||
@@ -0,0 +1,43 @@
|
||||
use infer::Infer;
|
||||
use once_cell::sync::Lazy;
|
||||
use regex::Regex;
|
||||
|
||||
const BUF_SIZE: usize = 4096;
|
||||
|
||||
static RE_FASTA: Lazy<Regex> = Lazy::new(|| Regex::new(r"^>[^ ]").unwrap());
|
||||
fn is_fasta(buf: &[u8]) -> bool {
|
||||
RE_FASTA.is_match(buf)
|
||||
}
|
||||
|
||||
static RE_FASTQ: Lazy<Regex> = Lazy::new(|| Regex::new(r"^@[^ ].*\n[A-Za-z.-]+").unwrap());
|
||||
fn is_fastq(buf: &[u8]) -> bool {
|
||||
RE_FASTQ.is_match(buf)
|
||||
}
|
||||
|
||||
static INFER: Lazy<Infer> = Lazy::new(|| {
|
||||
let mut infer = Infer::new();
|
||||
infer.add("text/fasta", "fasta", |buf| buf.starts_with(b">"));
|
||||
infer.add("text/fastq", "fastq", |buf| {
|
||||
buf.starts_with(b"@") && !buf.starts_with(b"@param,")
|
||||
});
|
||||
infer
|
||||
});
|
||||
|
||||
pub struct MimeTypeGuesser<R: std::io::Read>(PeekReader<R>);
|
||||
|
||||
impl<R: std::io::Read> MimeTypeGuesser<R> {
|
||||
pub fn new(reader: R) -> Self {
|
||||
Self { PeekReader::new(reader, BUF_SIZE) }
|
||||
}
|
||||
|
||||
pub fn mime_type(&mut self) -> Option<&'static str> {
|
||||
let buf = self.0.header(BUF_SIZE)?;
|
||||
INFER.get_mime_type_for_bytes(buf).map(|kind| kind.mime_type)
|
||||
}
|
||||
}
|
||||
|
||||
impl<R: std::io::Read> for MimeTypeGuesser<R> {
|
||||
fn read(&mut self, out: &mut [u8]) -> io::Result<usize> {
|
||||
self.0.read(out)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,72 @@
|
||||
use std::fs;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
/// An iterator that yields paths to fasta or fastq files.
|
||||
pub struct PathIter {
|
||||
dir_stack: Vec<PathBuf>, // dossiers qu'il reste à explorer
|
||||
file_buffer: Vec<PathBuf>, // fichiers déjà listés (à distribuer)
|
||||
}
|
||||
|
||||
impl PathIter {
|
||||
/// Creates a new `PathIter` that will yield paths to fasta or fastq files.
|
||||
pub fn new(paths: Vec<PathBuf>) -> Self {
|
||||
let mut iter = PathIter {
|
||||
dir_stack: Vec::new(),
|
||||
file_buffer: Vec::new(),
|
||||
};
|
||||
for path in paths {
|
||||
if path.is_dir() {
|
||||
iter.dir_stack.push(path);
|
||||
} else if path.is_file() && is_fasta_or_fastq(&path) {
|
||||
iter.file_buffer.push(path);
|
||||
}
|
||||
}
|
||||
iter
|
||||
}
|
||||
}
|
||||
|
||||
impl Iterator for PathIter {
|
||||
type Item = PathBuf;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
// Tant qu'on n'a pas de fichier à donner
|
||||
while self.file_buffer.is_empty() {
|
||||
// Prendre un dossier dans la pile (s'il y en a)
|
||||
let dir = self.dir_stack.pop()?;
|
||||
// Lire le dossier
|
||||
let entries = match fs::read_dir(&dir) {
|
||||
Ok(entries) => entries,
|
||||
Err(_) => continue, // si erreur (perm, etc.), on ignore ce dossier
|
||||
};
|
||||
for entry in entries.flatten() {
|
||||
let path = entry.path();
|
||||
if path.is_dir() {
|
||||
self.dir_stack.push(path);
|
||||
} else if path.is_file() && is_fasta_or_fastq(&path) {
|
||||
self.file_buffer.push(path);
|
||||
}
|
||||
}
|
||||
}
|
||||
// Maintenant file_buffer n'est pas vide
|
||||
self.file_buffer.pop()
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns an iterator that yields paths to fasta or fastq files.
|
||||
pub fn path_iter(paths: &[String]) -> PathIter {
|
||||
let path_bufs: Vec<PathBuf> = paths.iter().map(PathBuf::from).collect();
|
||||
PathIter::new(path_bufs)
|
||||
}
|
||||
|
||||
/// Returns true if the path ends with a fasta or fastq file extension.
|
||||
fn is_fasta_or_fastq(path: &Path) -> bool {
|
||||
let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
|
||||
name.ends_with(".fasta")
|
||||
|| name.ends_with(".fa")
|
||||
|| name.ends_with(".fastq")
|
||||
|| name.ends_with(".fq")
|
||||
|| name.ends_with(".fasta.gz")
|
||||
|| name.ends_with(".fa.gz")
|
||||
|| name.ends_with(".fastq.gz")
|
||||
|| name.ends_with(".fq.gz")
|
||||
}
|
||||
@@ -0,0 +1,47 @@
|
||||
//! PeekReader implementation.
|
||||
|
||||
use std::io::Read;
|
||||
|
||||
/// A reader that caches the first `buf_size` bytes for peeking.
|
||||
pub struct PeekReader<R> {
|
||||
reader: R,
|
||||
buffer: Vec<u8>,
|
||||
pos: usize,
|
||||
}
|
||||
|
||||
impl<R: Read> PeekReader<R> {
|
||||
/// Creates a new PeekReader that caches the first `buf_size` bytes.
|
||||
pub fn new(mut reader: R, buf_size: usize) -> Self {
|
||||
let mut buffer = Vec::with_capacity(buf_size);
|
||||
reader
|
||||
.by_ref()
|
||||
.take(buf_size as u64)
|
||||
.read_to_end(&mut buffer)
|
||||
.unwrap();
|
||||
|
||||
Self {
|
||||
reader,
|
||||
buffer,
|
||||
pos: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the buffered header bytes.
|
||||
pub fn header(&mut self) -> &[u8] {
|
||||
&self.buffer
|
||||
}
|
||||
}
|
||||
|
||||
impl<R: std::io::Read> std::io::Read for PeekReader<R> {
|
||||
fn read(&mut self, out: &mut [u8]) -> std::io::Result<usize> {
|
||||
if self.pos < self.buffer.len() {
|
||||
let avail = &self.buffer[self.pos..];
|
||||
let to_copy = out.len().min(avail.len());
|
||||
out[..to_copy].copy_from_slice(&avail[..to_copy]);
|
||||
self.pos += to_copy;
|
||||
Ok(to_copy)
|
||||
} else {
|
||||
self.reader.read(out)
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user