diff --git a/docmd/theory/kmers.md b/docmd/kmers.md similarity index 100% rename from docmd/theory/kmers.md rename to docmd/kmers.md diff --git a/src/.~lock.Synthese.docx# b/src/.~lock.Synthese.docx# new file mode 100644 index 0000000..6631e2a --- /dev/null +++ b/src/.~lock.Synthese.docx# @@ -0,0 +1 @@ +Eric Coissac,coissac,mac.lan,20.04.2026 19:13,file:///Users/coissac/Library/Application%20Support/LibreOffice/4; \ No newline at end of file diff --git a/src/Cargo.lock b/src/Cargo.lock index ce0c031..78bb33b 100644 --- a/src/Cargo.lock +++ b/src/Cargo.lock @@ -8,6 +8,15 @@ version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" +[[package]] +name = "aho-corasick" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" +dependencies = [ + "memchr", +] + [[package]] name = "anes" version = "0.2.1" @@ -541,6 +550,15 @@ version = "0.4.29" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" +[[package]] +name = "matchers" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1525a2a28c7f4fa0fc98bb91ae755d1e2d1505079e05539e35bc876b5d65ae9" +dependencies = [ + "regex-automata", +] + [[package]] name = "memchr" version = "2.8.0" @@ -571,6 +589,15 @@ dependencies = [ "zstd", ] +[[package]] +name = "nu-ansi-term" +version = "0.50.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" +dependencies = [ + "windows-sys 0.61.2", +] + [[package]] name = "num-traits" version = "0.2.19" @@ -623,6 +650,8 @@ version = "0.1.0" dependencies = [ "niffler", "obikrope", + "tracing", + "tracing-subscriber", "ureq", ] @@ -659,6 +688,12 @@ version = "2.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" +[[package]] +name = "pin-project-lite" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd" + [[package]] name = "pkg-config" version = "0.3.33" @@ -724,6 +759,23 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "regex-automata" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" + [[package]] name = "ring" version = "0.17.14" @@ -825,6 +877,15 @@ dependencies = [ "zmij", ] +[[package]] +name = "sharded-slab" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6" +dependencies = [ + "lazy_static", +] + [[package]] name = "shlex" version = "1.3.0" @@ -909,6 +970,15 @@ dependencies = [ "syn", ] +[[package]] +name = "thread_local" +version = "1.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f60246a4944f24f6e018aa17cdeffb7818b76356965d03b07d6a9886e8962185" +dependencies = [ + "cfg-if", +] + [[package]] name = "tinystr" version = "0.8.3" @@ -919,6 +989,67 @@ dependencies = [ "zerovec", ] +[[package]] +name = "tracing" +version = "0.1.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100" +dependencies = [ + "pin-project-lite", + "tracing-attributes", + "tracing-core", +] + +[[package]] +name = "tracing-attributes" +version = "0.1.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tracing-core" +version = "0.1.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a" +dependencies = [ + "once_cell", + "valuable", +] + +[[package]] +name = "tracing-log" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3" +dependencies = [ + "log", + "once_cell", + "tracing-core", +] + +[[package]] +name = "tracing-subscriber" +version = "0.3.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb7f578e5945fb242538965c2d0b04418d38ec25c79d160cd279bf0731c8d319" +dependencies = [ + "matchers", + "nu-ansi-term", + "once_cell", + "regex-automata", + "sharded-slab", + "smallvec", + "thread_local", + "tracing", + "tracing-core", + "tracing-log", +] + [[package]] name = "unicode-ident" version = "1.0.24" @@ -971,6 +1102,12 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" +[[package]] +name = "valuable" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" + [[package]] name = "walkdir" version = "2.5.0" diff --git a/src/Synthese.docx b/src/Synthese.docx new file mode 100644 index 0000000..79b1209 Binary files /dev/null and b/src/Synthese.docx differ diff --git a/src/obiread/Cargo.toml b/src/obiread/Cargo.toml index d182b93..8310c68 100644 --- a/src/obiread/Cargo.toml +++ b/src/obiread/Cargo.toml @@ -7,3 +7,5 @@ edition = "2024" obikrope = { path = "../obikrope" } niffler = { version = "2", default-features = false, features = ["gz", "bz2", "lzma", "zstd"] } ureq = "2" +tracing = "0.1.44" +tracing-subscriber = { version = "0.3.23", features = ["fmt", "env-filter"] } diff --git a/src/obiread/examples/expand_path.rs b/src/obiread/examples/expand_path.rs new file mode 100644 index 0000000..e47d89d --- /dev/null +++ b/src/obiread/examples/expand_path.rs @@ -0,0 +1,20 @@ +use obiread::expand_paths; +use tracing::{info, subscriber}; +use tracing_subscriber::{EnvFilter, fmt}; + +fn main() { + // Build a subscriber with environment-based filtering + tracing_subscriber::fmt() + .with_env_filter(tracing_subscriber::EnvFilter::from_default_env()) + .init(); + + info!("Expanding paths..."); + let paths = vec![ + "/home/user/data".to_string(), + "/home/user/sample.fastq.gz".to_string(), + ]; + let files = expand_paths(&paths); + for f in files { + println!("{}", f.display()); + } +} diff --git a/src/obiread/src/lib.rs b/src/obiread/src/lib.rs index 47f438a..5476b99 100644 --- a/src/obiread/src/lib.rs +++ b/src/obiread/src/lib.rs @@ -5,12 +5,15 @@ #![deny(missing_docs)] +pub mod chunk; mod fasta; mod fastq; -pub mod chunk; +mod list_of_files; pub mod normalize; pub mod xopen; +pub use list_of_files::expand_paths; + use std::io::Read; use chunk::SeqChunkIter; diff --git a/src/obiread/src/list_of_files.rs b/src/obiread/src/list_of_files.rs new file mode 100644 index 0000000..f87ef81 --- /dev/null +++ b/src/obiread/src/list_of_files.rs @@ -0,0 +1,47 @@ +use std::fs; +use std::path::Path; +use std::path::PathBuf; +use tracing::info; + +/// Returns true if the path ends with a fasta or fastq file extension. +fn is_fasta_or_fastq(path: &Path) -> bool { + let name = path.file_name().and_then(|n| n.to_str()).unwrap_or(""); + name.ends_with(".fasta") + || name.ends_with(".fa") + || name.ends_with(".fastq") + || name.ends_with(".fq") + || name.ends_with(".fasta.gz") + || name.ends_with(".fa.gz") + || name.ends_with(".fastq.gz") + || name.ends_with(".fq.gz") +} + +/// Walks a directory, collecting fasta or fastq files into the output vector. +fn walk_dir(dir: &Path, out: &mut Vec) { + if let Ok(entries) = fs::read_dir(dir) { + for entry in entries.flatten() { + let path = entry.path(); + if path.is_dir() { + walk_dir(&path, out); + } else if path.is_file() && is_fasta_or_fastq(&path) { + out.push(path); + } + } + } +} + +/// Expands a list of paths, returning a vector of `PathBuf` for fasta or fastq files. +pub fn expand_paths(paths: &[String]) -> Vec { + let mut result = Vec::new(); + for path_str in paths { + info!("Current step: {}", path_str); + let path = Path::new(path_str); + if path.is_dir() { + walk_dir(path, &mut result); + } else if path.is_file() && is_fasta_or_fastq(path) { + info!("Found fasta or fastq file: {}", path_str); + result.push(path.to_path_buf()); + } + } + result +}