feat: add input file logging and optimize path traversal

Instrument index and scatter stages with `tracing::info` to log input file paths for better runtime observability. Additionally, optimize the path iterator by replacing redundant `is_dir()` checks with explicit `is_file()` validation and deferring metadata resolution, eliminating unnecessary `stat()` syscalls and improving traversal performance on high-latency network filesystems like Lustre and NFS.
This commit is contained in:
Eric Coissac
2026-05-22 11:02:10 +02:00
parent fe0832190b
commit fe2127c463
3 changed files with 21 additions and 10 deletions
+17 -5
View File
@@ -9,16 +9,25 @@ pub struct PathIter {
impl PathIter {
/// Creates a new `PathIter` that will yield paths to fasta or fastq files.
///
/// For non-directory paths only the file extension is checked (no stat syscall).
/// Directories are left on the stack and expanded lazily in `next()`.
/// Invalid or missing paths are silently skipped when `open_chunks` fails later.
pub fn new(paths: Vec<PathBuf>) -> Self {
let mut iter = PathIter {
dir_stack: Vec::new(),
file_buffer: Vec::new(),
};
for path in paths {
if path.is_dir() {
iter.dir_stack.push(path);
} else if path.is_file() && is_fasta_or_fastq(&path) {
// Avoid stat() at construction time on network filesystems (Lustre, NFS)
// where metadata operations can be 100s of milliseconds each.
// Paths that look like sequence files are assumed to be files.
// Anything else is treated as a potential directory and expanded lazily
// in next(); read_dir errors are silently skipped.
if is_fasta_or_fastq(&path) {
iter.file_buffer.push(path);
} else {
iter.dir_stack.push(path);
}
}
iter
@@ -40,9 +49,12 @@ impl Iterator for PathIter {
};
for entry in entries.flatten() {
let path = entry.path();
if path.is_dir() {
// Prefer file_type() over path.is_dir()/is_file() — on most filesystems
// (including Lustre with DT_UNKNOWN fallback) this avoids an extra stat.
let is_dir = entry.file_type().map(|t| t.is_dir()).unwrap_or(false);
if is_dir {
self.dir_stack.push(path);
} else if path.is_file() && is_fasta_or_fastq(&path) {
} else if is_fasta_or_fastq(&path) {
self.file_buffer.push(path);
}
}