feat: add input file logging and optimize path traversal
Instrument index and scatter stages with `tracing::info` to log input file paths for better runtime observability. Additionally, optimize the path iterator by replacing redundant `is_dir()` checks with explicit `is_file()` validation and deferring metadata resolution, eliminating unnecessary `stat()` syscalls and improving traversal performance on high-latency network filesystems like Lustre and NFS.
This commit is contained in:
@@ -9,16 +9,25 @@ pub struct PathIter {
|
||||
|
||||
impl PathIter {
|
||||
/// Creates a new `PathIter` that will yield paths to fasta or fastq files.
|
||||
///
|
||||
/// For non-directory paths only the file extension is checked (no stat syscall).
|
||||
/// Directories are left on the stack and expanded lazily in `next()`.
|
||||
/// Invalid or missing paths are silently skipped when `open_chunks` fails later.
|
||||
pub fn new(paths: Vec<PathBuf>) -> Self {
|
||||
let mut iter = PathIter {
|
||||
dir_stack: Vec::new(),
|
||||
file_buffer: Vec::new(),
|
||||
};
|
||||
for path in paths {
|
||||
if path.is_dir() {
|
||||
iter.dir_stack.push(path);
|
||||
} else if path.is_file() && is_fasta_or_fastq(&path) {
|
||||
// Avoid stat() at construction time on network filesystems (Lustre, NFS)
|
||||
// where metadata operations can be 100s of milliseconds each.
|
||||
// Paths that look like sequence files are assumed to be files.
|
||||
// Anything else is treated as a potential directory and expanded lazily
|
||||
// in next(); read_dir errors are silently skipped.
|
||||
if is_fasta_or_fastq(&path) {
|
||||
iter.file_buffer.push(path);
|
||||
} else {
|
||||
iter.dir_stack.push(path);
|
||||
}
|
||||
}
|
||||
iter
|
||||
@@ -40,9 +49,12 @@ impl Iterator for PathIter {
|
||||
};
|
||||
for entry in entries.flatten() {
|
||||
let path = entry.path();
|
||||
if path.is_dir() {
|
||||
// Prefer file_type() over path.is_dir()/is_file() — on most filesystems
|
||||
// (including Lustre with DT_UNKNOWN fallback) this avoids an extra stat.
|
||||
let is_dir = entry.file_type().map(|t| t.is_dir()).unwrap_or(false);
|
||||
if is_dir {
|
||||
self.dir_stack.push(path);
|
||||
} else if path.is_file() && is_fasta_or_fastq(&path) {
|
||||
} else if is_fasta_or_fastq(&path) {
|
||||
self.file_buffer.push(path);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user