feat: add pipeline concurrency throttling and HPC build docs

Introduces a counting semaphore-based throttling mechanism to limit concurrent file I/O and pipeline processing. Replaces custom path wrappers with standardized `Throttled` types across `obikmer` and `obikpartitionner`, ensuring RAII-based resource cleanup and explicit backpressure. Additionally, documents how to redirect Cargo build artifacts to local scratch storage on HPC filesystems to prevent compilation slowdowns.
This commit is contained in:
Eric Coissac
2026-06-15 10:26:40 +02:00
parent c6ea0c53e3
commit 175ea5bbd0
7 changed files with 149 additions and 85 deletions
+5
View File
@@ -1,4 +1,5 @@
mod scheduler;
pub mod throttle;
pub use scheduler::Pipe;
pub use scheduler::PipeIter;
@@ -10,6 +11,10 @@ pub use scheduler::SinkFn;
pub use scheduler::SourceFn;
pub use scheduler::Stage;
pub use scheduler::WorkerPool;
pub use throttle::Throttle;
pub use throttle::ThrottleGuard;
pub use throttle::Throttled;
pub use throttle::throttle;
/// Re-export de `crossbeam_channel::Sender` utilisé dans les macros flat transform.
/// Permet aux macros `make_flat_transform!` / `make_flat_transform_fallible!` d'utiliser
+86
View File
@@ -0,0 +1,86 @@
use std::sync::{Arc, Condvar, Mutex};
// ── Throttle ──────────────────────────────────────────────────────────────────
/// Counting semaphore: limits how many items from a source are in-flight
/// simultaneously through the Flat stage of a pipeline.
///
/// Slots are acquired in the source thread before an item is emitted, and
/// released when the corresponding `ThrottleGuard` is dropped (i.e. when the
/// Flat worker finishes processing the item). Acquisition must never happen
/// inside a worker — only in the source thread — to prevent deadlocks.
pub struct Throttle {
count: Mutex<usize>,
condvar: Condvar,
max: usize,
}
impl Throttle {
pub fn new(max: usize) -> Self {
Self { count: Mutex::new(0), condvar: Condvar::new(), max }
}
pub fn acquire(&self) {
let mut count = self.count.lock().unwrap();
while *count >= self.max {
count = self.condvar.wait(count).unwrap();
}
*count += 1;
}
fn release(&self) {
let mut count = self.count.lock().unwrap();
*count -= 1;
self.condvar.notify_one();
}
}
// ── ThrottleGuard ─────────────────────────────────────────────────────────────
/// RAII guard: releases one slot in the `Throttle` when dropped.
pub struct ThrottleGuard(Arc<Throttle>);
impl Drop for ThrottleGuard {
fn drop(&mut self) {
self.0.release();
}
}
// ── Throttled<T> ──────────────────────────────────────────────────────────────
/// An item paired with its throttle guard.
///
/// The guard keeps a slot acquired until this value is dropped. In a Flat
/// pipeline stage, carry the guard inside the worker closure until the item
/// is fully processed, then let it drop.
pub struct Throttled<T> {
pub item: T,
pub guard: ThrottleGuard,
}
// ── throttle() ────────────────────────────────────────────────────────────────
/// Wrap `source` so that at most `max_concurrent` items are emitted before
/// earlier ones have been fully processed (i.e. their `ThrottleGuard` dropped).
///
/// Acquisition blocks the source thread until a slot is available. This must
/// be called in the source thread, never inside a pipeline worker.
///
/// # Example
///
/// ```ignore
/// let throttled = obipipeline::throttle(file_paths, n_workers - 1);
/// // Use `throttled` as the pipeline source; carry `item.guard` through the
/// // Flat stage and let it drop when the file is fully read.
/// ```
pub fn throttle<I>(source: I, max_concurrent: usize) -> impl Iterator<Item = Throttled<I::Item>>
where
I: Iterator,
I::Item: Send + 'static,
{
let t = Arc::new(Throttle::new(max_concurrent));
source.map(move |item| {
t.acquire();
Throttled { item, guard: ThrottleGuard(Arc::clone(&t)) }
})
}