feat: add pipeline concurrency throttling and HPC build docs

Introduces a counting semaphore-based throttling mechanism to limit concurrent file I/O and pipeline processing. Replaces custom path wrappers with standardized `Throttled` types across `obikmer` and `obikpartitionner`, ensuring RAII-based resource cleanup and explicit backpressure. Additionally, documents how to redirect Cargo build artifacts to local scratch storage on HPC filesystems to prevent compilation slowdowns.
This commit is contained in:
Eric Coissac
2026-06-15 10:26:40 +02:00
parent c6ea0c53e3
commit 175ea5bbd0
7 changed files with 149 additions and 85 deletions
+22 -10
View File
@@ -6,6 +6,7 @@ use std::sync::{Arc, Mutex};
use tracing::debug;
use obipipeline::{
Pipeline, PipelineError, PipelineSender, SharedFlatFn, Stage, WorkerPool,
ThrottleGuard, throttle,
make_sink, make_source, make_transform,
};
@@ -221,16 +222,18 @@ impl KmerPartition {
debug!("partition {i}: de Bruijn graph build start — {n_src_layers} source layer(s)");
enum Pass1Data {
File(PathBuf),
File((PathBuf, ThrottleGuard)),
Batch(Vec<CanonicalKmer>),
NewKmers(Vec<CanonicalKmer>),
}
const BATCH: usize = 4096;
// Inside pool.install() this returns the per-NUMA pool size; outside
// it returns the global pool size. Both are the right value here.
let n_workers = rayon::current_num_threads().max(1);
let capacity = n_workers * 8;
let n_workers = rayon::current_num_threads().min(16).max(4);
let capacity = 2;
// At most 2 files open simultaneously: keeps n_workers-2 workers free
// for the Transform stage. Each open file monopolises one worker for the
// full duration of its read, so this must stay well below n_workers.
let max_open = 2;
let dst_filter = Arc::clone(&dst_map);
let g_shared = Arc::new(Mutex::new(GraphDeBruijn::new()));
@@ -238,15 +241,18 @@ impl KmerPartition {
let pass1_err: Arc<Mutex<Option<String>>> = Arc::new(Mutex::new(None));
let err_cap = Arc::clone(&pass1_err);
let throttled_paths = throttle(unitig_paths.into_iter(), max_open);
let pipeline = Pipeline::new(
make_source!(Pass1Data, unitig_paths, File),
make_source!(Pass1Data, throttled_paths.map(|t| (t.item, t.guard)), File),
vec![
Stage::Flat(Arc::new(
move |data: Pass1Data,
push: &PipelineSender<Result<Pass1Data, PipelineError>>,
delta: &PipelineSender<isize>|
{
if let Pass1Data::File(path) = data {
if let Pass1Data::File((path, _guard)) = data {
// _guard is dropped at end of this block, releasing the slot.
let reader = match UnitigFileReader::open_sequential(&path) {
Ok(r) => r,
Err(e) => {
@@ -455,7 +461,7 @@ impl KmerPartition {
}
enum Pass2Data {
SrcLayer((usize, usize, PathBuf)),
SrcLayer((usize, usize, PathBuf, ThrottleGuard)),
RawBatch((usize, usize, Arc<SrcLayerData>, Vec<CanonicalKmer>)),
WriteBatch(Vec<(Option<usize>, usize, usize, u32)>),
}
@@ -477,15 +483,21 @@ impl KmerPartition {
let pass2_err: Arc<Mutex<Option<String>>> = Arc::new(Mutex::new(None));
let err_cap2 = Arc::clone(&pass2_err);
let throttled_pass2 = throttle(pass2_items.into_iter(), max_open);
let pipeline2 = Pipeline::new(
make_source!(Pass2Data, pass2_items, SrcLayer),
make_source!(Pass2Data, throttled_pass2.map(|t| {
let (col_offset, src_n, src_layer_dir) = t.item;
(col_offset, src_n, src_layer_dir, t.guard)
}), SrcLayer),
vec![
Stage::Flat(Arc::new(
move |data: Pass2Data,
push: &PipelineSender<Result<Pass2Data, PipelineError>>,
delta: &PipelineSender<isize>|
{
if let Pass2Data::SrcLayer((col_offset, src_n, src_layer_dir)) = data {
if let Pass2Data::SrcLayer((col_offset, src_n, src_layer_dir, _guard)) = data {
// _guard dropped at end of block, releasing the slot.
let reader = match UnitigFileReader::open_sequential(
&src_layer_dir.join("unitigs.bin"),
) {