feat: add pipeline concurrency throttling and HPC build docs
Introduces a counting semaphore-based throttling mechanism to limit concurrent file I/O and pipeline processing. Replaces custom path wrappers with standardized `Throttled` types across `obikmer` and `obikpartitionner`, ensuring RAII-based resource cleanup and explicit backpressure. Additionally, documents how to redirect Cargo build artifacts to local scratch storage on HPC filesystems to prevent compilation slowdowns.
This commit is contained in:
@@ -6,6 +6,7 @@ use std::sync::{Arc, Mutex};
|
||||
use tracing::debug;
|
||||
use obipipeline::{
|
||||
Pipeline, PipelineError, PipelineSender, SharedFlatFn, Stage, WorkerPool,
|
||||
ThrottleGuard, throttle,
|
||||
make_sink, make_source, make_transform,
|
||||
};
|
||||
|
||||
@@ -221,16 +222,18 @@ impl KmerPartition {
|
||||
debug!("partition {i}: de Bruijn graph build start — {n_src_layers} source layer(s)");
|
||||
|
||||
enum Pass1Data {
|
||||
File(PathBuf),
|
||||
File((PathBuf, ThrottleGuard)),
|
||||
Batch(Vec<CanonicalKmer>),
|
||||
NewKmers(Vec<CanonicalKmer>),
|
||||
}
|
||||
|
||||
const BATCH: usize = 4096;
|
||||
// Inside pool.install() this returns the per-NUMA pool size; outside
|
||||
// it returns the global pool size. Both are the right value here.
|
||||
let n_workers = rayon::current_num_threads().max(1);
|
||||
let capacity = n_workers * 8;
|
||||
let n_workers = rayon::current_num_threads().min(16).max(4);
|
||||
let capacity = 2;
|
||||
// At most 2 files open simultaneously: keeps n_workers-2 workers free
|
||||
// for the Transform stage. Each open file monopolises one worker for the
|
||||
// full duration of its read, so this must stay well below n_workers.
|
||||
let max_open = 2;
|
||||
|
||||
let dst_filter = Arc::clone(&dst_map);
|
||||
let g_shared = Arc::new(Mutex::new(GraphDeBruijn::new()));
|
||||
@@ -238,15 +241,18 @@ impl KmerPartition {
|
||||
let pass1_err: Arc<Mutex<Option<String>>> = Arc::new(Mutex::new(None));
|
||||
let err_cap = Arc::clone(&pass1_err);
|
||||
|
||||
let throttled_paths = throttle(unitig_paths.into_iter(), max_open);
|
||||
|
||||
let pipeline = Pipeline::new(
|
||||
make_source!(Pass1Data, unitig_paths, File),
|
||||
make_source!(Pass1Data, throttled_paths.map(|t| (t.item, t.guard)), File),
|
||||
vec![
|
||||
Stage::Flat(Arc::new(
|
||||
move |data: Pass1Data,
|
||||
push: &PipelineSender<Result<Pass1Data, PipelineError>>,
|
||||
delta: &PipelineSender<isize>|
|
||||
{
|
||||
if let Pass1Data::File(path) = data {
|
||||
if let Pass1Data::File((path, _guard)) = data {
|
||||
// _guard is dropped at end of this block, releasing the slot.
|
||||
let reader = match UnitigFileReader::open_sequential(&path) {
|
||||
Ok(r) => r,
|
||||
Err(e) => {
|
||||
@@ -455,7 +461,7 @@ impl KmerPartition {
|
||||
}
|
||||
|
||||
enum Pass2Data {
|
||||
SrcLayer((usize, usize, PathBuf)),
|
||||
SrcLayer((usize, usize, PathBuf, ThrottleGuard)),
|
||||
RawBatch((usize, usize, Arc<SrcLayerData>, Vec<CanonicalKmer>)),
|
||||
WriteBatch(Vec<(Option<usize>, usize, usize, u32)>),
|
||||
}
|
||||
@@ -477,15 +483,21 @@ impl KmerPartition {
|
||||
let pass2_err: Arc<Mutex<Option<String>>> = Arc::new(Mutex::new(None));
|
||||
let err_cap2 = Arc::clone(&pass2_err);
|
||||
|
||||
let throttled_pass2 = throttle(pass2_items.into_iter(), max_open);
|
||||
|
||||
let pipeline2 = Pipeline::new(
|
||||
make_source!(Pass2Data, pass2_items, SrcLayer),
|
||||
make_source!(Pass2Data, throttled_pass2.map(|t| {
|
||||
let (col_offset, src_n, src_layer_dir) = t.item;
|
||||
(col_offset, src_n, src_layer_dir, t.guard)
|
||||
}), SrcLayer),
|
||||
vec![
|
||||
Stage::Flat(Arc::new(
|
||||
move |data: Pass2Data,
|
||||
push: &PipelineSender<Result<Pass2Data, PipelineError>>,
|
||||
delta: &PipelineSender<isize>|
|
||||
{
|
||||
if let Pass2Data::SrcLayer((col_offset, src_n, src_layer_dir)) = data {
|
||||
if let Pass2Data::SrcLayer((col_offset, src_n, src_layer_dir, _guard)) = data {
|
||||
// _guard dropped at end of block, releasing the slot.
|
||||
let reader = match UnitigFileReader::open_sequential(
|
||||
&src_layer_dir.join("unitigs.bin"),
|
||||
) {
|
||||
|
||||
Reference in New Issue
Block a user