♻️ refactor pipeline architecture and fix macOS memory detection

- Replace WorkerPool-based pipelines with typed `Pipe` abstraction in obipipeline
  - Introduce Pipe/PipeIter for composable, sourceless/sink-less pipelines
- Update partition and superkmer commands to use new Pipe API via make_pipe!
  - Remove Arc<Mutex<...>> patterns; simplify state management
- Fix macOS available_memory() returning 0 by falling back to half total memory in dereplicate()
- Remove unused `format: "zstd"` field from partition.meta
This commit is contained in:
Eric Coissac
2026-04-28 08:40:07 +02:00
parent 4c19882f03
commit 97e65bd831
8 changed files with 264 additions and 48 deletions
+2
View File
@@ -1,5 +1,7 @@
mod scheduler;
pub use scheduler::Pipe;
pub use scheduler::PipeIter;
pub use scheduler::Pipeline;
pub use scheduler::PipelineError;
pub use scheduler::SharedFlatFn;
+192
View File
@@ -1,6 +1,7 @@
use crossbeam_channel::{Receiver, Select, Sender, bounded};
use std::error::Error;
use std::fmt;
use std::marker::PhantomData;
use std::sync::Arc;
use std::thread;
@@ -371,6 +372,115 @@ where
}
}
// ── Pipe ──────────────────────────────────────────────────────────────────────
/// Typed, composable iterator transformer.
///
/// A `Pipe<D, In, Out>` is a pure description of pipeline stages — no threads,
/// no channels, no scheduler. Call `.apply(iter, n_workers, capacity)` to start
/// execution and get back a `PipeIter<Out>`.
///
/// Compose two pipes with `.then()`: the resulting `Pipe` holds the concatenated
/// stage list, so a single scheduler is created when `.apply()` is eventually called.
pub struct Pipe<D, In, Out> {
stages: Vec<Stage<D>>,
wrap: Arc<dyn Fn(In) -> D + Send + Sync>,
unwrap: Arc<dyn Fn(D) -> Out + Send + Sync>,
_phantom: PhantomData<(In, Out)>,
}
impl<D, In, Out> Pipe<D, In, Out> {
/// Build a `Pipe` from stages and wrap/unwrap converters.
/// Prefer the `make_pipe!` macro.
pub fn new(
stages: Vec<Stage<D>>,
wrap: Arc<dyn Fn(In) -> D + Send + Sync>,
unwrap: Arc<dyn Fn(D) -> Out + Send + Sync>,
) -> Self {
Self { stages, wrap, unwrap, _phantom: PhantomData }
}
/// Concatenate stages from two pipes into one.
///
/// Requires `Out` of `self` == `In` of `other`. The single scheduler
/// created at `.apply()` time sees the full combined stage list.
pub fn then<Next>(self, other: Pipe<D, Out, Next>) -> Pipe<D, In, Next> {
Pipe {
stages: self.stages.into_iter().chain(other.stages).collect(),
wrap: self.wrap,
unwrap: other.unwrap,
_phantom: PhantomData,
}
}
}
impl<D, In, Out> Pipe<D, In, Out>
where
D: Send + Sync + 'static,
In: Send + 'static,
Out: Send + 'static,
{
/// Run the pipeline in a background thread; returns an iterator over the output.
pub fn apply(
self,
input: impl Iterator<Item = In> + Send + 'static,
n_workers: usize,
capacity: usize,
) -> PipeIter<Out> {
let wrap = Arc::clone(&self.wrap);
let unwrap = Arc::clone(&self.unwrap);
let mut iter = input;
let source: SourceFn<D> = Box::new(move || match iter.next() {
Some(x) => Ok(wrap(x)),
None => Err(PipelineError::EndOfStream),
});
let (out_tx, out_rx) = bounded::<Out>(capacity);
let sink: SinkFn<D> = Box::new(move |data: D| {
out_tx.send(unwrap(data)).map_err(|_| {
PipelineError::StepError(Box::new(std::io::Error::new(
std::io::ErrorKind::BrokenPipe,
"output channel closed",
)))
})
});
let pipeline = Pipeline::new(source, self.stages, sink);
let handle = thread::spawn(move || {
WorkerPool::new(pipeline, n_workers, capacity).run();
});
PipeIter { rx: out_rx, handle: Some(handle) }
}
}
// ── PipeIter ──────────────────────────────────────────────────────────────────
/// Iterator over the output of `Pipe::apply()`.
pub struct PipeIter<Out> {
rx: Receiver<Out>,
handle: Option<thread::JoinHandle<()>>,
}
impl<Out> Iterator for PipeIter<Out> {
type Item = Out;
fn next(&mut self) -> Option<Out> {
self.rx.recv().ok()
}
}
impl<Out> Drop for PipeIter<Out> {
fn drop(&mut self) {
// Drain buffered items so the scheduler can unblock if the channel is full.
while self.rx.try_recv().is_ok() {}
if let Some(h) = self.handle.take() {
let _ = h.join();
}
}
}
/// Envoie `data` au stage `stage_idx`.
/// Pour un `Transform`, empile une `WorkerTask::Transform`.
/// Pour un `Flat`, incrémente `flat_workers_active` et empile une `WorkerTask::Flat`.
@@ -684,3 +794,85 @@ macro_rules! make_pipeline {
)
};
}
/// Builds a typed `Pipe<D, In, Out>` — sourceless and sinkless.
///
/// Syntax:
/// ```ignore
/// make_pipe! {
/// MyData : InType => OutType,
/// | func : InVariant => OutVariant, // transform 1→1
/// |? func : InVariant => OutVariant, // transform 1→1 fallible
/// || func : InVariant => OutVariant, // flat transform 1→N
/// ||? func : InVariant => OutVariant, // flat transform 1→N fallible
/// }
/// ```
#[macro_export]
macro_rules! make_pipe {
// ── Entry: first stage | ─────────────────────────────────────────────
($enum:ident : $in_ty:ty => $out_ty:ty,
| $tf:tt : $fi:ident => $fo:ident, $($rest:tt)*) => {
$crate::make_pipe!(@build $enum : $in_ty => $out_ty, $fi,
[$crate::make_transform!($enum, $tf, $fi, $fo),], $fo, $($rest)*)
};
// ── Entry: first stage |? ────────────────────────────────────────────
($enum:ident : $in_ty:ty => $out_ty:ty,
|? $tf:tt : $fi:ident => $fo:ident, $($rest:tt)*) => {
$crate::make_pipe!(@build $enum : $in_ty => $out_ty, $fi,
[$crate::make_transform_fallible!($enum, $tf, $fi, $fo),], $fo, $($rest)*)
};
// ── Entry: first stage || ────────────────────────────────────────────
($enum:ident : $in_ty:ty => $out_ty:ty,
|| $tf:tt : $fi:ident => $fo:ident, $($rest:tt)*) => {
$crate::make_pipe!(@build $enum : $in_ty => $out_ty, $fi,
[$crate::make_flat_transform!($enum, $tf, $fi, $fo),], $fo, $($rest)*)
};
// ── Entry: first stage ||? ───────────────────────────────────────────
($enum:ident : $in_ty:ty => $out_ty:ty,
||? $tf:tt : $fi:ident => $fo:ident, $($rest:tt)*) => {
$crate::make_pipe!(@build $enum : $in_ty => $out_ty, $fi,
[$crate::make_flat_transform_fallible!($enum, $tf, $fi, $fo),], $fo, $($rest)*)
};
// ── Accumulation: | ──────────────────────────────────────────────────
(@build $enum:ident : $in_ty:ty => $out_ty:ty, $fi:ident,
[$($acc:tt)*], $lo:ident,
| $tf:tt : $ti:ident => $to:ident, $($rest:tt)*) => {
$crate::make_pipe!(@build $enum : $in_ty => $out_ty, $fi,
[$($acc)* $crate::make_transform!($enum, $tf, $ti, $to),], $to, $($rest)*)
};
// ── Accumulation: |? ─────────────────────────────────────────────────
(@build $enum:ident : $in_ty:ty => $out_ty:ty, $fi:ident,
[$($acc:tt)*], $lo:ident,
|? $tf:tt : $ti:ident => $to:ident, $($rest:tt)*) => {
$crate::make_pipe!(@build $enum : $in_ty => $out_ty, $fi,
[$($acc)* $crate::make_transform_fallible!($enum, $tf, $ti, $to),], $to, $($rest)*)
};
// ── Accumulation: || ─────────────────────────────────────────────────
(@build $enum:ident : $in_ty:ty => $out_ty:ty, $fi:ident,
[$($acc:tt)*], $lo:ident,
|| $tf:tt : $ti:ident => $to:ident, $($rest:tt)*) => {
$crate::make_pipe!(@build $enum : $in_ty => $out_ty, $fi,
[$($acc)* $crate::make_flat_transform!($enum, $tf, $ti, $to),], $to, $($rest)*)
};
// ── Accumulation: ||? ────────────────────────────────────────────────
(@build $enum:ident : $in_ty:ty => $out_ty:ty, $fi:ident,
[$($acc:tt)*], $lo:ident,
||? $tf:tt : $ti:ident => $to:ident, $($rest:tt)*) => {
$crate::make_pipe!(@build $enum : $in_ty => $out_ty, $fi,
[$($acc)* $crate::make_flat_transform_fallible!($enum, $tf, $ti, $to),], $to, $($rest)*)
};
// ── Termination ───────────────────────────────────────────────────────
(@build $enum:ident : $in_ty:ty => $out_ty:ty, $fi:ident,
[$($acc:tt)*], $lo:ident $(,)?) => {
$crate::Pipe::new(
vec![$($acc)*],
::std::sync::Arc::new(|x: $in_ty| $enum::$fi(x)),
::std::sync::Arc::new(|d: $enum| -> $out_ty {
if let $enum::$lo(x) = d { x }
else { ::std::unreachable!("unexpected pipeline data variant in make_pipe!") }
}),
)
};
}