refactor: migrate pipeline stages and improve graph processing
Refactored neighbor resolution to explicitly track unvisited indices for degree-1 nodes, updated display formatting, and added timing and debug logging to the degree computation routine. Migrated pipeline stages from eager vector returns to explicit flat implementations, enabling backpressure-aware streaming, configurable batch processing, incremental yielding, and progress tracking via a delta channel.
This commit is contained in:
@@ -3,7 +3,11 @@ use std::io;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
use obipipeline::{Pipeline, WorkerPool, make_flat_transform, make_sink, make_source, make_transform};
|
||||
use tracing::debug;
|
||||
use obipipeline::{
|
||||
Pipeline, PipelineError, PipelineSender, SharedFlatFn, Stage, WorkerPool,
|
||||
make_sink, make_source, make_transform,
|
||||
};
|
||||
|
||||
use obicompactvec::{
|
||||
PersistentBitMatrix, PersistentBitMatrixBuilder, PersistentBitVecBuilder,
|
||||
@@ -232,23 +236,38 @@ impl KmerPartition {
|
||||
let pipeline = Pipeline::new(
|
||||
make_source!(Pass1Data, unitig_paths, File),
|
||||
vec![
|
||||
make_flat_transform!(Pass1Data, {
|
||||
move |path: PathBuf| -> Vec<Vec<CanonicalKmer>> {
|
||||
match UnitigFileReader::open_sequential(&path) {
|
||||
Err(e) => {
|
||||
*err_cap.lock().unwrap() = Some(e.to_string());
|
||||
vec![]
|
||||
Stage::Flat(Arc::new(
|
||||
move |data: Pass1Data,
|
||||
push: &PipelineSender<Result<Pass1Data, PipelineError>>,
|
||||
delta: &PipelineSender<isize>|
|
||||
{
|
||||
if let Pass1Data::File(path) = data {
|
||||
let reader = match UnitigFileReader::open_sequential(&path) {
|
||||
Ok(r) => r,
|
||||
Err(e) => {
|
||||
*err_cap.lock().unwrap() = Some(e.to_string());
|
||||
delta.send(-1).ok();
|
||||
return;
|
||||
}
|
||||
};
|
||||
let mut batch: Vec<CanonicalKmer> = Vec::with_capacity(BATCH);
|
||||
let mut count: isize = 0;
|
||||
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
|
||||
batch.push(kmer);
|
||||
if batch.len() == BATCH {
|
||||
let b = std::mem::replace(&mut batch, Vec::with_capacity(BATCH));
|
||||
push.send(Ok(Pass1Data::Batch(b))).ok();
|
||||
count += 1;
|
||||
}
|
||||
}
|
||||
Ok(reader) => {
|
||||
let kmers: Vec<CanonicalKmer> = reader
|
||||
.iter_indexed_canonical_kmers()
|
||||
.map(|(k, _, _)| k)
|
||||
.collect();
|
||||
kmers.chunks(BATCH).map(|c| c.to_vec()).collect()
|
||||
if !batch.is_empty() {
|
||||
push.send(Ok(Pass1Data::Batch(batch))).ok();
|
||||
count += 1;
|
||||
}
|
||||
delta.send(count - 1).ok();
|
||||
}
|
||||
}
|
||||
}, File, Batch),
|
||||
) as SharedFlatFn<Pass1Data>),
|
||||
make_transform!(Pass1Data, {
|
||||
move |batch: Vec<CanonicalKmer>| -> Vec<CanonicalKmer> {
|
||||
batch.into_iter()
|
||||
@@ -278,6 +297,7 @@ impl KmerPartition {
|
||||
.into_inner()
|
||||
.unwrap_or_else(|e| e.into_inner());
|
||||
let any_new = g.len() > 0;
|
||||
debug!("partition {i}: de Bruijn graph done — {} new kmers", g.len());
|
||||
|
||||
// Build new layer from de Bruijn graph if there are new kmers.
|
||||
let new_layer_idx = n_dst_layers;
|
||||
@@ -430,36 +450,52 @@ impl KmerPartition {
|
||||
let pipeline2 = Pipeline::new(
|
||||
make_source!(Pass2Data, pass2_items, SrcLayer),
|
||||
vec![
|
||||
make_flat_transform!(Pass2Data, {
|
||||
move |(col_offset, src_n, src_layer_dir): (usize, usize, PathBuf)|
|
||||
-> Vec<(usize, usize, Arc<SrcLayerData>, Vec<CanonicalKmer>)>
|
||||
Stage::Flat(Arc::new(
|
||||
move |data: Pass2Data,
|
||||
push: &PipelineSender<Result<Pass2Data, PipelineError>>,
|
||||
delta: &PipelineSender<isize>|
|
||||
{
|
||||
let reader = match UnitigFileReader::open_sequential(
|
||||
&src_layer_dir.join("unitigs.bin"),
|
||||
) {
|
||||
Ok(r) => r,
|
||||
Err(e) => {
|
||||
*err_cap2.lock().unwrap() = Some(e.to_string());
|
||||
return vec![];
|
||||
if let Pass2Data::SrcLayer((col_offset, src_n, src_layer_dir)) = data {
|
||||
let reader = match UnitigFileReader::open_sequential(
|
||||
&src_layer_dir.join("unitigs.bin"),
|
||||
) {
|
||||
Ok(r) => r,
|
||||
Err(e) => {
|
||||
*err_cap2.lock().unwrap() = Some(e.to_string());
|
||||
delta.send(-1).ok();
|
||||
return;
|
||||
}
|
||||
};
|
||||
let src_data = match SrcLayerData::open(&src_layer_dir, mode) {
|
||||
Ok(d) => Arc::new(d),
|
||||
Err(e) => {
|
||||
*err_cap2.lock().unwrap() = Some(e.to_string());
|
||||
delta.send(-1).ok();
|
||||
return;
|
||||
}
|
||||
};
|
||||
let mut batch: Vec<CanonicalKmer> = Vec::with_capacity(BATCH);
|
||||
let mut count: isize = 0;
|
||||
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
|
||||
batch.push(kmer);
|
||||
if batch.len() == BATCH {
|
||||
let b = std::mem::replace(&mut batch, Vec::with_capacity(BATCH));
|
||||
push.send(Ok(Pass2Data::RawBatch((
|
||||
col_offset, src_n, Arc::clone(&src_data), b,
|
||||
)))).ok();
|
||||
count += 1;
|
||||
}
|
||||
}
|
||||
};
|
||||
let src_data = match SrcLayerData::open(&src_layer_dir, mode) {
|
||||
Ok(d) => Arc::new(d),
|
||||
Err(e) => {
|
||||
*err_cap2.lock().unwrap() = Some(e.to_string());
|
||||
return vec![];
|
||||
if !batch.is_empty() {
|
||||
push.send(Ok(Pass2Data::RawBatch((
|
||||
col_offset, src_n, src_data, batch,
|
||||
)))).ok();
|
||||
count += 1;
|
||||
}
|
||||
};
|
||||
let all_kmers: Vec<CanonicalKmer> = reader
|
||||
.iter_indexed_canonical_kmers()
|
||||
.map(|(kmer, _, _)| kmer)
|
||||
.collect();
|
||||
all_kmers
|
||||
.chunks(BATCH)
|
||||
.map(|c| (col_offset, src_n, Arc::clone(&src_data), c.to_vec()))
|
||||
.collect()
|
||||
delta.send(count - 1).ok();
|
||||
}
|
||||
}
|
||||
}, SrcLayer, RawBatch),
|
||||
) as SharedFlatFn<Pass2Data>),
|
||||
make_transform!(Pass2Data, {
|
||||
move |(col_offset, src_n, src_data, kmers): (usize, usize, Arc<SrcLayerData>, Vec<CanonicalKmer>)|
|
||||
-> Vec<(Option<usize>, usize, usize, u32)>
|
||||
|
||||
Reference in New Issue
Block a user