refactor: replace unitig extraction with de Bruijn graph pipeline

This change replaces direct partition-based extraction with a pipeline that reconstructs a de Bruijn graph from filtered k-mers. It introduces `FilterArgs` for k-mer selection, collects filtered k-mers in parallel into a `GraphDeBruijn`, computes node degrees, and enumerates unitigs from the graph for output instead of reading pre-computed partition files.
This commit is contained in:
Eric Coissac
2026-06-04 20:55:24 +02:00
parent 3e62ffe010
commit 712a03a3a6
+40 -29
View File
@@ -1,54 +1,65 @@
use std::io::{self, BufWriter, Write};
use std::path::PathBuf;
use std::sync::Mutex;
use clap::Args;
use obidebruinj::GraphDeBruijn;
use obifastwrite::write_unitig;
use obikindex::KmerIndex;
use obikseq::set_k;
use obiskio::UnitigFileReader;
use rayon::prelude::*;
use tracing::info;
use super::predicate::FilterArgs;
#[derive(Args)]
pub struct UnitigArgs {
/// Index directory produced by the `index` command
/// Index directory
pub index: PathBuf,
#[command(flatten)]
pub filter: FilterArgs,
}
pub fn run(args: UnitigArgs) {
let idx = KmerIndex::open(&args.index).unwrap_or_else(|e| {
eprintln!("error opening index: {e}");
std::process::exit(1)
std::process::exit(1);
});
let k = idx.kmer_size();
set_k(k);
let n = idx.n_partitions();
info!("dumping unitigs from {n} partitions (k={k})");
let k = idx.kmer_size();
let n = idx.n_partitions();
let n_genomes = idx.meta().genomes.len().max(1);
let use_counts = idx.meta().config.with_counts;
let stdout = Mutex::new(BufWriter::new(io::stdout()));
info!("unitig: building de Bruijn graph from {n} partition(s) (k={k})");
(0..n).into_par_iter().for_each(|i| {
let path = idx.layer_unitigs_path(i, 0);
if !path.exists() {
return;
}
let filters = args.filter.build_filters(&idx.meta().genomes);
// open_sequential: works with and without .idx (approx or exact index)
let reader = UnitigFileReader::open_sequential(&path).unwrap_or_else(|e| {
eprintln!("error opening unitigs (partition {i}): {e}");
std::process::exit(1)
});
// ── Collect all filtered kmers into a single de Bruijn graph ─────────────
let mut g = GraphDeBruijn::new();
for (j, unitig) in reader.iter_unitigs() {
let mut out = stdout.lock().unwrap();
write_unitig(&unitig, k, i, j, &mut *out).unwrap_or_else(|e| {
eprintln!("write error: {e}");
std::process::exit(1)
for i in 0..n {
idx.partition()
.iter_partition_kmers(i, use_counts, n_genomes, &filters, |kmer, _row| {
g.push(kmer);
})
.unwrap_or_else(|e| {
eprintln!("error reading partition {i}: {e}");
std::process::exit(1);
});
}
});
}
stdout.into_inner().unwrap().flush().expect("flush error");
info!("unitig: {} distinct k-mers", g.len());
g.compute_degrees();
// ── Enumerate unitigs and write as FASTA ──────────────────────────────────
let stdout = io::stdout();
let mut out = BufWriter::new(stdout.lock());
for (j, unitig) in g.iter_unitig().enumerate() {
write_unitig(&unitig, k, 0, j, &mut out).unwrap_or_else(|e| {
eprintln!("write error: {e}");
std::process::exit(1);
});
}
out.flush().expect("flush error");
}