From 712a03a3a63157885079954aeba014240a00044e Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Thu, 4 Jun 2026 20:55:24 +0200 Subject: [PATCH] refactor: replace unitig extraction with de Bruijn graph pipeline This change replaces direct partition-based extraction with a pipeline that reconstructs a de Bruijn graph from filtered k-mers. It introduces `FilterArgs` for k-mer selection, collects filtered k-mers in parallel into a `GraphDeBruijn`, computes node degrees, and enumerates unitigs from the graph for output instead of reading pre-computed partition files. --- src/obikmer/src/cmd/unitig.rs | 69 ++++++++++++++++++++--------------- 1 file changed, 40 insertions(+), 29 deletions(-) diff --git a/src/obikmer/src/cmd/unitig.rs b/src/obikmer/src/cmd/unitig.rs index 82fe6bc..6c36bb6 100644 --- a/src/obikmer/src/cmd/unitig.rs +++ b/src/obikmer/src/cmd/unitig.rs @@ -1,54 +1,65 @@ use std::io::{self, BufWriter, Write}; use std::path::PathBuf; -use std::sync::Mutex; use clap::Args; +use obidebruinj::GraphDeBruijn; use obifastwrite::write_unitig; use obikindex::KmerIndex; -use obikseq::set_k; -use obiskio::UnitigFileReader; -use rayon::prelude::*; use tracing::info; +use super::predicate::FilterArgs; + #[derive(Args)] pub struct UnitigArgs { - /// Index directory produced by the `index` command + /// Index directory pub index: PathBuf, + + #[command(flatten)] + pub filter: FilterArgs, } pub fn run(args: UnitigArgs) { let idx = KmerIndex::open(&args.index).unwrap_or_else(|e| { eprintln!("error opening index: {e}"); - std::process::exit(1) + std::process::exit(1); }); - let k = idx.kmer_size(); - set_k(k); - let n = idx.n_partitions(); - info!("dumping unitigs from {n} partitions (k={k})"); + let k = idx.kmer_size(); + let n = idx.n_partitions(); + let n_genomes = idx.meta().genomes.len().max(1); + let use_counts = idx.meta().config.with_counts; - let stdout = Mutex::new(BufWriter::new(io::stdout())); + info!("unitig: building de Bruijn graph from {n} partition(s) (k={k})"); - (0..n).into_par_iter().for_each(|i| { - let path = idx.layer_unitigs_path(i, 0); - if !path.exists() { - return; - } + let filters = args.filter.build_filters(&idx.meta().genomes); - // open_sequential: works with and without .idx (approx or exact index) - let reader = UnitigFileReader::open_sequential(&path).unwrap_or_else(|e| { - eprintln!("error opening unitigs (partition {i}): {e}"); - std::process::exit(1) - }); + // ── Collect all filtered kmers into a single de Bruijn graph ───────────── + let mut g = GraphDeBruijn::new(); - for (j, unitig) in reader.iter_unitigs() { - let mut out = stdout.lock().unwrap(); - write_unitig(&unitig, k, i, j, &mut *out).unwrap_or_else(|e| { - eprintln!("write error: {e}"); - std::process::exit(1) + for i in 0..n { + idx.partition() + .iter_partition_kmers(i, use_counts, n_genomes, &filters, |kmer, _row| { + g.push(kmer); + }) + .unwrap_or_else(|e| { + eprintln!("error reading partition {i}: {e}"); + std::process::exit(1); }); - } - }); + } - stdout.into_inner().unwrap().flush().expect("flush error"); + info!("unitig: {} distinct k-mers", g.len()); + g.compute_degrees(); + + // ── Enumerate unitigs and write as FASTA ────────────────────────────────── + let stdout = io::stdout(); + let mut out = BufWriter::new(stdout.lock()); + + for (j, unitig) in g.iter_unitig().enumerate() { + write_unitig(&unitig, k, 0, j, &mut out).unwrap_or_else(|e| { + eprintln!("write error: {e}"); + std::process::exit(1); + }); + } + + out.flush().expect("flush error"); }