From c5bcb7b8fa26dcc0388416005f8a4df8603ec592 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Wed, 20 May 2026 21:01:16 +0200 Subject: [PATCH 01/11] feat: introduce layered MPHF indexing and partition metadata Refactors obikindex and obikpartitionner to delegate index construction to a new layered MPHF implementation. Adds resume-safe building with abundance filtering and count persistence, while introducing a PartitionMeta struct for JSON configuration persistence. Updates OKIError to wrap layer-specific errors, replaces single-path extraction with full path collection and logging, and registers new internal dependencies across the workspace. --- TODO.md | 3 + src/Cargo.lock | 9 +- src/obikindex/Cargo.toml | 9 +- src/obikindex/src/error.rs | 8 -- src/obikindex/src/index.rs | 171 +++++------------------- src/obikmer/src/cmd/index.rs | 7 +- src/obikpartitionner/Cargo.toml | 6 +- src/obikpartitionner/src/index_layer.rs | 135 +++++++++++++++++++ src/obikpartitionner/src/lib.rs | 1 + src/obikpartitionner/src/partition.rs | 73 ++-------- 10 files changed, 193 insertions(+), 229 deletions(-) create mode 100644 src/obikpartitionner/src/index_layer.rs diff --git a/TODO.md b/TODO.md index fdb0998..b67b7c3 100644 --- a/TODO.md +++ b/TODO.md @@ -1,5 +1,6 @@ ## Chose à vérifier suite à la commande index +- il faudrait lister les fichier qui vont être indexés - partition.meta ne devrait plus exister - les spectrums globaux devrait etre identifier par génome - regrouper dans un sous-dossier spectrums à la racine de l'index avec un nom basé sur le génome @@ -26,3 +27,5 @@ - les arbres NJ sont sauvegardés en Newick avec les longeurs de branche - dump : une table csv de l'index avec les kmer et les genomes associés en mode count ou presence/absence avec une option pour forcer le mode presence/absence meme si l'index est en mode count. Par defaut, le mode count est utilisé pour les index en mode count et le mode presence/absence pour les index en mode presence/absence. + +- status : affiche le statut de l'index diff --git a/src/Cargo.lock b/src/Cargo.lock index 858c7c1..4c1929b 100644 --- a/src/Cargo.lock +++ b/src/Cargo.lock @@ -1452,17 +1452,10 @@ dependencies = [ name = "obikindex" version = "0.1.0" dependencies = [ - "cacheline-ef", - "epserde", "indicatif", - "obicompactvec", - "obidebruinj", "obikpartitionner", - "obikseq", - "obilayeredmap", "obiskio", "obisys", - "ptr_hash", "rayon", "serde", "serde_json", @@ -1501,8 +1494,10 @@ dependencies = [ "memmap2", "niffler 3.0.0", "obicompactvec", + "obidebruinj", "obikrope", "obikseq", + "obilayeredmap", "obiread", "obiskbuilder", "obiskio", diff --git a/src/obikindex/Cargo.toml b/src/obikindex/Cargo.toml index caa12d6..bf4ebd6 100644 --- a/src/obikindex/Cargo.toml +++ b/src/obikindex/Cargo.toml @@ -5,15 +5,8 @@ edition = "2024" [dependencies] obikpartitionner = { path = "../obikpartitionner" } -obikseq = { path = "../obikseq" } -obisys = { path = "../obisys" } obiskio = { path = "../obiskio" } -obidebruinj = { path = "../obidebruinj" } -obilayeredmap = { path = "../obilayeredmap" } -obicompactvec = { path = "../obicompactvec" } -cacheline-ef = "1.1" -epserde = "0.8" -ptr_hash = "1.1" +obisys = { path = "../obisys" } rayon = "1" serde = { version = "1", features = ["derive"] } serde_json = "1" diff --git a/src/obikindex/src/error.rs b/src/obikindex/src/error.rs index 55b2a79..df191ed 100644 --- a/src/obikindex/src/error.rs +++ b/src/obikindex/src/error.rs @@ -2,14 +2,12 @@ use std::fmt; use std::io; use obiskio::SKError; -use obilayeredmap::OLMError; #[derive(Debug)] pub enum OKIError { Io(io::Error), Json(serde_json::Error), Partition(SKError), - Layer(OLMError), } pub type OKIResult = Result; @@ -20,7 +18,6 @@ impl fmt::Display for OKIError { OKIError::Io(e) => write!(f, "I/O error: {e}"), OKIError::Json(e) => write!(f, "JSON error: {e}"), OKIError::Partition(e) => write!(f, "partition error: {e}"), - OKIError::Layer(e) => write!(f, "layer error: {e}"), } } } @@ -31,7 +28,6 @@ impl std::error::Error for OKIError { OKIError::Io(e) => Some(e), OKIError::Json(e) => Some(e), OKIError::Partition(e) => Some(e), - OKIError::Layer(e) => Some(e), } } } @@ -47,7 +43,3 @@ impl From for OKIError { impl From for OKIError { fn from(e: SKError) -> Self { OKIError::Partition(e) } } - -impl From for OKIError { - fn from(e: OLMError) -> Self { OKIError::Layer(e) } -} diff --git a/src/obikindex/src/index.rs b/src/obikindex/src/index.rs index 37b30c1..4e6bd7c 100644 --- a/src/obikindex/src/index.rs +++ b/src/obikindex/src/index.rs @@ -3,16 +3,9 @@ use std::path::{Path, PathBuf}; use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::{Arc, Mutex}; -use cacheline_ef::{CachelineEf, CachelineEfVec}; -use epserde::prelude::*; use indicatif::{ProgressBar, ProgressStyle}; -use obicompactvec::{PersistentCompactIntMatrix, PersistentCompactIntVec}; -use obidebruinj::GraphDeBruijn; use obikpartitionner::KmerPartition; -use obilayeredmap::layer::Layer; -use obiskio::{SKFileMeta, SKFileReader}; use obisys::{Reporter, Stage}; -use ptr_hash::{PtrHash, bucket_fn::CubicEps, hash::Xx64}; use rayon::prelude::*; use tracing::info; @@ -20,8 +13,6 @@ use crate::error::{OKIError, OKIResult}; use crate::meta::{IndexConfig, IndexMeta}; use crate::state::{IndexState, SENTINEL_INDEXED, SENTINEL_SCATTERED}; -type Mphf = PtrHash>, Xx64, Vec>; - pub struct KmerIndex { root_path: PathBuf, meta: IndexMeta, @@ -59,7 +50,12 @@ impl KmerIndex { pub fn open>(path: P) -> OKIResult { let root_path = path.as_ref().to_owned(); let meta = IndexMeta::read(&root_path).map_err(OKIError::Io)?; - let partition = KmerPartition::open(&root_path)?; + let partition = KmerPartition::open_with_config( + &root_path, + meta.config.kmer_size, + meta.config.minimizer_size, + meta.config.n_bits, + )?; Ok(Self { root_path, meta, partition }) } @@ -87,13 +83,10 @@ impl KmerIndex { /// Mark scatter as complete and write `scatter.done`. /// /// If no genome label was set at creation time, one is derived from - /// `first_scatter_path` (filename stripped of all extensions). - /// If `first_scatter_path` is also `None`, the label defaults to `"unknown"`. - pub fn mark_scattered(&mut self, first_scatter_path: Option<&Path>) -> OKIResult<()> { + /// the index root directory name (stripped of all extensions). + pub fn mark_scattered(&mut self) -> OKIResult<()> { if self.meta.genomes.is_empty() { - let label = first_scatter_path - .map(label_from_path) - .unwrap_or_else(|| "unknown".to_string()); + let label = label_from_path(&self.root_path); self.meta.genomes.push(label); self.meta.write(&self.root_path)?; } @@ -116,20 +109,9 @@ impl KmerIndex { Ok(()) } - /// Build the layered MPHF index for all partitions. - /// - /// Default mode (`config.with_counts = false`): set membership only. - /// With counts: count matrix per kmer. + /// Build the layered MPHF index for all partitions in parallel. /// /// Writes `index.done` upon completion. - /// Path to the unitigs file for partition `part`, layer `layer`. - pub fn layer_unitigs_path(&self, part: usize, layer: usize) -> PathBuf { - self.partition.part_dir(part) - .join("index") - .join(format!("layer_{layer}")) - .join("unitigs.bin") - } - pub fn build_layers( &self, min_ab: u32, @@ -140,12 +122,8 @@ impl KmerIndex { let n = self.partition.n_partitions(); let t = Stage::start("index"); let with_counts = self.meta.config.with_counts; - let filter_active = min_ab > 1 || max_ab.is_some(); - let need_counts = filter_active || with_counts; let total_kmers = AtomicUsize::new(0); - let partition = &self.partition; - let pb = Arc::new(Mutex::new( ProgressBar::new(n as u64).with_style( ProgressStyle::with_template("index — [{bar:20}] {pos}/{len} | {msg}").unwrap(), @@ -153,101 +131,19 @@ impl KmerIndex { )); (0..n).into_par_iter().for_each(|i| { - let part_dir = partition.part_dir(i); - let dedup_path = part_dir.join("dereplicated.skmer.zst"); - if !dedup_path.exists() { - return; - } - - let layer_dir = part_dir.join("index").join("layer_0"); - if layer_dir.join("mphf.bin").exists() { - return; - } - - let mphf1_opt: Option = if need_counts { - let p = part_dir.join("mphf1.bin"); - p.exists().then(|| Mphf::load_full(&p).ok()).flatten() - } else { - None - }; - - let counts1_opt: Option = if need_counts { - let p = part_dir.join("counts1.bin"); - p.exists() - .then(|| PersistentCompactIntVec::open(&p).ok()) - .flatten() - } else { - None - }; - - let mut g = GraphDeBruijn::new(); - let mut reader = SKFileReader::open(&dedup_path).unwrap_or_else(|e| { - eprintln!("error opening {}: {e}", dedup_path.display()); - std::process::exit(1); - }); - for sk in reader.iter() { - for kmer in sk.iter_canonical_kmers() { - let accept = if filter_active { - match (&mphf1_opt, &counts1_opt) { - (Some(mphf), Some(counts)) => { - let ab = counts.get(mphf.index(&kmer.raw())); - ab >= min_ab && max_ab.map_or(true, |max| ab <= max) - } - _ => true, - } - } else { - true - }; - if accept { - g.push(kmer); - } + match self.partition.build_index_layer(i, min_ab, max_ab, with_counts) { + Ok(0) => {} + Ok(n_kmers) => { + total_kmers.fetch_add(n_kmers, Ordering::Relaxed); + let pb = pb.lock().unwrap(); + pb.inc(1); + pb.set_message(format!("{i}: {n_kmers} kmers")); + } + Err(e) => { + eprintln!("error building layer for partition {i}: {e}"); + std::process::exit(1); } } - - let n_kmers = g.len(); - total_kmers.fetch_add(n_kmers, Ordering::Relaxed); - g.compute_degrees(); - - fs::create_dir_all(&layer_dir).unwrap_or_else(|e| { - eprintln!("error creating {}: {e}", layer_dir.display()); - std::process::exit(1); - }); - let mut uw = Layer::<()>::unitig_writer(&layer_dir).unwrap_or_else(|e| { - eprintln!("error creating unitig writer (partition {i}): {e}"); - std::process::exit(1); - }); - for unitig in g.iter_unitig() { - uw.write(&unitig).unwrap_or_else(|e| { - eprintln!("error writing unitig (partition {i}): {e}"); - std::process::exit(1); - }); - } - uw.close().unwrap_or_else(|e| { - eprintln!("error closing unitig writer (partition {i}): {e}"); - std::process::exit(1); - }); - - if with_counts { - Layer::::build(&layer_dir, |kmer| { - match (&mphf1_opt, &counts1_opt) { - (Some(mphf), Some(counts)) => counts.get(mphf.index(&kmer.raw())), - _ => 1, - } - }) - .unwrap_or_else(|e| { - eprintln!("error building count layer (partition {i}): {e}"); - std::process::exit(1); - }); - } else { - Layer::<()>::build(&layer_dir).unwrap_or_else(|e| { - eprintln!("error building set layer (partition {i}): {e}"); - std::process::exit(1); - }); - } - - let pb = pb.lock().unwrap(); - pb.inc(1); - pb.set_message(format!("{i}: {n_kmers} kmers")); }); pb.lock().unwrap().finish_and_clear(); @@ -258,13 +154,7 @@ impl KmerIndex { if !keep_intermediate { for i in 0..n { - let part_dir = partition.part_dir(i); - remove_if_exists(&part_dir.join("dereplicated.skmer.zst")); - remove_if_exists(&SKFileMeta::sidecar_path( - &part_dir.join("dereplicated.skmer.zst"), - )); - remove_if_exists(&part_dir.join("mphf1.bin")); - remove_if_exists(&part_dir.join("counts1.bin")); + self.partition.remove_build_artifacts(i); } } @@ -272,9 +162,16 @@ impl KmerIndex { rep.push(t.stop()); Ok(()) } + + /// Path to the unitigs file for partition `part`, layer `layer`. + pub fn layer_unitigs_path(&self, part: usize, layer: usize) -> PathBuf { + self.partition.part_dir(part) + .join("index") + .join(format!("layer_{layer}")) + .join("unitigs.bin") + } } -/// Derive a genome label from a file path: filename stripped of all extensions. fn label_from_path(path: &Path) -> String { let name = path .file_name() @@ -291,11 +188,3 @@ fn label_from_path(path: &Path) -> String { fn touch(path: &Path) -> Result<(), std::io::Error> { fs::File::create(path).map(|_| ()) } - -fn remove_if_exists(path: &Path) { - if let Err(e) = fs::remove_file(path) { - if e.kind() != std::io::ErrorKind::NotFound { - eprintln!("warning: could not remove {}: {e}", path.display()); - } - } -} diff --git a/src/obikmer/src/cmd/index.rs b/src/obikmer/src/cmd/index.rs index 40349e1..089402c 100644 --- a/src/obikmer/src/cmd/index.rs +++ b/src/obikmer/src/cmd/index.rs @@ -72,7 +72,10 @@ pub fn run(args: IndexArgs) { // ── Stage 1: scatter ───────────────────────────────────────────────────── if idx.state() < IndexState::Scattered { - let first_path = args.common.inputs.first().map(PathBuf::from); + let paths: Vec<_> = args.common.seqfile_paths().collect(); + for path in &paths { + info!("indexing: {}", path.display()); + } let k = idx.kmer_size(); let level_max = args.common.level_max; let theta = args.common.theta; @@ -80,7 +83,7 @@ pub fn run(args: IndexArgs) { scatter(idx.partition_mut(), args.common.seqfile_paths(), k, level_max, theta, n_workers, &mut rep); - idx.mark_scattered(first_path.as_deref()).unwrap_or_else(|e| { + idx.mark_scattered().unwrap_or_else(|e| { eprintln!("error marking scatter done: {e}"); std::process::exit(1); }); diff --git a/src/obikpartitionner/Cargo.toml b/src/obikpartitionner/Cargo.toml index 8e94ff2..d62fdf4 100644 --- a/src/obikpartitionner/Cargo.toml +++ b/src/obikpartitionner/Cargo.toml @@ -13,8 +13,10 @@ obikrope = { path = "../obikrope" } [dependencies] niffler = "3.0.0" remove_dir_all = "0.8" -obikseq = { path = "../obikseq" } -obiskio = { path = "../obiskio" } +obikseq = { path = "../obikseq" } +obiskio = { path = "../obiskio" } +obidebruinj = { path = "../obidebruinj" } +obilayeredmap = { path = "../obilayeredmap" } rayon = "1" sysinfo = "0.33" serde = { version = "1", features = ["derive"] } diff --git a/src/obikpartitionner/src/index_layer.rs b/src/obikpartitionner/src/index_layer.rs new file mode 100644 index 0000000..a6d873a --- /dev/null +++ b/src/obikpartitionner/src/index_layer.rs @@ -0,0 +1,135 @@ +use std::fs; +use std::io; + +use cacheline_ef::{CachelineEf, CachelineEfVec}; +use epserde::prelude::*; +use obicompactvec::{PersistentCompactIntMatrix, PersistentCompactIntVec}; +use obidebruinj::GraphDeBruijn; +use obilayeredmap::{OLMError, layer::Layer}; +use obiskio::{SKError, SKFileMeta, SKFileReader}; +use ptr_hash::{PtrHash, bucket_fn::CubicEps, hash::Xx64}; + +use crate::partition::KmerPartition; + +type Mphf = PtrHash>, Xx64, Vec>; + +fn olm_to_sk(e: OLMError) -> SKError { + match e { + OLMError::Io(io_err) => SKError::Io(io_err), + other => SKError::InvalidData { context: "layer build", detail: other.to_string() }, + } +} + +fn remove_if_exists(path: &std::path::Path) { + if let Err(e) = fs::remove_file(path) { + if e.kind() != io::ErrorKind::NotFound { + eprintln!("warning: could not remove {}: {e}", path.display()); + } + } +} + +impl KmerPartition { + /// Build the layered MPHF index for partition `i`. + /// + /// Returns the number of canonical k-mers indexed, or 0 if the partition + /// has no data or its layer was already built (resume-safe). + /// + /// Abundance filtering is applied when `min_ab > 1` or `max_ab.is_some()`, + /// using `mphf1.bin` + `counts1.bin` if they exist. + /// Count payload is stored iff `with_counts` is true. + pub fn build_index_layer( + &self, + i: usize, + min_ab: u32, + max_ab: Option, + with_counts: bool, + ) -> Result { + let part_dir = self.part_dir(i); + let dedup_path = part_dir.join("dereplicated.skmer.zst"); + if !dedup_path.exists() { + return Ok(0); + } + + let layer_dir = part_dir.join("index").join("layer_0"); + if layer_dir.join("mphf.bin").exists() { + return Ok(0); + } + + let filter_active = min_ab > 1 || max_ab.is_some(); + let need_counts = filter_active || with_counts; + + let mphf1_opt: Option = if need_counts { + let p = part_dir.join("mphf1.bin"); + p.exists().then(|| Mphf::load_full(&p).ok()).flatten() + } else { + None + }; + + let counts1_opt: Option = if need_counts { + let p = part_dir.join("counts1.bin"); + p.exists() + .then(|| PersistentCompactIntVec::open(&p).ok()) + .flatten() + } else { + None + }; + + let mut g = GraphDeBruijn::new(); + let mut reader = SKFileReader::open(&dedup_path)?; + for sk in reader.iter() { + for kmer in sk.iter_canonical_kmers() { + let accept = if filter_active { + match (&mphf1_opt, &counts1_opt) { + (Some(mphf), Some(counts)) => { + let ab = counts.get(mphf.index(&kmer.raw())); + ab >= min_ab && max_ab.map_or(true, |max| ab <= max) + } + _ => true, + } + } else { + true + }; + if accept { + g.push(kmer); + } + } + } + + let n_kmers = g.len(); + g.compute_degrees(); + + fs::create_dir_all(&layer_dir)?; + + let mut uw = Layer::<()>::unitig_writer(&layer_dir).map_err(olm_to_sk)?; + for unitig in g.iter_unitig() { + uw.write(&unitig)?; + } + uw.close()?; + + if with_counts { + Layer::::build(&layer_dir, |kmer| { + match (&mphf1_opt, &counts1_opt) { + (Some(mphf), Some(counts)) => counts.get(mphf.index(&kmer.raw())), + _ => 1, + } + }) + .map_err(olm_to_sk)?; + } else { + Layer::<()>::build(&layer_dir).map_err(olm_to_sk)?; + } + + Ok(n_kmers) + } + + /// Remove intermediate build artifacts for partition `i`. + /// + /// Deletes `dereplicated.skmer.zst` (+ sidecar), `mphf1.bin`, `counts1.bin`. + pub fn remove_build_artifacts(&self, i: usize) { + let part_dir = self.part_dir(i); + let dedup = part_dir.join("dereplicated.skmer.zst"); + remove_if_exists(&SKFileMeta::sidecar_path(&dedup)); + remove_if_exists(&dedup); + remove_if_exists(&part_dir.join("mphf1.bin")); + remove_if_exists(&part_dir.join("counts1.bin")); + } +} diff --git a/src/obikpartitionner/src/lib.rs b/src/obikpartitionner/src/lib.rs index 09fc147..bd7934f 100644 --- a/src/obikpartitionner/src/lib.rs +++ b/src/obikpartitionner/src/lib.rs @@ -1,3 +1,4 @@ +mod index_layer; mod kmer_sort; mod partition; diff --git a/src/obikpartitionner/src/partition.rs b/src/obikpartitionner/src/partition.rs index a638f93..5740aa2 100644 --- a/src/obikpartitionner/src/partition.rs +++ b/src/obikpartitionner/src/partition.rs @@ -18,7 +18,6 @@ use obiskio::{SKFileMeta, SKFileReader, SKFileWriter, SKResult}; use ptr_hash::{PtrHash, PtrHashParams, bucket_fn::CubicEps, hash::Xx64}; use rayon::prelude::*; use remove_dir_all::remove_dir_all; -use serde::{Deserialize, Serialize}; use sysinfo::System; use niffler::Level; @@ -28,18 +27,9 @@ use crate::kmer_sort::{chunk_size_from_ram, sort_unique_kmers}; type Mphf = PtrHash>, Xx64, Vec>; -const META_FILENAME: &str = "partition.meta"; const SK_EXT: &str = "skmer.zst"; pub const PARTITIONS_SUBDIR: &str = "partitions"; -#[derive(Serialize, Deserialize)] -struct PartitionMeta { - n_bits: usize, - kmer_size: usize, - minimizer_size: usize, - level: u32, -} - pub struct KmerPartition { root_path: PathBuf, n_partitions: usize, @@ -98,11 +88,15 @@ impl KmerPartition { level, closed: false, }; - partition.write_meta(n_bits)?; Ok(partition) } - pub fn open>(path: P) -> SKResult { + pub fn open_with_config>( + path: P, + kmer_size: usize, + minimizer_size: usize, + n_bits: usize, + ) -> SKResult { let root_path = path.as_ref().to_owned(); if !root_path.exists() { return Err(io::Error::new( @@ -111,22 +105,17 @@ impl KmerPartition { ) .into()); } - let meta_path = root_path.join(META_FILENAME); - let meta: PartitionMeta = - serde_json::from_reader(fs::File::open(&meta_path)?).map_err(io::Error::other)?; - - let level = level_from_u32(meta.level); - let n_partitions = 1usize << meta.n_bits; + let n_partitions = 1usize << n_bits; let writers = (0..n_partitions).map(|_| None).collect(); Ok(Self { root_path, n_partitions, - partitions_mask: (1u64 << meta.n_bits) - 1, - kmer_size: meta.kmer_size, - minimizer_size: meta.minimizer_size, + partitions_mask: (1u64 << n_bits) - 1, + kmer_size, + minimizer_size, writers, - level, - closed: true, // read-only: writing is not allowed on an opened partition + level: Level::One, + closed: true, }) } @@ -339,18 +328,6 @@ impl KmerPartition { } } - fn write_meta(&self, n_bits: usize) -> SKResult<()> { - let meta = PartitionMeta { - n_bits, - kmer_size: self.kmer_size, - minimizer_size: self.minimizer_size, - level: u32::from(self.level), - }; - let f = fs::File::create(self.root_path.join(META_FILENAME))?; - serde_json::to_writer_pretty(f, &meta).map_err(|e| io::Error::other(e))?; - Ok(()) - } - fn ensure_writer(&mut self, partition: usize) -> SKResult<&mut SKFileWriter> { if self.writers[partition].is_none() { let dir = self.root_path.join(PARTITIONS_SUBDIR).join(format!("part_{:05}", partition)); @@ -411,32 +388,6 @@ fn optimal_buckets(raw_path: &Path, available_bytes: u64) -> usize { n.next_power_of_two() as usize } -fn level_from_u32(n: u32) -> Level { - match n { - 0 => Level::Zero, - 1 => Level::One, - 2 => Level::Two, - 3 => Level::Three, - 4 => Level::Four, - 5 => Level::Five, - 6 => Level::Six, - 7 => Level::Seven, - 8 => Level::Eight, - 9 => Level::Nine, - 10 => Level::Ten, - 11 => Level::Eleven, - 12 => Level::Twelve, - 13 => Level::Thirteen, - 14 => Level::Fourteen, - 15 => Level::Fifteen, - 16 => Level::Sixteen, - 17 => Level::Seventeen, - 18 => Level::Eighteen, - 19 => Level::Nineteen, - 20 => Level::Twenty, - _ => Level::TwentyOne, - } -} /// Maximum value that fits in the 24-bit COUNT field of a SuperKmer header. const MAX_SK_COUNT: u64 = (1 << 24) - 1; -- 2.52.0 From 7d1b62ddf3828b0a7aa566ec960b01f96b6cc87e Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Wed, 20 May 2026 21:06:27 +0200 Subject: [PATCH 02/11] refactor: replace single spectrum file with per-partition outputs Replace the single `kmer_spectrum_raw.json` output with per-partition JSON files in a `spectrums/` directory. Add a `keep_intermediate` parameter to control intermediate file cleanup, and introduce a `write_spectrum` helper for serialization. Update the completion sentinel to `count.done` and align state documentation accordingly. --- src/obikindex/src/index.rs | 34 ++++++++++++++++++---- src/obikindex/src/state.rs | 4 +-- src/obikmer/src/cmd/index.rs | 2 +- src/obikpartitionner/src/lib.rs | 2 +- src/obikpartitionner/src/partition.rs | 41 ++++++++++++++------------- 5 files changed, 53 insertions(+), 30 deletions(-) diff --git a/src/obikindex/src/index.rs b/src/obikindex/src/index.rs index 4e6bd7c..c5ba267 100644 --- a/src/obikindex/src/index.rs +++ b/src/obikindex/src/index.rs @@ -1,17 +1,18 @@ +use std::collections::BTreeMap; use std::fs; use std::path::{Path, PathBuf}; use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::{Arc, Mutex}; use indicatif::{ProgressBar, ProgressStyle}; -use obikpartitionner::KmerPartition; +use obikpartitionner::{KmerPartition, KmerSpectrum}; use obisys::{Reporter, Stage}; use rayon::prelude::*; use tracing::info; use crate::error::{OKIError, OKIResult}; use crate::meta::{IndexConfig, IndexMeta}; -use crate::state::{IndexState, SENTINEL_INDEXED, SENTINEL_SCATTERED}; +use crate::state::{IndexState, SENTINEL_COUNTED, SENTINEL_INDEXED, SENTINEL_SCATTERED}; pub struct KmerIndex { root_path: PathBuf, @@ -96,16 +97,37 @@ impl KmerIndex { /// Dereplicate all partitions then compute kmer counts. /// - /// Writes `kmer_spectrum_raw.json` at the index root upon completion - /// (this file doubles as the `Counted` sentinel). - pub fn dereplicate_and_count(&self, rep: &mut Reporter) -> OKIResult<()> { + /// Writes `spectrums/{label}.json` and touches `count.done` upon completion. + /// Per-partition spectrum files are removed unless `keep_intermediate` is true. + pub fn dereplicate_and_count(&self, keep_intermediate: bool, rep: &mut Reporter) -> OKIResult<()> { let t = Stage::start("dereplicate"); self.partition.dereplicate()?; rep.push(t.stop()); let t = Stage::start("count_kmer"); - self.partition.count_kmer()?; + let spectrum = self.partition.count_kmer(keep_intermediate)?; rep.push(t.stop()); + + self.write_spectrum(&spectrum)?; + touch(&self.root_path.join(SENTINEL_COUNTED))?; + Ok(()) + } + + fn write_spectrum(&self, sp: &KmerSpectrum) -> OKIResult<()> { + let label = self.meta.genomes.first().map(String::as_str).unwrap_or("unknown"); + let spectrums_dir = self.root_path.join("spectrums"); + fs::create_dir_all(&spectrums_dir)?; + let path = spectrums_dir.join(format!("{label}.json")); + let spectrum_map: BTreeMap = sp.counts + .iter() + .map(|(&c, &f)| (format!("{c:010}"), f)) + .collect(); + let f = fs::File::create(&path)?; + serde_json::to_writer_pretty( + f, + &serde_json::json!({ "f0": sp.f0, "f1": sp.f1, "spectrum": spectrum_map }), + ) + .map_err(OKIError::Json)?; Ok(()) } diff --git a/src/obikindex/src/state.rs b/src/obikindex/src/state.rs index a32d4d8..4db50ce 100644 --- a/src/obikindex/src/state.rs +++ b/src/obikindex/src/state.rs @@ -3,7 +3,7 @@ use std::path::Path; use crate::meta::META_FILENAME; pub const SENTINEL_SCATTERED: &str = "scatter.done"; -pub const SENTINEL_COUNTED: &str = "kmer_spectrum_raw.json"; +pub const SENTINEL_COUNTED: &str = "count.done"; pub const SENTINEL_INDEXED: &str = "index.done"; /// Progression state of a `KmerIndex`. @@ -17,7 +17,7 @@ pub enum IndexState { Empty, /// `scatter.done` sentinel present — all super-kmers have been routed. Scattered, - /// `kmer_spectrum_raw.json` present — dereplicate + count complete. + /// `count.done` sentinel present — dereplicate + count complete. Counted, /// `index.done` sentinel present — layered MPHF index fully built. Indexed, diff --git a/src/obikmer/src/cmd/index.rs b/src/obikmer/src/cmd/index.rs index 089402c..9f9bad2 100644 --- a/src/obikmer/src/cmd/index.rs +++ b/src/obikmer/src/cmd/index.rs @@ -93,7 +93,7 @@ pub fn run(args: IndexArgs) { // ── Stage 2: dereplicate + count ───────────────────────────────────────── if idx.state() < IndexState::Counted { - idx.dereplicate_and_count(&mut rep).unwrap_or_else(|e| { + idx.dereplicate_and_count(args.keep_intermediate, &mut rep).unwrap_or_else(|e| { eprintln!("error: {e}"); std::process::exit(1); }); diff --git a/src/obikpartitionner/src/lib.rs b/src/obikpartitionner/src/lib.rs index bd7934f..2e94e56 100644 --- a/src/obikpartitionner/src/lib.rs +++ b/src/obikpartitionner/src/lib.rs @@ -2,4 +2,4 @@ mod index_layer; mod kmer_sort; mod partition; -pub use partition::{KmerPartition, PARTITIONS_SUBDIR}; +pub use partition::{KmerPartition, KmerSpectrum, PARTITIONS_SUBDIR}; diff --git a/src/obikpartitionner/src/partition.rs b/src/obikpartitionner/src/partition.rs index 5740aa2..c668309 100644 --- a/src/obikpartitionner/src/partition.rs +++ b/src/obikpartitionner/src/partition.rs @@ -27,6 +27,12 @@ use crate::kmer_sort::{chunk_size_from_ram, sort_unique_kmers}; type Mphf = PtrHash>, Xx64, Vec>; +pub struct KmerSpectrum { + pub f0: u64, + pub f1: u64, + pub counts: BTreeMap, +} + const SK_EXT: &str = "skmer.zst"; pub const PARTITIONS_SUBDIR: &str = "partitions"; @@ -238,11 +244,13 @@ impl KmerPartition { /// 3. Writes a flat binary count file (`counts1.bin`, one `u32` per slot, /// memory-mapped) accumulating kmer abundances from the superkmer counts. /// 4. Persists the MPHF to `mphf1.bin` for downstream use. - /// 5. Writes a global `kmer_spectrum_raw.json` at the partition root. + /// + /// Returns the aggregated `KmerSpectrum`. Per-partition spectrum files are + /// deleted after aggregation unless `keep_partial` is true. /// /// Partitions are processed in parallel via Rayon (one task per thread). /// Peak memory per partition is ~80 MB, so n_threads partitions run simultaneously. - pub fn count_kmer(&self) -> SKResult<()> { + pub fn count_kmer(&self, keep_partial: bool) -> SKResult { let sys = System::new_all(); let available = match sys.available_memory() { 0 => sys.total_memory() / 2, @@ -282,10 +290,10 @@ impl KmerPartition { r?; } - // Aggregate per-partition spectra into a global one at the root. - let mut global_spectrum: BTreeMap = BTreeMap::new(); - let mut global_f0: u64 = 0; - let mut global_f1: u64 = 0; + // Aggregate per-partition spectra. + let mut counts: BTreeMap = BTreeMap::new(); + let mut f0: u64 = 0; + let mut f1: u64 = 0; for i in 0..self.n_partitions { let path = self.part_dir(i).join("kmer_spectrum_raw.json"); @@ -294,28 +302,21 @@ impl KmerPartition { } let v: serde_json::Value = serde_json::from_str(&fs::read_to_string(&path)?).map_err(io::Error::other)?; - global_f0 += v["f0"].as_u64().unwrap_or(0); - global_f1 += v["f1"].as_u64().unwrap_or(0); + f0 += v["f0"].as_u64().unwrap_or(0); + f1 += v["f1"].as_u64().unwrap_or(0); if let Some(obj) = v["spectrum"].as_object() { for (c_str, freq) in obj { if let (Ok(c), Some(f)) = (c_str.parse::(), freq.as_u64()) { - *global_spectrum.entry(c).or_insert(0) += f; + *counts.entry(c).or_insert(0) += f; } } } + if !keep_partial { + let _ = fs::remove_file(&path); + } } - let global_spectrum_map: BTreeMap = global_spectrum - .iter() - .map(|(&c, &f)| (format!("{c:010}"), f)) - .collect(); - serde_json::to_writer_pretty( - fs::File::create(self.root_path.join("kmer_spectrum_raw.json"))?, - &serde_json::json!({ "f0": global_f0, "f1": global_f1, "spectrum": &global_spectrum_map }), - ) - .map_err(io::Error::other)?; - - Ok(()) + Ok(KmerSpectrum { f0, f1, counts }) } // ── private ─────────────────────────────────────────────────────────────── -- 2.52.0 From bfa436ad155966e055990de98e55426819dfe181 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Wed, 20 May 2026 21:18:19 +0200 Subject: [PATCH 03/11] feat: add merge operation specs and partition progress bar Added implementation specifications for the `merge` operation, detailing parallel partition processing, I/O paths, and kmer matrix aggregation across multiple indexes. Integrated an `indicatif` progress bar into the `rayon` parallel loop to monitor processing position, throughput, ETA, and recent partition duration. --- TODO.md | 11 +++++++++++ src/obikpartitionner/src/partition.rs | 17 ++++++++++++++++- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/TODO.md b/TODO.md index b67b7c3..038bd71 100644 --- a/TODO.md +++ b/TODO.md @@ -11,6 +11,17 @@ - merge : pour construire un index à partir d'index existants - deux modes : count et presence/absence. count exige que tous les index mergés soient déjà en mode count. mode presence/absence par defaut. Si passage de mode count à mode presence/absence, par defaut presence = count >= 1. Possibilité de spécifier un seuil personnalisé. + - le merge doit se faire en parallèle sur chaque partition + - en entrée : une liste de chemins vers les index à fusionner + - en sortie : un nouvel index fusionné (option -o ) + - j'imagine comme algo: + - on copie le premier index dans le nouvel index + - on ajoute a chaque partition une matrice de count ou de presence s'il n'y en avait pas déjà. + - si besoin, on cree la colone 0 de la matrice de count ou de presence pour le genome courant + - on parcourt les partitions et les index à fusionner en parallèle + - pour chaque partition, on ajoute les kmer présents dans les index à fusionner au nouvel index + - si le kmer est déjà présent dans le nouvel index on ajoute le compte ou la presence du kmer dans la matrice de count ou de presence + - sinon, on ajoute le kmer dans une nouvelle layer - filter : produit un nouvel index filtré à partir d'un index existant en verifiant que les kmer présents dans le nouvel index respectent les critères de filtrage spécifiés - quorum de presence en fraction-(min/max) du nombre de génomes, en nombre-(min/max) de génomes, si mode count la présence peut être défini par un seuil personnalisé minimum et maximum diff --git a/src/obikpartitionner/src/partition.rs b/src/obikpartitionner/src/partition.rs index c668309..ea7f810 100644 --- a/src/obikpartitionner/src/partition.rs +++ b/src/obikpartitionner/src/partition.rs @@ -219,19 +219,34 @@ impl KmerPartition { let n_threads = rayon::current_num_threads().max(1) as u64; let available_per_thread = available / n_threads; + let pb = ProgressBar::new(self.n_partitions as u64); + pb.set_style( + ProgressStyle::with_template( + "dereplicating [{bar:40}] {pos}/{len} ({percent}%) {per_sec} eta {eta} {msg}", + ) + .unwrap() + .progress_chars("█▌░"), + ); + let results: Vec> = (0..self.n_partitions) .into_par_iter() .map(|i| { let dir = self.part_dir(i); if !dir.exists() { + pb.inc(1); return Ok(()); } let raw_path = dir.join(format!("raw.{SK_EXT}")); + let t = Instant::now(); let n_buckets = optimal_buckets(&raw_path, available_per_thread); - dereplicate_partition(&dir, level, n_buckets) + let result = dereplicate_partition(&dir, level, n_buckets); + pb.set_message(format!("last {:.0}ms", t.elapsed().as_millis())); + pb.inc(1); + result }) .collect(); + pb.finish_and_clear(); for r in results { r?; } -- 2.52.0 From e1d59fde54b735eb799d4cb141a2f489482d99a9 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Thu, 21 May 2026 05:53:55 +0200 Subject: [PATCH 04/11] feat: add merge command to consolidate k-mer indexes Introduces a new `merge` CLI subcommand and underlying implementation to consolidate multiple pre-indexed k-mer indexes into a single output. Adds `append_column` methods to persistent bit and int matrices to enable incremental genome column expansion without rebuilding the MPHF. Includes new error variants for index readiness and configuration mismatches, adds a `--force` flag to the index command, and updates documentation and navigation structure accordingly. --- docmd/architecture/index_architecture.md | 27 +++ docmd/implementation/merge.md | 133 ++++++++++ docmd/implementation/obilayeredmap.md | 54 +++++ mkdocs.yml | 1 + src/obicompactvec/src/bitmatrix.rs | 20 ++ src/obicompactvec/src/intmatrix.rs | 20 ++ src/obikindex/src/error.rs | 16 +- src/obikindex/src/index.rs | 6 +- src/obikindex/src/lib.rs | 2 + src/obikindex/src/merge.rs | 131 ++++++++++ src/obikmer/src/cmd/index.rs | 11 +- src/obikmer/src/cmd/merge.rs | 47 ++++ src/obikmer/src/cmd/mod.rs | 1 + src/obikmer/src/main.rs | 3 + src/obikpartitionner/src/lib.rs | 2 + src/obikpartitionner/src/merge_layer.rs | 294 +++++++++++++++++++++++ src/obilayeredmap/src/layer.rs | 39 +++ 17 files changed, 799 insertions(+), 8 deletions(-) create mode 100644 docmd/implementation/merge.md create mode 100644 src/obikindex/src/merge.rs create mode 100644 src/obikmer/src/cmd/merge.rs create mode 100644 src/obikpartitionner/src/merge_layer.rs diff --git a/docmd/architecture/index_architecture.md b/docmd/architecture/index_architecture.md index cbc1083..0915a9c 100644 --- a/docmd/architecture/index_architecture.md +++ b/docmd/architecture/index_architecture.md @@ -327,3 +327,30 @@ Other derivations: threshold a count matrix → binary presence matrix; union tw 2. Replace `LayerData` trait with the `DataStore` / `ColumnWeights` / `CountPartials` / `BitPartials` system. 3. Rewire `LayeredMap` to hold `LayeredStore` (or bit variant) alongside the MPHF layers. 4. Implement `PartitionedIndex` using `LayeredStore>` for data and parallel dispatch for queries. + +--- + +## Multi-genome column invariant + +### The invariant + +After any merge operation, **every layer in every partition has exactly `n_genomes` columns**, where `n_genomes` is the current total genome count recorded in `index.meta`. This applies to both `PersistentCompactIntMatrix` (Count mode) and `PersistentBitMatrix` (Presence mode). + +### How it is maintained + +The invariant is established and preserved by three coordinated operations: + +**1. Existing layers — column append.** +When merging source genome G into an existing index with `n_existing_genomes` genomes, one column is appended to every existing layer via `append_genome_column`. Slots that contain a kmer present in source G receive its count or `true`; all other slots receive 0 or `false`. After this step, every pre-existing layer has `n_existing_genomes + 1` columns. + +**2. New layers — absent columns prepended.** +If source G introduces kmers not found in any existing layer, a new layer is created for those kmers. Before appending source G's own column, `n_existing_genomes` absent columns (all-zero or all-false) are prepended — one per genome already in the index. This ensures the new layer starts at the same column count as every other layer in the partition immediately after creation. + +**3. First merge, Presence mode — `init_presence_matrix`.** +The initial single-genome index has no `presence/` directory (presence is implicit: every kmer in the index is present in genome 0). On the first merge, before appending any column for source 1, `Layer<()>::init_presence_matrix` creates `presence/col_000000.pbiv` set entirely to `true` for each existing layer. This retroactively materialises genome 0's presence column, bringing the column count from 0 to 1 so that the subsequent append for source 1 raises it to 2. + +### Why the invariant is required + +The `LayeredStore` aggregation traits (`col_weights`, `partial_bray`, `partial_jaccard`, etc.) sum contributions across all `(partition, layer)` pairs without any shape check. If one layer had fewer columns than others, its contribution would silently produce a malformed result — wrong column sums, wrong distance matrices, and incorrect genome-level statistics. + +The invariant is the precondition that makes the progressive aggregation principle correct: each level can blindly sum matrices from the level below because all matrices have the same shape. diff --git a/docmd/implementation/merge.md b/docmd/implementation/merge.md new file mode 100644 index 0000000..8a51609 --- /dev/null +++ b/docmd/implementation/merge.md @@ -0,0 +1,133 @@ +# Merge command + +## Purpose + +`obikmer merge` combines multiple existing kmer indexes into a single index. The result contains all kmers from all sources, with per-genome presence/absence or count data for every genome across every layer. + +--- + +## Modes + +```rust +pub enum MergeMode { Presence, Count } +``` + +Default mode is `Presence`. `Count` mode requires **all** source indexes to have `with_counts=true`; mixing count and non-count sources is rejected at validation. + +| Mode | Column type | Constraint | +|---|---|---| +| `Presence` | `PersistentBitMatrix` (one bit per genome per slot) | none | +| `Count` | `PersistentCompactIntMatrix` (one u32 per genome per slot) | all sources `with_counts=true` | + +--- + +## Input / output constraints + +All source indexes must satisfy: + +- `IndexState::Indexed` (fully built — `index.done` sentinel present) +- Same `kmer_size`, `minimizer_size`, `n_bits` +- If `Count` mode: all sources must have `with_counts=true` + +`--force`: if the output directory already exists, it is deleted before the merge begins. + +--- + +## Algorithm + +### 1. Validation + +Check all sources against the constraints above. Abort on any mismatch. + +### 2. Bootstrap output from first source + +Recursive file copy of `sources[0]` → `output`. This establishes the partition layout, all existing MPHFs, unitigs, and evidence files. The first source's genome is genome 0 in the output. + +### 3. For each subsequent source (parallel across partitions) + +For each partition, process one source at a time sequentially: + +**a. Classify kmers** + +Iterate all kmers in the source partition (via `UnitigFileReader` + canonical kmer iteration). For each kmer, probe the destination `LayeredMap<()>`: + +- **Hit** `(dst_layer, slot)`: record `(dst_layer, slot, value)` where `value` is the source count (Count mode) or `1` (Presence mode). +- **Miss**: add kmer to a `GraphDeBruijn` accumulator; record its value in a `HashMap>`. + +**b. Extend existing layers** + +For each existing layer in the destination partition, call `append_genome_column` once per source genome being merged. Slots that received a hit are populated with their recorded value; all other slots receive 0 (absent). + +If this is the **first merge** and operating in Presence mode, call `Layer<()>::init_presence_matrix` before appending any source column. This creates `presence/` with `col_000000.pbiv` set all-true (genome 0 is present in every slot of this layer). + +**c. Build new layer for new kmers** + +If the `GraphDeBruijn` accumulator is non-empty (misses exist), write compact unitigs from the de Bruijn graph, then call the appropriate `Layer::build` variant. Before appending the source's own column, prepend `n_existing_genomes` absent columns (value 0) — one per genome already in the index — so the column count invariant holds immediately after layer creation. + +**d. Update partition metadata** + +Write updated `meta.json` with the incremented `n_layers` if a new layer was added. + +### 4. Update index metadata + +Append the merged source's genome label(s) to `index.meta.genomes`. Write updated `index.meta`. + +--- + +## Column count invariant + +After any merge, **every layer in every partition has exactly `n_genomes` columns**, where `n_genomes` is the total genome count in the index at that point. + +This is maintained by three mechanisms: + +1. **Existing layers**: one column appended per source genome (`append_genome_column`). +2. **New layers created during merge**: `n_existing_genomes` absent columns prepended before the source's own column. +3. **First merge, Presence mode**: `init_presence_matrix` retroactively creates `presence/col_0` all-true for genome 0 before any source column is appended. + +The invariant is a precondition of the `LayeredStore` aggregation traits: `col_weights()` and all partial distance methods assume every inner store has the same column count. + +--- + +## New layer construction + +All kmers absent from the destination index — across **all** sources being merged — accumulate into a **single** `GraphDeBruijn` per partition. One new layer is built from this graph, not one layer per source. This keeps the layer count bounded and preserves compact unitig representation. + +De Bruijn reconstruction ensures that overlapping k-1 suffixes/prefixes from different source kmers are merged into minimal unitigs before MPHF construction. + +--- + +## On-disk impact + +After merging `G` genomes (1 existing + G-1 new sources): + +``` +partitions/ + part_00000/ + index/ + meta.json ← n_layers updated if new layer added + layer_0/ + mphf.bin ← unchanged (MPHF immutable) + unitigs.bin ← unchanged + evidence.bin ← unchanged + presence/ ← created on first merge (Presence mode) + meta.json {"n": N, "n_cols": G} + col_000000.pbiv ← all-true (genome 0) + col_000001.pbiv ← source 1 presence + ... + counts/ ← extended (Count mode) + meta.json {"n": N, "n_cols": G} + col_000000.pciv ← genome 0 counts (from original build) + col_000001.pciv ← source 1 counts + ... + layer_1/ ← new layer (if new kmers found) + mphf.bin + unitigs.bin + evidence.bin + presence/ or counts/ + meta.json {"n": N1, "n_cols": G} + col_000000.pbiv ← all-false (genome 0 absent from this layer) + ... + col_000001.pbiv ← source 1 presence in this new layer +``` + +The `index.meta` genome list grows from 1 entry to G entries after all sources are merged. diff --git a/docmd/implementation/obilayeredmap.md b/docmd/implementation/obilayeredmap.md index cec43a2..d5ad7d2 100644 --- a/docmd/implementation/obilayeredmap.md +++ b/docmd/implementation/obilayeredmap.md @@ -255,3 +255,57 @@ Each partition's new layer is built independently; the operation is fully parall | `obicompactvec` | payload types + aggregation traits | | `rayon 1` | parallel MPHF construction pass | | `ndarray 0.16` | aggregation output arrays | + +--- + +## Column append and merge support + +These methods extend existing layers with new genome columns without touching the MPHF. They are the building blocks of the `merge` command. + +### Matrix column append + +```rust +impl PersistentCompactIntMatrix { + pub fn append_column(dir: &Path, value_of: impl Fn(usize) -> u32) -> OLMResult<()> +} + +impl PersistentBitMatrix { + pub fn append_column(dir: &Path, value_of: impl Fn(usize) -> bool) -> OLMResult<()> +} +``` + +Both methods write a new column file (`col_NNNNNN.pciv` / `col_NNNNNN.pbiv`) and update `meta.json` to increment `n_cols`. The `value_of` closure is called once per slot (indexed 0..n) to populate the column. The matrix `n` (row count) is read from the existing `meta.json` and must not change. + +### Presence matrix initialisation + +```rust +impl Layer<()> { + pub fn init_presence_matrix(layer_dir: &Path, n_kmers: usize) -> OLMResult<()> +} +``` + +Called on the first merge of a Presence-mode index. Creates the `presence/` subdirectory with `meta.json {"n": n_kmers, "n_cols": 1}` and `col_000000.pbiv` set entirely to `true`. This retroactively records that genome 0 (the original source) is present in every slot of this layer, satisfying the column count invariant before any new-source column is appended. + +### Layer-level genome column append + +```rust +impl Layer { + pub fn append_genome_column( + layer_dir: &Path, + value_of: impl Fn(usize) -> bool, + ) -> OLMResult<()> +} + +impl Layer { + pub fn append_genome_column( + layer_dir: &Path, + value_of: impl Fn(usize) -> u32, + ) -> OLMResult<()> +} +``` + +These delegate directly to the corresponding `PersistentBitMatrix::append_column` / `PersistentCompactIntMatrix::append_column`. They are typed at the `Layer` level to make call sites mode-aware without exposing the inner matrix path construction. + +### Why the MPHF is never rebuilt + +The MPHF (`mphf.bin`, `evidence.bin`, `unitigs.bin`) is built once from the kmer set of a layer and is immutable for the lifetime of that layer. Adding a genome column does not change the kmer set — it only adds a new data column indexed by the same slot numbers. Rebuilding the MPHF would require re-running the full construction pipeline (two sequential passes over unitigs, parallel ptr_hash construction) and would invalidate any open memory maps. Column append avoids all of this: the only disk writes are one new `.pciv`/`.pbiv` file and a single `meta.json` update. Kmers absent from a given layer are represented as zero (count) or false (presence) values in the new column — no structural change to the layer is required. diff --git a/mkdocs.yml b/mkdocs.yml index af78836..9f2ea83 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -47,6 +47,7 @@ nav: - obilayeredmap crate: implementation/obilayeredmap.md - PersistentCompactIntVec: implementation/persistent_compact_int_vec.md - PersistentBitVec: implementation/persistent_bit_vec.md + - Merge command: implementation/merge.md - Architecture: - Sequences: architecture/sequences/invariant.md - Kmer index: architecture/index_architecture.md diff --git a/src/obicompactvec/src/bitmatrix.rs b/src/obicompactvec/src/bitmatrix.rs index df823d1..b7305ef 100644 --- a/src/obicompactvec/src/bitmatrix.rs +++ b/src/obicompactvec/src/bitmatrix.rs @@ -101,6 +101,26 @@ impl PersistentBitMatrix { } } +// ── Column append ───────────────────────────────────────────────────────────── + +impl PersistentBitMatrix { + /// Append a new column to an existing matrix on disk. + /// + /// Reads `meta.json` to obtain `n` and the current column count, writes + /// `col_{n_cols:06}.pbiv` filled by `value_of(slot)`, then increments + /// `n_cols` in `meta.json`. + pub fn append_column(dir: &Path, value_of: impl Fn(usize) -> bool) -> io::Result<()> { + let mut meta = MatrixMeta::load(dir)?; + let mut b = PersistentBitVecBuilder::new(meta.n, &col_path(dir, meta.n_cols))?; + for slot in 0..meta.n { + b.set(slot, value_of(slot)); + } + b.close()?; + meta.n_cols += 1; + meta.save(dir) + } +} + fn upper_pairs(n: usize) -> Vec<(usize, usize)> { (0..n).flat_map(|i| (i + 1..n).map(move |j| (i, j))).collect() } diff --git a/src/obicompactvec/src/intmatrix.rs b/src/obicompactvec/src/intmatrix.rs index c77a135..a9aae75 100644 --- a/src/obicompactvec/src/intmatrix.rs +++ b/src/obicompactvec/src/intmatrix.rs @@ -203,6 +203,26 @@ where m } +// ── Column append ───────────────────────────────────────────────────────────── + +impl PersistentCompactIntMatrix { + /// Append a new column to an existing matrix on disk. + /// + /// Reads `meta.json` to obtain `n` and the current column count, writes + /// `col_{n_cols:06}.pciv` filled by `value_of(slot)`, then increments + /// `n_cols` in `meta.json`. + pub fn append_column(dir: &Path, value_of: impl Fn(usize) -> u32) -> io::Result<()> { + let mut meta = MatrixMeta::load(dir)?; + let mut b = PersistentCompactIntVecBuilder::new(meta.n, &col_path(dir, meta.n_cols))?; + for slot in 0..meta.n { + b.set(slot, value_of(slot)); + } + b.close()?; + meta.n_cols += 1; + meta.save(dir) + } +} + // ── Trait impls ─────────────────────────────────────────────────────────────── use crate::traits::{ColumnWeights, CountPartials}; diff --git a/src/obikindex/src/error.rs b/src/obikindex/src/error.rs index df191ed..74082fe 100644 --- a/src/obikindex/src/error.rs +++ b/src/obikindex/src/error.rs @@ -8,6 +8,12 @@ pub enum OKIError { Io(io::Error), Json(serde_json::Error), Partition(SKError), + /// Source index is not in `Indexed` state. + NotIndexed(std::path::PathBuf), + /// Source indexes have incompatible configurations (k, m, n_bits). + IncompatibleConfig, + /// Count mode requested but a source index lacks count data. + MismatchedMode, } pub type OKIResult = Result; @@ -15,9 +21,12 @@ pub type OKIResult = Result; impl fmt::Display for OKIError { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { - OKIError::Io(e) => write!(f, "I/O error: {e}"), - OKIError::Json(e) => write!(f, "JSON error: {e}"), - OKIError::Partition(e) => write!(f, "partition error: {e}"), + OKIError::Io(e) => write!(f, "I/O error: {e}"), + OKIError::Json(e) => write!(f, "JSON error: {e}"), + OKIError::Partition(e) => write!(f, "partition error: {e}"), + OKIError::NotIndexed(p) => write!(f, "index not fully built: {}", p.display()), + OKIError::IncompatibleConfig => write!(f, "incompatible index configurations"), + OKIError::MismatchedMode => write!(f, "count mode requires all sources to have with_counts=true"), } } } @@ -28,6 +37,7 @@ impl std::error::Error for OKIError { OKIError::Io(e) => Some(e), OKIError::Json(e) => Some(e), OKIError::Partition(e) => Some(e), + _ => None, } } } diff --git a/src/obikindex/src/index.rs b/src/obikindex/src/index.rs index c5ba267..16c1154 100644 --- a/src/obikindex/src/index.rs +++ b/src/obikindex/src/index.rs @@ -15,9 +15,9 @@ use crate::meta::{IndexConfig, IndexMeta}; use crate::state::{IndexState, SENTINEL_COUNTED, SENTINEL_INDEXED, SENTINEL_SCATTERED}; pub struct KmerIndex { - root_path: PathBuf, - meta: IndexMeta, - partition: KmerPartition, + pub(crate) root_path: PathBuf, + pub(crate) meta: IndexMeta, + pub(crate) partition: KmerPartition, } impl KmerIndex { diff --git a/src/obikindex/src/lib.rs b/src/obikindex/src/lib.rs index 91a728a..4dcf4ff 100644 --- a/src/obikindex/src/lib.rs +++ b/src/obikindex/src/lib.rs @@ -2,8 +2,10 @@ pub mod error; pub mod meta; pub mod state; mod index; +mod merge; pub use error::{OKIError, OKIResult}; pub use index::KmerIndex; +pub use merge::MergeMode; pub use meta::{IndexConfig, IndexMeta, META_FILENAME}; pub use state::{IndexState, SENTINEL_COUNTED, SENTINEL_INDEXED, SENTINEL_SCATTERED}; diff --git a/src/obikindex/src/merge.rs b/src/obikindex/src/merge.rs new file mode 100644 index 0000000..be71a02 --- /dev/null +++ b/src/obikindex/src/merge.rs @@ -0,0 +1,131 @@ +use std::fs; +use std::io; +use std::path::Path; + +use obisys::{Reporter, Stage}; +use rayon::prelude::*; +use tracing::info; + +use crate::error::{OKIError, OKIResult}; +use crate::index::KmerIndex; +use crate::meta::IndexMeta; +use crate::state::IndexState; + +pub use obikpartitionner::MergeMode; + +impl KmerIndex { + /// Merge `sources` into a new index at `output`. + /// + /// All sources must be in `Indexed` state and share the same `kmer_size`, + /// `minimizer_size`, and `n_partitions`. Count mode additionally requires + /// every source to have `with_counts = true`. + /// + /// The first source is copied to `output`, then each subsequent source is + /// merged partition-by-partition in parallel. + pub fn merge>( + output: P, + sources: &[&KmerIndex], + mode: MergeMode, + force: bool, + ) -> OKIResult { + let output = output.as_ref(); + + if sources.is_empty() { + return Err(OKIError::Io(io::Error::new( + io::ErrorKind::InvalidInput, + "merge requires at least one source index", + ))); + } + + // ── Validate ────────────────────────────────────────────────────────── + let ref0 = sources[0]; + for src in sources { + if src.state() != IndexState::Indexed { + return Err(OKIError::NotIndexed(src.root_path.clone())); + } + if src.kmer_size() != ref0.kmer_size() + || src.minimizer_size() != ref0.minimizer_size() + || src.n_partitions() != ref0.n_partitions() + { + return Err(OKIError::IncompatibleConfig); + } + if mode == MergeMode::Count && !src.meta.config.with_counts { + return Err(OKIError::MismatchedMode); + } + } + + // ── Prepare output directory ────────────────────────────────────────── + if output.exists() { + if force { + fs::remove_dir_all(output)?; + } else { + return Err(OKIError::Io(io::Error::new( + io::ErrorKind::AlreadyExists, + format!("{}: output directory already exists", output.display()), + ))); + } + } + + // ── Bootstrap: copy first source to output ──────────────────────────── + info!("copying {} → {}", sources[0].root_path.display(), output.display()); + copy_dir_all(&sources[0].root_path, output)?; + + // Rewrite index.meta with all genome labels. + let all_genomes: Vec = sources + .iter() + .flat_map(|s| s.meta.genomes.iter().cloned()) + .collect(); + let mut meta = IndexMeta::read(output).map_err(OKIError::Io)?; + meta.genomes = all_genomes; + meta.write(output)?; + + // Open the destination index. + let dst = KmerIndex::open(output)?; + let n_partitions = dst.n_partitions(); + + // ── Merge each subsequent source partition-by-partition ─────────────── + let remaining_sources: Vec<&KmerIndex> = sources[1..].to_vec(); + if !remaining_sources.is_empty() { + let mut rep = Reporter::new(); + let t = Stage::start("merge_partitions"); + + let dst_partition = &dst.partition; + + let errors: Vec = (0..n_partitions) + .into_par_iter() + .filter_map(|i| { + let srcs: Vec<&obikpartitionner::KmerPartition> = + remaining_sources.iter().map(|s| &s.partition).collect(); + // n_dst_genomes = 1 (copied from source_0 only) + dst_partition.merge_partition(i, &srcs, mode, 1).err() + }) + .collect(); + + if let Some(e) = errors.into_iter().next() { + return Err(OKIError::Partition(e)); + } + + rep.push(t.stop()); + } + + // Re-open to get the updated state. + KmerIndex::open(output) + } +} + +// ── Directory copy ──────────────────────────────────────────────────────────── + +fn copy_dir_all(src: &Path, dst: &Path) -> io::Result<()> { + fs::create_dir_all(dst)?; + for entry in fs::read_dir(src)? { + let entry = entry?; + let src_path = entry.path(); + let dst_path = dst.join(entry.file_name()); + if src_path.is_dir() { + copy_dir_all(&src_path, &dst_path)?; + } else { + fs::copy(&src_path, &dst_path)?; + } + } + Ok(()) +} diff --git a/src/obikmer/src/cmd/index.rs b/src/obikmer/src/cmd/index.rs index 9f9bad2..ccfa487 100644 --- a/src/obikmer/src/cmd/index.rs +++ b/src/obikmer/src/cmd/index.rs @@ -48,20 +48,27 @@ pub fn run(args: IndexArgs) { let mut rep = Reporter::new(); // ── Open or create the index ───────────────────────────────────────────── - let mut idx = if KmerIndex::exists(&output) { + let mut idx = if KmerIndex::exists(&output) && !args.force { info!("resuming from existing index at {}", output.display()); KmerIndex::open(&output).unwrap_or_else(|e| { eprintln!("error opening index: {e}"); std::process::exit(1); }) } else { + if args.force && KmerIndex::exists(&output) { + info!("--force: removing existing index at {}", output.display()); + std::fs::remove_dir_all(&output).unwrap_or_else(|e| { + eprintln!("error removing existing index: {e}"); + std::process::exit(1); + }); + } let config = IndexConfig { kmer_size: args.common.kmer_size, minimizer_size: args.common.minimizer_size, n_bits: args.common.partition_bits, with_counts: args.with_counts, }; - KmerIndex::create(&output, config, args.label.clone(), args.force).unwrap_or_else(|e| { + KmerIndex::create(&output, config, args.label.clone(), false).unwrap_or_else(|e| { eprintln!("error creating index: {e}"); std::process::exit(1); }) diff --git a/src/obikmer/src/cmd/merge.rs b/src/obikmer/src/cmd/merge.rs new file mode 100644 index 0000000..0109617 --- /dev/null +++ b/src/obikmer/src/cmd/merge.rs @@ -0,0 +1,47 @@ +use std::path::PathBuf; + +use clap::Args; +use obikindex::{KmerIndex, MergeMode}; +use obisys::Reporter; +use tracing::info; + +#[derive(Args)] +pub struct MergeArgs { + /// Source index directories to merge + #[arg(required = true)] + pub sources: Vec, + + /// Output index directory + #[arg(short, long)] + pub output: PathBuf, + + /// Overwrite output directory if it already exists + #[arg(long, default_value_t = false)] + pub force: bool, + + /// Merge in count mode (requires all sources to have with_counts=true) + #[arg(long, default_value_t = false)] + pub with_counts: bool, +} + +pub fn run(args: MergeArgs) { + let mode = if args.with_counts { MergeMode::Count } else { MergeMode::Presence }; + + let sources: Vec = args.sources.iter().map(|p| { + info!("opening source index: {}", p.display()); + KmerIndex::open(p).unwrap_or_else(|e| { + eprintln!("error opening source index {}: {e}", p.display()); + std::process::exit(1); + }) + }).collect(); + + let source_refs: Vec<&KmerIndex> = sources.iter().collect(); + + info!("merging {} indexes → {}", sources.len(), args.output.display()); + let rep = Reporter::new(); + KmerIndex::merge(&args.output, &source_refs, mode, args.force).unwrap_or_else(|e| { + eprintln!("error merging: {e}"); + std::process::exit(1); + }); + rep.print(); +} diff --git a/src/obikmer/src/cmd/mod.rs b/src/obikmer/src/cmd/mod.rs index 1d2188e..aaccdbf 100644 --- a/src/obikmer/src/cmd/mod.rs +++ b/src/obikmer/src/cmd/mod.rs @@ -1,3 +1,4 @@ pub mod index; +pub mod merge; pub mod superkmer; pub mod unitig; diff --git a/src/obikmer/src/main.rs b/src/obikmer/src/main.rs index 470da38..52c5953 100644 --- a/src/obikmer/src/main.rs +++ b/src/obikmer/src/main.rs @@ -18,6 +18,8 @@ enum Commands { Superkmer(cmd::superkmer::SuperkmerArgs), /// Build the complete genome index (scatter → dereplicate → count → layered MPHF) Index(cmd::index::IndexArgs), + /// Merge multiple built indexes into one + Merge(cmd::merge::MergeArgs), /// Dump unitigs from a built index to stdout (debug) Unitig(cmd::unitig::UnitigArgs), } @@ -43,6 +45,7 @@ fn main() { match cli.command { Commands::Superkmer(args) => cmd::superkmer::run(args), Commands::Index(args) => cmd::index::run(args), + Commands::Merge(args) => cmd::merge::run(args), Commands::Unitig(args) => cmd::unitig::run(args), } diff --git a/src/obikpartitionner/src/lib.rs b/src/obikpartitionner/src/lib.rs index 2e94e56..672c118 100644 --- a/src/obikpartitionner/src/lib.rs +++ b/src/obikpartitionner/src/lib.rs @@ -1,5 +1,7 @@ mod index_layer; mod kmer_sort; +mod merge_layer; mod partition; +pub use merge_layer::MergeMode; pub use partition::{KmerPartition, KmerSpectrum, PARTITIONS_SUBDIR}; diff --git a/src/obikpartitionner/src/merge_layer.rs b/src/obikpartitionner/src/merge_layer.rs new file mode 100644 index 0000000..7d77c45 --- /dev/null +++ b/src/obikpartitionner/src/merge_layer.rs @@ -0,0 +1,294 @@ +use std::fs; +use std::io; +use std::path::{Path, PathBuf}; + +use obidebruinj::GraphDeBruijn; +use obicompactvec::{PersistentBitMatrix, PersistentBitMatrixBuilder, + PersistentBitVecBuilder, + PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, + PersistentCompactIntVecBuilder}; +use obiskio::{SKError, SKResult, UnitigFileReader}; +use obilayeredmap::{Layer, LayeredMap, MphfLayer, OLMError}; +use obilayeredmap::meta::PartitionMeta; + +use crate::partition::KmerPartition; + +// ── MergeMode ───────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum MergeMode { Presence, Count } + +// ── ColBuilder — enum dispatch to avoid trait-object boxing issues ───────────── + +enum ColBuilder { + Bit(PersistentBitVecBuilder), + Int(PersistentCompactIntVecBuilder), +} + +impl ColBuilder { + fn set_val(&mut self, slot: usize, value: u32) { + match self { + ColBuilder::Bit(b) => b.set(slot, value > 0), + ColBuilder::Int(b) => b.set(slot, value), + } + } + + fn close(self) -> SKResult<()> { + match self { + ColBuilder::Bit(b) => b.close().map_err(SKError::Io), + ColBuilder::Int(b) => b.close().map_err(SKError::Io), + } + } +} + +// ── helpers ─────────────────────────────────────────────────────────────────── + +const INDEX_SUBDIR: &str = "index"; + +fn olm_to_sk(e: OLMError) -> SKError { + match e { + OLMError::Io(e) => SKError::Io(e), + other => SKError::InvalidData { context: "merge", detail: other.to_string() }, + } +} + +fn col_path_bit(dir: &Path, col: usize) -> PathBuf { + dir.join(format!("col_{col:06}.pbiv")) +} + +fn col_path_int(dir: &Path, col: usize) -> PathBuf { + dir.join(format!("col_{col:06}.pciv")) +} + +fn write_matrix_meta(dir: &Path, n: usize, n_cols: usize) -> io::Result<()> { + fs::write(dir.join("meta.json"), format!("{{\"n\":{n},\"n_cols\":{n_cols}}}\n")) +} + +// ── KmerPartition::merge_partition ──────────────────────────────────────────── + +impl KmerPartition { + /// Merge `sources` into destination partition `i`. + /// + /// `n_dst_genomes` is the number of genome columns already in the dst + /// matrices (1 after copying source_0, more for subsequent merges). + /// + /// Two-pass algorithm: + /// 1. Classify each source kmer as dst-hit or new → build de Bruijn graph + /// of new kmers → write unitigs → build MPHF for the new layer. + /// 2. Iterate source kmers again → fill per-genome column builders + /// (memory-mapped) → close → update matrix metadata. + pub fn merge_partition( + &self, + i: usize, + sources: &[&KmerPartition], + mode: MergeMode, + n_dst_genomes: usize, + ) -> SKResult<()> { + let dst_index_dir = self.part_dir(i).join(INDEX_SUBDIR); + if !dst_index_dir.exists() { + return Ok(()); + } + + let dst_map = LayeredMap::<()>::open(&dst_index_dir).map_err(olm_to_sk)?; + let n_dst_layers = dst_map.n_layers(); + let n_src = sources.len(); + + // First merge in presence mode: init presence matrices on existing layers + // (all slots true — every kmer in a layer belongs to genome_0). + if n_dst_genomes == 1 && mode == MergeMode::Presence { + for l in 0..n_dst_layers { + let layer_dir = dst_index_dir.join(format!("layer_{l}")); + Layer::<()>::init_presence_matrix(&layer_dir, dst_map.layer(l).n()) + .map_err(olm_to_sk)?; + } + } + + // ── Pass 1: classify kmers, build new-kmer de Bruijn graph ─────────── + let mut g = GraphDeBruijn::new(); + let mut any_new = false; + + for src in sources.iter() { + let src_index_dir = src.part_dir(i).join(INDEX_SUBDIR); + if !src_index_dir.exists() { continue; } + let src_meta = PartitionMeta::load(&src_index_dir).map_err(olm_to_sk)?; + + for l in 0..src_meta.n_layers { + let unitigs_path = src_index_dir + .join(format!("layer_{l}")).join("unitigs.bin"); + let reader = UnitigFileReader::open(&unitigs_path)?; + for (kmer, _, _) in reader.iter_indexed_canonical_kmers() { + if dst_map.query(kmer).is_none() { + g.push(kmer); + any_new = true; + } + } + } + } + + // Build new layer from de Bruijn graph if there are new kmers. + let new_layer_idx = n_dst_layers; + let new_layer_dir = dst_index_dir.join(format!("layer_{new_layer_idx}")); + + if any_new { + g.compute_degrees(); + fs::create_dir_all(&new_layer_dir)?; + let mut uw = Layer::<()>::unitig_writer(&new_layer_dir).map_err(olm_to_sk)?; + for unitig in g.iter_unitig() { + uw.write(&unitig)?; + } + uw.close()?; + Layer::<()>::build(&new_layer_dir).map_err(olm_to_sk)?; + } + drop(g); + + let new_mphf = if any_new { + Some(MphfLayer::open(&new_layer_dir).map_err(olm_to_sk)?) + } else { + None + }; + let n_new = new_mphf.as_ref().map_or(0, |m| m.n()); + + // ── Prepare matrix directories for the new layer ────────────────────── + // Absent columns (dst genomes) are written now via append_column (all-zero/false). + // Source-genome columns are created as mutable builders for pass 2. + let mut new_src_builders: Vec = if any_new { + let data_dir = match mode { + MergeMode::Presence => new_layer_dir.join("presence"), + MergeMode::Count => new_layer_dir.join("counts"), + }; + fs::create_dir_all(&data_dir)?; + // Bootstrap meta with 0 cols. + match mode { + MergeMode::Presence => { + PersistentBitMatrixBuilder::new(n_new, &data_dir) + .map_err(SKError::Io)?.close().map_err(SKError::Io)?; + for _ in 0..n_dst_genomes { + PersistentBitMatrix::append_column(&data_dir, |_| false) + .map_err(SKError::Io)?; + } + (0..n_src).map(|g| -> SKResult { + let b = PersistentBitVecBuilder::new( + n_new, &col_path_bit(&data_dir, n_dst_genomes + g))?; + Ok(ColBuilder::Bit(b)) + }).collect::>()? + } + MergeMode::Count => { + PersistentCompactIntMatrixBuilder::new(n_new, &data_dir) + .map_err(SKError::Io)?.close().map_err(SKError::Io)?; + for _ in 0..n_dst_genomes { + PersistentCompactIntMatrix::append_column(&data_dir, |_| 0) + .map_err(SKError::Io)?; + } + (0..n_src).map(|g| -> SKResult { + let b = PersistentCompactIntVecBuilder::new( + n_new, &col_path_int(&data_dir, n_dst_genomes + g))?; + Ok(ColBuilder::Int(b)) + }).collect::>()? + } + } + } else { + vec![] + }; + + // Builders for existing layers: one per (layer, src_genome). + // Invariant: existing layers already have exactly n_dst_genomes columns. + // New source columns go at positions n_dst_genomes .. n_dst_genomes+n_src-1. + let mut exist_builders: Vec> = (0..n_dst_layers) + .map(|l| { + let layer_dir = dst_index_dir.join(format!("layer_{l}")); + let n = dst_map.layer(l).n(); + (0..n_src).map(|src_g| -> SKResult { + match mode { + MergeMode::Presence => { + let data_dir = layer_dir.join("presence"); + let b = PersistentBitVecBuilder::new( + n, &col_path_bit(&data_dir, n_dst_genomes + src_g))?; + Ok(ColBuilder::Bit(b)) + } + MergeMode::Count => { + let data_dir = layer_dir.join("counts"); + let b = PersistentCompactIntVecBuilder::new( + n, &col_path_int(&data_dir, n_dst_genomes + src_g))?; + Ok(ColBuilder::Int(b)) + } + } + }).collect::>() + }) + .collect::>()?; + + // ── Pass 2: fill builders ───────────────────────────────────────────── + for (src_g, src) in sources.iter().enumerate() { + let src_index_dir = src.part_dir(i).join(INDEX_SUBDIR); + if !src_index_dir.exists() { continue; } + let src_meta = PartitionMeta::load(&src_index_dir).map_err(olm_to_sk)?; + + for l in 0..src_meta.n_layers { + let src_layer_dir = src_index_dir.join(format!("layer_{l}")); + let reader = UnitigFileReader::open(&src_layer_dir.join("unitigs.bin"))?; + + // Open source MPHF + count matrix for count mode. + let src_count_data: Option<(MphfLayer, PersistentCompactIntMatrix)> = + if mode == MergeMode::Count { + let counts_dir = src_layer_dir.join("counts"); + if counts_dir.exists() { + let mphf = MphfLayer::open(&src_layer_dir).map_err(olm_to_sk)?; + let mat = PersistentCompactIntMatrix::open(&counts_dir) + .map_err(SKError::Io)?; + Some((mphf, mat)) + } else { + None + } + } else { + None + }; + + for (kmer, _, _) in reader.iter_indexed_canonical_kmers() { + let value: u32 = match &src_count_data { + Some((mphf, counts)) => { + mphf.find(kmer).map(|s| counts.col(0).get(s)).unwrap_or(1) + } + None => 1, + }; + + if let Some((dst_layer, hit)) = dst_map.query(kmer) { + exist_builders[dst_layer][src_g].set_val(hit.slot, value); + } else if let Some(ref mphf) = new_mphf { + if let Some(slot) = mphf.find(kmer) { + new_src_builders[src_g].set_val(slot, value); + } + } + } + } + } + + // ── Close builders and update metadata ──────────────────────────────── + for (l, builders) in exist_builders.into_iter().enumerate() { + let layer_dir = dst_index_dir.join(format!("layer_{l}")); + for b in builders { b.close()?; } + // Update the matrix meta to reflect the n_src new columns. + let n = dst_map.layer(l).n(); + let data_dir = match mode { + MergeMode::Presence => layer_dir.join("presence"), + MergeMode::Count => layer_dir.join("counts"), + }; + write_matrix_meta(&data_dir, n, n_dst_genomes + n_src).map_err(SKError::Io)?; + } + + for b in new_src_builders { b.close()?; } + // new layer matrix meta was already written by append_column calls above + // with n_dst_genomes cols; update to n_dst_genomes + n_src. + if any_new { + let data_dir = match mode { + MergeMode::Presence => new_layer_dir.join("presence"), + MergeMode::Count => new_layer_dir.join("counts"), + }; + write_matrix_meta(&data_dir, n_new, n_dst_genomes + n_src).map_err(SKError::Io)?; + + let mut part_meta = PartitionMeta::load(&dst_index_dir).map_err(olm_to_sk)?; + part_meta.n_layers = new_layer_idx + 1; + part_meta.save(&dst_index_dir).map_err(olm_to_sk)?; + } + + Ok(()) + } +} diff --git a/src/obilayeredmap/src/layer.rs b/src/obilayeredmap/src/layer.rs index 166a072..9f41675 100644 --- a/src/obilayeredmap/src/layer.rs +++ b/src/obilayeredmap/src/layer.rs @@ -1,4 +1,5 @@ use std::collections::HashMap; +use std::fs; use std::path::Path; use obicompactvec::{ @@ -83,6 +84,22 @@ impl Layer<()> { pub fn build(out_dir: &Path) -> OLMResult { MphfLayer::build(out_dir, &mut |_, _| Ok(())) } + + /// Create a presence matrix for a set-membership layer (first merge). + /// + /// All `n_kmers` slots are set to `true`: every kmer in this layer belongs + /// to genome_0, so genome_0 is present at every slot. + pub fn init_presence_matrix(layer_dir: &Path, n_kmers: usize) -> OLMResult<()> { + let presence_dir = layer_dir.join(PRESENCE_DIR); + fs::create_dir_all(&presence_dir).map_err(OLMError::Io)?; + let mut mb = PersistentBitMatrixBuilder::new(n_kmers, &presence_dir).map_err(OLMError::Io)?; + let mut col = mb.add_col().map_err(OLMError::Io)?; + for slot in 0..n_kmers { + col.set(slot, true); + } + col.close().map_err(OLMError::Io)?; + mb.close().map_err(OLMError::Io) + } } // ── Mode 2 — count matrix (1 column per layer) ──────────────────────────────── @@ -111,9 +128,31 @@ impl Layer { } } +// ── Mode 2 — count matrix column append ────────────────────────────────────── + +impl Layer { + /// Append a genome column to an existing count matrix. + pub fn append_genome_column( + layer_dir: &Path, + value_of: impl Fn(usize) -> u32, + ) -> OLMResult<()> { + PersistentCompactIntMatrix::append_column(&layer_dir.join(COUNTS_DIR), value_of) + .map_err(OLMError::Io) + } +} + // ── Mode 3 — presence/absence matrix (1 column per genome) ─────────────────── impl Layer { + /// Append a genome column to an existing presence matrix. + pub fn append_genome_column( + layer_dir: &Path, + value_of: impl Fn(usize) -> bool, + ) -> OLMResult<()> { + PersistentBitMatrix::append_column(&layer_dir.join(PRESENCE_DIR), value_of) + .map_err(OLMError::Io) + } + pub fn build_presence( out_dir: &Path, n_genomes: usize, -- 2.52.0 From 1a1f95e59d3b129b0cfe0fc286793cd696d2c16f Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Thu, 21 May 2026 06:02:27 +0200 Subject: [PATCH 05/11] feat: add CLI command to export indexed k-mers to CSV This change introduces a new `dump` subcommand that exports all indexed k-mers to a CSV stream. The implementation spans multiple crates, adding core export logic to `obikindex` and partition iteration to `obikpartitionner`. The command supports a `--force-presence` flag to output binary presence/absence data instead of stored counts, and includes necessary module registrations and structural updates across the codebase. --- src/obikindex/src/dump.rs | 50 +++++++++++++++++ src/obikindex/src/lib.rs | 1 + src/obikmer/src/cmd/dump.rs | 39 +++++++++++++ src/obikmer/src/cmd/mod.rs | 1 + src/obikmer/src/main.rs | 3 + src/obikpartitionner/src/dump_layer.rs | 77 ++++++++++++++++++++++++++ src/obikpartitionner/src/lib.rs | 1 + 7 files changed, 172 insertions(+) create mode 100644 src/obikindex/src/dump.rs create mode 100644 src/obikmer/src/cmd/dump.rs create mode 100644 src/obikpartitionner/src/dump_layer.rs diff --git a/src/obikindex/src/dump.rs b/src/obikindex/src/dump.rs new file mode 100644 index 0000000..31e5ed3 --- /dev/null +++ b/src/obikindex/src/dump.rs @@ -0,0 +1,50 @@ +use std::io::Write; + +use crate::error::{OKIError, OKIResult}; +use crate::index::KmerIndex; + +impl KmerIndex { + /// Write a CSV table of all indexed kmers to `out`. + /// + /// Columns: `kmer`, then one column per genome (in index order). + /// Values are counts (u32) when `use_counts = true`, otherwise 0/1. + /// + /// `force_presence` overrides `with_counts`: even if the index stores counts, + /// the output uses 0/1 presence columns. + /// + /// The caller must have set the global kmer length (`obikseq::set_k`) before + /// calling this method. + pub fn dump(&self, out: &mut W, force_presence: bool) -> OKIResult<()> { + + let genomes = &self.meta.genomes; + let use_counts = self.meta.config.with_counts && !force_presence; + let n_genomes = genomes.len().max(1); + + // ── Header ──────────────────────────────────────────────────────────── + write!(out, "kmer")?; + for g in genomes { + write!(out, ",{g}")?; + } + writeln!(out)?; + + // ── Rows ────────────────────────────────────────────────────────────── + let n = self.n_partitions(); + for i in 0..n { + self.partition + .iter_partition_kmers(i, use_counts, n_genomes, |kmer, row| { + let seq = String::from_utf8(kmer.to_ascii()) + .unwrap_or_else(|_| "?".repeat(self.kmer_size())); + // write is infallible inside a closure — propagate via a flag if needed + let _ = write!(out, "{seq}"); + for &v in row.iter() { + let _ = write!(out, ",{v}"); + } + let _ = writeln!(out); + }) + .map_err(OKIError::Partition)?; + } + + out.flush()?; + Ok(()) + } +} diff --git a/src/obikindex/src/lib.rs b/src/obikindex/src/lib.rs index 4dcf4ff..c4a66f5 100644 --- a/src/obikindex/src/lib.rs +++ b/src/obikindex/src/lib.rs @@ -1,6 +1,7 @@ pub mod error; pub mod meta; pub mod state; +mod dump; mod index; mod merge; diff --git a/src/obikmer/src/cmd/dump.rs b/src/obikmer/src/cmd/dump.rs new file mode 100644 index 0000000..8775699 --- /dev/null +++ b/src/obikmer/src/cmd/dump.rs @@ -0,0 +1,39 @@ +use std::io::{self, BufWriter}; +use std::path::PathBuf; + +use clap::Args; +use obikindex::KmerIndex; +use obikseq::set_k; +use tracing::info; + +#[derive(Args)] +pub struct DumpArgs { + /// Index directory to dump + pub index: PathBuf, + + /// Output presence/absence (0/1) even if the index stores counts + #[arg(long, default_value_t = false)] + pub force_presence: bool, +} + +pub fn run(args: DumpArgs) { + let idx = KmerIndex::open(&args.index).unwrap_or_else(|e| { + eprintln!("error opening index: {e}"); + std::process::exit(1); + }); + + set_k(idx.kmer_size()); + info!( + "dumping {} partitions, {} genome(s)", + idx.n_partitions(), + idx.meta().genomes.len() + ); + + let stdout = io::stdout(); + let mut out = BufWriter::new(stdout.lock()); + + idx.dump(&mut out, args.force_presence).unwrap_or_else(|e| { + eprintln!("dump error: {e}"); + std::process::exit(1); + }); +} diff --git a/src/obikmer/src/cmd/mod.rs b/src/obikmer/src/cmd/mod.rs index aaccdbf..85d8603 100644 --- a/src/obikmer/src/cmd/mod.rs +++ b/src/obikmer/src/cmd/mod.rs @@ -1,3 +1,4 @@ +pub mod dump; pub mod index; pub mod merge; pub mod superkmer; diff --git a/src/obikmer/src/main.rs b/src/obikmer/src/main.rs index 52c5953..1a1f8bc 100644 --- a/src/obikmer/src/main.rs +++ b/src/obikmer/src/main.rs @@ -20,6 +20,8 @@ enum Commands { Index(cmd::index::IndexArgs), /// Merge multiple built indexes into one Merge(cmd::merge::MergeArgs), + /// Dump all indexed kmers as CSV (kmer + per-genome counts or presence) + Dump(cmd::dump::DumpArgs), /// Dump unitigs from a built index to stdout (debug) Unitig(cmd::unitig::UnitigArgs), } @@ -46,6 +48,7 @@ fn main() { Commands::Superkmer(args) => cmd::superkmer::run(args), Commands::Index(args) => cmd::index::run(args), Commands::Merge(args) => cmd::merge::run(args), + Commands::Dump(args) => cmd::dump::run(args), Commands::Unitig(args) => cmd::unitig::run(args), } diff --git a/src/obikpartitionner/src/dump_layer.rs b/src/obikpartitionner/src/dump_layer.rs new file mode 100644 index 0000000..71bcc93 --- /dev/null +++ b/src/obikpartitionner/src/dump_layer.rs @@ -0,0 +1,77 @@ +use obicompactvec::{PersistentBitMatrix, PersistentCompactIntMatrix}; +use obikseq::CanonicalKmer; +use obiskio::{SKError, SKResult, UnitigFileReader}; +use obilayeredmap::OLMError; +use obilayeredmap::meta::PartitionMeta; +use obilayeredmap::MphfLayer; + +use crate::partition::KmerPartition; + +const INDEX_SUBDIR: &str = "index"; + +fn olm_to_sk(e: OLMError) -> SKError { + match e { + OLMError::Io(e) => SKError::Io(e), + other => SKError::InvalidData { context: "dump", detail: other.to_string() }, + } +} + +impl KmerPartition { + /// Iterate all indexed kmers in partition `part`, calling `cb(kmer, row)` for each. + /// + /// `use_counts = true` → reads count columns (u32 values per genome). + /// `use_counts = false` → reads presence columns, converted to 0/1 u32. + /// + /// If no data matrix exists for a layer (pure set-membership, single genome), + /// a row of `n_genomes` ones is emitted for every kmer in that layer. + pub fn iter_partition_kmers( + &self, + part: usize, + use_counts: bool, + n_genomes: usize, + mut cb: impl FnMut(CanonicalKmer, Box<[u32]>), + ) -> SKResult<()> { + let index_dir = self.part_dir(part).join(INDEX_SUBDIR); + if !index_dir.exists() { + return Ok(()); + } + + let meta = PartitionMeta::load(&index_dir).map_err(olm_to_sk)?; + + for l in 0..meta.n_layers { + let layer_dir = index_dir.join(format!("layer_{l}")); + let mphf = MphfLayer::open(&layer_dir).map_err(olm_to_sk)?; + let reader = UnitigFileReader::open(&layer_dir.join("unitigs.bin"))?; + + let counts_dir = layer_dir.join("counts"); + let presence_dir = layer_dir.join("presence"); + + if use_counts && counts_dir.exists() { + let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?; + for (kmer, _, _) in reader.iter_indexed_canonical_kmers() { + if let Some(slot) = mphf.find(kmer) { + cb(kmer, mat.row(slot)); + } + } + } else if !use_counts && presence_dir.exists() { + let mat = PersistentBitMatrix::open(&presence_dir).map_err(SKError::Io)?; + for (kmer, _, _) in reader.iter_indexed_canonical_kmers() { + if let Some(slot) = mphf.find(kmer) { + let row: Box<[u32]> = mat.row(slot).iter().map(|&b| b as u32).collect(); + cb(kmer, row); + } + } + } else { + // No data matrix: pure set-membership layer, all kmers belong to every genome. + let all_present: Box<[u32]> = vec![1u32; n_genomes].into(); + for (kmer, _, _) in reader.iter_indexed_canonical_kmers() { + if mphf.find(kmer).is_some() { + cb(kmer, all_present.clone()); + } + } + } + } + + Ok(()) + } +} diff --git a/src/obikpartitionner/src/lib.rs b/src/obikpartitionner/src/lib.rs index 672c118..3b098a2 100644 --- a/src/obikpartitionner/src/lib.rs +++ b/src/obikpartitionner/src/lib.rs @@ -1,3 +1,4 @@ +mod dump_layer; mod index_layer; mod kmer_sort; mod merge_layer; -- 2.52.0 From 11182005a2d1b9100e6f7cceb048675d0eaa7a61 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Thu, 21 May 2026 08:12:02 +0200 Subject: [PATCH 06/11] feat: enhance merge label resolution, debug dump, and layer metadata This commit enhances the CLI and index pipelines by introducing `--force-presence` to normalize output to binary values, `--debug` to expose partition and layer metadata, and `--rename-duplicates` to automatically disambiguate overlapping genome labels. It updates the partitioner and index layers to auto-discover layers, persist `meta.json` for single-genome builds, and fix per-source column offsets during merging. A `DuplicateGenomeLabel` error variant is also added, and stale directories are properly managed in presence/absence mode. --- src/obikindex/src/dump.rs | 45 +++++-- src/obikindex/src/error.rs | 5 +- src/obikindex/src/merge.rs | 117 ++++++++++++++-- src/obikmer/src/cmd/dump.rs | 6 +- src/obikmer/src/cmd/merge.rs | 28 +++- src/obikpartitionner/src/dump_layer.rs | 64 ++++++++- src/obikpartitionner/src/index_layer.rs | 6 + src/obikpartitionner/src/merge_layer.rs | 170 ++++++++++++++++-------- 8 files changed, 347 insertions(+), 94 deletions(-) diff --git a/src/obikindex/src/dump.rs b/src/obikindex/src/dump.rs index 31e5ed3..d1fb8db 100644 --- a/src/obikindex/src/dump.rs +++ b/src/obikindex/src/dump.rs @@ -14,13 +14,17 @@ impl KmerIndex { /// /// The caller must have set the global kmer length (`obikseq::set_k`) before /// calling this method. - pub fn dump(&self, out: &mut W, force_presence: bool) -> OKIResult<()> { + pub fn dump(&self, out: &mut W, force_presence: bool, debug: bool) -> OKIResult<()> { - let genomes = &self.meta.genomes; + let genomes = &self.meta.genomes; let use_counts = self.meta.config.with_counts && !force_presence; let n_genomes = genomes.len().max(1); + let kmer_size = self.kmer_size(); // ── Header ──────────────────────────────────────────────────────────── + if debug { + write!(out, "partition,layer,")?; + } write!(out, "kmer")?; for g in genomes { write!(out, ",{g}")?; @@ -30,18 +34,31 @@ impl KmerIndex { // ── Rows ────────────────────────────────────────────────────────────── let n = self.n_partitions(); for i in 0..n { - self.partition - .iter_partition_kmers(i, use_counts, n_genomes, |kmer, row| { - let seq = String::from_utf8(kmer.to_ascii()) - .unwrap_or_else(|_| "?".repeat(self.kmer_size())); - // write is infallible inside a closure — propagate via a flag if needed - let _ = write!(out, "{seq}"); - for &v in row.iter() { - let _ = write!(out, ",{v}"); - } - let _ = writeln!(out); - }) - .map_err(OKIError::Partition)?; + if debug { + self.partition + .iter_partition_kmers_located(i, use_counts, n_genomes, |part, layer, kmer, row| { + let seq = String::from_utf8(kmer.to_ascii()) + .unwrap_or_else(|_| "?".repeat(kmer_size)); + let _ = write!(out, "{part},{layer},{seq}"); + for &v in row.iter() { + let _ = write!(out, ",{v}"); + } + let _ = writeln!(out); + }) + .map_err(OKIError::Partition)?; + } else { + self.partition + .iter_partition_kmers(i, use_counts, n_genomes, |kmer, row| { + let seq = String::from_utf8(kmer.to_ascii()) + .unwrap_or_else(|_| "?".repeat(kmer_size)); + let _ = write!(out, "{seq}"); + for &v in row.iter() { + let _ = write!(out, ",{v}"); + } + let _ = writeln!(out); + }) + .map_err(OKIError::Partition)?; + } } out.flush()?; diff --git a/src/obikindex/src/error.rs b/src/obikindex/src/error.rs index 74082fe..66f1abc 100644 --- a/src/obikindex/src/error.rs +++ b/src/obikindex/src/error.rs @@ -14,6 +14,8 @@ pub enum OKIError { IncompatibleConfig, /// Count mode requested but a source index lacks count data. MismatchedMode, + /// Two or more sources share the same genome label. + DuplicateGenomeLabel(String), } pub type OKIResult = Result; @@ -27,6 +29,7 @@ impl fmt::Display for OKIError { OKIError::NotIndexed(p) => write!(f, "index not fully built: {}", p.display()), OKIError::IncompatibleConfig => write!(f, "incompatible index configurations"), OKIError::MismatchedMode => write!(f, "count mode requires all sources to have with_counts=true"), + OKIError::DuplicateGenomeLabel(l) => write!(f, "duplicate genome label across sources: {l}"), } } } @@ -37,7 +40,7 @@ impl std::error::Error for OKIError { OKIError::Io(e) => Some(e), OKIError::Json(e) => Some(e), OKIError::Partition(e) => Some(e), - _ => None, + _ => None, // IncompatibleConfig, MismatchedMode, DuplicateGenomeLabel } } } diff --git a/src/obikindex/src/merge.rs b/src/obikindex/src/merge.rs index be71a02..305d344 100644 --- a/src/obikindex/src/merge.rs +++ b/src/obikindex/src/merge.rs @@ -1,3 +1,4 @@ +use std::collections::HashMap; use std::fs; use std::io; use std::path::Path; @@ -20,13 +21,16 @@ impl KmerIndex { /// `minimizer_size`, and `n_partitions`. Count mode additionally requires /// every source to have `with_counts = true`. /// - /// The first source is copied to `output`, then each subsequent source is - /// merged partition-by-partition in parallel. + /// Genome labels must be unique across all sources. If `rename_duplicates` + /// is true, repeated labels are disambiguated by appending `.1`, `.2`, … + /// to the second and subsequent occurrences. Otherwise a + /// `DuplicateGenomeLabel` error is returned on the first conflict. pub fn merge>( output: P, sources: &[&KmerIndex], mode: MergeMode, force: bool, + rename_duplicates: bool, ) -> OKIResult { let output = output.as_ref(); @@ -37,7 +41,7 @@ impl KmerIndex { ))); } - // ── Validate ────────────────────────────────────────────────────────── + // ── Validate config compatibility ───────────────────────────────────── let ref0 = sources[0]; for src in sources { if src.state() != IndexState::Indexed { @@ -54,6 +58,9 @@ impl KmerIndex { } } + // ── Compute final genome labels (rename duplicates if requested) ─────── + let (source_labels, all_genomes) = compute_labels(sources, rename_duplicates)?; + // ── Prepare output directory ────────────────────────────────────────── if output.exists() { if force { @@ -70,18 +77,32 @@ impl KmerIndex { info!("copying {} → {}", sources[0].root_path.display(), output.display()); copy_dir_all(&sources[0].root_path, output)?; - // Rewrite index.meta with all genome labels. - let all_genomes: Vec = sources - .iter() - .flat_map(|s| s.meta.genomes.iter().cloned()) - .collect(); + // Rewrite index.meta with final genome labels and the effective mode. let mut meta = IndexMeta::read(output).map_err(OKIError::Io)?; meta.genomes = all_genomes; + meta.config.with_counts = mode == MergeMode::Count; meta.write(output)?; + // In presence/absence mode, purge counts/ directories inherited from + // source_0 — they are stale data from the source's count index. + if mode == MergeMode::Presence { + remove_dirs_named(output, "counts")?; + } + + // Rebuild spectrums/ from all sources using the (possibly renamed) labels. + // Drop the spectrums/ that were copied from source_0 and rebuild from scratch. + let spectrums_dir = output.join("spectrums"); + if spectrums_dir.exists() { + fs::remove_dir_all(&spectrums_dir)?; + } + for (src, new_labels) in sources.iter().zip(&source_labels) { + copy_spectrums(&src.root_path, output, &src.meta.genomes, new_labels)?; + } + // Open the destination index. let dst = KmerIndex::open(output)?; let n_partitions = dst.n_partitions(); + let n_dst_genomes = sources[0].meta.genomes.len(); // ── Merge each subsequent source partition-by-partition ─────────────── let remaining_sources: Vec<&KmerIndex> = sources[1..].to_vec(); @@ -94,10 +115,9 @@ impl KmerIndex { let errors: Vec = (0..n_partitions) .into_par_iter() .filter_map(|i| { - let srcs: Vec<&obikpartitionner::KmerPartition> = - remaining_sources.iter().map(|s| &s.partition).collect(); - // n_dst_genomes = 1 (copied from source_0 only) - dst_partition.merge_partition(i, &srcs, mode, 1).err() + let srcs: Vec<(&obikpartitionner::KmerPartition, usize)> = + remaining_sources.iter().map(|s| (&s.partition, s.meta.genomes.len())).collect(); + dst_partition.merge_partition(i, &srcs, mode, n_dst_genomes).err() }) .collect(); @@ -113,7 +133,78 @@ impl KmerIndex { } } -// ── Directory copy ──────────────────────────────────────────────────────────── +// ── Helpers ─────────────────────────────────────────────────────────────────── + +/// Compute the final genome label lists for all sources. +/// +/// Returns `(per_source_labels, all_genomes_flat)`. +/// The first occurrence of a label keeps the original name. Subsequent +/// occurrences receive `.1`, `.2`, … suffixes when `rename_duplicates` is true, +/// or trigger a `DuplicateGenomeLabel` error otherwise. +fn compute_labels( + sources: &[&KmerIndex], + rename_duplicates: bool, +) -> OKIResult<(Vec>, Vec)> { + let mut seen: HashMap = HashMap::new(); + let mut source_labels: Vec> = Vec::with_capacity(sources.len()); + let mut all_genomes: Vec = Vec::new(); + + for src in sources { + let mut labels = Vec::with_capacity(src.meta.genomes.len()); + for label in &src.meta.genomes { + let count = seen.entry(label.clone()).or_insert(0); + let new_label = if *count == 0 { + label.clone() + } else if rename_duplicates { + format!("{label}.{count}") + } else { + return Err(OKIError::DuplicateGenomeLabel(label.clone())); + }; + *count += 1; + labels.push(new_label.clone()); + all_genomes.push(new_label); + } + source_labels.push(labels); + } + + Ok((source_labels, all_genomes)) +} + +/// Copy spectrum JSON files from `src_root/spectrums/` to `dst_root/spectrums/`, +/// mapping each `old_labels[i]` filename to `new_labels[i]`. +fn copy_spectrums( + src_root: &Path, + dst_root: &Path, + old_labels: &[String], + new_labels: &[String], +) -> io::Result<()> { + let src_dir = src_root.join("spectrums"); + let dst_dir = dst_root.join("spectrums"); + fs::create_dir_all(&dst_dir)?; + for (old, new) in old_labels.iter().zip(new_labels.iter()) { + let src_file = src_dir.join(format!("{old}.json")); + if src_file.exists() { + fs::copy(&src_file, dst_dir.join(format!("{new}.json")))?; + } + } + Ok(()) +} + +/// Recursively remove every directory named `name` under `root`. +fn remove_dirs_named(root: &Path, name: &str) -> io::Result<()> { + for entry in fs::read_dir(root)? { + let entry = entry?; + let path = entry.path(); + if path.is_dir() { + if path.file_name().and_then(|n| n.to_str()) == Some(name) { + fs::remove_dir_all(&path)?; + } else { + remove_dirs_named(&path, name)?; + } + } + } + Ok(()) +} fn copy_dir_all(src: &Path, dst: &Path) -> io::Result<()> { fs::create_dir_all(dst)?; diff --git a/src/obikmer/src/cmd/dump.rs b/src/obikmer/src/cmd/dump.rs index 8775699..f7c324c 100644 --- a/src/obikmer/src/cmd/dump.rs +++ b/src/obikmer/src/cmd/dump.rs @@ -14,6 +14,10 @@ pub struct DumpArgs { /// Output presence/absence (0/1) even if the index stores counts #[arg(long, default_value_t = false)] pub force_presence: bool, + + /// Prepend partition and layer columns to each row + #[arg(long, default_value_t = false)] + pub debug: bool, } pub fn run(args: DumpArgs) { @@ -32,7 +36,7 @@ pub fn run(args: DumpArgs) { let stdout = io::stdout(); let mut out = BufWriter::new(stdout.lock()); - idx.dump(&mut out, args.force_presence).unwrap_or_else(|e| { + idx.dump(&mut out, args.force_presence, args.debug).unwrap_or_else(|e| { eprintln!("dump error: {e}"); std::process::exit(1); }); diff --git a/src/obikmer/src/cmd/merge.rs b/src/obikmer/src/cmd/merge.rs index 0109617..0b18ac8 100644 --- a/src/obikmer/src/cmd/merge.rs +++ b/src/obikmer/src/cmd/merge.rs @@ -2,6 +2,7 @@ use std::path::PathBuf; use clap::Args; use obikindex::{KmerIndex, MergeMode}; +use obikseq::{set_k, set_m}; use obisys::Reporter; use tracing::info; @@ -19,14 +20,16 @@ pub struct MergeArgs { #[arg(long, default_value_t = false)] pub force: bool, - /// Merge in count mode (requires all sources to have with_counts=true) + /// Force presence/absence mode even if all sources have count data #[arg(long, default_value_t = false)] - pub with_counts: bool, + pub force_presence: bool, + + /// Disambiguate duplicate genome labels by appending .1, .2, … instead of erroring + #[arg(long, default_value_t = false)] + pub rename_duplicates: bool, } pub fn run(args: MergeArgs) { - let mode = if args.with_counts { MergeMode::Count } else { MergeMode::Presence }; - let sources: Vec = args.sources.iter().map(|p| { info!("opening source index: {}", p.display()); KmerIndex::open(p).unwrap_or_else(|e| { @@ -35,11 +38,26 @@ pub fn run(args: MergeArgs) { }) }).collect(); + // Auto-detect mode: count if all sources have count data, presence otherwise. + // --force-presence overrides to presence regardless. + let all_have_counts = sources.iter().all(|s| s.meta().config.with_counts); + let mode = if !args.force_presence && all_have_counts { + MergeMode::Count + } else { + MergeMode::Presence + }; + info!( + "merge mode: {}", + if mode == MergeMode::Count { "count" } else { "presence/absence" } + ); + let source_refs: Vec<&KmerIndex> = sources.iter().collect(); + set_k(sources[0].kmer_size()); + set_m(sources[0].minimizer_size()); info!("merging {} indexes → {}", sources.len(), args.output.display()); let rep = Reporter::new(); - KmerIndex::merge(&args.output, &source_refs, mode, args.force).unwrap_or_else(|e| { + KmerIndex::merge(&args.output, &source_refs, mode, args.force, args.rename_duplicates).unwrap_or_else(|e| { eprintln!("error merging: {e}"); std::process::exit(1); }); diff --git a/src/obikpartitionner/src/dump_layer.rs b/src/obikpartitionner/src/dump_layer.rs index 71bcc93..eaf8231 100644 --- a/src/obikpartitionner/src/dump_layer.rs +++ b/src/obikpartitionner/src/dump_layer.rs @@ -2,7 +2,6 @@ use obicompactvec::{PersistentBitMatrix, PersistentCompactIntMatrix}; use obikseq::CanonicalKmer; use obiskio::{SKError, SKResult, UnitigFileReader}; use obilayeredmap::OLMError; -use obilayeredmap::meta::PartitionMeta; use obilayeredmap::MphfLayer; use crate::partition::KmerPartition; @@ -36,10 +35,14 @@ impl KmerPartition { return Ok(()); } - let meta = PartitionMeta::load(&index_dir).map_err(olm_to_sk)?; - - for l in 0..meta.n_layers { + // Discover layers by probing layer_0, layer_1, … until one is absent. + // PartitionMeta (meta.json) is only created by the merge path, not by + // the initial single-genome build, so we cannot rely on it here. + let mut l = 0; + loop { let layer_dir = index_dir.join(format!("layer_{l}")); + if !layer_dir.exists() { break; } + l += 1; let mphf = MphfLayer::open(&layer_dir).map_err(olm_to_sk)?; let reader = UnitigFileReader::open(&layer_dir.join("unitigs.bin"))?; @@ -74,4 +77,57 @@ impl KmerPartition { Ok(()) } + + /// Like [`iter_partition_kmers`] but the callback also receives the layer index, + /// enabling debug output that identifies where each kmer was stored. + pub fn iter_partition_kmers_located( + &self, + part: usize, + use_counts: bool, + n_genomes: usize, + mut cb: impl FnMut(usize, usize, CanonicalKmer, Box<[u32]>), + ) -> SKResult<()> { + let index_dir = self.part_dir(part).join(INDEX_SUBDIR); + if !index_dir.exists() { + return Ok(()); + } + + let mut layer = 0; + loop { + let layer_dir = index_dir.join(format!("layer_{layer}")); + if !layer_dir.exists() { break; } + let mphf = MphfLayer::open(&layer_dir).map_err(olm_to_sk)?; + let reader = UnitigFileReader::open(&layer_dir.join("unitigs.bin"))?; + + let counts_dir = layer_dir.join("counts"); + let presence_dir = layer_dir.join("presence"); + + if use_counts && counts_dir.exists() { + let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?; + for (kmer, _, _) in reader.iter_indexed_canonical_kmers() { + if let Some(slot) = mphf.find(kmer) { + cb(part, layer, kmer, mat.row(slot)); + } + } + } else if !use_counts && presence_dir.exists() { + let mat = PersistentBitMatrix::open(&presence_dir).map_err(SKError::Io)?; + for (kmer, _, _) in reader.iter_indexed_canonical_kmers() { + if let Some(slot) = mphf.find(kmer) { + let row: Box<[u32]> = mat.row(slot).iter().map(|&b| b as u32).collect(); + cb(part, layer, kmer, row); + } + } + } else { + let all_present: Box<[u32]> = vec![1u32; n_genomes].into(); + for (kmer, _, _) in reader.iter_indexed_canonical_kmers() { + if mphf.find(kmer).is_some() { + cb(part, layer, kmer, all_present.clone()); + } + } + } + layer += 1; + } + + Ok(()) + } } diff --git a/src/obikpartitionner/src/index_layer.rs b/src/obikpartitionner/src/index_layer.rs index a6d873a..a6401b5 100644 --- a/src/obikpartitionner/src/index_layer.rs +++ b/src/obikpartitionner/src/index_layer.rs @@ -6,6 +6,7 @@ use epserde::prelude::*; use obicompactvec::{PersistentCompactIntMatrix, PersistentCompactIntVec}; use obidebruinj::GraphDeBruijn; use obilayeredmap::{OLMError, layer::Layer}; +use obilayeredmap::meta::PartitionMeta; use obiskio::{SKError, SKFileMeta, SKFileReader}; use ptr_hash::{PtrHash, bucket_fn::CubicEps, hash::Xx64}; @@ -118,6 +119,11 @@ impl KmerPartition { Layer::<()>::build(&layer_dir).map_err(olm_to_sk)?; } + // Write meta.json in the index/ directory so LayeredMap::open works + // (e.g. for subsequent merge operations). + let index_dir = layer_dir.parent().expect("layer_dir has a parent"); + PartitionMeta { n_layers: 1 }.save(index_dir).map_err(olm_to_sk)?; + Ok(n_kmers) } diff --git a/src/obikpartitionner/src/merge_layer.rs b/src/obikpartitionner/src/merge_layer.rs index 7d77c45..c933966 100644 --- a/src/obikpartitionner/src/merge_layer.rs +++ b/src/obikpartitionner/src/merge_layer.rs @@ -7,6 +7,7 @@ use obicompactvec::{PersistentBitMatrix, PersistentBitMatrixBuilder, PersistentBitVecBuilder, PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, PersistentCompactIntVecBuilder}; +use obikseq::CanonicalKmer; use obiskio::{SKError, SKResult, UnitigFileReader}; use obilayeredmap::{Layer, LayeredMap, MphfLayer, OLMError}; use obilayeredmap::meta::PartitionMeta; @@ -41,10 +42,88 @@ impl ColBuilder { } } +// ── SrcLayerData — opened source matrix for pass-2 lookup ───────────────────── + +enum SrcLayerData { + /// Pure set-membership layer (no data matrix): every kmer is present in all genomes. + SetMembership, + Presence(MphfLayer, PersistentBitMatrix), + Count(MphfLayer, PersistentCompactIntMatrix), +} + +impl SrcLayerData { + fn open(layer_dir: &Path, mode: MergeMode) -> SKResult { + let presence_dir = layer_dir.join("presence"); + let counts_dir = layer_dir.join("counts"); + match mode { + MergeMode::Presence => { + if presence_dir.exists() { + let mphf = MphfLayer::open(layer_dir).map_err(olm_to_sk)?; + let mat = PersistentBitMatrix::open(&presence_dir).map_err(SKError::Io)?; + Ok(SrcLayerData::Presence(mphf, mat)) + } else if counts_dir.exists() { + // Source is a count index; treat count > 0 as present via ColBuilder::Bit. + let mphf = MphfLayer::open(layer_dir).map_err(olm_to_sk)?; + let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?; + Ok(SrcLayerData::Count(mphf, mat)) + } else { + Ok(SrcLayerData::SetMembership) + } + } + MergeMode::Count => { + if counts_dir.exists() { + let mphf = MphfLayer::open(layer_dir).map_err(olm_to_sk)?; + let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?; + Ok(SrcLayerData::Count(mphf, mat)) + } else { + Ok(SrcLayerData::SetMembership) + } + } + } + } + + /// Return one value per source genome for `kmer`. + fn lookup(&self, kmer: CanonicalKmer, n_genomes: usize) -> Vec { + match self { + SrcLayerData::SetMembership => vec![1u32; n_genomes], + SrcLayerData::Presence(mphf, mat) => { + if let Some(slot) = mphf.find(kmer) { + mat.row(slot).iter().map(|&b| b as u32).collect() + } else { + vec![0u32; n_genomes] + } + } + SrcLayerData::Count(mphf, mat) => { + if let Some(slot) = mphf.find(kmer) { + mat.row(slot).iter().copied().collect() + } else { + vec![0u32; n_genomes] + } + } + } + } +} + // ── helpers ─────────────────────────────────────────────────────────────────── const INDEX_SUBDIR: &str = "index"; +/// Load PartitionMeta, or recover it by probing layer directories. +/// Indexes built before meta.json was introduced lack the file. +fn load_meta(dir: &Path) -> SKResult { + match PartitionMeta::load(dir) { + Ok(m) => Ok(m), + Err(e) if matches!(e, OLMError::Io(ref io_e) if io_e.kind() == std::io::ErrorKind::NotFound) => { + let mut n = 0usize; + while dir.join(format!("layer_{n}")).exists() { n += 1; } + let m = PartitionMeta { n_layers: n }; + m.save(dir).map_err(olm_to_sk)?; + Ok(m) + } + Err(e) => Err(olm_to_sk(e)), + } +} + fn olm_to_sk(e: OLMError) -> SKError { match e { OLMError::Io(e) => SKError::Io(e), @@ -69,18 +148,17 @@ fn write_matrix_meta(dir: &Path, n: usize, n_cols: usize) -> io::Result<()> { impl KmerPartition { /// Merge `sources` into destination partition `i`. /// - /// `n_dst_genomes` is the number of genome columns already in the dst - /// matrices (1 after copying source_0, more for subsequent merges). + /// Each entry in `sources` is `(partition, n_genomes)` where `n_genomes` is + /// the number of genome columns that source contributes. A merged index + /// contributes more than one. The total new columns added to the destination + /// is `sum(n_genomes)`. /// - /// Two-pass algorithm: - /// 1. Classify each source kmer as dst-hit or new → build de Bruijn graph - /// of new kmers → write unitigs → build MPHF for the new layer. - /// 2. Iterate source kmers again → fill per-genome column builders - /// (memory-mapped) → close → update matrix metadata. + /// `n_dst_genomes` is the number of genome columns already in the destination + /// matrices (copied from source_0 before this call). pub fn merge_partition( &self, i: usize, - sources: &[&KmerPartition], + sources: &[(&KmerPartition, usize)], mode: MergeMode, n_dst_genomes: usize, ) -> SKResult<()> { @@ -89,12 +167,13 @@ impl KmerPartition { return Ok(()); } + load_meta(&dst_index_dir)?; // ensure meta.json exists before LayeredMap::open let dst_map = LayeredMap::<()>::open(&dst_index_dir).map_err(olm_to_sk)?; let n_dst_layers = dst_map.n_layers(); - let n_src = sources.len(); + let n_src_total: usize = sources.iter().map(|(_, n)| *n).sum(); // First merge in presence mode: init presence matrices on existing layers - // (all slots true — every kmer in a layer belongs to genome_0). + // (all slots true — every kmer in those layers belongs to genome_0). if n_dst_genomes == 1 && mode == MergeMode::Presence { for l in 0..n_dst_layers { let layer_dir = dst_index_dir.join(format!("layer_{l}")); @@ -107,10 +186,10 @@ impl KmerPartition { let mut g = GraphDeBruijn::new(); let mut any_new = false; - for src in sources.iter() { + for (src, _) in sources.iter() { let src_index_dir = src.part_dir(i).join(INDEX_SUBDIR); if !src_index_dir.exists() { continue; } - let src_meta = PartitionMeta::load(&src_index_dir).map_err(olm_to_sk)?; + let src_meta = load_meta(&src_index_dir)?; for l in 0..src_meta.n_layers { let unitigs_path = src_index_dir @@ -149,7 +228,7 @@ impl KmerPartition { let n_new = new_mphf.as_ref().map_or(0, |m| m.n()); // ── Prepare matrix directories for the new layer ────────────────────── - // Absent columns (dst genomes) are written now via append_column (all-zero/false). + // Absent columns (dst genomes) are written via append_column (all-zero/false). // Source-genome columns are created as mutable builders for pass 2. let mut new_src_builders: Vec = if any_new { let data_dir = match mode { @@ -157,7 +236,6 @@ impl KmerPartition { MergeMode::Count => new_layer_dir.join("counts"), }; fs::create_dir_all(&data_dir)?; - // Bootstrap meta with 0 cols. match mode { MergeMode::Presence => { PersistentBitMatrixBuilder::new(n_new, &data_dir) @@ -166,7 +244,7 @@ impl KmerPartition { PersistentBitMatrix::append_column(&data_dir, |_| false) .map_err(SKError::Io)?; } - (0..n_src).map(|g| -> SKResult { + (0..n_src_total).map(|g| -> SKResult { let b = PersistentBitVecBuilder::new( n_new, &col_path_bit(&data_dir, n_dst_genomes + g))?; Ok(ColBuilder::Bit(b)) @@ -179,7 +257,7 @@ impl KmerPartition { PersistentCompactIntMatrix::append_column(&data_dir, |_| 0) .map_err(SKError::Io)?; } - (0..n_src).map(|g| -> SKResult { + (0..n_src_total).map(|g| -> SKResult { let b = PersistentCompactIntVecBuilder::new( n_new, &col_path_int(&data_dir, n_dst_genomes + g))?; Ok(ColBuilder::Int(b)) @@ -190,14 +268,13 @@ impl KmerPartition { vec![] }; - // Builders for existing layers: one per (layer, src_genome). - // Invariant: existing layers already have exactly n_dst_genomes columns. - // New source columns go at positions n_dst_genomes .. n_dst_genomes+n_src-1. + // Builders for existing layers: n_src_total per layer. + // Columns n_dst_genomes .. n_dst_genomes + n_src_total - 1. let mut exist_builders: Vec> = (0..n_dst_layers) .map(|l| { let layer_dir = dst_index_dir.join(format!("layer_{l}")); let n = dst_map.layer(l).n(); - (0..n_src).map(|src_g| -> SKResult { + (0..n_src_total).map(|src_g| -> SKResult { match mode { MergeMode::Presence => { let data_dir = layer_dir.join("presence"); @@ -217,72 +294,53 @@ impl KmerPartition { .collect::>()?; // ── Pass 2: fill builders ───────────────────────────────────────────── - for (src_g, src) in sources.iter().enumerate() { + let mut col_offset = 0usize; + for (src, src_n) in sources.iter() { let src_index_dir = src.part_dir(i).join(INDEX_SUBDIR); - if !src_index_dir.exists() { continue; } - let src_meta = PartitionMeta::load(&src_index_dir).map_err(olm_to_sk)?; + if !src_index_dir.exists() { col_offset += src_n; continue; } + let src_meta = load_meta(&src_index_dir)?; for l in 0..src_meta.n_layers { let src_layer_dir = src_index_dir.join(format!("layer_{l}")); let reader = UnitigFileReader::open(&src_layer_dir.join("unitigs.bin"))?; - - // Open source MPHF + count matrix for count mode. - let src_count_data: Option<(MphfLayer, PersistentCompactIntMatrix)> = - if mode == MergeMode::Count { - let counts_dir = src_layer_dir.join("counts"); - if counts_dir.exists() { - let mphf = MphfLayer::open(&src_layer_dir).map_err(olm_to_sk)?; - let mat = PersistentCompactIntMatrix::open(&counts_dir) - .map_err(SKError::Io)?; - Some((mphf, mat)) - } else { - None - } - } else { - None - }; + let src_data = SrcLayerData::open(&src_layer_dir, mode)?; for (kmer, _, _) in reader.iter_indexed_canonical_kmers() { - let value: u32 = match &src_count_data { - Some((mphf, counts)) => { - mphf.find(kmer).map(|s| counts.col(0).get(s)).unwrap_or(1) - } - None => 1, - }; - - if let Some((dst_layer, hit)) = dst_map.query(kmer) { - exist_builders[dst_layer][src_g].set_val(hit.slot, value); - } else if let Some(ref mphf) = new_mphf { - if let Some(slot) = mphf.find(kmer) { - new_src_builders[src_g].set_val(slot, value); + let values = src_data.lookup(kmer, *src_n); + for (g, &value) in values.iter().enumerate() { + let builder_idx = col_offset + g; + if let Some((dst_layer, hit)) = dst_map.query(kmer) { + exist_builders[dst_layer][builder_idx].set_val(hit.slot, value); + } else if let Some(ref mphf) = new_mphf { + if let Some(slot) = mphf.find(kmer) { + new_src_builders[builder_idx].set_val(slot, value); + } } } } } + col_offset += src_n; } // ── Close builders and update metadata ──────────────────────────────── for (l, builders) in exist_builders.into_iter().enumerate() { let layer_dir = dst_index_dir.join(format!("layer_{l}")); for b in builders { b.close()?; } - // Update the matrix meta to reflect the n_src new columns. let n = dst_map.layer(l).n(); let data_dir = match mode { MergeMode::Presence => layer_dir.join("presence"), MergeMode::Count => layer_dir.join("counts"), }; - write_matrix_meta(&data_dir, n, n_dst_genomes + n_src).map_err(SKError::Io)?; + write_matrix_meta(&data_dir, n, n_dst_genomes + n_src_total).map_err(SKError::Io)?; } for b in new_src_builders { b.close()?; } - // new layer matrix meta was already written by append_column calls above - // with n_dst_genomes cols; update to n_dst_genomes + n_src. if any_new { let data_dir = match mode { MergeMode::Presence => new_layer_dir.join("presence"), MergeMode::Count => new_layer_dir.join("counts"), }; - write_matrix_meta(&data_dir, n_new, n_dst_genomes + n_src).map_err(SKError::Io)?; + write_matrix_meta(&data_dir, n_new, n_dst_genomes + n_src_total).map_err(SKError::Io)?; let mut part_meta = PartitionMeta::load(&dst_index_dir).map_err(olm_to_sk)?; part_meta.n_layers = new_layer_idx + 1; -- 2.52.0 From 9e1d6f2f25615428e1f8af50ae848609516eda62 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Thu, 21 May 2026 11:16:00 +0200 Subject: [PATCH 07/11] feat: implement partition-based merge command for k-mer indices Implements a new `merge` command that aggregates k-mer counts and presence/absence matrices from multiple source indices using a parallelized, partition-based algorithm. Adds CLI progress bars and execution timing across the bootstrap, spectrum rebuild, and merge phases. Updates logging to report the aggregate genome count and introduces a bounds check in the perfect hash layer to safely return `None` for unknown k-mers, preventing out-of-bounds access in downstream operations. --- TODO.md | 14 -------- src/obikindex/src/merge.rs | 56 +++++++++++++++++++++++++++-- src/obikmer/src/cmd/merge.rs | 12 +++++-- src/obilayeredmap/src/mphf_layer.rs | 2 ++ 4 files changed, 64 insertions(+), 20 deletions(-) diff --git a/TODO.md b/TODO.md index 038bd71..592fc74 100644 --- a/TODO.md +++ b/TODO.md @@ -9,20 +9,6 @@ ## commandes à ajouter -- merge : pour construire un index à partir d'index existants - - deux modes : count et presence/absence. count exige que tous les index mergés soient déjà en mode count. mode presence/absence par defaut. Si passage de mode count à mode presence/absence, par defaut presence = count >= 1. Possibilité de spécifier un seuil personnalisé. - - le merge doit se faire en parallèle sur chaque partition - - en entrée : une liste de chemins vers les index à fusionner - - en sortie : un nouvel index fusionné (option -o ) - - j'imagine comme algo: - - on copie le premier index dans le nouvel index - - on ajoute a chaque partition une matrice de count ou de presence s'il n'y en avait pas déjà. - - si besoin, on cree la colone 0 de la matrice de count ou de presence pour le genome courant - - on parcourt les partitions et les index à fusionner en parallèle - - pour chaque partition, on ajoute les kmer présents dans les index à fusionner au nouvel index - - si le kmer est déjà présent dans le nouvel index on ajoute le compte ou la presence du kmer dans la matrice de count ou de presence - - sinon, on ajoute le kmer dans une nouvelle layer - - filter : produit un nouvel index filtré à partir d'un index existant en verifiant que les kmer présents dans le nouvel index respectent les critères de filtrage spécifiés - quorum de presence en fraction-(min/max) du nombre de génomes, en nombre-(min/max) de génomes, si mode count la présence peut être défini par un seuil personnalisé minimum et maximum diff --git a/src/obikindex/src/merge.rs b/src/obikindex/src/merge.rs index 305d344..c31e3e5 100644 --- a/src/obikindex/src/merge.rs +++ b/src/obikindex/src/merge.rs @@ -2,7 +2,9 @@ use std::collections::HashMap; use std::fs; use std::io; use std::path::Path; +use std::time::Duration; +use indicatif::{ProgressBar, ProgressStyle}; use obisys::{Reporter, Stage}; use rayon::prelude::*; use tracing::info; @@ -31,6 +33,7 @@ impl KmerIndex { mode: MergeMode, force: bool, rename_duplicates: bool, + rep: &mut Reporter, ) -> OKIResult { let output = output.as_ref(); @@ -74,7 +77,14 @@ impl KmerIndex { } // ── Bootstrap: copy first source to output ──────────────────────────── - info!("copying {} → {}", sources[0].root_path.display(), output.display()); + info!( + "bootstrap: copying {} → {} ({} genome(s))", + sources[0].root_path.display(), + output.display(), + sources[0].meta.genomes.len(), + ); + let t = Stage::start("bootstrap"); + let pb = spinner("bootstrap — copying index …"); copy_dir_all(&sources[0].root_path, output)?; // Rewrite index.meta with final genome labels and the effective mode. @@ -88,9 +98,14 @@ impl KmerIndex { if mode == MergeMode::Presence { remove_dirs_named(output, "counts")?; } + pb.finish_and_clear(); + rep.push(t.stop()); // Rebuild spectrums/ from all sources using the (possibly renamed) labels. // Drop the spectrums/ that were copied from source_0 and rebuild from scratch. + info!("rebuilding spectrums for {} source(s)", sources.len()); + let t = Stage::start("spectrums"); + let pb = spinner("spectrums — copying …"); let spectrums_dir = output.join("spectrums"); if spectrums_dir.exists() { fs::remove_dir_all(&spectrums_dir)?; @@ -98,6 +113,8 @@ impl KmerIndex { for (src, new_labels) in sources.iter().zip(&source_labels) { copy_spectrums(&src.root_path, output, &src.meta.genomes, new_labels)?; } + pb.finish_and_clear(); + rep.push(t.stop()); // Open the destination index. let dst = KmerIndex::open(output)?; @@ -107,8 +124,13 @@ impl KmerIndex { // ── Merge each subsequent source partition-by-partition ─────────────── let remaining_sources: Vec<&KmerIndex> = sources[1..].to_vec(); if !remaining_sources.is_empty() { - let mut rep = Reporter::new(); + let n_src_genomes: usize = remaining_sources.iter().map(|s| s.meta.genomes.len()).sum(); + info!( + "merging {} partition(s) × {} additional source genome(s) into {} destination genome(s)", + n_partitions, n_src_genomes, n_dst_genomes, + ); let t = Stage::start("merge_partitions"); + let pb = partition_bar(n_partitions as u64); let dst_partition = &dst.partition; @@ -117,10 +139,13 @@ impl KmerIndex { .filter_map(|i| { let srcs: Vec<(&obikpartitionner::KmerPartition, usize)> = remaining_sources.iter().map(|s| (&s.partition, s.meta.genomes.len())).collect(); - dst_partition.merge_partition(i, &srcs, mode, n_dst_genomes).err() + let result = dst_partition.merge_partition(i, &srcs, mode, n_dst_genomes).err(); + pb.inc(1); + result }) .collect(); + pb.finish_and_clear(); if let Some(e) = errors.into_iter().next() { return Err(OKIError::Partition(e)); } @@ -206,6 +231,31 @@ fn remove_dirs_named(root: &Path, name: &str) -> io::Result<()> { Ok(()) } +fn spinner(msg: &'static str) -> ProgressBar { + let pb = ProgressBar::new_spinner(); + pb.set_style( + ProgressStyle::with_template("{spinner} {msg} {elapsed}") + .unwrap() + .tick_strings(&["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"]), + ); + pb.set_message(msg); + pb.enable_steady_tick(Duration::from_millis(100)); + pb +} + +fn partition_bar(n: u64) -> ProgressBar { + let pb = ProgressBar::new(n); + pb.set_style( + ProgressStyle::with_template( + "{spinner} merge — {bar:40.cyan/blue} {pos}/{len} partitions {elapsed}", + ) + .unwrap() + .tick_strings(&["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"]), + ); + pb.enable_steady_tick(Duration::from_millis(100)); + pb +} + fn copy_dir_all(src: &Path, dst: &Path) -> io::Result<()> { fs::create_dir_all(dst)?; for entry in fs::read_dir(src)? { diff --git a/src/obikmer/src/cmd/merge.rs b/src/obikmer/src/cmd/merge.rs index 0b18ac8..3442273 100644 --- a/src/obikmer/src/cmd/merge.rs +++ b/src/obikmer/src/cmd/merge.rs @@ -55,9 +55,15 @@ pub fn run(args: MergeArgs) { set_k(sources[0].kmer_size()); set_m(sources[0].minimizer_size()); - info!("merging {} indexes → {}", sources.len(), args.output.display()); - let rep = Reporter::new(); - KmerIndex::merge(&args.output, &source_refs, mode, args.force, args.rename_duplicates).unwrap_or_else(|e| { + + let n_genomes: usize = sources.iter().map(|s| s.meta().genomes.len()).sum(); + info!( + "merging {} index(es), {} genome(s) total → {}", + sources.len(), n_genomes, args.output.display() + ); + + let mut rep = Reporter::new(); + KmerIndex::merge(&args.output, &source_refs, mode, args.force, args.rename_duplicates, &mut rep).unwrap_or_else(|e| { eprintln!("error merging: {e}"); std::process::exit(1); }); diff --git a/src/obilayeredmap/src/mphf_layer.rs b/src/obilayeredmap/src/mphf_layer.rs index 5fcb677..0844237 100644 --- a/src/obilayeredmap/src/mphf_layer.rs +++ b/src/obilayeredmap/src/mphf_layer.rs @@ -41,6 +41,8 @@ impl MphfLayer { #[inline] pub fn find(&self, kmer: CanonicalKmer) -> Option { let slot = self.mphf.index(&kmer.raw()); + // PtrHash guarantees slot < n only for its key set; arbitrary queries may exceed bounds. + if slot >= self.n { return None; } let (chunk_id, rank) = self.evidence.decode(slot); if self.unitigs.verify_canonical_kmer(chunk_id as usize, rank as usize, kmer) { Some(slot) -- 2.52.0 From 3fa1dbf8cc91c34289edc2277fb2f54197441ca4 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Thu, 21 May 2026 11:47:35 +0200 Subject: [PATCH 08/11] feat: add pairwise distance computation and phylogenetic trees This commit introduces a new `distance` CLI subcommand that computes pairwise genomic distance matrices using configurable metrics (Jaccard, Hamming, Bray-Curtis, Euclidean, and Hellinger). It optionally generates phylogenetic trees (NJ or UPGMA) in Newick format and outputs results as CSV. The implementation adds a robust distance computation backend that dynamically routes to optimized backends based on index configuration, supports parallel iteration, and gracefully handles missing data. Additionally, it adds a `dump` task for exporting k-mer to genome mappings as CSV, introduces an `InvalidInput` error variant, updates dependencies to support numerical operations and tree construction, and performs minor module reorganizations. --- TODO.md | 2 - src/Cargo.lock | 100 ++++++++++++- src/obicompactvec/src/traits.rs | 7 + src/obikindex/Cargo.toml | 3 + src/obikindex/src/distance.rs | 132 ++++++++++++++++ src/obikindex/src/error.rs | 3 + src/obikindex/src/lib.rs | 2 + src/obikmer/Cargo.toml | 2 + src/obikmer/src/cmd/distance.rs | 216 +++++++++++++++++++++++++++ src/obikmer/src/cmd/mod.rs | 1 + src/obikmer/src/main.rs | 3 + src/obikpartitionner/src/distance.rs | 47 ++++++ src/obikpartitionner/src/lib.rs | 1 + 13 files changed, 512 insertions(+), 7 deletions(-) create mode 100644 src/obikindex/src/distance.rs create mode 100644 src/obikmer/src/cmd/distance.rs create mode 100644 src/obikpartitionner/src/distance.rs diff --git a/TODO.md b/TODO.md index 592fc74..e6ba5d0 100644 --- a/TODO.md +++ b/TODO.md @@ -23,6 +23,4 @@ - les matrices sont sauvegardées en CSV - les arbres NJ sont sauvegardés en Newick avec les longeurs de branche -- dump : une table csv de l'index avec les kmer et les genomes associés en mode count ou presence/absence avec une option pour forcer le mode presence/absence meme si l'index est en mode count. Par defaut, le mode count est utilisé pour les index en mode count et le mode presence/absence pour les index en mode presence/absence. - - status : affiche le statut de l'index diff --git a/src/Cargo.lock b/src/Cargo.lock index 4c1929b..1de3042 100644 --- a/src/Cargo.lock +++ b/src/Cargo.lock @@ -176,6 +176,21 @@ dependencies = [ "thiserror 1.0.69", ] +[[package]] +name = "bit-set" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0700ddab506f33b20a03b13996eccd309a48e5ff77d0d95926aa0210fb4e95f1" +dependencies = [ + "bit-vec", +] + +[[package]] +name = "bit-vec" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb" + [[package]] name = "bitflags" version = "1.3.2" @@ -625,6 +640,12 @@ dependencies = [ "syn", ] +[[package]] +name = "dtoa" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c3cf4824e2d5f025c7b531afcb2325364084a16806f6d47fbc1f5fbd9960590" + [[package]] name = "either" version = "1.15.0" @@ -1102,6 +1123,15 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "kodama" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a44f3a71a44fbf49ce38152db7dc9adf959d4fe5c29344cd1858bdbda8d9091" +dependencies = [ + "num-traits", +] + [[package]] name = "lazy_static" version = "1.5.0" @@ -1453,7 +1483,10 @@ name = "obikindex" version = "0.1.0" dependencies = [ "indicatif", + "ndarray", + "obicompactvec", "obikpartitionner", + "obilayeredmap", "obiskio", "obisys", "rayon", @@ -1468,6 +1501,7 @@ version = "0.1.0" dependencies = [ "clap", "indicatif", + "kodama", "obifastwrite", "obikindex", "obikpartitionner", @@ -1480,6 +1514,7 @@ dependencies = [ "obisys", "pprof", "rayon", + "speedytree", "tracing", "tracing-subscriber", ] @@ -1886,8 +1921,8 @@ dependencies = [ "lazy_static", "log", "mem_dbg", - "rand", - "rand_chacha", + "rand 0.9.4", + "rand_chacha 0.9.0", "rayon", "rdst", "rustc-hash", @@ -1918,14 +1953,35 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc33ff2d4973d518d823d61aa239014831e521c75da58e3df4840d3f47749d09" +[[package]] +name = "rand" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ca0ecfa931c29007047d1bc58e623ab12e5590e8c7cc53200d5202b69266d8a" +dependencies = [ + "libc", + "rand_chacha 0.3.1", + "rand_core 0.6.4", +] + [[package]] name = "rand" version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea" dependencies = [ - "rand_chacha", - "rand_core", + "rand_chacha 0.9.0", + "rand_core 0.9.5", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core 0.6.4", ] [[package]] @@ -1935,7 +1991,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" dependencies = [ "ppv-lite86", - "rand_core", + "rand_core 0.9.5", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom 0.2.17", ] [[package]] @@ -1973,6 +2038,12 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "rb_tree" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6724c78e033e1c4155b4d3f76593d09ad384739c6986c1addc2bd2f55b1aefe" + [[package]] name = "rdst" version = "0.20.14" @@ -2227,6 +2298,25 @@ version = "1.15.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" +[[package]] +name = "speedytree" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f4522052445ce1b002c095d93e9dc545c326e9e2321c1315f1ab2a381c11666" +dependencies = [ + "bit-set", + "bit-vec", + "bitvec", + "clap", + "dtoa", + "fixedbitset", + "parking_lot", + "petgraph", + "rand 0.8.6", + "rayon", + "rb_tree", +] + [[package]] name = "stable_deref_trait" version = "1.2.1" diff --git a/src/obicompactvec/src/traits.rs b/src/obicompactvec/src/traits.rs index bc10097..7cccf48 100644 --- a/src/obicompactvec/src/traits.rs +++ b/src/obicompactvec/src/traits.rs @@ -80,6 +80,13 @@ pub trait CountPartials: ColumnWeights { let sq2 = std::f64::consts::SQRT_2; self.partial_hellinger(&global).mapv(|v| v.sqrt() / sq2) } + + /// Euclidean distance in the Hellinger (√relative-frequency) space. + /// Equal to √2 × hellinger_dist — unnormalised variant. + fn hellinger_euclidean_dist_matrix(&self) -> Array2 { + let global = self.col_weights(); + self.partial_hellinger(&global).mapv(|v| v.sqrt()) + } } /// Partial distance matrices for bit-based data (`PersistentBitMatrix`). diff --git a/src/obikindex/Cargo.toml b/src/obikindex/Cargo.toml index bf4ebd6..538cffb 100644 --- a/src/obikindex/Cargo.toml +++ b/src/obikindex/Cargo.toml @@ -7,6 +7,9 @@ edition = "2024" obikpartitionner = { path = "../obikpartitionner" } obiskio = { path = "../obiskio" } obisys = { path = "../obisys" } +obicompactvec = { path = "../obicompactvec" } +obilayeredmap = { path = "../obilayeredmap" } +ndarray = "0.16" rayon = "1" serde = { version = "1", features = ["derive"] } serde_json = "1" diff --git a/src/obikindex/src/distance.rs b/src/obikindex/src/distance.rs new file mode 100644 index 0000000..a1c9cb0 --- /dev/null +++ b/src/obikindex/src/distance.rs @@ -0,0 +1,132 @@ +use ndarray::Array2; +use obicompactvec::traits::{BitPartials, CountPartials}; +use obilayeredmap::LayeredStore; +use rayon::prelude::*; + +use crate::error::{OKIError, OKIResult}; +use crate::index::KmerIndex; + +// ── Public API ──────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum DistanceMetric { + /// Jaccard distance on presence/absence data. + Jaccard, + /// Hamming distance (number of differing kmer positions) on presence/absence data. + Hamming, + /// Bray-Curtis dissimilarity on raw counts. + BrayCurtis, + /// Bray-Curtis dissimilarity normalised by per-genome total counts. + RelfreqBrayCurtis, + /// Euclidean distance on raw counts. + Euclidean, + /// Euclidean distance on relative frequencies. + RelfreqEuclidean, + /// Hellinger distance on counts. + Hellinger, + /// Euclidean distance in the Hellinger (√relative-frequency) space (unnormalised variant). + HellingerEuclidean, +} + +pub struct DistanceOutput { + /// n×n pairwise distance matrix (genomes in index order). + pub matrix: Array2, + /// n×n shared-kmer count matrix (intersection), if requested. + pub shared_kmers: Option>, +} + +impl DistanceMetric { + pub fn requires_counts(self) -> bool { + matches!( + self, + DistanceMetric::BrayCurtis + | DistanceMetric::RelfreqBrayCurtis + | DistanceMetric::Euclidean + | DistanceMetric::RelfreqEuclidean + | DistanceMetric::Hellinger + | DistanceMetric::HellingerEuclidean + ) + } +} + +// ── KmerIndex::distance ─────────────────────────────────────────────────────── + +impl KmerIndex { + pub fn distance(&self, metric: DistanceMetric, shared_kmers: bool) -> OKIResult { + let n_genomes = self.meta.genomes.len(); + if n_genomes < 2 { + return Err(OKIError::InvalidInput( + "distance requires at least 2 genomes in the index".into(), + )); + } + + let use_counts = self.meta.config.with_counts; + if metric.requires_counts() && !use_counts { + return Err(OKIError::InvalidInput(format!( + "{metric:?} requires a count index (with_counts = true)" + ))); + } + + let n_parts = self.n_partitions(); + + if use_counts { + let stores: Vec<_> = (0..n_parts) + .into_par_iter() + .map(|i| self.partition.count_store(i).map_err(OKIError::Partition)) + .collect::>()?; + let global = LayeredStore::new(stores); + + let matrix = match metric { + DistanceMetric::BrayCurtis => CountPartials::bray_dist_matrix(&global), + DistanceMetric::RelfreqBrayCurtis => CountPartials::relfreq_bray_dist_matrix(&global), + DistanceMetric::Euclidean => CountPartials::euclidean_dist_matrix(&global), + DistanceMetric::RelfreqEuclidean => CountPartials::relfreq_euclidean_dist_matrix(&global), + DistanceMetric::Hellinger => CountPartials::hellinger_dist_matrix(&global), + DistanceMetric::HellingerEuclidean => CountPartials::hellinger_euclidean_dist_matrix(&global), + // Jaccard on count data: threshold at 0 (present if count > 0) + DistanceMetric::Jaccard => CountPartials::threshold_jaccard_dist_matrix(&global, 0), + DistanceMetric::Hamming => { + return Err(OKIError::InvalidInput( + "Hamming is only available for presence/absence indexes".into(), + )); + } + }; + + let shared = if shared_kmers { + let (inter, _) = CountPartials::partial_threshold_jaccard(&global, 0); + Some(inter) + } else { + None + }; + + Ok(DistanceOutput { matrix, shared_kmers: shared }) + } else { + let stores: Vec<_> = (0..n_parts) + .into_par_iter() + .map(|i| self.partition.presence_store(i).map_err(OKIError::Partition)) + .collect::>()?; + let global = LayeredStore::new(stores); + + let matrix = match metric { + DistanceMetric::Jaccard => BitPartials::jaccard_dist_matrix(&global), + DistanceMetric::Hamming => { + BitPartials::hamming_dist_matrix(&global).mapv(|v| v as f64) + } + other => { + return Err(OKIError::InvalidInput(format!( + "{other:?} requires a count index; use --metric jaccard or --metric hamming" + ))); + } + }; + + let shared = if shared_kmers { + let (inter, _) = BitPartials::partial_jaccard(&global); + Some(inter) + } else { + None + }; + + Ok(DistanceOutput { matrix, shared_kmers: shared }) + } + } +} diff --git a/src/obikindex/src/error.rs b/src/obikindex/src/error.rs index 66f1abc..96834cb 100644 --- a/src/obikindex/src/error.rs +++ b/src/obikindex/src/error.rs @@ -16,6 +16,8 @@ pub enum OKIError { MismatchedMode, /// Two or more sources share the same genome label. DuplicateGenomeLabel(String), + /// Operation not valid for this index configuration. + InvalidInput(String), } pub type OKIResult = Result; @@ -30,6 +32,7 @@ impl fmt::Display for OKIError { OKIError::IncompatibleConfig => write!(f, "incompatible index configurations"), OKIError::MismatchedMode => write!(f, "count mode requires all sources to have with_counts=true"), OKIError::DuplicateGenomeLabel(l) => write!(f, "duplicate genome label across sources: {l}"), + OKIError::InvalidInput(m) => write!(f, "invalid input: {m}"), } } } diff --git a/src/obikindex/src/lib.rs b/src/obikindex/src/lib.rs index c4a66f5..d6a0908 100644 --- a/src/obikindex/src/lib.rs +++ b/src/obikindex/src/lib.rs @@ -1,11 +1,13 @@ pub mod error; pub mod meta; pub mod state; +mod distance; mod dump; mod index; mod merge; pub use error::{OKIError, OKIResult}; +pub use distance::{DistanceMetric, DistanceOutput}; pub use index::KmerIndex; pub use merge::MergeMode; pub use meta::{IndexConfig, IndexMeta, META_FILENAME}; diff --git a/src/obikmer/Cargo.toml b/src/obikmer/Cargo.toml index f61a524..fffb28b 100644 --- a/src/obikmer/Cargo.toml +++ b/src/obikmer/Cargo.toml @@ -19,6 +19,8 @@ obisys = { path = "../obisys" } obiskio = { path = "../obiskio" } obikindex = { path = "../obikindex" } clap = { version = "4", features = ["derive"] } +kodama = "0.2" +speedytree = "0.1" rayon = "1" indicatif = "0.17" tracing = "0.1.44" diff --git a/src/obikmer/src/cmd/distance.rs b/src/obikmer/src/cmd/distance.rs new file mode 100644 index 0000000..e8574ae --- /dev/null +++ b/src/obikmer/src/cmd/distance.rs @@ -0,0 +1,216 @@ +use std::io::{self, BufWriter, Write}; +use std::path::PathBuf; + +use clap::Args; +use kodama::{Method, linkage}; +use obikindex::{DistanceMetric, KmerIndex}; +use obikseq::{set_k, set_m}; +use speedytree::{DistanceMatrix, Hybrid, NeighborJoiningSolver, to_newick}; +use tracing::info; + +#[derive(clap::ValueEnum, Clone, Copy, Debug)] +pub enum MetricArg { + Jaccard, + Hamming, + BrayCurtis, + #[value(name = "relfreq-bray-curtis")] + RelfreqBrayCurtis, + Euclidean, + #[value(name = "relfreq-euclidean")] + RelfreqEuclidean, + Hellinger, + #[value(name = "hellinger-euclidean")] + HellingerEuclidean, +} + +impl From for DistanceMetric { + fn from(m: MetricArg) -> Self { + match m { + MetricArg::Jaccard => DistanceMetric::Jaccard, + MetricArg::Hamming => DistanceMetric::Hamming, + MetricArg::BrayCurtis => DistanceMetric::BrayCurtis, + MetricArg::RelfreqBrayCurtis => DistanceMetric::RelfreqBrayCurtis, + MetricArg::Euclidean => DistanceMetric::Euclidean, + MetricArg::RelfreqEuclidean => DistanceMetric::RelfreqEuclidean, + MetricArg::Hellinger => DistanceMetric::Hellinger, + MetricArg::HellingerEuclidean => DistanceMetric::HellingerEuclidean, + } + } +} + +#[derive(Args)] +pub struct DistanceArgs { + /// Index directory + pub index: PathBuf, + + /// Distance metric to compute + #[arg(long, value_enum, default_value = "jaccard")] + pub metric: MetricArg, + + /// Also output the shared-kmer count matrix (CSV) + #[arg(long)] + pub shared_kmers: bool, + + /// Compute and write a Neighbor-Joining tree (Newick) + #[arg(long)] + pub nj: bool, + + /// Compute and write a UPGMA tree (Newick) + #[arg(long)] + pub upgma: bool, + + /// Output prefix: _dist.csv, _shared.csv, + /// _nj.nwk, _upgma.nwk. + /// If omitted, the distance matrix is written to stdout. + #[arg(short, long)] + pub output: Option, +} + +pub fn run(args: DistanceArgs) { + let idx = KmerIndex::open(&args.index).unwrap_or_else(|e| { + eprintln!("error opening index: {e}"); + std::process::exit(1); + }); + + set_k(idx.kmer_size()); + set_m(idx.minimizer_size()); + + let genomes = idx.meta().genomes.clone(); + let n = genomes.len(); + info!( + "computing {:?} distances for {} genome(s)", + args.metric, n + ); + + let need_shared = args.shared_kmers || args.nj || args.upgma; + let result = idx + .distance(args.metric.into(), need_shared) + .unwrap_or_else(|e| { + eprintln!("error computing distances: {e}"); + std::process::exit(1); + }); + + // ── Distance matrix → CSV ───────────────────────────────────────────────── + let write_dist_csv = |w: &mut dyn Write| { + write!(w, "genome").unwrap(); + for g in &genomes { write!(w, ",{g}").unwrap(); } + writeln!(w).unwrap(); + for (i, g) in genomes.iter().enumerate() { + write!(w, "{g}").unwrap(); + for j in 0..n { + write!(w, ",{:.6}", result.matrix[[i, j]]).unwrap(); + } + writeln!(w).unwrap(); + } + }; + + match &args.output { + Some(prefix) => { + let path = format!("{}_dist.csv", prefix.display()); + let mut f = BufWriter::new(std::fs::File::create(&path).unwrap_or_else(|e| { + eprintln!("error creating {path}: {e}"); + std::process::exit(1); + })); + write_dist_csv(&mut f); + info!("distance matrix → {path}"); + } + None => { + let stdout = io::stdout(); + let mut out = BufWriter::new(stdout.lock()); + write_dist_csv(&mut out); + } + } + + // ── Shared-kmer matrix → CSV ────────────────────────────────────────────── + if args.shared_kmers { + if let Some(shared) = &result.shared_kmers { + let path = args.output.as_ref() + .map(|p| format!("{}_shared.csv", p.display())) + .unwrap_or_else(|| "shared.csv".into()); + let mut f = BufWriter::new(std::fs::File::create(&path).unwrap_or_else(|e| { + eprintln!("error creating {path}: {e}"); + std::process::exit(1); + })); + write!(f, "genome").unwrap(); + for g in &genomes { write!(f, ",{g}").unwrap(); } + writeln!(f).unwrap(); + for (i, g) in genomes.iter().enumerate() { + write!(f, "{g}").unwrap(); + for j in 0..n { write!(f, ",{}", shared[[i, j]]).unwrap(); } + writeln!(f).unwrap(); + } + info!("shared-kmer matrix → {path}"); + } + } + + // ── NJ tree via speedytree ──────────────────────────────────────────────── + if args.nj { + let rows: Vec> = (0..n) + .map(|i| (0..n).map(|j| result.matrix[[i, j]]).collect()) + .collect(); + let dm = DistanceMatrix::build(rows, genomes.clone()).unwrap_or_else(|e| { + eprintln!("error building distance matrix for NJ: {e}"); + std::process::exit(1); + }); + let tree = NeighborJoiningSolver::::default(dm).solve().unwrap_or_else(|e| { + eprintln!("error computing NJ tree: {e}"); + std::process::exit(1); + }); + let newick = to_newick(&tree); + let path = args.output.as_ref() + .map(|p| format!("{}_nj.nwk", p.display())) + .unwrap_or_else(|| "nj.nwk".into()); + std::fs::write(&path, &newick).unwrap_or_else(|e| { + eprintln!("error writing {path}: {e}"); + std::process::exit(1); + }); + info!("NJ tree → {path}"); + } + + // ── UPGMA tree via kodama ───────────────────────────────────────────────── + if args.upgma { + let mut condensed: Vec = Vec::with_capacity(n * (n - 1) / 2); + for i in 0..n { + for j in (i + 1)..n { + condensed.push(result.matrix[[i, j]]); + } + } + let dendro = linkage(&mut condensed, n, Method::Average); + let newick = upgma_to_newick(&dendro, &genomes); + let path = args.output.as_ref() + .map(|p| format!("{}_upgma.nwk", p.display())) + .unwrap_or_else(|| "upgma.nwk".into()); + std::fs::write(&path, &newick).unwrap_or_else(|e| { + eprintln!("error writing {path}: {e}"); + std::process::exit(1); + }); + info!("UPGMA tree → {path}"); + } +} + +// ── UPGMA Newick from kodama dendrogram ─────────────────────────────────────── + +fn upgma_to_newick(dendro: &kodama::Dendrogram, names: &[String]) -> String { + let n = names.len(); + // node_labels[i]: Newick subtree string for node i (leaves 0..n, internals n..) + let mut labels: Vec = names.to_vec(); + // height of each node: leaves = 0, internal = dissimilarity/2 + let mut heights: Vec = vec![0.0; 2 * n - 1]; + + for (k, step) in dendro.steps().iter().enumerate() { + let new_node = n + k; + let h = step.dissimilarity / 2.0; + heights[new_node] = h; + let c1 = step.cluster1; + let c2 = step.cluster2; + let bl1 = (h - heights[c1]).max(0.0); + let bl2 = (h - heights[c2]).max(0.0); + labels.push(format!( + "({label1}:{bl1:.6},{label2}:{bl2:.6})", + label1 = labels[c1], + label2 = labels[c2], + )); + } + + format!("{};", labels.last().unwrap()) +} diff --git a/src/obikmer/src/cmd/mod.rs b/src/obikmer/src/cmd/mod.rs index 85d8603..ed66c65 100644 --- a/src/obikmer/src/cmd/mod.rs +++ b/src/obikmer/src/cmd/mod.rs @@ -1,3 +1,4 @@ +pub mod distance; pub mod dump; pub mod index; pub mod merge; diff --git a/src/obikmer/src/main.rs b/src/obikmer/src/main.rs index 1a1f8bc..c88428b 100644 --- a/src/obikmer/src/main.rs +++ b/src/obikmer/src/main.rs @@ -22,6 +22,8 @@ enum Commands { Merge(cmd::merge::MergeArgs), /// Dump all indexed kmers as CSV (kmer + per-genome counts or presence) Dump(cmd::dump::DumpArgs), + /// Compute pairwise distance matrix between genomes; optionally build NJ/UPGMA trees + Distance(cmd::distance::DistanceArgs), /// Dump unitigs from a built index to stdout (debug) Unitig(cmd::unitig::UnitigArgs), } @@ -49,6 +51,7 @@ fn main() { Commands::Index(args) => cmd::index::run(args), Commands::Merge(args) => cmd::merge::run(args), Commands::Dump(args) => cmd::dump::run(args), + Commands::Distance(args) => cmd::distance::run(args), Commands::Unitig(args) => cmd::unitig::run(args), } diff --git a/src/obikpartitionner/src/distance.rs b/src/obikpartitionner/src/distance.rs new file mode 100644 index 0000000..b5e6914 --- /dev/null +++ b/src/obikpartitionner/src/distance.rs @@ -0,0 +1,47 @@ +use obicompactvec::{PersistentBitMatrix, PersistentCompactIntMatrix}; +use obilayeredmap::LayeredStore; +use obiskio::{SKError, SKResult}; + +use crate::partition::KmerPartition; + +const INDEX_SUBDIR: &str = "index"; + +fn probe_n_layers(index_dir: &std::path::Path) -> usize { + let mut n = 0; + while index_dir.join(format!("layer_{n}")).exists() { n += 1; } + n +} + +impl KmerPartition { + /// Open all count matrices for partition `part`, one per layer. + /// Layers without a `counts/` directory are skipped. + pub fn count_store(&self, part: usize) -> SKResult> { + let index_dir = self.part_dir(part).join(INDEX_SUBDIR); + if !index_dir.exists() { + return Ok(LayeredStore::new(vec![])); + } + let matrices = (0..probe_n_layers(&index_dir)) + .filter_map(|l| { + let dir = index_dir.join(format!("layer_{l}")).join("counts"); + dir.exists().then(|| PersistentCompactIntMatrix::open(&dir).map_err(SKError::Io)) + }) + .collect::>>()?; + Ok(LayeredStore::new(matrices)) + } + + /// Open all presence matrices for partition `part`, one per layer. + /// Layers without a `presence/` directory are skipped. + pub fn presence_store(&self, part: usize) -> SKResult> { + let index_dir = self.part_dir(part).join(INDEX_SUBDIR); + if !index_dir.exists() { + return Ok(LayeredStore::new(vec![])); + } + let matrices = (0..probe_n_layers(&index_dir)) + .filter_map(|l| { + let dir = index_dir.join(format!("layer_{l}")).join("presence"); + dir.exists().then(|| PersistentBitMatrix::open(&dir).map_err(SKError::Io)) + }) + .collect::>>()?; + Ok(LayeredStore::new(matrices)) + } +} diff --git a/src/obikpartitionner/src/lib.rs b/src/obikpartitionner/src/lib.rs index 3b098a2..4b03fce 100644 --- a/src/obikpartitionner/src/lib.rs +++ b/src/obikpartitionner/src/lib.rs @@ -1,3 +1,4 @@ +mod distance; mod dump_layer; mod index_layer; mod kmer_sort; -- 2.52.0 From d9aa211b8fc79a71a9cdbab3b939df312725ed8a Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Thu, 21 May 2026 12:11:55 +0200 Subject: [PATCH 09/11] feat: add k-mer index rebuild and compaction feature This commit introduces a new `rebuild` CLI subcommand that reconstructs an existing multi-layer k-mer index into a compact, single-layer index. It implements a configurable filtering pipeline supporting min/max genome fraction/count and total count thresholds, parallel partition processing via `rayon`, and CLI progress tracking. The change also restructures module declarations across `obikindex` and `obikpartitionner` to integrate the new rebuild and layer-handling logic. --- src/obikindex/src/lib.rs | 1 + src/obikindex/src/rebuild.rs | 116 ++++++++++++ src/obikmer/src/cmd/mod.rs | 1 + src/obikmer/src/cmd/rebuild.rs | 111 ++++++++++++ src/obikmer/src/main.rs | 3 + src/obikpartitionner/src/filter.rs | 87 +++++++++ src/obikpartitionner/src/lib.rs | 3 + src/obikpartitionner/src/merge_layer.rs | 6 +- src/obikpartitionner/src/rebuild_layer.rs | 205 ++++++++++++++++++++++ 9 files changed, 530 insertions(+), 3 deletions(-) create mode 100644 src/obikindex/src/rebuild.rs create mode 100644 src/obikmer/src/cmd/rebuild.rs create mode 100644 src/obikpartitionner/src/filter.rs create mode 100644 src/obikpartitionner/src/rebuild_layer.rs diff --git a/src/obikindex/src/lib.rs b/src/obikindex/src/lib.rs index d6a0908..65df6c0 100644 --- a/src/obikindex/src/lib.rs +++ b/src/obikindex/src/lib.rs @@ -5,6 +5,7 @@ mod distance; mod dump; mod index; mod merge; +mod rebuild; pub use error::{OKIError, OKIResult}; pub use distance::{DistanceMetric, DistanceOutput}; diff --git a/src/obikindex/src/rebuild.rs b/src/obikindex/src/rebuild.rs new file mode 100644 index 0000000..b360030 --- /dev/null +++ b/src/obikindex/src/rebuild.rs @@ -0,0 +1,116 @@ +use std::fs; +use std::io; +use std::path::Path; +use std::time::Duration; + +use indicatif::{ProgressBar, ProgressStyle}; +use obikpartitionner::{KmerFilter, KmerPartition, MergeMode}; +use obisys::{Reporter, Stage}; +use rayon::prelude::*; +use tracing::info; + +use crate::error::{OKIError, OKIResult}; +use crate::index::KmerIndex; +use crate::meta::IndexMeta; +use crate::state::{IndexState, SENTINEL_INDEXED}; + +impl KmerIndex { + /// Rebuild `src` into a new compact single-layer index at `output`. + /// + /// Only k-mers whose per-genome row passes every filter in `filters` are + /// written. If `filters` is empty every k-mer is kept (pure compaction). + /// + /// `mode` controls whether the output stores counts or presence/absence. + /// A count source may be rebuilt in presence mode; a presence source + /// cannot be rebuilt in count mode. + pub fn rebuild>( + output: P, + src: &KmerIndex, + filters: &[Box], + mode: MergeMode, + force: bool, + rep: &mut Reporter, + ) -> OKIResult { + let output = output.as_ref(); + + if src.state() != IndexState::Indexed { + return Err(OKIError::NotIndexed(src.root_path.clone())); + } + + if mode == MergeMode::Count && !src.meta.config.with_counts { + return Err(OKIError::InvalidInput( + "cannot rebuild in count mode from a presence-only source index".into(), + )); + } + + if output.exists() { + if force { + fs::remove_dir_all(output)?; + } else { + return Err(OKIError::Io(io::Error::new( + io::ErrorKind::AlreadyExists, + format!("{}: output directory already exists", output.display()), + ))); + } + } + + // ── Create output directory + metadata ──────────────────────────────── + fs::create_dir_all(output)?; + let mut meta = IndexMeta::new(src.meta.config.clone()); + meta.config.with_counts = mode == MergeMode::Count; + meta.genomes = src.meta.genomes.clone(); + meta.write(output)?; + + let n_genomes = src.meta.genomes.len(); + let n_partitions = src.partition.n_partitions(); + + // ── Create an empty destination KmerPartition ───────────────────────── + // Create the partitions/ subdirectory so KmerPartition::open_with_config works. + fs::create_dir_all(output.join(obikpartitionner::PARTITIONS_SUBDIR))?; + let dst_partition = KmerPartition::open_with_config( + output, + meta.config.kmer_size, + meta.config.minimizer_size, + meta.config.n_bits, + )?; + + info!( + "rebuild: {} partition(s), {} genome(s), mode={:?}", + n_partitions, n_genomes, mode, + ); + + let t = Stage::start("rebuild"); + let pb = ProgressBar::new(n_partitions as u64).with_style( + ProgressStyle::with_template("rebuild — [{bar:20}] {pos}/{len} | {msg}") + .unwrap() + .progress_chars("=> "), + ); + pb.enable_steady_tick(Duration::from_millis(100)); + + let src_partition = &src.partition; + + let errors: Vec = (0..n_partitions) + .into_par_iter() + .filter_map(|i| { + let result = dst_partition + .rebuild_partition(src_partition, i, filters, mode, n_genomes) + .err(); + pb.inc(1); + result + }) + .collect(); + + pb.finish_and_clear(); + + if let Some(e) = errors.into_iter().next() { + return Err(OKIError::Partition(e)); + } + + rep.push(t.stop()); + + // Write SENTINEL_INDEXED — output is ready to use. + fs::File::create(output.join(SENTINEL_INDEXED))?; + + KmerIndex::open(output) + } +} diff --git a/src/obikmer/src/cmd/mod.rs b/src/obikmer/src/cmd/mod.rs index ed66c65..ac4057e 100644 --- a/src/obikmer/src/cmd/mod.rs +++ b/src/obikmer/src/cmd/mod.rs @@ -2,5 +2,6 @@ pub mod distance; pub mod dump; pub mod index; pub mod merge; +pub mod rebuild; pub mod superkmer; pub mod unitig; diff --git a/src/obikmer/src/cmd/rebuild.rs b/src/obikmer/src/cmd/rebuild.rs new file mode 100644 index 0000000..f172811 --- /dev/null +++ b/src/obikmer/src/cmd/rebuild.rs @@ -0,0 +1,111 @@ +use std::path::PathBuf; + +use clap::Args; +use obikindex::{KmerIndex, MergeMode}; +use obikpartitionner::filter::{ + KmerFilter, MaxGenomeCount, MaxGenomeFraction, MaxTotalCount, + MinGenomeCount, MinGenomeFraction, MinTotalCount, +}; +use obisys::Reporter; +use obikseq::{set_k, set_m}; +use tracing::info; + +#[derive(Args)] +pub struct RebuildArgs { + /// Source index directory + pub source: PathBuf, + + /// Output index directory + #[arg(short, long)] + pub output: PathBuf, + + /// Minimum fraction of genomes containing the k-mer [0.0–1.0] + #[arg(long)] + pub min_quorum_frac: Option, + + /// Maximum fraction of genomes containing the k-mer [0.0–1.0] + #[arg(long)] + pub max_quorum_frac: Option, + + /// Minimum number of genomes containing the k-mer + #[arg(long)] + pub min_quorum_count: Option, + + /// Maximum number of genomes containing the k-mer + #[arg(long)] + pub max_quorum_count: Option, + + /// Minimum total count across all genomes (count index only) + #[arg(long)] + pub min_total_count: Option, + + /// Maximum total count across all genomes (count index only) + #[arg(long)] + pub max_total_count: Option, + + /// Per-genome count threshold for presence in quorum filters (default 0) + #[arg(long, default_value = "0")] + pub presence_threshold: u32, + + /// Output as presence/absence instead of counts + #[arg(long)] + pub presence: bool, + + /// Overwrite existing output directory + #[arg(short, long)] + pub force: bool, +} + +pub fn run(args: RebuildArgs) { + let src = KmerIndex::open(&args.source).unwrap_or_else(|e| { + eprintln!("error opening source index: {e}"); + std::process::exit(1); + }); + + set_k(src.kmer_size()); + set_m(src.minimizer_size()); + + let n_genomes = src.meta().genomes.len(); + let mode = if args.presence || !src.meta().config.with_counts { + MergeMode::Presence + } else { + MergeMode::Count + }; + + info!( + "rebuild: {} genome(s), mode={:?}, source={}", + n_genomes, mode, args.source.display() + ); + + let pt = args.presence_threshold; + let mut filters: Vec> = Vec::new(); + + if let Some(v) = args.min_quorum_frac { + filters.push(Box::new(MinGenomeFraction { frac: v, threshold: pt })); + } + if let Some(v) = args.max_quorum_frac { + filters.push(Box::new(MaxGenomeFraction { frac: v, threshold: pt })); + } + if let Some(v) = args.min_quorum_count { + filters.push(Box::new(MinGenomeCount { count: v, threshold: pt })); + } + if let Some(v) = args.max_quorum_count { + filters.push(Box::new(MaxGenomeCount { count: v, threshold: pt })); + } + if let Some(v) = args.min_total_count { + filters.push(Box::new(MinTotalCount { total: v })); + } + if let Some(v) = args.max_total_count { + filters.push(Box::new(MaxTotalCount { total: v })); + } + + let mut rep = Reporter::new(); + KmerIndex::rebuild(&args.output, &src, &filters, mode, args.force, &mut rep) + .unwrap_or_else(|e| { + eprintln!("error rebuilding index: {e}"); + std::process::exit(1); + }); + + rep.print(); + info!("rebuilt index → {}", args.output.display()); +} diff --git a/src/obikmer/src/main.rs b/src/obikmer/src/main.rs index c88428b..bae4cdf 100644 --- a/src/obikmer/src/main.rs +++ b/src/obikmer/src/main.rs @@ -20,6 +20,8 @@ enum Commands { Index(cmd::index::IndexArgs), /// Merge multiple built indexes into one Merge(cmd::merge::MergeArgs), + /// Filter and compact an existing index into a new single-layer index + Rebuild(cmd::rebuild::RebuildArgs), /// Dump all indexed kmers as CSV (kmer + per-genome counts or presence) Dump(cmd::dump::DumpArgs), /// Compute pairwise distance matrix between genomes; optionally build NJ/UPGMA trees @@ -51,6 +53,7 @@ fn main() { Commands::Index(args) => cmd::index::run(args), Commands::Merge(args) => cmd::merge::run(args), Commands::Dump(args) => cmd::dump::run(args), + Commands::Rebuild(args) => cmd::rebuild::run(args), Commands::Distance(args) => cmd::distance::run(args), Commands::Unitig(args) => cmd::unitig::run(args), } diff --git a/src/obikpartitionner/src/filter.rs b/src/obikpartitionner/src/filter.rs new file mode 100644 index 0000000..f35a3e8 --- /dev/null +++ b/src/obikpartitionner/src/filter.rs @@ -0,0 +1,87 @@ +/// Trait for kmer row filters used in the rebuild command. +/// +/// `row` contains raw per-genome counts (or 0/1 for presence/absence data). +/// `n_genomes` equals `row.len()`. +pub trait KmerFilter: Send + Sync { + fn passes(&self, row: &[u32], n_genomes: usize) -> bool; +} + +// ── Quorum filters ───────────────────────────────────────────────────────────── + +fn present_count(row: &[u32], threshold: u32) -> usize { + row.iter().filter(|&&v| v > threshold).count() +} + +/// At least `frac` fraction of genomes contain this kmer (count > `threshold`). +pub struct MinGenomeFraction { + pub frac: f64, + pub threshold: u32, +} + +impl KmerFilter for MinGenomeFraction { + fn passes(&self, row: &[u32], n_genomes: usize) -> bool { + let p = present_count(row, self.threshold); + p as f64 / n_genomes as f64 >= self.frac + } +} + +/// At most `frac` fraction of genomes contain this kmer (count > `threshold`). +pub struct MaxGenomeFraction { + pub frac: f64, + pub threshold: u32, +} + +impl KmerFilter for MaxGenomeFraction { + fn passes(&self, row: &[u32], n_genomes: usize) -> bool { + let p = present_count(row, self.threshold); + p as f64 / n_genomes as f64 <= self.frac + } +} + +/// At least `count` genomes contain this kmer (count > `threshold`). +pub struct MinGenomeCount { + pub count: usize, + pub threshold: u32, +} + +impl KmerFilter for MinGenomeCount { + fn passes(&self, row: &[u32], _n_genomes: usize) -> bool { + present_count(row, self.threshold) >= self.count + } +} + +/// At most `count` genomes contain this kmer (count > `threshold`). +pub struct MaxGenomeCount { + pub count: usize, + pub threshold: u32, +} + +impl KmerFilter for MaxGenomeCount { + fn passes(&self, row: &[u32], _n_genomes: usize) -> bool { + present_count(row, self.threshold) <= self.count + } +} + +// ── Total-count filters (count indexes only) ─────────────────────────────────── + +/// Sum of counts across all genomes >= `total`. +pub struct MinTotalCount { + pub total: u32, +} + +impl KmerFilter for MinTotalCount { + fn passes(&self, row: &[u32], _n_genomes: usize) -> bool { + row.iter().sum::() >= self.total + } +} + +/// Sum of counts across all genomes <= `total`. +pub struct MaxTotalCount { + pub total: u32, +} + +impl KmerFilter for MaxTotalCount { + fn passes(&self, row: &[u32], _n_genomes: usize) -> bool { + row.iter().sum::() <= self.total + } +} diff --git a/src/obikpartitionner/src/lib.rs b/src/obikpartitionner/src/lib.rs index 4b03fce..48d72c1 100644 --- a/src/obikpartitionner/src/lib.rs +++ b/src/obikpartitionner/src/lib.rs @@ -1,9 +1,12 @@ +pub mod filter; mod distance; mod dump_layer; mod index_layer; mod kmer_sort; mod merge_layer; mod partition; +mod rebuild_layer; +pub use filter::KmerFilter; pub use merge_layer::MergeMode; pub use partition::{KmerPartition, KmerSpectrum, PARTITIONS_SUBDIR}; diff --git a/src/obikpartitionner/src/merge_layer.rs b/src/obikpartitionner/src/merge_layer.rs index c933966..21785d0 100644 --- a/src/obikpartitionner/src/merge_layer.rs +++ b/src/obikpartitionner/src/merge_layer.rs @@ -44,7 +44,7 @@ impl ColBuilder { // ── SrcLayerData — opened source matrix for pass-2 lookup ───────────────────── -enum SrcLayerData { +pub(crate) enum SrcLayerData { /// Pure set-membership layer (no data matrix): every kmer is present in all genomes. SetMembership, Presence(MphfLayer, PersistentBitMatrix), @@ -52,7 +52,7 @@ enum SrcLayerData { } impl SrcLayerData { - fn open(layer_dir: &Path, mode: MergeMode) -> SKResult { + pub(crate) fn open(layer_dir: &Path, mode: MergeMode) -> SKResult { let presence_dir = layer_dir.join("presence"); let counts_dir = layer_dir.join("counts"); match mode { @@ -83,7 +83,7 @@ impl SrcLayerData { } /// Return one value per source genome for `kmer`. - fn lookup(&self, kmer: CanonicalKmer, n_genomes: usize) -> Vec { + pub(crate) fn lookup(&self, kmer: CanonicalKmer, n_genomes: usize) -> Vec { match self { SrcLayerData::SetMembership => vec![1u32; n_genomes], SrcLayerData::Presence(mphf, mat) => { diff --git a/src/obikpartitionner/src/rebuild_layer.rs b/src/obikpartitionner/src/rebuild_layer.rs new file mode 100644 index 0000000..fa581f9 --- /dev/null +++ b/src/obikpartitionner/src/rebuild_layer.rs @@ -0,0 +1,205 @@ +use std::fs; +use std::io; +use std::path::{Path, PathBuf}; + +use obicompactvec::{PersistentBitMatrixBuilder, + PersistentBitVecBuilder, + PersistentCompactIntMatrixBuilder, + PersistentCompactIntVecBuilder}; +use obidebruinj::GraphDeBruijn; +use obiskio::{SKError, SKResult, UnitigFileReader}; +use obilayeredmap::{Layer, MphfLayer, OLMError}; +use obilayeredmap::meta::PartitionMeta; + +use crate::filter::KmerFilter; +use crate::merge_layer::{MergeMode, SrcLayerData}; +use crate::partition::KmerPartition; + +const INDEX_SUBDIR: &str = "index"; + +fn olm_to_sk(e: OLMError) -> SKError { + match e { + OLMError::Io(e) => SKError::Io(e), + other => SKError::InvalidData { context: "rebuild", detail: other.to_string() }, + } +} + +fn col_path_bit(dir: &Path, col: usize) -> PathBuf { + dir.join(format!("col_{col:06}.pbiv")) +} + +fn col_path_int(dir: &Path, col: usize) -> PathBuf { + dir.join(format!("col_{col:06}.pciv")) +} + +fn write_matrix_meta(dir: &Path, n: usize, n_cols: usize) -> io::Result<()> { + fs::write(dir.join("meta.json"), format!("{{\"n\":{n},\"n_cols\":{n_cols}}}\n")) +} + +// ── ColBuilder ──────────────────────────────────────────────────────────────── + +enum ColBuilder { + Bit(PersistentBitVecBuilder), + Int(PersistentCompactIntVecBuilder), +} + +impl ColBuilder { + fn set_val(&mut self, slot: usize, value: u32) { + match self { + ColBuilder::Bit(b) => b.set(slot, value > 0), + ColBuilder::Int(b) => b.set(slot, value), + } + } + + fn close(self) -> SKResult<()> { + match self { + ColBuilder::Bit(b) => b.close().map_err(SKError::Io), + ColBuilder::Int(b) => b.close().map_err(SKError::Io), + } + } +} + +// ── Helpers ─────────────────────────────────────────────────────────────────── + +fn load_meta(dir: &Path) -> SKResult { + match PartitionMeta::load(dir) { + Ok(m) => Ok(m), + Err(e) if matches!(e, OLMError::Io(ref io_e) if io_e.kind() == std::io::ErrorKind::NotFound) => { + let mut n = 0usize; + while dir.join(format!("layer_{n}")).exists() { n += 1; } + let m = PartitionMeta { n_layers: n }; + m.save(dir).map_err(olm_to_sk)?; + Ok(m) + } + Err(e) => Err(olm_to_sk(e)), + } +} + +fn passes_all(filters: &[Box], row: &[u32], n_genomes: usize) -> bool { + filters.iter().all(|f| f.passes(row, n_genomes)) +} + +// ── KmerPartition::rebuild_partition ───────────────────────────────────────── + +impl KmerPartition { + /// Rebuild partition `i` from `src` into `self` (an empty destination partition). + /// + /// Only k-mers whose per-genome row passes all `filters` are written. + /// The output is a single-layer index — regardless of how many layers the + /// source has. + /// + /// `n_genomes` is the number of genome columns in the source (and destination). + pub fn rebuild_partition( + &self, + src: &KmerPartition, + i: usize, + filters: &[Box], + mode: MergeMode, + n_genomes: usize, + ) -> SKResult<()> { + let src_index_dir = src.part_dir(i).join(INDEX_SUBDIR); + if !src_index_dir.exists() { + return Ok(()); + } + + let src_meta = load_meta(&src_index_dir)?; + if src_meta.n_layers == 0 { + return Ok(()); + } + + // ── Pass 1: collect filtered kmers into de Bruijn graph ─────────────── + let mut g = GraphDeBruijn::new(); + + for l in 0..src_meta.n_layers { + let src_layer_dir = src_index_dir.join(format!("layer_{l}")); + let unitigs_path = src_layer_dir.join("unitigs.bin"); + if !unitigs_path.exists() { continue; } + + let reader = UnitigFileReader::open(&unitigs_path)?; + let src_data = SrcLayerData::open(&src_layer_dir, mode)?; + + for (kmer, _, _) in reader.iter_indexed_canonical_kmers() { + let row = src_data.lookup(kmer, n_genomes); + if passes_all(filters, &row, n_genomes) { + g.push(kmer); + } + } + } + + if g.len() == 0 { + return Ok(()); + } + + let n_new = g.len(); + g.compute_degrees(); + + // ── Build MPHF in dst layer_0 ───────────────────────────────────────── + let dst_index_dir = self.part_dir(i).join(INDEX_SUBDIR); + let dst_layer_dir = dst_index_dir.join("layer_0"); + fs::create_dir_all(&dst_layer_dir)?; + + let mut uw = Layer::<()>::unitig_writer(&dst_layer_dir).map_err(olm_to_sk)?; + for unitig in g.iter_unitig() { + uw.write(&unitig)?; + } + uw.close()?; + drop(g); + + Layer::<()>::build(&dst_layer_dir).map_err(olm_to_sk)?; + let dst_mphf = MphfLayer::open(&dst_layer_dir).map_err(olm_to_sk)?; + + // ── Prepare matrix builders (one column per genome) ─────────────────── + let data_dir = match mode { + MergeMode::Presence => dst_layer_dir.join("presence"), + MergeMode::Count => dst_layer_dir.join("counts"), + }; + fs::create_dir_all(&data_dir)?; + + let mut builders: Vec = match mode { + MergeMode::Presence => { + PersistentBitMatrixBuilder::new(n_new, &data_dir) + .map_err(SKError::Io)?.close().map_err(SKError::Io)?; + (0..n_genomes).map(|g| -> SKResult { + let b = PersistentBitVecBuilder::new(n_new, &col_path_bit(&data_dir, g))?; + Ok(ColBuilder::Bit(b)) + }).collect::>()? + } + MergeMode::Count => { + PersistentCompactIntMatrixBuilder::new(n_new, &data_dir) + .map_err(SKError::Io)?.close().map_err(SKError::Io)?; + (0..n_genomes).map(|g| -> SKResult { + let b = PersistentCompactIntVecBuilder::new(n_new, &col_path_int(&data_dir, g))?; + Ok(ColBuilder::Int(b)) + }).collect::>()? + } + }; + + // ── Pass 2: fill builders ───────────────────────────────────────────── + for l in 0..src_meta.n_layers { + let src_layer_dir = src_index_dir.join(format!("layer_{l}")); + let unitigs_path = src_layer_dir.join("unitigs.bin"); + if !unitigs_path.exists() { continue; } + + let reader = UnitigFileReader::open(&unitigs_path)?; + let src_data = SrcLayerData::open(&src_layer_dir, mode)?; + + for (kmer, _, _) in reader.iter_indexed_canonical_kmers() { + let row = src_data.lookup(kmer, n_genomes); + if !passes_all(filters, &row, n_genomes) { continue; } + if let Some(slot) = dst_mphf.find(kmer) { + for (col, &value) in row.iter().enumerate() { + builders[col].set_val(slot, value); + } + } + } + } + + // ── Close builders, write metadata ──────────────────────────────────── + for b in builders { b.close()?; } + write_matrix_meta(&data_dir, n_new, n_genomes).map_err(SKError::Io)?; + + PartitionMeta { n_layers: 1 }.save(&dst_index_dir).map_err(olm_to_sk)?; + + Ok(()) + } +} -- 2.52.0 From c8e591fc78e2a9148f331e273e7aab3062f69fb6 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Thu, 21 May 2026 12:17:32 +0200 Subject: [PATCH 10/11] feat: add superkmer CLI setup and partition bit handling This commit introduces CLI argument parsing for the `superkmer` command via a new `SuperkmerArgs` struct. It also adds a `partitions_to_bits` utility to compute the minimum bit width for partition encoding, enforcing a 1-bit floor. Finally, the index configuration automatically rounds the partition count up to the nearest power of two to ensure compatibility with bitmask-based indexing operations. --- src/obikmer/src/cli.rs | 12 +++++++++--- src/obikmer/src/cmd/index.rs | 9 +++++++-- src/obikmer/src/cmd/superkmer.rs | 4 ++-- 3 files changed, 18 insertions(+), 7 deletions(-) diff --git a/src/obikmer/src/cli.rs b/src/obikmer/src/cli.rs index 76ed52f..1b79fa0 100644 --- a/src/obikmer/src/cli.rs +++ b/src/obikmer/src/cli.rs @@ -29,9 +29,9 @@ pub struct CommonArgs { #[arg(long, default_value_t = 6)] pub level_max: usize, - /// Number of bits to encode partitions (allows up to 2^partition_bits partitions) - #[arg(short, long, default_value_t = 8)] - pub partition_bits: usize, + /// Number of partitions (rounded up to the next power of 2) + #[arg(short, long, default_value_t = 256)] + pub partitions: usize, /// Number of worker threads #[arg( @@ -44,6 +44,12 @@ pub struct CommonArgs { pub threads: usize, } +/// Smallest `b` such that `2^b >= n` (i.e. `n.next_power_of_two().ilog2()`). +/// Minimum 1 (degenerate n=0 or n=1 → 1 partition). +pub fn partitions_to_bits(n: usize) -> usize { + n.max(1).next_power_of_two().trailing_zeros() as usize +} + impl CommonArgs { pub fn seqfile_paths(&self) -> obiread::PathIter { let paths = self.inputs.iter().map(PathBuf::from).collect(); diff --git a/src/obikmer/src/cmd/index.rs b/src/obikmer/src/cmd/index.rs index ccfa487..38bc722 100644 --- a/src/obikmer/src/cmd/index.rs +++ b/src/obikmer/src/cmd/index.rs @@ -6,7 +6,7 @@ use obikseq::{set_k, set_m}; use obisys::Reporter; use tracing::info; -use crate::cli::CommonArgs; +use crate::cli::{CommonArgs, partitions_to_bits}; use crate::steps::scatter; #[derive(Args)] @@ -62,10 +62,15 @@ pub fn run(args: IndexArgs) { std::process::exit(1); }); } + let n_bits = partitions_to_bits(args.common.partitions); + let effective = 1usize << n_bits; + if effective != args.common.partitions { + info!("partitions: {} → {} (next power of 2)", args.common.partitions, effective); + } let config = IndexConfig { kmer_size: args.common.kmer_size, minimizer_size: args.common.minimizer_size, - n_bits: args.common.partition_bits, + n_bits, with_counts: args.with_counts, }; KmerIndex::create(&output, config, args.label.clone(), false).unwrap_or_else(|e| { diff --git a/src/obikmer/src/cmd/superkmer.rs b/src/obikmer/src/cmd/superkmer.rs index 2ea1060..881d4a6 100644 --- a/src/obikmer/src/cmd/superkmer.rs +++ b/src/obikmer/src/cmd/superkmer.rs @@ -5,7 +5,7 @@ use clap::Args; use obifastwrite::write_scatter; use obikseq::{RoutableSuperKmer, set_k, set_m}; -use crate::cli::{CommonArgs, PipelineData, open_chunks}; +use crate::cli::{CommonArgs, PipelineData, open_chunks, partitions_to_bits}; #[derive(Args)] pub struct SuperkmerArgs { @@ -38,7 +38,7 @@ pub fn run(args: SuperkmerArgs) { let m = args.common.minimizer_size; let theta = args.common.theta; let level_max = args.common.level_max; - let partition_bits = args.common.partition_bits; + let partition_bits = partitions_to_bits(args.common.partitions); let n_workers = args.common.threads.max(1); set_k(k); -- 2.52.0 From 13599dd444e972cbb3b0f21d10b01f1bd9cbe976 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Thu, 21 May 2026 13:23:05 +0200 Subject: [PATCH 11/11] feat: Implement query subcommand for sequence-to-genome mapping This change introduces the `query` CLI command and its supporting infrastructure for sequence-to-genome mapping and k-mer matching. It adds a `QueryLayer` abstraction backed by MPHF and persistent matrices, exposes the index partition for direct querying, and implements `Hash`/`Eq` for `RoutableSuperKmer`. The command ingests sequence batches, deduplicates superkmers, routes them to index partitions for parallel exact or 1-mismatch matching, and outputs results as FASTA records annotated with JSON metadata. Includes `serde_json` dependency addition, module exports, and documentation updates. --- TODO.md | 20 +- docmd/architecture/query.md | 111 ++++++++++ src/Cargo.lock | 1 + src/obikindex/src/index.rs | 5 + src/obikmer/Cargo.toml | 1 + src/obikmer/src/cmd/mod.rs | 1 + src/obikmer/src/cmd/query.rs | 281 ++++++++++++++++++++++++ src/obikmer/src/main.rs | 3 + src/obikpartitionner/src/lib.rs | 1 + src/obikpartitionner/src/query_layer.rs | 120 ++++++++++ src/obikseq/src/routable.rs | 14 ++ src/obiread/src/lib.rs | 1 + src/obiread/src/record.rs | 222 +++++++++++++++++++ 13 files changed, 762 insertions(+), 19 deletions(-) create mode 100644 docmd/architecture/query.md create mode 100644 src/obikmer/src/cmd/query.rs create mode 100644 src/obikpartitionner/src/query_layer.rs create mode 100644 src/obiread/src/record.rs diff --git a/TODO.md b/TODO.md index e6ba5d0..a874150 100644 --- a/TODO.md +++ b/TODO.md @@ -1,26 +1,8 @@ -## Chose à vérifier suite à la commande index - -- il faudrait lister les fichier qui vont être indexés -- partition.meta ne devrait plus exister -- les spectrums globaux devrait etre identifier par génome - - regrouper dans un sous-dossier spectrums à la racine de l'index avec un nom basé sur le génome -- les spectrum patiels ont-ils vocation à être conserver ? -- l'étape de déreplication dure quasiment autant de temps que le comptage mais ne laisse aucune trace de progression à l'utilisateur - ## commandes à ajouter -- filter : produit un nouvel index filtré à partir d'un index existant en verifiant que les kmer présents dans le nouvel index respectent les critères de filtrage spécifiés - - quorum de presence en fraction-(min/max) du nombre de génomes, en nombre-(min/max) de génomes, si mode count la présence peut être défini par un seuil personnalisé minimum et maximum - - aggregate : aggrege toutes les colonnes d'une matrice d'index en une seule colonne. - query : scan un fichier de sequences et retourne pour chaque sequence quels kmer sont présents dans l'index et dans quel genomes - -- distance : calcule la matrice de distance entre les genomes - - proposer une option pour chaque distance à calculer - - un possibité de récuperer la matrice des kmer communs - - un possibité de calculer l'arbre nj - - les matrices sont sauvegardées en CSV - - les arbres NJ sont sauvegardés en Newick avec les longeurs de branche + --detail et --mismatch à implementer - status : affiche le statut de l'index diff --git a/docmd/architecture/query.md b/docmd/architecture/query.md new file mode 100644 index 0000000..6a11a11 --- /dev/null +++ b/docmd/architecture/query.md @@ -0,0 +1,111 @@ +# Query system + +## Goal + +Given a set of query sequences, determine for each sequence how many of its k-mers are found in the index and, for each indexed genome, how many k-mers match. The query system is the foundation for read classification and sequence-to-genome mapping. + +--- + +## Input + +- Query sequences in FASTA or FASTQ format (gzip supported, streaming stdin supported). +- Sequences shorter than k bases are silently skipped. +- Non-ACGT characters are handled by the superkmer decomposition layer: they act as hard breaks, producing shorter superkmers (identical to the behaviour at indexing time). + +--- + +## Algorithm + +The query follows the same superkmer-based partitioning strategy used at indexing time. + +``` +for each query sequence: + decompose into superkmers (non-ACGT breaks, same minimiser scheme as indexing) + for each superkmer: + route to partition p via minimiser hash + for each kmer in the superkmer: + lookup kmer in partition p (MPHF → evidence check → matrix row) + accumulate result into per-sequence accumulators + emit annotated sequence +``` + +Parallelism is **per sequence**: each worker thread handles all partitions of one sequence independently. This avoids cross-thread coordination when merging partial results and keeps memory usage proportional to the number of concurrent sequences rather than to the number of partitions. + +--- + +## Exact vs. approximate matching + +### Exact (default) + +Standard MPHF lookup followed by evidence check. O(1) per k-mer. + +### 1-mismatch (`--mismatch` flag) + +For each k-mer of the query, generate all `3·k` single-substitution variants. Each variant is canonicalised and looked up independently in the index. If one or more variants are found, their per-genome rows are **summed** into the result for that k-mer position. + +- If a k-mer matches exactly AND one of its variants also matches (distinct k-mers in the index), both contributions are accumulated. +- Exact and approximate matches are tracked separately in the output (see annotation schema below). +- The superkmer routing optimisation is **not** used in 1-mismatch mode: each variant is looked up directly via its own minimiser. +- Cost: up to `3·k` MPHF probes per k-mer position vs. 1 in exact mode. + +--- + +## Output format + +Output sequences are written in **OBITools4 format**: the original sequence with a JSON annotation map in the title line. + +``` +>read_id {"kmer_total":150,"kmer_found":59,...} +ATCGATCG... +``` + +Genome order in all list-valued annotations follows the genome order recorded in `index.meta`. + +--- + +## Annotation schema + +### Summary mode (default) + +| Key | Type | Condition | Semantics | +|---|---|---|---| +| `kmer_total` | int | always | total k-mers in the (masked) sequence | +| `kmer_found` | int | always | k-mers with at least one match (exact or approx) | +| `kmer_missing` | int | `--count-missing` | k-mers absent from the index | +| `kmer_match` | list[int] | always | per-genome matched k-mer count (exact + approx) | +| `kmer_match_exact` | list[int] | `--mismatch` | per-genome exact match count | +| `kmer_match_approx` | list[int] | `--mismatch` | per-genome approx match count | +| `count_match` | list[int] | count index | per-genome sum of index counts for matched k-mers | + +`kmer_match[i]` is the number of k-mer positions in the query that contribute at least one match to genome i. In 1-mismatch mode, a single k-mer position can contribute to multiple genomes if several of its variants are present in the index. + +`count_match[i]` sums raw index counts across all matched k-mer positions for genome i. Only meaningful for count indexes. + +### Detail mode (`--detail`) + +All summary keys, plus per-position coverage vectors — one list per genome, length `len(sequence) − k + 1`: + +| Key | Type | Condition | Semantics | +|---|---|---|---| +| `cov_` | list[int] | `--detail` | coverage at each k-mer position for genome i; raw count (count index) or 0/1 (presence index); 0 if absent | +| `cov_exact_` | list[int] | `--detail` + `--mismatch` | exact-match contribution per position | +| `cov_approx_` | list[int] | `--detail` + `--mismatch` | approx-match contribution per position | + +Genome indices in key names are 0-based integers matching the `index.meta` genome order. Genome labels are not used as key names to avoid issues with special characters in long or complex genome identifiers. + +--- + +## CLI + +``` +obikmer query -i [--summary | --detail] [--mismatch] [--count-missing] +``` + +`--summary` is the default; `--detail` implies `--summary` (all summary keys are always present). + +--- + +## Future work + +- **Read classification** (`--classify`): assign each read to the genome with the highest `kmer_match` score; emit as a single annotation key. +- **Whitelist / blacklist filtering**: accept or reject sequences based on whether their k-mer match score for a designated set of genomes exceeds a threshold. diff --git a/src/Cargo.lock b/src/Cargo.lock index 1de3042..a07c18c 100644 --- a/src/Cargo.lock +++ b/src/Cargo.lock @@ -1514,6 +1514,7 @@ dependencies = [ "obisys", "pprof", "rayon", + "serde_json", "speedytree", "tracing", "tracing-subscriber", diff --git a/src/obikindex/src/index.rs b/src/obikindex/src/index.rs index 16c1154..07792a4 100644 --- a/src/obikindex/src/index.rs +++ b/src/obikindex/src/index.rs @@ -185,6 +185,11 @@ impl KmerIndex { Ok(()) } + /// Borrow the inner partition for direct superkmer-level queries. + pub fn partition(&self) -> &KmerPartition { + &self.partition + } + /// Path to the unitigs file for partition `part`, layer `layer`. pub fn layer_unitigs_path(&self, part: usize, layer: usize) -> PathBuf { self.partition.part_dir(part) diff --git a/src/obikmer/Cargo.toml b/src/obikmer/Cargo.toml index fffb28b..053bc97 100644 --- a/src/obikmer/Cargo.toml +++ b/src/obikmer/Cargo.toml @@ -19,6 +19,7 @@ obisys = { path = "../obisys" } obiskio = { path = "../obiskio" } obikindex = { path = "../obikindex" } clap = { version = "4", features = ["derive"] } +serde_json = "1" kodama = "0.2" speedytree = "0.1" rayon = "1" diff --git a/src/obikmer/src/cmd/mod.rs b/src/obikmer/src/cmd/mod.rs index ac4057e..b71a789 100644 --- a/src/obikmer/src/cmd/mod.rs +++ b/src/obikmer/src/cmd/mod.rs @@ -2,6 +2,7 @@ pub mod distance; pub mod dump; pub mod index; pub mod merge; +pub mod query; pub mod rebuild; pub mod superkmer; pub mod unitig; diff --git a/src/obikmer/src/cmd/query.rs b/src/obikmer/src/cmd/query.rs new file mode 100644 index 0000000..2067067 --- /dev/null +++ b/src/obikmer/src/cmd/query.rs @@ -0,0 +1,281 @@ +use std::collections::HashMap; +use std::io::{self, BufWriter, Write}; +use std::path::PathBuf; + +use clap::Args; +use obikindex::KmerIndex; +use obiread::record::{SeqRecord, parse_chunk}; +use obiread::chunk::read_sequence_chunks; +use obikseq::{RoutableSuperKmer, set_k, set_m}; +use obiskbuilder::SuperKmerIter; +use tracing::info; + +// ── CLI ─────────────────────────────────────────────────────────────────────── + +#[derive(Args)] +pub struct QueryArgs { + /// Index directory + pub index: PathBuf, + + /// Input sequences (FASTA/FASTQ, optionally gzip-compressed) + #[arg(num_args = 1..)] + pub inputs: Vec, + + /// Also report per-position coverage vectors (cov_ per genome) + #[arg(long)] + pub detail: bool, + + /// Enable 1-mismatch approximate matching + #[arg(long)] + pub mismatch: bool, + + /// Count k-mers absent from the index (adds kmer_missing annotation) + #[arg(long)] + pub count_missing: bool, + + /// Report per-genome presence (0/1) instead of raw counts + #[arg(long)] + pub force_presence: bool, + + /// Minimum accumulated match count to declare a genome present (implies --force-presence) + #[arg(long, default_value_t = 1)] + pub presence_threshold: u32, + + /// Number of worker threads + #[arg( + short = 'T', + long, + default_value_t = std::thread::available_parallelism() + .map(|n| n.get()) + .unwrap_or(1) + )] + pub threads: usize, +} + +// ── SKDesc — one occurrence of a superkmer in the batch ─────────────────────── + +/// Describes one occurrence of a superkmer in the query batch. +pub struct SKDesc { + /// Index of the source sequence within the batch. + pub seq_idx: u32, + /// Kmer offset of the first kmer of this superkmer within its sequence. + /// Computed as the cumulative number of kmers emitted before this superkmer + /// in the same sequence. Used for `--detail` coverage vectors. + pub kmer_offset: u32, +} + +// ── QueryBatch ──────────────────────────────────────────────────────────────── + +/// A batch of query sequences with their superkmers deduplicated. +/// +/// Each unique `RoutableSuperKmer` maps to all the (seq_idx, kmer_offset) +/// positions it occupies across the batch. The superkmer is queried once +/// per partition; the result is broadcast to every SKDesc entry. +pub struct QueryBatch { + /// Sequence ids in batch order. + pub ids: Vec, + /// Raw sequence bytes (for output), in batch order. + pub seqs: Vec>, + /// Per-sequence total kmer count (kmer_count + kmer_missing). + pub n_kmers: Vec, + /// Deduplicated superkmer map. + pub map: HashMap>, +} + +impl QueryBatch { + /// Build a batch from a vec of parsed sequence records. + pub fn from_records( + records: Vec, + k: usize, + level_max: usize, + theta: f64, + ) -> Self { + let mut ids = Vec::with_capacity(records.len()); + let mut seqs = Vec::with_capacity(records.len()); + let mut n_kmers = Vec::with_capacity(records.len()); + let mut map: HashMap> = HashMap::new(); + + for (seq_idx, record) in records.into_iter().enumerate() { + let mut kmer_offset = 0u32; + + for rsk in SuperKmerIter::new(&record.normalized, k, level_max, theta) { + let n = (rsk.seql() - k + 1) as u32; + map.entry(rsk) + .or_default() + .push(SKDesc { seq_idx: seq_idx as u32, kmer_offset }); + kmer_offset += n; + } + + ids.push(record.id); + seqs.push(record.sequence); + n_kmers.push(kmer_offset); + } + + Self { ids, seqs, n_kmers, map } + } + + /// Split the superkmer map by partition index. + /// + /// Returns a vec of length `n_partitions`; each slot holds the RSK refs + /// whose minimizer routes to that partition. + pub fn split_by_partition(&self, n_partitions: usize) -> Vec> { + let mask = (n_partitions as u64) - 1; + let mut by_part: Vec> = vec![Vec::new(); n_partitions]; + for rsk in self.map.keys() { + let part = (rsk.minimizer().seq_hash() & mask) as usize; + by_part[part].push(rsk); + } + by_part + } +} + +// ── Per-sequence accumulator ────────────────────────────────────────────────── + +struct SeqAcc { + kmer_count: u32, + kmer_missing: u32, + /// Per-genome accumulated count (count mode) or presence sum (presence mode). + genome_totals: Vec, +} + +impl SeqAcc { + fn new(n_genomes: usize) -> Self { + Self { + kmer_count: 0, + kmer_missing: 0, + genome_totals: vec![0u32; n_genomes], + } + } +} + +// ── Entry point ─────────────────────────────────────────────────────────────── + +pub fn run(args: QueryArgs) { + let idx = KmerIndex::open(&args.index).unwrap_or_else(|e| { + eprintln!("error opening index: {e}"); + std::process::exit(1); + }); + + set_k(idx.kmer_size()); + set_m(idx.minimizer_size()); + + let k = idx.kmer_size(); + let n_genomes = idx.meta().genomes.len(); + let n_partitions = idx.n_partitions(); + let with_counts = idx.meta().config.with_counts; + + info!( + "query: k={k}, {} genome(s), with_counts={with_counts}, mismatch={}, detail={}", + n_genomes, args.mismatch, args.detail + ); + + if args.mismatch { + eprintln!("warning: --mismatch not yet implemented, ignored"); + } + if args.detail { + eprintln!("warning: --detail not yet implemented, ignored"); + } + + let paths: Vec = args.inputs.iter().map(PathBuf::from).collect(); + let mut out = BufWriter::new(io::stdout()); + + for path in &paths { + let chunks = read_sequence_chunks(path.to_str().unwrap_or("")) + .unwrap_or_else(|e| { + eprintln!("error opening {}: {e}", path.display()); + std::process::exit(1); + }); + + for chunk_result in chunks { + let chunk = chunk_result.unwrap_or_else(|e| { + eprintln!("read error: {e}"); + std::process::exit(1); + }); + + let records = parse_chunk(&chunk, k); + if records.is_empty() { + continue; + } + + let batch = QueryBatch::from_records(records, k, 6, 0.7); + let n_seqs = batch.ids.len(); + + let mut accs: Vec = (0..n_seqs).map(|_| SeqAcc::new(n_genomes)).collect(); + + let by_part = batch.split_by_partition(n_partitions); + + for (part_idx, part_sks) in by_part.iter().enumerate() { + if part_sks.is_empty() { + continue; + } + + let kmer_results = idx + .partition() + .query_partition(part_idx, part_sks, k, n_genomes, with_counts) + .unwrap_or_else(|e| { + eprintln!("query error on partition {part_idx}: {e}"); + std::process::exit(1); + }); + + let presence = args.force_presence || !with_counts; + let threshold = args.presence_threshold; + + for (rsk, sk_kmer_results) in part_sks.iter().zip(kmer_results.iter()) { + let descs = batch.map.get(*rsk).expect("rsk must be in map"); + for desc in descs { + let acc = &mut accs[desc.seq_idx as usize]; + for hit in sk_kmer_results.iter() { + match hit { + None => acc.kmer_missing += 1, + Some(row) => { + acc.kmer_count += 1; + for (g, &v) in row.iter().enumerate() { + acc.genome_totals[g] += if presence { + u32::from(v >= threshold) + } else { + v + }; + } + } + } + } + } + } + } + + emit_batch(&batch, &accs, idx.meta(), args.count_missing, &mut out); + } + } +} + +// ── Output ──────────────────────────────────────────────────────────────────── + +fn emit_batch( + batch: &QueryBatch, + accs: &[SeqAcc], + meta: &obikindex::meta::IndexMeta, + count_missing: bool, + out: &mut impl Write, +) { + for (seq_idx, (id, seq)) in batch.ids.iter().zip(batch.seqs.iter()).enumerate() { + let acc = &accs[seq_idx]; + let mut ann = serde_json::Map::new(); + + ann.insert("kmer_count".into(), acc.kmer_count.into()); + if count_missing { + ann.insert("kmer_missing".into(), acc.kmer_missing.into()); + } + let mut match_map = serde_json::Map::new(); + for (g, label) in meta.genomes.iter().enumerate() { + match_map.insert(label.clone(), acc.genome_totals[g].into()); + } + ann.insert("kmer_strict_matches".into(), match_map.into()); + + let ann_str = serde_json::to_string(&ann).unwrap_or_else(|_| "{}".to_string()); + + // OBITools4 FASTA format: >id {"key":value,...} + let _ = writeln!(out, ">{id} {ann_str}"); + let _ = out.write_all(seq); + let _ = out.write_all(b"\n"); + } +} diff --git a/src/obikmer/src/main.rs b/src/obikmer/src/main.rs index bae4cdf..1b8f833 100644 --- a/src/obikmer/src/main.rs +++ b/src/obikmer/src/main.rs @@ -22,6 +22,8 @@ enum Commands { Merge(cmd::merge::MergeArgs), /// Filter and compact an existing index into a new single-layer index Rebuild(cmd::rebuild::RebuildArgs), + /// Query an index with sequences and annotate matches + Query(cmd::query::QueryArgs), /// Dump all indexed kmers as CSV (kmer + per-genome counts or presence) Dump(cmd::dump::DumpArgs), /// Compute pairwise distance matrix between genomes; optionally build NJ/UPGMA trees @@ -54,6 +56,7 @@ fn main() { Commands::Merge(args) => cmd::merge::run(args), Commands::Dump(args) => cmd::dump::run(args), Commands::Rebuild(args) => cmd::rebuild::run(args), + Commands::Query(args) => cmd::query::run(args), Commands::Distance(args) => cmd::distance::run(args), Commands::Unitig(args) => cmd::unitig::run(args), } diff --git a/src/obikpartitionner/src/lib.rs b/src/obikpartitionner/src/lib.rs index 48d72c1..49a81fc 100644 --- a/src/obikpartitionner/src/lib.rs +++ b/src/obikpartitionner/src/lib.rs @@ -5,6 +5,7 @@ mod index_layer; mod kmer_sort; mod merge_layer; mod partition; +mod query_layer; mod rebuild_layer; pub use filter::KmerFilter; diff --git a/src/obikpartitionner/src/query_layer.rs b/src/obikpartitionner/src/query_layer.rs new file mode 100644 index 0000000..f18a0d5 --- /dev/null +++ b/src/obikpartitionner/src/query_layer.rs @@ -0,0 +1,120 @@ +use std::path::Path; + +use obicompactvec::{PersistentBitMatrix, PersistentCompactIntMatrix}; +use obikseq::{CanonicalKmer, RoutableSuperKmer}; +use obiskio::{SKError, SKResult}; +use obilayeredmap::{MphfLayer, OLMError}; +use obilayeredmap::meta::PartitionMeta; + +use crate::partition::KmerPartition; + +const INDEX_SUBDIR: &str = "index"; + +fn olm_to_sk(e: OLMError) -> SKError { + match e { + OLMError::Io(io_err) => SKError::Io(io_err), + other => SKError::InvalidData { context: "query", detail: other.to_string() }, + } +} + +// ── per-layer query handle ──────────────────────────────────────────────────── + +enum QueryLayer { + /// Layer<()> — MPHF-only, no data matrix; all indexed kmers map to 1 per genome. + SetOnly(MphfLayer), + Presence(MphfLayer, PersistentBitMatrix), + Count(MphfLayer, PersistentCompactIntMatrix), +} + +impl QueryLayer { + fn open(layer_dir: &Path, with_counts: bool) -> SKResult { + let presence_dir = layer_dir.join("presence"); + let counts_dir = layer_dir.join("counts"); + + if with_counts && counts_dir.exists() { + let mphf = MphfLayer::open(layer_dir).map_err(olm_to_sk)?; + let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?; + Ok(QueryLayer::Count(mphf, mat)) + } else if presence_dir.exists() { + let mphf = MphfLayer::open(layer_dir).map_err(olm_to_sk)?; + let mat = PersistentBitMatrix::open(&presence_dir).map_err(SKError::Io)?; + Ok(QueryLayer::Presence(mphf, mat)) + } else if counts_dir.exists() { + // presence query on a count index — return counts as-is + let mphf = MphfLayer::open(layer_dir).map_err(olm_to_sk)?; + let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?; + Ok(QueryLayer::Count(mphf, mat)) + } else { + let mphf = MphfLayer::open(layer_dir).map_err(olm_to_sk)?; + Ok(QueryLayer::SetOnly(mphf)) + } + } + + /// Return `Some(per-genome row)` if `kmer` is indexed in this layer, else `None`. + fn find(&self, kmer: CanonicalKmer, n_genomes: usize) -> Option> { + match self { + QueryLayer::SetOnly(mphf) => { + mphf.find(kmer) + .map(|_| vec![1u32; n_genomes].into_boxed_slice()) + } + QueryLayer::Presence(mphf, mat) => { + mphf.find(kmer) + .map(|slot| mat.row(slot).iter().map(|&b| b as u32).collect()) + } + QueryLayer::Count(mphf, mat) => { + mphf.find(kmer).map(|slot| mat.row(slot)) + } + } + } +} + +// ── KmerPartition::query_partition ─────────────────────────────────────────── + +impl KmerPartition { + /// Query a single partition for a slice of (already-routed) super-kmers. + /// + /// Returns one entry per input super-kmer; each entry is a `Vec` with one + /// `Option>` per k-mer inside that super-kmer: + /// - `None` — k-mer absent from the index + /// - `Some(row)` — per-genome count (count index) or 0/1 (presence index) + /// + /// All `superkmers` must belong to this partition (same minimizer bucket). + pub fn query_partition( + &self, + part_idx: usize, + superkmers: &[&RoutableSuperKmer], + k: usize, + n_genomes: usize, + with_counts: bool, + ) -> SKResult>>>> { + if superkmers.is_empty() { + return Ok(Vec::new()); + } + + let index_dir = self.part_dir(part_idx).join(INDEX_SUBDIR); + + if !index_dir.exists() { + return Ok(superkmers + .iter() + .map(|rsk| vec![None; rsk.seql() - k + 1]) + .collect()); + } + + let meta = PartitionMeta::load(&index_dir).map_err(olm_to_sk)?; + let layers: Vec = (0..meta.n_layers) + .map(|i| QueryLayer::open(&index_dir.join(format!("layer_{i}")), with_counts)) + .collect::>()?; + + Ok(superkmers + .iter() + .map(|rsk| { + rsk.superkmer() + .iter_canonical_kmers() + .map(|kmer| { + layers.iter().find_map(|layer| layer.find(kmer, n_genomes)) + }) + .collect() + }) + .collect()) + } +} diff --git a/src/obikseq/src/routable.rs b/src/obikseq/src/routable.rs index a4b2338..411fcb0 100644 --- a/src/obikseq/src/routable.rs +++ b/src/obikseq/src/routable.rs @@ -71,6 +71,20 @@ impl RoutableSuperKmer { } } +impl PartialEq for RoutableSuperKmer { + fn eq(&self, other: &Self) -> bool { + self.superkmer == other.superkmer + } +} + +impl Eq for RoutableSuperKmer {} + +impl std::hash::Hash for RoutableSuperKmer { + fn hash(&self, state: &mut H) { + self.superkmer.hash(state); + } +} + impl Sequence for RoutableSuperKmer { type Canonical = RoutableSuperKmer; diff --git a/src/obiread/src/lib.rs b/src/obiread/src/lib.rs index a07d250..771d839 100644 --- a/src/obiread/src/lib.rs +++ b/src/obiread/src/lib.rs @@ -12,6 +12,7 @@ mod mimetype; pub mod normalize; mod path_iterator; pub mod peakreader; +pub mod record; pub mod xopen; pub use chunk::{SeqChunkIter, fasta_chunks, fastq_chunks, diff --git a/src/obiread/src/record.rs b/src/obiread/src/record.rs new file mode 100644 index 0000000..fb5e141 --- /dev/null +++ b/src/obiread/src/record.rs @@ -0,0 +1,222 @@ +//! Per-sequence record parser for FASTA and FASTQ chunks. +//! +//! Same automaton structure as `normalize.rs` — only the actions differ: +//! instead of writing into a single flat rope, we accumulate per-sequence +//! data (id, raw ASCII, normalised ACGT\x00 rope). + +use obikrope::{ForwardCursor, Rope, RopeCursor}; + +/// One sequence record extracted from a FASTA or FASTQ chunk. +pub struct SeqRecord { + /// Sequence identifier (everything before the first space in the header). + pub id: String, + /// Raw sequence bytes, newlines stripped, non-ACGT characters preserved. + /// Reproduced verbatim in query output. + pub sequence: Vec, + /// Per-sequence normalised rope: uppercase ACGT segments of length ≥ k + /// separated by `\x00`. Ready for `SuperKmerIter`. + pub normalized: Rope, +} + +/// Parse all records from a FASTA or FASTQ chunk rope. +/// Returns an empty vec if the rope carries no recognised mime type. +pub fn parse_chunk(rope: &Rope, k: usize) -> Vec { + let cursor = rope.fw_cursor(); + match rope.mime_type() { + Some("text/fasta") => parse_fasta(cursor, k), + Some("text/fastq") => parse_fastq(cursor, k), + _ => vec![], + } +} + +// ── Shared state accumulated while scanning one sequence ────────────────────── + +struct RecordBuilder { + id: String, + sequence: Vec, // raw ASCII, no newlines + norm: Vec, // ACGT\x00 segments being built + seg_start: usize, // index in norm where current segment started + k: usize, +} + +impl RecordBuilder { + fn new(k: usize) -> Self { + Self { id: String::new(), sequence: Vec::new(), norm: Vec::new(), seg_start: 0, k } + } + + fn reset(&mut self, id: String) { + self.id = id; + self.sequence.clear(); + self.norm.clear(); + self.seg_start = 0; + } + + /// Push one accepted ACGT byte. + fn push_acgt(&mut self, b: u8) { + self.sequence.push(b); + self.norm.push(b); + } + + /// Push one non-ACGT byte to the raw sequence only (not to the norm buffer). + fn push_raw(&mut self, b: u8) { + self.sequence.push(b); + } + + /// Close the current ACGT segment (same logic as `end_segment` in normalize.rs). + fn end_segment(&mut self) { + if self.norm.len() - self.seg_start >= self.k { + self.norm.push(0x00); + self.seg_start = self.norm.len(); + } else { + self.norm.truncate(self.seg_start); + } + } + + /// Consume into a SeqRecord. Closes any open segment first. + fn finish(mut self) -> Option { + self.end_segment(); + if self.id.is_empty() { + return None; + } + let mut rope = Rope::new(None); + if !self.norm.is_empty() { + rope.push(self.norm); + } + Some(SeqRecord { id: self.id, sequence: self.sequence, normalized: rope }) + } +} + +// ── FASTA automaton ─────────────────────────────────────────────────────────── + +fn parse_fasta(cursor: ForwardCursor<'_>, k: usize) -> Vec { + let mut records: Vec = Vec::new(); + let mut builder = RecordBuilder::new(k); + + // skip up to (and including) the first '>' + loop { + match cursor.read_next().ok() { + None => return records, + Some(b'>') => break, + Some(_) => {} + } + } + + // read first id — read_id already consumes the full header line + builder.id = read_id(&cursor); + + loop { + match cursor.read_next().ok() { + None => { + // EOF — close final segment and emit + if let Some(rec) = builder.finish() { + records.push(rec); + } + return records; + } + Some(b'\n') | Some(b'\r') => { + // peek: next non-empty char determines if new record starts + match cursor.read_ahead(1).ok() { + Some(b'>') => { + // end of current record + builder.end_segment(); + if let Some(rec) = builder.finish() { + records.push(rec); + } + cursor.read_next().ok(); // consume '>' + let id = read_id(&cursor); // already consumes header line + builder = RecordBuilder::new(k); + builder.reset(id); + } + None => { + builder.end_segment(); + if let Some(rec) = builder.finish() { + records.push(rec); + } + return records; + } + Some(_) => {} // continuation line — do nothing + } + } + Some(b) => { + let upper = b & !0x20u8; + if matches!(upper, b'A' | b'C' | b'G' | b'T') { + builder.push_acgt(upper); + } else { + builder.push_raw(b); + builder.end_segment(); + } + } + } + } +} + +// ── FASTQ automaton ─────────────────────────────────────────────────────────── + +fn parse_fastq(cursor: ForwardCursor<'_>, k: usize) -> Vec { + let mut records: Vec = Vec::new(); + + loop { + // find '@' + loop { + match cursor.read_next().ok() { + None => return records, + Some(b'@') => break, + Some(_) => {} + } + } + + let mut builder = RecordBuilder::new(k); + builder.id = read_id(&cursor); // already consumes the full header line + + // sequence line — stop at newline, non-ACGT breaks segment + loop { + match cursor.read_next().ok() { + None | Some(b'\n') | Some(b'\r') => { + builder.end_segment(); + break; + } + Some(b) => { + let upper = b & !0x20u8; + if matches!(upper, b'A' | b'C' | b'G' | b'T') { + builder.push_acgt(upper); + } else { + builder.push_raw(b); + builder.end_segment(); + } + } + } + } + + skip_line(&cursor); // '+' line + skip_line(&cursor); // quality line + + if let Some(rec) = builder.finish() { + records.push(rec); + } + } +} + +// ── Helpers ─────────────────────────────────────────────────────────────────── + +fn read_id(cursor: &ForwardCursor<'_>) -> String { + let mut id = Vec::new(); + loop { + match cursor.read_next().ok() { + None | Some(b'\n') | Some(b'\r') => break, + Some(b' ') | Some(b'\t') => { + skip_line(cursor); + break; + } + Some(b) => id.push(b), + } + } + String::from_utf8_lossy(&id).into_owned() +} + +fn skip_line(cursor: &ForwardCursor<'_>) { + while let Some(c) = cursor.read_next().ok() { + if c == b'\n' { + return; + } + } +} -- 2.52.0