feat: add merge command to consolidate k-mer indexes

Introduces a new `merge` CLI subcommand and underlying implementation to consolidate multiple pre-indexed k-mer indexes into a single output. Adds `append_column` methods to persistent bit and int matrices to enable incremental genome column expansion without rebuilding the MPHF. Includes new error variants for index readiness and configuration mismatches, adds a `--force` flag to the index command, and updates documentation and navigation structure accordingly.
This commit is contained in:
Eric Coissac
2026-05-21 05:53:55 +02:00
parent bfa436ad15
commit e1d59fde54
17 changed files with 799 additions and 8 deletions
+2
View File
@@ -1,5 +1,7 @@
mod index_layer;
mod kmer_sort;
mod merge_layer;
mod partition;
pub use merge_layer::MergeMode;
pub use partition::{KmerPartition, KmerSpectrum, PARTITIONS_SUBDIR};
+294
View File
@@ -0,0 +1,294 @@
use std::fs;
use std::io;
use std::path::{Path, PathBuf};
use obidebruinj::GraphDeBruijn;
use obicompactvec::{PersistentBitMatrix, PersistentBitMatrixBuilder,
PersistentBitVecBuilder,
PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder,
PersistentCompactIntVecBuilder};
use obiskio::{SKError, SKResult, UnitigFileReader};
use obilayeredmap::{Layer, LayeredMap, MphfLayer, OLMError};
use obilayeredmap::meta::PartitionMeta;
use crate::partition::KmerPartition;
// ── MergeMode ─────────────────────────────────────────────────────────────────
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum MergeMode { Presence, Count }
// ── ColBuilder — enum dispatch to avoid trait-object boxing issues ─────────────
enum ColBuilder {
Bit(PersistentBitVecBuilder),
Int(PersistentCompactIntVecBuilder),
}
impl ColBuilder {
fn set_val(&mut self, slot: usize, value: u32) {
match self {
ColBuilder::Bit(b) => b.set(slot, value > 0),
ColBuilder::Int(b) => b.set(slot, value),
}
}
fn close(self) -> SKResult<()> {
match self {
ColBuilder::Bit(b) => b.close().map_err(SKError::Io),
ColBuilder::Int(b) => b.close().map_err(SKError::Io),
}
}
}
// ── helpers ───────────────────────────────────────────────────────────────────
const INDEX_SUBDIR: &str = "index";
fn olm_to_sk(e: OLMError) -> SKError {
match e {
OLMError::Io(e) => SKError::Io(e),
other => SKError::InvalidData { context: "merge", detail: other.to_string() },
}
}
fn col_path_bit(dir: &Path, col: usize) -> PathBuf {
dir.join(format!("col_{col:06}.pbiv"))
}
fn col_path_int(dir: &Path, col: usize) -> PathBuf {
dir.join(format!("col_{col:06}.pciv"))
}
fn write_matrix_meta(dir: &Path, n: usize, n_cols: usize) -> io::Result<()> {
fs::write(dir.join("meta.json"), format!("{{\"n\":{n},\"n_cols\":{n_cols}}}\n"))
}
// ── KmerPartition::merge_partition ────────────────────────────────────────────
impl KmerPartition {
/// Merge `sources` into destination partition `i`.
///
/// `n_dst_genomes` is the number of genome columns already in the dst
/// matrices (1 after copying source_0, more for subsequent merges).
///
/// Two-pass algorithm:
/// 1. Classify each source kmer as dst-hit or new → build de Bruijn graph
/// of new kmers → write unitigs → build MPHF for the new layer.
/// 2. Iterate source kmers again → fill per-genome column builders
/// (memory-mapped) → close → update matrix metadata.
pub fn merge_partition(
&self,
i: usize,
sources: &[&KmerPartition],
mode: MergeMode,
n_dst_genomes: usize,
) -> SKResult<()> {
let dst_index_dir = self.part_dir(i).join(INDEX_SUBDIR);
if !dst_index_dir.exists() {
return Ok(());
}
let dst_map = LayeredMap::<()>::open(&dst_index_dir).map_err(olm_to_sk)?;
let n_dst_layers = dst_map.n_layers();
let n_src = sources.len();
// First merge in presence mode: init presence matrices on existing layers
// (all slots true — every kmer in a layer belongs to genome_0).
if n_dst_genomes == 1 && mode == MergeMode::Presence {
for l in 0..n_dst_layers {
let layer_dir = dst_index_dir.join(format!("layer_{l}"));
Layer::<()>::init_presence_matrix(&layer_dir, dst_map.layer(l).n())
.map_err(olm_to_sk)?;
}
}
// ── Pass 1: classify kmers, build new-kmer de Bruijn graph ───────────
let mut g = GraphDeBruijn::new();
let mut any_new = false;
for src in sources.iter() {
let src_index_dir = src.part_dir(i).join(INDEX_SUBDIR);
if !src_index_dir.exists() { continue; }
let src_meta = PartitionMeta::load(&src_index_dir).map_err(olm_to_sk)?;
for l in 0..src_meta.n_layers {
let unitigs_path = src_index_dir
.join(format!("layer_{l}")).join("unitigs.bin");
let reader = UnitigFileReader::open(&unitigs_path)?;
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
if dst_map.query(kmer).is_none() {
g.push(kmer);
any_new = true;
}
}
}
}
// Build new layer from de Bruijn graph if there are new kmers.
let new_layer_idx = n_dst_layers;
let new_layer_dir = dst_index_dir.join(format!("layer_{new_layer_idx}"));
if any_new {
g.compute_degrees();
fs::create_dir_all(&new_layer_dir)?;
let mut uw = Layer::<()>::unitig_writer(&new_layer_dir).map_err(olm_to_sk)?;
for unitig in g.iter_unitig() {
uw.write(&unitig)?;
}
uw.close()?;
Layer::<()>::build(&new_layer_dir).map_err(olm_to_sk)?;
}
drop(g);
let new_mphf = if any_new {
Some(MphfLayer::open(&new_layer_dir).map_err(olm_to_sk)?)
} else {
None
};
let n_new = new_mphf.as_ref().map_or(0, |m| m.n());
// ── Prepare matrix directories for the new layer ──────────────────────
// Absent columns (dst genomes) are written now via append_column (all-zero/false).
// Source-genome columns are created as mutable builders for pass 2.
let mut new_src_builders: Vec<ColBuilder> = if any_new {
let data_dir = match mode {
MergeMode::Presence => new_layer_dir.join("presence"),
MergeMode::Count => new_layer_dir.join("counts"),
};
fs::create_dir_all(&data_dir)?;
// Bootstrap meta with 0 cols.
match mode {
MergeMode::Presence => {
PersistentBitMatrixBuilder::new(n_new, &data_dir)
.map_err(SKError::Io)?.close().map_err(SKError::Io)?;
for _ in 0..n_dst_genomes {
PersistentBitMatrix::append_column(&data_dir, |_| false)
.map_err(SKError::Io)?;
}
(0..n_src).map(|g| -> SKResult<ColBuilder> {
let b = PersistentBitVecBuilder::new(
n_new, &col_path_bit(&data_dir, n_dst_genomes + g))?;
Ok(ColBuilder::Bit(b))
}).collect::<SKResult<_>>()?
}
MergeMode::Count => {
PersistentCompactIntMatrixBuilder::new(n_new, &data_dir)
.map_err(SKError::Io)?.close().map_err(SKError::Io)?;
for _ in 0..n_dst_genomes {
PersistentCompactIntMatrix::append_column(&data_dir, |_| 0)
.map_err(SKError::Io)?;
}
(0..n_src).map(|g| -> SKResult<ColBuilder> {
let b = PersistentCompactIntVecBuilder::new(
n_new, &col_path_int(&data_dir, n_dst_genomes + g))?;
Ok(ColBuilder::Int(b))
}).collect::<SKResult<_>>()?
}
}
} else {
vec![]
};
// Builders for existing layers: one per (layer, src_genome).
// Invariant: existing layers already have exactly n_dst_genomes columns.
// New source columns go at positions n_dst_genomes .. n_dst_genomes+n_src-1.
let mut exist_builders: Vec<Vec<ColBuilder>> = (0..n_dst_layers)
.map(|l| {
let layer_dir = dst_index_dir.join(format!("layer_{l}"));
let n = dst_map.layer(l).n();
(0..n_src).map(|src_g| -> SKResult<ColBuilder> {
match mode {
MergeMode::Presence => {
let data_dir = layer_dir.join("presence");
let b = PersistentBitVecBuilder::new(
n, &col_path_bit(&data_dir, n_dst_genomes + src_g))?;
Ok(ColBuilder::Bit(b))
}
MergeMode::Count => {
let data_dir = layer_dir.join("counts");
let b = PersistentCompactIntVecBuilder::new(
n, &col_path_int(&data_dir, n_dst_genomes + src_g))?;
Ok(ColBuilder::Int(b))
}
}
}).collect::<SKResult<_>>()
})
.collect::<SKResult<_>>()?;
// ── Pass 2: fill builders ─────────────────────────────────────────────
for (src_g, src) in sources.iter().enumerate() {
let src_index_dir = src.part_dir(i).join(INDEX_SUBDIR);
if !src_index_dir.exists() { continue; }
let src_meta = PartitionMeta::load(&src_index_dir).map_err(olm_to_sk)?;
for l in 0..src_meta.n_layers {
let src_layer_dir = src_index_dir.join(format!("layer_{l}"));
let reader = UnitigFileReader::open(&src_layer_dir.join("unitigs.bin"))?;
// Open source MPHF + count matrix for count mode.
let src_count_data: Option<(MphfLayer, PersistentCompactIntMatrix)> =
if mode == MergeMode::Count {
let counts_dir = src_layer_dir.join("counts");
if counts_dir.exists() {
let mphf = MphfLayer::open(&src_layer_dir).map_err(olm_to_sk)?;
let mat = PersistentCompactIntMatrix::open(&counts_dir)
.map_err(SKError::Io)?;
Some((mphf, mat))
} else {
None
}
} else {
None
};
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
let value: u32 = match &src_count_data {
Some((mphf, counts)) => {
mphf.find(kmer).map(|s| counts.col(0).get(s)).unwrap_or(1)
}
None => 1,
};
if let Some((dst_layer, hit)) = dst_map.query(kmer) {
exist_builders[dst_layer][src_g].set_val(hit.slot, value);
} else if let Some(ref mphf) = new_mphf {
if let Some(slot) = mphf.find(kmer) {
new_src_builders[src_g].set_val(slot, value);
}
}
}
}
}
// ── Close builders and update metadata ────────────────────────────────
for (l, builders) in exist_builders.into_iter().enumerate() {
let layer_dir = dst_index_dir.join(format!("layer_{l}"));
for b in builders { b.close()?; }
// Update the matrix meta to reflect the n_src new columns.
let n = dst_map.layer(l).n();
let data_dir = match mode {
MergeMode::Presence => layer_dir.join("presence"),
MergeMode::Count => layer_dir.join("counts"),
};
write_matrix_meta(&data_dir, n, n_dst_genomes + n_src).map_err(SKError::Io)?;
}
for b in new_src_builders { b.close()?; }
// new layer matrix meta was already written by append_column calls above
// with n_dst_genomes cols; update to n_dst_genomes + n_src.
if any_new {
let data_dir = match mode {
MergeMode::Presence => new_layer_dir.join("presence"),
MergeMode::Count => new_layer_dir.join("counts"),
};
write_matrix_meta(&data_dir, n_new, n_dst_genomes + n_src).map_err(SKError::Io)?;
let mut part_meta = PartitionMeta::load(&dst_index_dir).map_err(olm_to_sk)?;
part_meta.n_layers = new_layer_idx + 1;
part_meta.save(&dst_index_dir).map_err(olm_to_sk)?;
}
Ok(())
}
}