feat: add merge command to consolidate k-mer indexes

Introduces a new `merge` CLI subcommand and underlying implementation to consolidate multiple pre-indexed k-mer indexes into a single output. Adds `append_column` methods to persistent bit and int matrices to enable incremental genome column expansion without rebuilding the MPHF. Includes new error variants for index readiness and configuration mismatches, adds a `--force` flag to the index command, and updates documentation and navigation structure accordingly.
This commit is contained in:
Eric Coissac
2026-05-21 05:53:55 +02:00
parent bfa436ad15
commit e1d59fde54
17 changed files with 799 additions and 8 deletions
+39
View File
@@ -1,4 +1,5 @@
use std::collections::HashMap;
use std::fs;
use std::path::Path;
use obicompactvec::{
@@ -83,6 +84,22 @@ impl Layer<()> {
pub fn build(out_dir: &Path) -> OLMResult<usize> {
MphfLayer::build(out_dir, &mut |_, _| Ok(()))
}
/// Create a presence matrix for a set-membership layer (first merge).
///
/// All `n_kmers` slots are set to `true`: every kmer in this layer belongs
/// to genome_0, so genome_0 is present at every slot.
pub fn init_presence_matrix(layer_dir: &Path, n_kmers: usize) -> OLMResult<()> {
let presence_dir = layer_dir.join(PRESENCE_DIR);
fs::create_dir_all(&presence_dir).map_err(OLMError::Io)?;
let mut mb = PersistentBitMatrixBuilder::new(n_kmers, &presence_dir).map_err(OLMError::Io)?;
let mut col = mb.add_col().map_err(OLMError::Io)?;
for slot in 0..n_kmers {
col.set(slot, true);
}
col.close().map_err(OLMError::Io)?;
mb.close().map_err(OLMError::Io)
}
}
// ── Mode 2 — count matrix (1 column per layer) ────────────────────────────────
@@ -111,9 +128,31 @@ impl Layer<PersistentCompactIntMatrix> {
}
}
// ── Mode 2 — count matrix column append ──────────────────────────────────────
impl Layer<PersistentCompactIntMatrix> {
/// Append a genome column to an existing count matrix.
pub fn append_genome_column(
layer_dir: &Path,
value_of: impl Fn(usize) -> u32,
) -> OLMResult<()> {
PersistentCompactIntMatrix::append_column(&layer_dir.join(COUNTS_DIR), value_of)
.map_err(OLMError::Io)
}
}
// ── Mode 3 — presence/absence matrix (1 column per genome) ───────────────────
impl Layer<PersistentBitMatrix> {
/// Append a genome column to an existing presence matrix.
pub fn append_genome_column(
layer_dir: &Path,
value_of: impl Fn(usize) -> bool,
) -> OLMResult<()> {
PersistentBitMatrix::append_column(&layer_dir.join(PRESENCE_DIR), value_of)
.map_err(OLMError::Io)
}
pub fn build_presence(
out_dir: &Path,
n_genomes: usize,