feat: add merge command to consolidate k-mer indexes

Introduces a new `merge` CLI subcommand and underlying implementation to consolidate multiple pre-indexed k-mer indexes into a single output. Adds `append_column` methods to persistent bit and int matrices to enable incremental genome column expansion without rebuilding the MPHF. Includes new error variants for index readiness and configuration mismatches, adds a `--force` flag to the index command, and updates documentation and navigation structure accordingly.
This commit is contained in:
Eric Coissac
2026-05-21 05:53:55 +02:00
parent bfa436ad15
commit e1d59fde54
17 changed files with 799 additions and 8 deletions
+13 -3
View File
@@ -8,6 +8,12 @@ pub enum OKIError {
Io(io::Error),
Json(serde_json::Error),
Partition(SKError),
/// Source index is not in `Indexed` state.
NotIndexed(std::path::PathBuf),
/// Source indexes have incompatible configurations (k, m, n_bits).
IncompatibleConfig,
/// Count mode requested but a source index lacks count data.
MismatchedMode,
}
pub type OKIResult<T> = Result<T, OKIError>;
@@ -15,9 +21,12 @@ pub type OKIResult<T> = Result<T, OKIError>;
impl fmt::Display for OKIError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
OKIError::Io(e) => write!(f, "I/O error: {e}"),
OKIError::Json(e) => write!(f, "JSON error: {e}"),
OKIError::Partition(e) => write!(f, "partition error: {e}"),
OKIError::Io(e) => write!(f, "I/O error: {e}"),
OKIError::Json(e) => write!(f, "JSON error: {e}"),
OKIError::Partition(e) => write!(f, "partition error: {e}"),
OKIError::NotIndexed(p) => write!(f, "index not fully built: {}", p.display()),
OKIError::IncompatibleConfig => write!(f, "incompatible index configurations"),
OKIError::MismatchedMode => write!(f, "count mode requires all sources to have with_counts=true"),
}
}
}
@@ -28,6 +37,7 @@ impl std::error::Error for OKIError {
OKIError::Io(e) => Some(e),
OKIError::Json(e) => Some(e),
OKIError::Partition(e) => Some(e),
_ => None,
}
}
}
+3 -3
View File
@@ -15,9 +15,9 @@ use crate::meta::{IndexConfig, IndexMeta};
use crate::state::{IndexState, SENTINEL_COUNTED, SENTINEL_INDEXED, SENTINEL_SCATTERED};
pub struct KmerIndex {
root_path: PathBuf,
meta: IndexMeta,
partition: KmerPartition,
pub(crate) root_path: PathBuf,
pub(crate) meta: IndexMeta,
pub(crate) partition: KmerPartition,
}
impl KmerIndex {
+2
View File
@@ -2,8 +2,10 @@ pub mod error;
pub mod meta;
pub mod state;
mod index;
mod merge;
pub use error::{OKIError, OKIResult};
pub use index::KmerIndex;
pub use merge::MergeMode;
pub use meta::{IndexConfig, IndexMeta, META_FILENAME};
pub use state::{IndexState, SENTINEL_COUNTED, SENTINEL_INDEXED, SENTINEL_SCATTERED};
+131
View File
@@ -0,0 +1,131 @@
use std::fs;
use std::io;
use std::path::Path;
use obisys::{Reporter, Stage};
use rayon::prelude::*;
use tracing::info;
use crate::error::{OKIError, OKIResult};
use crate::index::KmerIndex;
use crate::meta::IndexMeta;
use crate::state::IndexState;
pub use obikpartitionner::MergeMode;
impl KmerIndex {
/// Merge `sources` into a new index at `output`.
///
/// All sources must be in `Indexed` state and share the same `kmer_size`,
/// `minimizer_size`, and `n_partitions`. Count mode additionally requires
/// every source to have `with_counts = true`.
///
/// The first source is copied to `output`, then each subsequent source is
/// merged partition-by-partition in parallel.
pub fn merge<P: AsRef<Path>>(
output: P,
sources: &[&KmerIndex],
mode: MergeMode,
force: bool,
) -> OKIResult<Self> {
let output = output.as_ref();
if sources.is_empty() {
return Err(OKIError::Io(io::Error::new(
io::ErrorKind::InvalidInput,
"merge requires at least one source index",
)));
}
// ── Validate ──────────────────────────────────────────────────────────
let ref0 = sources[0];
for src in sources {
if src.state() != IndexState::Indexed {
return Err(OKIError::NotIndexed(src.root_path.clone()));
}
if src.kmer_size() != ref0.kmer_size()
|| src.minimizer_size() != ref0.minimizer_size()
|| src.n_partitions() != ref0.n_partitions()
{
return Err(OKIError::IncompatibleConfig);
}
if mode == MergeMode::Count && !src.meta.config.with_counts {
return Err(OKIError::MismatchedMode);
}
}
// ── Prepare output directory ──────────────────────────────────────────
if output.exists() {
if force {
fs::remove_dir_all(output)?;
} else {
return Err(OKIError::Io(io::Error::new(
io::ErrorKind::AlreadyExists,
format!("{}: output directory already exists", output.display()),
)));
}
}
// ── Bootstrap: copy first source to output ────────────────────────────
info!("copying {} → {}", sources[0].root_path.display(), output.display());
copy_dir_all(&sources[0].root_path, output)?;
// Rewrite index.meta with all genome labels.
let all_genomes: Vec<String> = sources
.iter()
.flat_map(|s| s.meta.genomes.iter().cloned())
.collect();
let mut meta = IndexMeta::read(output).map_err(OKIError::Io)?;
meta.genomes = all_genomes;
meta.write(output)?;
// Open the destination index.
let dst = KmerIndex::open(output)?;
let n_partitions = dst.n_partitions();
// ── Merge each subsequent source partition-by-partition ───────────────
let remaining_sources: Vec<&KmerIndex> = sources[1..].to_vec();
if !remaining_sources.is_empty() {
let mut rep = Reporter::new();
let t = Stage::start("merge_partitions");
let dst_partition = &dst.partition;
let errors: Vec<obiskio::SKError> = (0..n_partitions)
.into_par_iter()
.filter_map(|i| {
let srcs: Vec<&obikpartitionner::KmerPartition> =
remaining_sources.iter().map(|s| &s.partition).collect();
// n_dst_genomes = 1 (copied from source_0 only)
dst_partition.merge_partition(i, &srcs, mode, 1).err()
})
.collect();
if let Some(e) = errors.into_iter().next() {
return Err(OKIError::Partition(e));
}
rep.push(t.stop());
}
// Re-open to get the updated state.
KmerIndex::open(output)
}
}
// ── Directory copy ────────────────────────────────────────────────────────────
fn copy_dir_all(src: &Path, dst: &Path) -> io::Result<()> {
fs::create_dir_all(dst)?;
for entry in fs::read_dir(src)? {
let entry = entry?;
let src_path = entry.path();
let dst_path = dst.join(entry.file_name());
if src_path.is_dir() {
copy_dir_all(&src_path, &dst_path)?;
} else {
fs::copy(&src_path, &dst_path)?;
}
}
Ok(())
}