feat: add merge command to consolidate k-mer indexes
Introduces a new `merge` CLI subcommand and underlying implementation to consolidate multiple pre-indexed k-mer indexes into a single output. Adds `append_column` methods to persistent bit and int matrices to enable incremental genome column expansion without rebuilding the MPHF. Includes new error variants for index readiness and configuration mismatches, adds a `--force` flag to the index command, and updates documentation and navigation structure accordingly.
This commit is contained in:
@@ -0,0 +1,131 @@
|
||||
use std::fs;
|
||||
use std::io;
|
||||
use std::path::Path;
|
||||
|
||||
use obisys::{Reporter, Stage};
|
||||
use rayon::prelude::*;
|
||||
use tracing::info;
|
||||
|
||||
use crate::error::{OKIError, OKIResult};
|
||||
use crate::index::KmerIndex;
|
||||
use crate::meta::IndexMeta;
|
||||
use crate::state::IndexState;
|
||||
|
||||
pub use obikpartitionner::MergeMode;
|
||||
|
||||
impl KmerIndex {
|
||||
/// Merge `sources` into a new index at `output`.
|
||||
///
|
||||
/// All sources must be in `Indexed` state and share the same `kmer_size`,
|
||||
/// `minimizer_size`, and `n_partitions`. Count mode additionally requires
|
||||
/// every source to have `with_counts = true`.
|
||||
///
|
||||
/// The first source is copied to `output`, then each subsequent source is
|
||||
/// merged partition-by-partition in parallel.
|
||||
pub fn merge<P: AsRef<Path>>(
|
||||
output: P,
|
||||
sources: &[&KmerIndex],
|
||||
mode: MergeMode,
|
||||
force: bool,
|
||||
) -> OKIResult<Self> {
|
||||
let output = output.as_ref();
|
||||
|
||||
if sources.is_empty() {
|
||||
return Err(OKIError::Io(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"merge requires at least one source index",
|
||||
)));
|
||||
}
|
||||
|
||||
// ── Validate ──────────────────────────────────────────────────────────
|
||||
let ref0 = sources[0];
|
||||
for src in sources {
|
||||
if src.state() != IndexState::Indexed {
|
||||
return Err(OKIError::NotIndexed(src.root_path.clone()));
|
||||
}
|
||||
if src.kmer_size() != ref0.kmer_size()
|
||||
|| src.minimizer_size() != ref0.minimizer_size()
|
||||
|| src.n_partitions() != ref0.n_partitions()
|
||||
{
|
||||
return Err(OKIError::IncompatibleConfig);
|
||||
}
|
||||
if mode == MergeMode::Count && !src.meta.config.with_counts {
|
||||
return Err(OKIError::MismatchedMode);
|
||||
}
|
||||
}
|
||||
|
||||
// ── Prepare output directory ──────────────────────────────────────────
|
||||
if output.exists() {
|
||||
if force {
|
||||
fs::remove_dir_all(output)?;
|
||||
} else {
|
||||
return Err(OKIError::Io(io::Error::new(
|
||||
io::ErrorKind::AlreadyExists,
|
||||
format!("{}: output directory already exists", output.display()),
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
// ── Bootstrap: copy first source to output ────────────────────────────
|
||||
info!("copying {} → {}", sources[0].root_path.display(), output.display());
|
||||
copy_dir_all(&sources[0].root_path, output)?;
|
||||
|
||||
// Rewrite index.meta with all genome labels.
|
||||
let all_genomes: Vec<String> = sources
|
||||
.iter()
|
||||
.flat_map(|s| s.meta.genomes.iter().cloned())
|
||||
.collect();
|
||||
let mut meta = IndexMeta::read(output).map_err(OKIError::Io)?;
|
||||
meta.genomes = all_genomes;
|
||||
meta.write(output)?;
|
||||
|
||||
// Open the destination index.
|
||||
let dst = KmerIndex::open(output)?;
|
||||
let n_partitions = dst.n_partitions();
|
||||
|
||||
// ── Merge each subsequent source partition-by-partition ───────────────
|
||||
let remaining_sources: Vec<&KmerIndex> = sources[1..].to_vec();
|
||||
if !remaining_sources.is_empty() {
|
||||
let mut rep = Reporter::new();
|
||||
let t = Stage::start("merge_partitions");
|
||||
|
||||
let dst_partition = &dst.partition;
|
||||
|
||||
let errors: Vec<obiskio::SKError> = (0..n_partitions)
|
||||
.into_par_iter()
|
||||
.filter_map(|i| {
|
||||
let srcs: Vec<&obikpartitionner::KmerPartition> =
|
||||
remaining_sources.iter().map(|s| &s.partition).collect();
|
||||
// n_dst_genomes = 1 (copied from source_0 only)
|
||||
dst_partition.merge_partition(i, &srcs, mode, 1).err()
|
||||
})
|
||||
.collect();
|
||||
|
||||
if let Some(e) = errors.into_iter().next() {
|
||||
return Err(OKIError::Partition(e));
|
||||
}
|
||||
|
||||
rep.push(t.stop());
|
||||
}
|
||||
|
||||
// Re-open to get the updated state.
|
||||
KmerIndex::open(output)
|
||||
}
|
||||
}
|
||||
|
||||
// ── Directory copy ────────────────────────────────────────────────────────────
|
||||
|
||||
fn copy_dir_all(src: &Path, dst: &Path) -> io::Result<()> {
|
||||
fs::create_dir_all(dst)?;
|
||||
for entry in fs::read_dir(src)? {
|
||||
let entry = entry?;
|
||||
let src_path = entry.path();
|
||||
let dst_path = dst.join(entry.file_name());
|
||||
if src_path.is_dir() {
|
||||
copy_dir_all(&src_path, &dst_path)?;
|
||||
} else {
|
||||
fs::copy(&src_path, &dst_path)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
Reference in New Issue
Block a user