feat: add merge command to consolidate k-mer indexes
Introduces a new `merge` CLI subcommand and underlying implementation to consolidate multiple pre-indexed k-mer indexes into a single output. Adds `append_column` methods to persistent bit and int matrices to enable incremental genome column expansion without rebuilding the MPHF. Includes new error variants for index readiness and configuration mismatches, adds a `--force` flag to the index command, and updates documentation and navigation structure accordingly.
This commit is contained in:
@@ -8,6 +8,12 @@ pub enum OKIError {
|
||||
Io(io::Error),
|
||||
Json(serde_json::Error),
|
||||
Partition(SKError),
|
||||
/// Source index is not in `Indexed` state.
|
||||
NotIndexed(std::path::PathBuf),
|
||||
/// Source indexes have incompatible configurations (k, m, n_bits).
|
||||
IncompatibleConfig,
|
||||
/// Count mode requested but a source index lacks count data.
|
||||
MismatchedMode,
|
||||
}
|
||||
|
||||
pub type OKIResult<T> = Result<T, OKIError>;
|
||||
@@ -15,9 +21,12 @@ pub type OKIResult<T> = Result<T, OKIError>;
|
||||
impl fmt::Display for OKIError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
OKIError::Io(e) => write!(f, "I/O error: {e}"),
|
||||
OKIError::Json(e) => write!(f, "JSON error: {e}"),
|
||||
OKIError::Partition(e) => write!(f, "partition error: {e}"),
|
||||
OKIError::Io(e) => write!(f, "I/O error: {e}"),
|
||||
OKIError::Json(e) => write!(f, "JSON error: {e}"),
|
||||
OKIError::Partition(e) => write!(f, "partition error: {e}"),
|
||||
OKIError::NotIndexed(p) => write!(f, "index not fully built: {}", p.display()),
|
||||
OKIError::IncompatibleConfig => write!(f, "incompatible index configurations"),
|
||||
OKIError::MismatchedMode => write!(f, "count mode requires all sources to have with_counts=true"),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -28,6 +37,7 @@ impl std::error::Error for OKIError {
|
||||
OKIError::Io(e) => Some(e),
|
||||
OKIError::Json(e) => Some(e),
|
||||
OKIError::Partition(e) => Some(e),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,9 +15,9 @@ use crate::meta::{IndexConfig, IndexMeta};
|
||||
use crate::state::{IndexState, SENTINEL_COUNTED, SENTINEL_INDEXED, SENTINEL_SCATTERED};
|
||||
|
||||
pub struct KmerIndex {
|
||||
root_path: PathBuf,
|
||||
meta: IndexMeta,
|
||||
partition: KmerPartition,
|
||||
pub(crate) root_path: PathBuf,
|
||||
pub(crate) meta: IndexMeta,
|
||||
pub(crate) partition: KmerPartition,
|
||||
}
|
||||
|
||||
impl KmerIndex {
|
||||
|
||||
@@ -2,8 +2,10 @@ pub mod error;
|
||||
pub mod meta;
|
||||
pub mod state;
|
||||
mod index;
|
||||
mod merge;
|
||||
|
||||
pub use error::{OKIError, OKIResult};
|
||||
pub use index::KmerIndex;
|
||||
pub use merge::MergeMode;
|
||||
pub use meta::{IndexConfig, IndexMeta, META_FILENAME};
|
||||
pub use state::{IndexState, SENTINEL_COUNTED, SENTINEL_INDEXED, SENTINEL_SCATTERED};
|
||||
|
||||
@@ -0,0 +1,131 @@
|
||||
use std::fs;
|
||||
use std::io;
|
||||
use std::path::Path;
|
||||
|
||||
use obisys::{Reporter, Stage};
|
||||
use rayon::prelude::*;
|
||||
use tracing::info;
|
||||
|
||||
use crate::error::{OKIError, OKIResult};
|
||||
use crate::index::KmerIndex;
|
||||
use crate::meta::IndexMeta;
|
||||
use crate::state::IndexState;
|
||||
|
||||
pub use obikpartitionner::MergeMode;
|
||||
|
||||
impl KmerIndex {
|
||||
/// Merge `sources` into a new index at `output`.
|
||||
///
|
||||
/// All sources must be in `Indexed` state and share the same `kmer_size`,
|
||||
/// `minimizer_size`, and `n_partitions`. Count mode additionally requires
|
||||
/// every source to have `with_counts = true`.
|
||||
///
|
||||
/// The first source is copied to `output`, then each subsequent source is
|
||||
/// merged partition-by-partition in parallel.
|
||||
pub fn merge<P: AsRef<Path>>(
|
||||
output: P,
|
||||
sources: &[&KmerIndex],
|
||||
mode: MergeMode,
|
||||
force: bool,
|
||||
) -> OKIResult<Self> {
|
||||
let output = output.as_ref();
|
||||
|
||||
if sources.is_empty() {
|
||||
return Err(OKIError::Io(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"merge requires at least one source index",
|
||||
)));
|
||||
}
|
||||
|
||||
// ── Validate ──────────────────────────────────────────────────────────
|
||||
let ref0 = sources[0];
|
||||
for src in sources {
|
||||
if src.state() != IndexState::Indexed {
|
||||
return Err(OKIError::NotIndexed(src.root_path.clone()));
|
||||
}
|
||||
if src.kmer_size() != ref0.kmer_size()
|
||||
|| src.minimizer_size() != ref0.minimizer_size()
|
||||
|| src.n_partitions() != ref0.n_partitions()
|
||||
{
|
||||
return Err(OKIError::IncompatibleConfig);
|
||||
}
|
||||
if mode == MergeMode::Count && !src.meta.config.with_counts {
|
||||
return Err(OKIError::MismatchedMode);
|
||||
}
|
||||
}
|
||||
|
||||
// ── Prepare output directory ──────────────────────────────────────────
|
||||
if output.exists() {
|
||||
if force {
|
||||
fs::remove_dir_all(output)?;
|
||||
} else {
|
||||
return Err(OKIError::Io(io::Error::new(
|
||||
io::ErrorKind::AlreadyExists,
|
||||
format!("{}: output directory already exists", output.display()),
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
// ── Bootstrap: copy first source to output ────────────────────────────
|
||||
info!("copying {} → {}", sources[0].root_path.display(), output.display());
|
||||
copy_dir_all(&sources[0].root_path, output)?;
|
||||
|
||||
// Rewrite index.meta with all genome labels.
|
||||
let all_genomes: Vec<String> = sources
|
||||
.iter()
|
||||
.flat_map(|s| s.meta.genomes.iter().cloned())
|
||||
.collect();
|
||||
let mut meta = IndexMeta::read(output).map_err(OKIError::Io)?;
|
||||
meta.genomes = all_genomes;
|
||||
meta.write(output)?;
|
||||
|
||||
// Open the destination index.
|
||||
let dst = KmerIndex::open(output)?;
|
||||
let n_partitions = dst.n_partitions();
|
||||
|
||||
// ── Merge each subsequent source partition-by-partition ───────────────
|
||||
let remaining_sources: Vec<&KmerIndex> = sources[1..].to_vec();
|
||||
if !remaining_sources.is_empty() {
|
||||
let mut rep = Reporter::new();
|
||||
let t = Stage::start("merge_partitions");
|
||||
|
||||
let dst_partition = &dst.partition;
|
||||
|
||||
let errors: Vec<obiskio::SKError> = (0..n_partitions)
|
||||
.into_par_iter()
|
||||
.filter_map(|i| {
|
||||
let srcs: Vec<&obikpartitionner::KmerPartition> =
|
||||
remaining_sources.iter().map(|s| &s.partition).collect();
|
||||
// n_dst_genomes = 1 (copied from source_0 only)
|
||||
dst_partition.merge_partition(i, &srcs, mode, 1).err()
|
||||
})
|
||||
.collect();
|
||||
|
||||
if let Some(e) = errors.into_iter().next() {
|
||||
return Err(OKIError::Partition(e));
|
||||
}
|
||||
|
||||
rep.push(t.stop());
|
||||
}
|
||||
|
||||
// Re-open to get the updated state.
|
||||
KmerIndex::open(output)
|
||||
}
|
||||
}
|
||||
|
||||
// ── Directory copy ────────────────────────────────────────────────────────────
|
||||
|
||||
fn copy_dir_all(src: &Path, dst: &Path) -> io::Result<()> {
|
||||
fs::create_dir_all(dst)?;
|
||||
for entry in fs::read_dir(src)? {
|
||||
let entry = entry?;
|
||||
let src_path = entry.path();
|
||||
let dst_path = dst.join(entry.file_name());
|
||||
if src_path.is_dir() {
|
||||
copy_dir_all(&src_path, &dst_path)?;
|
||||
} else {
|
||||
fs::copy(&src_path, &dst_path)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
Reference in New Issue
Block a user