feat: add k-mer index rebuild and compaction feature

This commit introduces a new `rebuild` CLI subcommand that reconstructs an existing multi-layer k-mer index into a compact, single-layer index. It implements a configurable filtering pipeline supporting min/max genome fraction/count and total count thresholds, parallel partition processing via `rayon`, and CLI progress tracking. The change also restructures module declarations across `obikindex` and `obikpartitionner` to integrate the new rebuild and layer-handling logic.
This commit is contained in:
Eric Coissac
2026-05-21 12:11:55 +02:00
parent 3fa1dbf8cc
commit d9aa211b8f
9 changed files with 530 additions and 3 deletions
+1
View File
@@ -5,6 +5,7 @@ mod distance;
mod dump;
mod index;
mod merge;
mod rebuild;
pub use error::{OKIError, OKIResult};
pub use distance::{DistanceMetric, DistanceOutput};
+116
View File
@@ -0,0 +1,116 @@
use std::fs;
use std::io;
use std::path::Path;
use std::time::Duration;
use indicatif::{ProgressBar, ProgressStyle};
use obikpartitionner::{KmerFilter, KmerPartition, MergeMode};
use obisys::{Reporter, Stage};
use rayon::prelude::*;
use tracing::info;
use crate::error::{OKIError, OKIResult};
use crate::index::KmerIndex;
use crate::meta::IndexMeta;
use crate::state::{IndexState, SENTINEL_INDEXED};
impl KmerIndex {
/// Rebuild `src` into a new compact single-layer index at `output`.
///
/// Only k-mers whose per-genome row passes every filter in `filters` are
/// written. If `filters` is empty every k-mer is kept (pure compaction).
///
/// `mode` controls whether the output stores counts or presence/absence.
/// A count source may be rebuilt in presence mode; a presence source
/// cannot be rebuilt in count mode.
pub fn rebuild<P: AsRef<Path>>(
output: P,
src: &KmerIndex,
filters: &[Box<dyn KmerFilter>],
mode: MergeMode,
force: bool,
rep: &mut Reporter,
) -> OKIResult<Self> {
let output = output.as_ref();
if src.state() != IndexState::Indexed {
return Err(OKIError::NotIndexed(src.root_path.clone()));
}
if mode == MergeMode::Count && !src.meta.config.with_counts {
return Err(OKIError::InvalidInput(
"cannot rebuild in count mode from a presence-only source index".into(),
));
}
if output.exists() {
if force {
fs::remove_dir_all(output)?;
} else {
return Err(OKIError::Io(io::Error::new(
io::ErrorKind::AlreadyExists,
format!("{}: output directory already exists", output.display()),
)));
}
}
// ── Create output directory + metadata ────────────────────────────────
fs::create_dir_all(output)?;
let mut meta = IndexMeta::new(src.meta.config.clone());
meta.config.with_counts = mode == MergeMode::Count;
meta.genomes = src.meta.genomes.clone();
meta.write(output)?;
let n_genomes = src.meta.genomes.len();
let n_partitions = src.partition.n_partitions();
// ── Create an empty destination KmerPartition ─────────────────────────
// Create the partitions/ subdirectory so KmerPartition::open_with_config works.
fs::create_dir_all(output.join(obikpartitionner::PARTITIONS_SUBDIR))?;
let dst_partition = KmerPartition::open_with_config(
output,
meta.config.kmer_size,
meta.config.minimizer_size,
meta.config.n_bits,
)?;
info!(
"rebuild: {} partition(s), {} genome(s), mode={:?}",
n_partitions, n_genomes, mode,
);
let t = Stage::start("rebuild");
let pb = ProgressBar::new(n_partitions as u64).with_style(
ProgressStyle::with_template("rebuild — [{bar:20}] {pos}/{len} | {msg}")
.unwrap()
.progress_chars("=> "),
);
pb.enable_steady_tick(Duration::from_millis(100));
let src_partition = &src.partition;
let errors: Vec<obiskio::SKError> = (0..n_partitions)
.into_par_iter()
.filter_map(|i| {
let result = dst_partition
.rebuild_partition(src_partition, i, filters, mode, n_genomes)
.err();
pb.inc(1);
result
})
.collect();
pb.finish_and_clear();
if let Some(e) = errors.into_iter().next() {
return Err(OKIError::Partition(e));
}
rep.push(t.stop());
// Write SENTINEL_INDEXED — output is ready to use.
fs::File::create(output.join(SENTINEL_INDEXED))?;
KmerIndex::open(output)
}
}