feat: add k-mer index rebuild and compaction feature
This commit introduces a new `rebuild` CLI subcommand that reconstructs an existing multi-layer k-mer index into a compact, single-layer index. It implements a configurable filtering pipeline supporting min/max genome fraction/count and total count thresholds, parallel partition processing via `rayon`, and CLI progress tracking. The change also restructures module declarations across `obikindex` and `obikpartitionner` to integrate the new rebuild and layer-handling logic.
This commit is contained in:
@@ -5,6 +5,7 @@ mod distance;
|
||||
mod dump;
|
||||
mod index;
|
||||
mod merge;
|
||||
mod rebuild;
|
||||
|
||||
pub use error::{OKIError, OKIResult};
|
||||
pub use distance::{DistanceMetric, DistanceOutput};
|
||||
|
||||
@@ -0,0 +1,116 @@
|
||||
use std::fs;
|
||||
use std::io;
|
||||
use std::path::Path;
|
||||
use std::time::Duration;
|
||||
|
||||
use indicatif::{ProgressBar, ProgressStyle};
|
||||
use obikpartitionner::{KmerFilter, KmerPartition, MergeMode};
|
||||
use obisys::{Reporter, Stage};
|
||||
use rayon::prelude::*;
|
||||
use tracing::info;
|
||||
|
||||
use crate::error::{OKIError, OKIResult};
|
||||
use crate::index::KmerIndex;
|
||||
use crate::meta::IndexMeta;
|
||||
use crate::state::{IndexState, SENTINEL_INDEXED};
|
||||
|
||||
impl KmerIndex {
|
||||
/// Rebuild `src` into a new compact single-layer index at `output`.
|
||||
///
|
||||
/// Only k-mers whose per-genome row passes every filter in `filters` are
|
||||
/// written. If `filters` is empty every k-mer is kept (pure compaction).
|
||||
///
|
||||
/// `mode` controls whether the output stores counts or presence/absence.
|
||||
/// A count source may be rebuilt in presence mode; a presence source
|
||||
/// cannot be rebuilt in count mode.
|
||||
pub fn rebuild<P: AsRef<Path>>(
|
||||
output: P,
|
||||
src: &KmerIndex,
|
||||
filters: &[Box<dyn KmerFilter>],
|
||||
mode: MergeMode,
|
||||
force: bool,
|
||||
rep: &mut Reporter,
|
||||
) -> OKIResult<Self> {
|
||||
let output = output.as_ref();
|
||||
|
||||
if src.state() != IndexState::Indexed {
|
||||
return Err(OKIError::NotIndexed(src.root_path.clone()));
|
||||
}
|
||||
|
||||
if mode == MergeMode::Count && !src.meta.config.with_counts {
|
||||
return Err(OKIError::InvalidInput(
|
||||
"cannot rebuild in count mode from a presence-only source index".into(),
|
||||
));
|
||||
}
|
||||
|
||||
if output.exists() {
|
||||
if force {
|
||||
fs::remove_dir_all(output)?;
|
||||
} else {
|
||||
return Err(OKIError::Io(io::Error::new(
|
||||
io::ErrorKind::AlreadyExists,
|
||||
format!("{}: output directory already exists", output.display()),
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
// ── Create output directory + metadata ────────────────────────────────
|
||||
fs::create_dir_all(output)?;
|
||||
let mut meta = IndexMeta::new(src.meta.config.clone());
|
||||
meta.config.with_counts = mode == MergeMode::Count;
|
||||
meta.genomes = src.meta.genomes.clone();
|
||||
meta.write(output)?;
|
||||
|
||||
let n_genomes = src.meta.genomes.len();
|
||||
let n_partitions = src.partition.n_partitions();
|
||||
|
||||
// ── Create an empty destination KmerPartition ─────────────────────────
|
||||
// Create the partitions/ subdirectory so KmerPartition::open_with_config works.
|
||||
fs::create_dir_all(output.join(obikpartitionner::PARTITIONS_SUBDIR))?;
|
||||
let dst_partition = KmerPartition::open_with_config(
|
||||
output,
|
||||
meta.config.kmer_size,
|
||||
meta.config.minimizer_size,
|
||||
meta.config.n_bits,
|
||||
)?;
|
||||
|
||||
info!(
|
||||
"rebuild: {} partition(s), {} genome(s), mode={:?}",
|
||||
n_partitions, n_genomes, mode,
|
||||
);
|
||||
|
||||
let t = Stage::start("rebuild");
|
||||
let pb = ProgressBar::new(n_partitions as u64).with_style(
|
||||
ProgressStyle::with_template("rebuild — [{bar:20}] {pos}/{len} | {msg}")
|
||||
.unwrap()
|
||||
.progress_chars("=> "),
|
||||
);
|
||||
pb.enable_steady_tick(Duration::from_millis(100));
|
||||
|
||||
let src_partition = &src.partition;
|
||||
|
||||
let errors: Vec<obiskio::SKError> = (0..n_partitions)
|
||||
.into_par_iter()
|
||||
.filter_map(|i| {
|
||||
let result = dst_partition
|
||||
.rebuild_partition(src_partition, i, filters, mode, n_genomes)
|
||||
.err();
|
||||
pb.inc(1);
|
||||
result
|
||||
})
|
||||
.collect();
|
||||
|
||||
pb.finish_and_clear();
|
||||
|
||||
if let Some(e) = errors.into_iter().next() {
|
||||
return Err(OKIError::Partition(e));
|
||||
}
|
||||
|
||||
rep.push(t.stop());
|
||||
|
||||
// Write SENTINEL_INDEXED — output is ready to use.
|
||||
fs::File::create(output.join(SENTINEL_INDEXED))?;
|
||||
|
||||
KmerIndex::open(output)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user