feat: add configurable block sizes and in-place reindex command

Propagate configurable block size (`block_bits`) through index and layer construction to control unitig chunking and optimize memory/performance trade-offs. Introduce an in-place `reindex` command and library method to convert indices between exact and approximate evidence formats. Add validation to reject merging indexes with mismatched evidence types, and update parallel kmer counting to use `AtomicUsize` for thread-safe aggregation. Includes CLI argument parsing, metadata persistence, and updated tests.
This commit is contained in:
Eric Coissac
2026-05-23 12:50:03 +02:00
parent 876bc0127f
commit bc51cd9861
21 changed files with 318 additions and 51 deletions
+4 -1
View File
@@ -18,6 +18,8 @@ pub enum OKIError {
DuplicateGenomeLabel(String),
/// Operation not valid for this index configuration.
InvalidInput(String),
/// Sources mix exact and approximate evidence, or use incompatible approx parameters.
IncompatibleEvidence(String),
}
pub type OKIResult<T> = Result<T, OKIError>;
@@ -32,7 +34,8 @@ impl fmt::Display for OKIError {
OKIError::IncompatibleConfig => write!(f, "incompatible index configurations"),
OKIError::MismatchedMode => write!(f, "count mode requires all sources to have with_counts=true"),
OKIError::DuplicateGenomeLabel(l) => write!(f, "duplicate genome label across sources: {l}"),
OKIError::InvalidInput(m) => write!(f, "invalid input: {m}"),
OKIError::InvalidInput(m) => write!(f, "invalid input: {m}"),
OKIError::IncompatibleEvidence(m) => write!(f, "incompatible evidence: {m}"),
}
}
}
+2 -1
View File
@@ -145,6 +145,7 @@ impl KmerIndex {
let t = Stage::start("index");
let with_counts = self.meta.config.with_counts;
let evidence = self.meta.config.evidence.clone();
let block_bits = self.meta.config.block_bits;
let total_kmers = AtomicUsize::new(0);
let pb = Arc::new(Mutex::new(
@@ -154,7 +155,7 @@ impl KmerIndex {
));
(0..n).into_par_iter().for_each(|i| {
match self.partition.build_index_layer(i, min_ab, max_ab, with_counts, &evidence) {
match self.partition.build_index_layer(i, min_ab, max_ab, with_counts, &evidence, block_bits) {
Ok(0) => {}
Ok(n_kmers) => {
total_kmers.fetch_add(n_kmers, Ordering::Relaxed);
+1
View File
@@ -6,6 +6,7 @@ mod dump;
mod index;
mod merge;
mod rebuild;
mod reindex;
pub use error::{OKIError, OKIResult};
pub use distance::{DistanceMetric, DistanceOutput};
+36 -1
View File
@@ -9,6 +9,8 @@ use obisys::{Reporter, Stage};
use rayon::prelude::*;
use tracing::info;
use obilayeredmap::EvidenceKind;
use crate::error::{OKIError, OKIResult};
use crate::index::KmerIndex;
use crate::meta::{GenomeInfo, IndexMeta};
@@ -61,6 +63,9 @@ impl KmerIndex {
}
}
// ── Validate evidence compatibility ───────────────────────────────────
let evidence = validate_evidence_compat(sources)?;
// ── Compute final genome labels (rename duplicates if requested) ───────
let (source_labels, all_genomes) = compute_labels(sources, rename_duplicates)?;
@@ -91,6 +96,7 @@ impl KmerIndex {
let mut meta = IndexMeta::read(output).map_err(OKIError::Io)?;
meta.genomes = all_genomes;
meta.config.with_counts = mode == MergeMode::Count;
meta.config.evidence = evidence;
meta.write(output)?;
// In presence/absence mode, purge counts/ directories inherited from
@@ -134,13 +140,14 @@ impl KmerIndex {
let pb = partition_bar(n_partitions as u64);
let dst_partition = &dst.partition;
let block_bits = dst.meta.config.block_bits;
let errors: Vec<obiskio::SKError> = (0..n_partitions)
.into_par_iter()
.filter_map(|i| {
let srcs: Vec<(&obikpartitionner::KmerPartition, usize)> =
remaining_sources.iter().map(|s| (&s.partition, s.meta.genomes.len())).collect();
let result = dst_partition.merge_partition(i, &srcs, mode, n_dst_genomes).err();
let result = dst_partition.merge_partition(i, &srcs, mode, n_dst_genomes, block_bits).err();
pb.inc(1);
result
})
@@ -258,6 +265,34 @@ fn partition_bar(n: u64) -> ProgressBar {
pb
}
/// Check that all sources share the same evidence kind.
///
/// Rules:
/// - all `Exact` → OK, returns `Exact`
/// - all `Approx { b, z }` same params → OK, returns `Approx { b, z }`
/// - mixed exact/approx or different approx params → `IncompatibleEvidence`
fn validate_evidence_compat(sources: &[&KmerIndex]) -> OKIResult<EvidenceKind> {
let ref_ev = &sources[0].meta.config.evidence;
for src in &sources[1..] {
let ev = &src.meta.config.evidence;
let compat = match (ref_ev, ev) {
(EvidenceKind::Exact, EvidenceKind::Exact) => true,
(EvidenceKind::Approx { b: b1, z: z1 },
EvidenceKind::Approx { b: b2, z: z2 }) => b1 == b2 && z1 == z2,
_ => false,
};
if !compat {
return Err(OKIError::IncompatibleEvidence(format!(
"source {:?} has evidence {:?}, expected {:?}\
convert all sources to the same evidence kind first \
(use the `reindex` command)",
src.root_path.display(), ev, ref_ev,
)));
}
}
Ok(ref_ev.clone())
}
fn copy_dir_all(src: &Path, dst: &Path) -> io::Result<()> {
fs::create_dir_all(dst)?;
for entry in fs::read_dir(src)? {
+5
View File
@@ -31,6 +31,11 @@ pub struct IndexConfig {
pub with_counts: bool,
#[serde(default)]
pub evidence: EvidenceKind,
/// Block size for the unitig index as a power-of-two exponent.
/// The `.idx` block covers 2^block_bits consecutive unitigs.
/// 0 = one entry per unitig (O(1) access, largest `.idx`).
#[serde(default)]
pub block_bits: u8,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
+2 -1
View File
@@ -88,12 +88,13 @@ impl KmerIndex {
pb.enable_steady_tick(Duration::from_millis(100));
let src_partition = &src.partition;
let block_bits = meta.config.block_bits;
let errors: Vec<obiskio::SKError> = (0..n_partitions)
.into_par_iter()
.filter_map(|i| {
let result = dst_partition
.rebuild_partition(src_partition, i, filters, mode, n_genomes)
.rebuild_partition(src_partition, i, filters, mode, n_genomes, block_bits)
.err();
pb.inc(1);
result
+126
View File
@@ -0,0 +1,126 @@
use std::fs;
use std::path::Path;
use std::time::Duration;
use indicatif::{ProgressBar, ProgressStyle};
use obilayeredmap::{EvidenceKind, layer::Layer};
use obilayeredmap::meta::PartitionMeta;
use obisys::{Reporter, Stage};
use rayon::prelude::*;
use tracing::info;
use crate::error::{OKIError, OKIResult};
use crate::index::KmerIndex;
use crate::state::IndexState;
const EVIDENCE_FILE: &str = "evidence.bin";
const FINGERPRINT_FILE: &str = "fingerprint.bin";
const UNITIG_IDX_FILE: &str = "unitigs.bin.idx";
fn olm_to_oki(e: obilayeredmap::OLMError) -> OKIError {
OKIError::InvalidInput(e.to_string())
}
impl KmerIndex {
/// Convert every layer's evidence bundle to `target` in-place.
///
/// - `Exact` → builds `evidence.bin` + `unitigs.bin.idx`, removes `fingerprint.bin`
/// - `Approx` → builds `fingerprint.bin`, removes `evidence.bin` + `unitigs.bin.idx`
///
/// The MPHF (`mphf.bin`) and unitigs (`unitigs.bin`) are never touched.
/// `index.meta` is updated with the new evidence kind on success.
pub fn reindex(
&mut self,
target: EvidenceKind,
block_bits: u8,
rep: &mut Reporter,
) -> OKIResult<()> {
if self.state() != IndexState::Indexed {
return Err(OKIError::NotIndexed(self.root_path.clone()));
}
let n = self.partition.n_partitions();
info!(
"reindex {} partition(s): {:?} → {:?}",
n, self.meta.config.evidence, target,
);
let t = Stage::start("reindex");
let pb = ProgressBar::new(n as u64).with_style(
ProgressStyle::with_template(
"reindex — [{bar:20}] {pos}/{len} | {msg}",
)
.unwrap()
.tick_strings(&["","","","","","","","","",""]),
);
pb.enable_steady_tick(Duration::from_millis(80));
let errors: Vec<String> = (0..n)
.into_par_iter()
.filter_map(|i| {
let res = reindex_partition(
&self.partition.part_dir(i).join("index"),
&target,
block_bits,
);
pb.inc(1);
res.err().map(|e| format!("partition {i}: {e}"))
})
.collect();
pb.finish_and_clear();
if let Some(e) = errors.into_iter().next() {
return Err(OKIError::InvalidInput(e));
}
self.meta.config.evidence = target;
if matches!(self.meta.config.evidence, EvidenceKind::Exact) {
self.meta.config.block_bits = block_bits;
}
self.meta.write(&self.root_path)?;
rep.push(t.stop());
Ok(())
}
}
/// Process all layers of one partition's index directory.
fn reindex_partition(index_dir: &Path, target: &EvidenceKind, block_bits: u8) -> OKIResult<()> {
if !index_dir.exists() {
return Ok(());
}
let pm = PartitionMeta::load(index_dir).map_err(olm_to_oki)?;
for layer_idx in 0..pm.n_layers {
let layer_dir = index_dir.join(format!("layer_{layer_idx}"));
reindex_layer(&layer_dir, target, block_bits)?;
}
Ok(())
}
fn reindex_layer(layer_dir: &Path, target: &EvidenceKind, block_bits: u8) -> OKIResult<()> {
Layer::<()>::build_evidence(layer_dir, target, block_bits).map_err(olm_to_oki)?;
remove_stale_evidence(layer_dir, target)
}
fn remove_stale_evidence(layer_dir: &Path, target: &EvidenceKind) -> OKIResult<()> {
match target {
EvidenceKind::Exact => {
// fingerprint.bin is no longer valid
remove_if_exists(&layer_dir.join(FINGERPRINT_FILE));
}
EvidenceKind::Approx { .. } => {
// exact bundle is no longer valid
remove_if_exists(&layer_dir.join(EVIDENCE_FILE));
remove_if_exists(&layer_dir.join(UNITIG_IDX_FILE));
}
}
Ok(())
}
fn remove_if_exists(path: &Path) {
if let Err(e) = fs::remove_file(path) {
if e.kind() != std::io::ErrorKind::NotFound {
eprintln!("warning: could not remove {}: {e}", path.display());
}
}
}