feat: add configurable block sizes and in-place reindex command

Propagate configurable block size (`block_bits`) through index and layer construction to control unitig chunking and optimize memory/performance trade-offs. Introduce an in-place `reindex` command and library method to convert indices between exact and approximate evidence formats. Add validation to reject merging indexes with mismatched evidence types, and update parallel kmer counting to use `AtomicUsize` for thread-safe aggregation. Includes CLI argument parsing, metadata persistence, and updated tests.
This commit is contained in:
Eric Coissac
2026-05-23 12:50:03 +02:00
parent 876bc0127f
commit bc51cd9861
21 changed files with 318 additions and 51 deletions
+36 -1
View File
@@ -9,6 +9,8 @@ use obisys::{Reporter, Stage};
use rayon::prelude::*;
use tracing::info;
use obilayeredmap::EvidenceKind;
use crate::error::{OKIError, OKIResult};
use crate::index::KmerIndex;
use crate::meta::{GenomeInfo, IndexMeta};
@@ -61,6 +63,9 @@ impl KmerIndex {
}
}
// ── Validate evidence compatibility ───────────────────────────────────
let evidence = validate_evidence_compat(sources)?;
// ── Compute final genome labels (rename duplicates if requested) ───────
let (source_labels, all_genomes) = compute_labels(sources, rename_duplicates)?;
@@ -91,6 +96,7 @@ impl KmerIndex {
let mut meta = IndexMeta::read(output).map_err(OKIError::Io)?;
meta.genomes = all_genomes;
meta.config.with_counts = mode == MergeMode::Count;
meta.config.evidence = evidence;
meta.write(output)?;
// In presence/absence mode, purge counts/ directories inherited from
@@ -134,13 +140,14 @@ impl KmerIndex {
let pb = partition_bar(n_partitions as u64);
let dst_partition = &dst.partition;
let block_bits = dst.meta.config.block_bits;
let errors: Vec<obiskio::SKError> = (0..n_partitions)
.into_par_iter()
.filter_map(|i| {
let srcs: Vec<(&obikpartitionner::KmerPartition, usize)> =
remaining_sources.iter().map(|s| (&s.partition, s.meta.genomes.len())).collect();
let result = dst_partition.merge_partition(i, &srcs, mode, n_dst_genomes).err();
let result = dst_partition.merge_partition(i, &srcs, mode, n_dst_genomes, block_bits).err();
pb.inc(1);
result
})
@@ -258,6 +265,34 @@ fn partition_bar(n: u64) -> ProgressBar {
pb
}
/// Check that all sources share the same evidence kind.
///
/// Rules:
/// - all `Exact` → OK, returns `Exact`
/// - all `Approx { b, z }` same params → OK, returns `Approx { b, z }`
/// - mixed exact/approx or different approx params → `IncompatibleEvidence`
fn validate_evidence_compat(sources: &[&KmerIndex]) -> OKIResult<EvidenceKind> {
let ref_ev = &sources[0].meta.config.evidence;
for src in &sources[1..] {
let ev = &src.meta.config.evidence;
let compat = match (ref_ev, ev) {
(EvidenceKind::Exact, EvidenceKind::Exact) => true,
(EvidenceKind::Approx { b: b1, z: z1 },
EvidenceKind::Approx { b: b2, z: z2 }) => b1 == b2 && z1 == z2,
_ => false,
};
if !compat {
return Err(OKIError::IncompatibleEvidence(format!(
"source {:?} has evidence {:?}, expected {:?}\
convert all sources to the same evidence kind first \
(use the `reindex` command)",
src.root_path.display(), ev, ref_ev,
)));
}
}
Ok(ref_ev.clone())
}
fn copy_dir_all(src: &Path, dst: &Path) -> io::Result<()> {
fs::create_dir_all(dst)?;
for entry in fs::read_dir(src)? {