feat: enhance merge label resolution, debug dump, and layer metadata

This commit enhances the CLI and index pipelines by introducing `--force-presence` to normalize output to binary values, `--debug` to expose partition and layer metadata, and `--rename-duplicates` to automatically disambiguate overlapping genome labels. It updates the partitioner and index layers to auto-discover layers, persist `meta.json` for single-genome builds, and fix per-source column offsets during merging. A `DuplicateGenomeLabel` error variant is also added, and stale directories are properly managed in presence/absence mode.
This commit is contained in:
Eric Coissac
2026-05-21 08:12:02 +02:00
parent 1a1f95e59d
commit 11182005a2
8 changed files with 347 additions and 94 deletions
+31 -14
View File
@@ -14,13 +14,17 @@ impl KmerIndex {
///
/// The caller must have set the global kmer length (`obikseq::set_k`) before
/// calling this method.
pub fn dump<W: Write>(&self, out: &mut W, force_presence: bool) -> OKIResult<()> {
pub fn dump<W: Write>(&self, out: &mut W, force_presence: bool, debug: bool) -> OKIResult<()> {
let genomes = &self.meta.genomes;
let genomes = &self.meta.genomes;
let use_counts = self.meta.config.with_counts && !force_presence;
let n_genomes = genomes.len().max(1);
let kmer_size = self.kmer_size();
// ── Header ────────────────────────────────────────────────────────────
if debug {
write!(out, "partition,layer,")?;
}
write!(out, "kmer")?;
for g in genomes {
write!(out, ",{g}")?;
@@ -30,18 +34,31 @@ impl KmerIndex {
// ── Rows ──────────────────────────────────────────────────────────────
let n = self.n_partitions();
for i in 0..n {
self.partition
.iter_partition_kmers(i, use_counts, n_genomes, |kmer, row| {
let seq = String::from_utf8(kmer.to_ascii())
.unwrap_or_else(|_| "?".repeat(self.kmer_size()));
// write is infallible inside a closure — propagate via a flag if needed
let _ = write!(out, "{seq}");
for &v in row.iter() {
let _ = write!(out, ",{v}");
}
let _ = writeln!(out);
})
.map_err(OKIError::Partition)?;
if debug {
self.partition
.iter_partition_kmers_located(i, use_counts, n_genomes, |part, layer, kmer, row| {
let seq = String::from_utf8(kmer.to_ascii())
.unwrap_or_else(|_| "?".repeat(kmer_size));
let _ = write!(out, "{part},{layer},{seq}");
for &v in row.iter() {
let _ = write!(out, ",{v}");
}
let _ = writeln!(out);
})
.map_err(OKIError::Partition)?;
} else {
self.partition
.iter_partition_kmers(i, use_counts, n_genomes, |kmer, row| {
let seq = String::from_utf8(kmer.to_ascii())
.unwrap_or_else(|_| "?".repeat(kmer_size));
let _ = write!(out, "{seq}");
for &v in row.iter() {
let _ = write!(out, ",{v}");
}
let _ = writeln!(out);
})
.map_err(OKIError::Partition)?;
}
}
out.flush()?;
+4 -1
View File
@@ -14,6 +14,8 @@ pub enum OKIError {
IncompatibleConfig,
/// Count mode requested but a source index lacks count data.
MismatchedMode,
/// Two or more sources share the same genome label.
DuplicateGenomeLabel(String),
}
pub type OKIResult<T> = Result<T, OKIError>;
@@ -27,6 +29,7 @@ impl fmt::Display for OKIError {
OKIError::NotIndexed(p) => write!(f, "index not fully built: {}", p.display()),
OKIError::IncompatibleConfig => write!(f, "incompatible index configurations"),
OKIError::MismatchedMode => write!(f, "count mode requires all sources to have with_counts=true"),
OKIError::DuplicateGenomeLabel(l) => write!(f, "duplicate genome label across sources: {l}"),
}
}
}
@@ -37,7 +40,7 @@ impl std::error::Error for OKIError {
OKIError::Io(e) => Some(e),
OKIError::Json(e) => Some(e),
OKIError::Partition(e) => Some(e),
_ => None,
_ => None, // IncompatibleConfig, MismatchedMode, DuplicateGenomeLabel
}
}
}
+104 -13
View File
@@ -1,3 +1,4 @@
use std::collections::HashMap;
use std::fs;
use std::io;
use std::path::Path;
@@ -20,13 +21,16 @@ impl KmerIndex {
/// `minimizer_size`, and `n_partitions`. Count mode additionally requires
/// every source to have `with_counts = true`.
///
/// The first source is copied to `output`, then each subsequent source is
/// merged partition-by-partition in parallel.
/// Genome labels must be unique across all sources. If `rename_duplicates`
/// is true, repeated labels are disambiguated by appending `.1`, `.2`, …
/// to the second and subsequent occurrences. Otherwise a
/// `DuplicateGenomeLabel` error is returned on the first conflict.
pub fn merge<P: AsRef<Path>>(
output: P,
sources: &[&KmerIndex],
mode: MergeMode,
force: bool,
rename_duplicates: bool,
) -> OKIResult<Self> {
let output = output.as_ref();
@@ -37,7 +41,7 @@ impl KmerIndex {
)));
}
// ── Validate ──────────────────────────────────────────────────────────
// ── Validate config compatibility ─────────────────────────────────────
let ref0 = sources[0];
for src in sources {
if src.state() != IndexState::Indexed {
@@ -54,6 +58,9 @@ impl KmerIndex {
}
}
// ── Compute final genome labels (rename duplicates if requested) ───────
let (source_labels, all_genomes) = compute_labels(sources, rename_duplicates)?;
// ── Prepare output directory ──────────────────────────────────────────
if output.exists() {
if force {
@@ -70,18 +77,32 @@ impl KmerIndex {
info!("copying {} → {}", sources[0].root_path.display(), output.display());
copy_dir_all(&sources[0].root_path, output)?;
// Rewrite index.meta with all genome labels.
let all_genomes: Vec<String> = sources
.iter()
.flat_map(|s| s.meta.genomes.iter().cloned())
.collect();
// Rewrite index.meta with final genome labels and the effective mode.
let mut meta = IndexMeta::read(output).map_err(OKIError::Io)?;
meta.genomes = all_genomes;
meta.config.with_counts = mode == MergeMode::Count;
meta.write(output)?;
// In presence/absence mode, purge counts/ directories inherited from
// source_0 — they are stale data from the source's count index.
if mode == MergeMode::Presence {
remove_dirs_named(output, "counts")?;
}
// Rebuild spectrums/ from all sources using the (possibly renamed) labels.
// Drop the spectrums/ that were copied from source_0 and rebuild from scratch.
let spectrums_dir = output.join("spectrums");
if spectrums_dir.exists() {
fs::remove_dir_all(&spectrums_dir)?;
}
for (src, new_labels) in sources.iter().zip(&source_labels) {
copy_spectrums(&src.root_path, output, &src.meta.genomes, new_labels)?;
}
// Open the destination index.
let dst = KmerIndex::open(output)?;
let n_partitions = dst.n_partitions();
let n_dst_genomes = sources[0].meta.genomes.len();
// ── Merge each subsequent source partition-by-partition ───────────────
let remaining_sources: Vec<&KmerIndex> = sources[1..].to_vec();
@@ -94,10 +115,9 @@ impl KmerIndex {
let errors: Vec<obiskio::SKError> = (0..n_partitions)
.into_par_iter()
.filter_map(|i| {
let srcs: Vec<&obikpartitionner::KmerPartition> =
remaining_sources.iter().map(|s| &s.partition).collect();
// n_dst_genomes = 1 (copied from source_0 only)
dst_partition.merge_partition(i, &srcs, mode, 1).err()
let srcs: Vec<(&obikpartitionner::KmerPartition, usize)> =
remaining_sources.iter().map(|s| (&s.partition, s.meta.genomes.len())).collect();
dst_partition.merge_partition(i, &srcs, mode, n_dst_genomes).err()
})
.collect();
@@ -113,7 +133,78 @@ impl KmerIndex {
}
}
// ── Directory copy ────────────────────────────────────────────────────────────
// ── Helpers ───────────────────────────────────────────────────────────────────
/// Compute the final genome label lists for all sources.
///
/// Returns `(per_source_labels, all_genomes_flat)`.
/// The first occurrence of a label keeps the original name. Subsequent
/// occurrences receive `.1`, `.2`, … suffixes when `rename_duplicates` is true,
/// or trigger a `DuplicateGenomeLabel` error otherwise.
fn compute_labels(
sources: &[&KmerIndex],
rename_duplicates: bool,
) -> OKIResult<(Vec<Vec<String>>, Vec<String>)> {
let mut seen: HashMap<String, usize> = HashMap::new();
let mut source_labels: Vec<Vec<String>> = Vec::with_capacity(sources.len());
let mut all_genomes: Vec<String> = Vec::new();
for src in sources {
let mut labels = Vec::with_capacity(src.meta.genomes.len());
for label in &src.meta.genomes {
let count = seen.entry(label.clone()).or_insert(0);
let new_label = if *count == 0 {
label.clone()
} else if rename_duplicates {
format!("{label}.{count}")
} else {
return Err(OKIError::DuplicateGenomeLabel(label.clone()));
};
*count += 1;
labels.push(new_label.clone());
all_genomes.push(new_label);
}
source_labels.push(labels);
}
Ok((source_labels, all_genomes))
}
/// Copy spectrum JSON files from `src_root/spectrums/` to `dst_root/spectrums/`,
/// mapping each `old_labels[i]` filename to `new_labels[i]`.
fn copy_spectrums(
src_root: &Path,
dst_root: &Path,
old_labels: &[String],
new_labels: &[String],
) -> io::Result<()> {
let src_dir = src_root.join("spectrums");
let dst_dir = dst_root.join("spectrums");
fs::create_dir_all(&dst_dir)?;
for (old, new) in old_labels.iter().zip(new_labels.iter()) {
let src_file = src_dir.join(format!("{old}.json"));
if src_file.exists() {
fs::copy(&src_file, dst_dir.join(format!("{new}.json")))?;
}
}
Ok(())
}
/// Recursively remove every directory named `name` under `root`.
fn remove_dirs_named(root: &Path, name: &str) -> io::Result<()> {
for entry in fs::read_dir(root)? {
let entry = entry?;
let path = entry.path();
if path.is_dir() {
if path.file_name().and_then(|n| n.to_str()) == Some(name) {
fs::remove_dir_all(&path)?;
} else {
remove_dirs_named(&path, name)?;
}
}
}
Ok(())
}
fn copy_dir_all(src: &Path, dst: &Path) -> io::Result<()> {
fs::create_dir_all(dst)?;