feat: enhance merge label resolution, debug dump, and layer metadata
This commit enhances the CLI and index pipelines by introducing `--force-presence` to normalize output to binary values, `--debug` to expose partition and layer metadata, and `--rename-duplicates` to automatically disambiguate overlapping genome labels. It updates the partitioner and index layers to auto-discover layers, persist `meta.json` for single-genome builds, and fix per-source column offsets during merging. A `DuplicateGenomeLabel` error variant is also added, and stale directories are properly managed in presence/absence mode.
This commit is contained in:
+31
-14
@@ -14,13 +14,17 @@ impl KmerIndex {
|
||||
///
|
||||
/// The caller must have set the global kmer length (`obikseq::set_k`) before
|
||||
/// calling this method.
|
||||
pub fn dump<W: Write>(&self, out: &mut W, force_presence: bool) -> OKIResult<()> {
|
||||
pub fn dump<W: Write>(&self, out: &mut W, force_presence: bool, debug: bool) -> OKIResult<()> {
|
||||
|
||||
let genomes = &self.meta.genomes;
|
||||
let genomes = &self.meta.genomes;
|
||||
let use_counts = self.meta.config.with_counts && !force_presence;
|
||||
let n_genomes = genomes.len().max(1);
|
||||
let kmer_size = self.kmer_size();
|
||||
|
||||
// ── Header ────────────────────────────────────────────────────────────
|
||||
if debug {
|
||||
write!(out, "partition,layer,")?;
|
||||
}
|
||||
write!(out, "kmer")?;
|
||||
for g in genomes {
|
||||
write!(out, ",{g}")?;
|
||||
@@ -30,18 +34,31 @@ impl KmerIndex {
|
||||
// ── Rows ──────────────────────────────────────────────────────────────
|
||||
let n = self.n_partitions();
|
||||
for i in 0..n {
|
||||
self.partition
|
||||
.iter_partition_kmers(i, use_counts, n_genomes, |kmer, row| {
|
||||
let seq = String::from_utf8(kmer.to_ascii())
|
||||
.unwrap_or_else(|_| "?".repeat(self.kmer_size()));
|
||||
// write is infallible inside a closure — propagate via a flag if needed
|
||||
let _ = write!(out, "{seq}");
|
||||
for &v in row.iter() {
|
||||
let _ = write!(out, ",{v}");
|
||||
}
|
||||
let _ = writeln!(out);
|
||||
})
|
||||
.map_err(OKIError::Partition)?;
|
||||
if debug {
|
||||
self.partition
|
||||
.iter_partition_kmers_located(i, use_counts, n_genomes, |part, layer, kmer, row| {
|
||||
let seq = String::from_utf8(kmer.to_ascii())
|
||||
.unwrap_or_else(|_| "?".repeat(kmer_size));
|
||||
let _ = write!(out, "{part},{layer},{seq}");
|
||||
for &v in row.iter() {
|
||||
let _ = write!(out, ",{v}");
|
||||
}
|
||||
let _ = writeln!(out);
|
||||
})
|
||||
.map_err(OKIError::Partition)?;
|
||||
} else {
|
||||
self.partition
|
||||
.iter_partition_kmers(i, use_counts, n_genomes, |kmer, row| {
|
||||
let seq = String::from_utf8(kmer.to_ascii())
|
||||
.unwrap_or_else(|_| "?".repeat(kmer_size));
|
||||
let _ = write!(out, "{seq}");
|
||||
for &v in row.iter() {
|
||||
let _ = write!(out, ",{v}");
|
||||
}
|
||||
let _ = writeln!(out);
|
||||
})
|
||||
.map_err(OKIError::Partition)?;
|
||||
}
|
||||
}
|
||||
|
||||
out.flush()?;
|
||||
|
||||
@@ -14,6 +14,8 @@ pub enum OKIError {
|
||||
IncompatibleConfig,
|
||||
/// Count mode requested but a source index lacks count data.
|
||||
MismatchedMode,
|
||||
/// Two or more sources share the same genome label.
|
||||
DuplicateGenomeLabel(String),
|
||||
}
|
||||
|
||||
pub type OKIResult<T> = Result<T, OKIError>;
|
||||
@@ -27,6 +29,7 @@ impl fmt::Display for OKIError {
|
||||
OKIError::NotIndexed(p) => write!(f, "index not fully built: {}", p.display()),
|
||||
OKIError::IncompatibleConfig => write!(f, "incompatible index configurations"),
|
||||
OKIError::MismatchedMode => write!(f, "count mode requires all sources to have with_counts=true"),
|
||||
OKIError::DuplicateGenomeLabel(l) => write!(f, "duplicate genome label across sources: {l}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -37,7 +40,7 @@ impl std::error::Error for OKIError {
|
||||
OKIError::Io(e) => Some(e),
|
||||
OKIError::Json(e) => Some(e),
|
||||
OKIError::Partition(e) => Some(e),
|
||||
_ => None,
|
||||
_ => None, // IncompatibleConfig, MismatchedMode, DuplicateGenomeLabel
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
+104
-13
@@ -1,3 +1,4 @@
|
||||
use std::collections::HashMap;
|
||||
use std::fs;
|
||||
use std::io;
|
||||
use std::path::Path;
|
||||
@@ -20,13 +21,16 @@ impl KmerIndex {
|
||||
/// `minimizer_size`, and `n_partitions`. Count mode additionally requires
|
||||
/// every source to have `with_counts = true`.
|
||||
///
|
||||
/// The first source is copied to `output`, then each subsequent source is
|
||||
/// merged partition-by-partition in parallel.
|
||||
/// Genome labels must be unique across all sources. If `rename_duplicates`
|
||||
/// is true, repeated labels are disambiguated by appending `.1`, `.2`, …
|
||||
/// to the second and subsequent occurrences. Otherwise a
|
||||
/// `DuplicateGenomeLabel` error is returned on the first conflict.
|
||||
pub fn merge<P: AsRef<Path>>(
|
||||
output: P,
|
||||
sources: &[&KmerIndex],
|
||||
mode: MergeMode,
|
||||
force: bool,
|
||||
rename_duplicates: bool,
|
||||
) -> OKIResult<Self> {
|
||||
let output = output.as_ref();
|
||||
|
||||
@@ -37,7 +41,7 @@ impl KmerIndex {
|
||||
)));
|
||||
}
|
||||
|
||||
// ── Validate ──────────────────────────────────────────────────────────
|
||||
// ── Validate config compatibility ─────────────────────────────────────
|
||||
let ref0 = sources[0];
|
||||
for src in sources {
|
||||
if src.state() != IndexState::Indexed {
|
||||
@@ -54,6 +58,9 @@ impl KmerIndex {
|
||||
}
|
||||
}
|
||||
|
||||
// ── Compute final genome labels (rename duplicates if requested) ───────
|
||||
let (source_labels, all_genomes) = compute_labels(sources, rename_duplicates)?;
|
||||
|
||||
// ── Prepare output directory ──────────────────────────────────────────
|
||||
if output.exists() {
|
||||
if force {
|
||||
@@ -70,18 +77,32 @@ impl KmerIndex {
|
||||
info!("copying {} → {}", sources[0].root_path.display(), output.display());
|
||||
copy_dir_all(&sources[0].root_path, output)?;
|
||||
|
||||
// Rewrite index.meta with all genome labels.
|
||||
let all_genomes: Vec<String> = sources
|
||||
.iter()
|
||||
.flat_map(|s| s.meta.genomes.iter().cloned())
|
||||
.collect();
|
||||
// Rewrite index.meta with final genome labels and the effective mode.
|
||||
let mut meta = IndexMeta::read(output).map_err(OKIError::Io)?;
|
||||
meta.genomes = all_genomes;
|
||||
meta.config.with_counts = mode == MergeMode::Count;
|
||||
meta.write(output)?;
|
||||
|
||||
// In presence/absence mode, purge counts/ directories inherited from
|
||||
// source_0 — they are stale data from the source's count index.
|
||||
if mode == MergeMode::Presence {
|
||||
remove_dirs_named(output, "counts")?;
|
||||
}
|
||||
|
||||
// Rebuild spectrums/ from all sources using the (possibly renamed) labels.
|
||||
// Drop the spectrums/ that were copied from source_0 and rebuild from scratch.
|
||||
let spectrums_dir = output.join("spectrums");
|
||||
if spectrums_dir.exists() {
|
||||
fs::remove_dir_all(&spectrums_dir)?;
|
||||
}
|
||||
for (src, new_labels) in sources.iter().zip(&source_labels) {
|
||||
copy_spectrums(&src.root_path, output, &src.meta.genomes, new_labels)?;
|
||||
}
|
||||
|
||||
// Open the destination index.
|
||||
let dst = KmerIndex::open(output)?;
|
||||
let n_partitions = dst.n_partitions();
|
||||
let n_dst_genomes = sources[0].meta.genomes.len();
|
||||
|
||||
// ── Merge each subsequent source partition-by-partition ───────────────
|
||||
let remaining_sources: Vec<&KmerIndex> = sources[1..].to_vec();
|
||||
@@ -94,10 +115,9 @@ impl KmerIndex {
|
||||
let errors: Vec<obiskio::SKError> = (0..n_partitions)
|
||||
.into_par_iter()
|
||||
.filter_map(|i| {
|
||||
let srcs: Vec<&obikpartitionner::KmerPartition> =
|
||||
remaining_sources.iter().map(|s| &s.partition).collect();
|
||||
// n_dst_genomes = 1 (copied from source_0 only)
|
||||
dst_partition.merge_partition(i, &srcs, mode, 1).err()
|
||||
let srcs: Vec<(&obikpartitionner::KmerPartition, usize)> =
|
||||
remaining_sources.iter().map(|s| (&s.partition, s.meta.genomes.len())).collect();
|
||||
dst_partition.merge_partition(i, &srcs, mode, n_dst_genomes).err()
|
||||
})
|
||||
.collect();
|
||||
|
||||
@@ -113,7 +133,78 @@ impl KmerIndex {
|
||||
}
|
||||
}
|
||||
|
||||
// ── Directory copy ────────────────────────────────────────────────────────────
|
||||
// ── Helpers ───────────────────────────────────────────────────────────────────
|
||||
|
||||
/// Compute the final genome label lists for all sources.
|
||||
///
|
||||
/// Returns `(per_source_labels, all_genomes_flat)`.
|
||||
/// The first occurrence of a label keeps the original name. Subsequent
|
||||
/// occurrences receive `.1`, `.2`, … suffixes when `rename_duplicates` is true,
|
||||
/// or trigger a `DuplicateGenomeLabel` error otherwise.
|
||||
fn compute_labels(
|
||||
sources: &[&KmerIndex],
|
||||
rename_duplicates: bool,
|
||||
) -> OKIResult<(Vec<Vec<String>>, Vec<String>)> {
|
||||
let mut seen: HashMap<String, usize> = HashMap::new();
|
||||
let mut source_labels: Vec<Vec<String>> = Vec::with_capacity(sources.len());
|
||||
let mut all_genomes: Vec<String> = Vec::new();
|
||||
|
||||
for src in sources {
|
||||
let mut labels = Vec::with_capacity(src.meta.genomes.len());
|
||||
for label in &src.meta.genomes {
|
||||
let count = seen.entry(label.clone()).or_insert(0);
|
||||
let new_label = if *count == 0 {
|
||||
label.clone()
|
||||
} else if rename_duplicates {
|
||||
format!("{label}.{count}")
|
||||
} else {
|
||||
return Err(OKIError::DuplicateGenomeLabel(label.clone()));
|
||||
};
|
||||
*count += 1;
|
||||
labels.push(new_label.clone());
|
||||
all_genomes.push(new_label);
|
||||
}
|
||||
source_labels.push(labels);
|
||||
}
|
||||
|
||||
Ok((source_labels, all_genomes))
|
||||
}
|
||||
|
||||
/// Copy spectrum JSON files from `src_root/spectrums/` to `dst_root/spectrums/`,
|
||||
/// mapping each `old_labels[i]` filename to `new_labels[i]`.
|
||||
fn copy_spectrums(
|
||||
src_root: &Path,
|
||||
dst_root: &Path,
|
||||
old_labels: &[String],
|
||||
new_labels: &[String],
|
||||
) -> io::Result<()> {
|
||||
let src_dir = src_root.join("spectrums");
|
||||
let dst_dir = dst_root.join("spectrums");
|
||||
fs::create_dir_all(&dst_dir)?;
|
||||
for (old, new) in old_labels.iter().zip(new_labels.iter()) {
|
||||
let src_file = src_dir.join(format!("{old}.json"));
|
||||
if src_file.exists() {
|
||||
fs::copy(&src_file, dst_dir.join(format!("{new}.json")))?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Recursively remove every directory named `name` under `root`.
|
||||
fn remove_dirs_named(root: &Path, name: &str) -> io::Result<()> {
|
||||
for entry in fs::read_dir(root)? {
|
||||
let entry = entry?;
|
||||
let path = entry.path();
|
||||
if path.is_dir() {
|
||||
if path.file_name().and_then(|n| n.to_str()) == Some(name) {
|
||||
fs::remove_dir_all(&path)?;
|
||||
} else {
|
||||
remove_dirs_named(&path, name)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn copy_dir_all(src: &Path, dst: &Path) -> io::Result<()> {
|
||||
fs::create_dir_all(dst)?;
|
||||
|
||||
Reference in New Issue
Block a user