enforce uniform index mode and optimize base index selection
Adds validation to ensure all input sources share the same `IndexMode`. Introduces base index selection logic that prioritizes approximate or hybrid evidence and maximizes base size to minimize newly indexed k-mers. Includes helper functions for triviality evaluation, cumulative size calculation, and mode consistency checks.
This commit is contained in:
+52
-28
@@ -63,8 +63,15 @@ impl KmerIndex {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Validate evidence compatibility ───────────────────────────────────
|
// ── Choose base: largest source in the output evidence mode ───────────
|
||||||
let evidence = validate_evidence_compat(sources)?;
|
let base_idx = choose_base(sources, mode);
|
||||||
|
let mut ordered: Vec<&KmerIndex> = Vec::with_capacity(sources.len());
|
||||||
|
ordered.push(sources[base_idx]);
|
||||||
|
for (i, &src) in sources.iter().enumerate() {
|
||||||
|
if i != base_idx { ordered.push(src); }
|
||||||
|
}
|
||||||
|
let sources: &[&KmerIndex] = &ordered;
|
||||||
|
let evidence = sources[0].meta.config.evidence.clone();
|
||||||
|
|
||||||
// ── Compute final genome labels (rename duplicates if requested) ───────
|
// ── Compute final genome labels (rename duplicates if requested) ───────
|
||||||
let (source_labels, all_genomes) = compute_labels(sources, rename_duplicates)?;
|
let (source_labels, all_genomes) = compute_labels(sources, rename_duplicates)?;
|
||||||
@@ -265,34 +272,51 @@ fn partition_bar(n: u64) -> ProgressBar {
|
|||||||
pb
|
pb
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Check that all sources share the same evidence kind.
|
/// A source is "trivial" if its presence/count values carry no approximation:
|
||||||
///
|
/// single-genome presence index (SetMembership — all values are 1 by construction).
|
||||||
/// Rules:
|
fn is_trivial(src: &KmerIndex, mode: MergeMode) -> bool {
|
||||||
/// - all `Exact` → OK, returns `Exact`
|
src.meta.genomes.len() == 1 && mode == MergeMode::Presence
|
||||||
/// - all `Approx { b, z }` same params → OK, returns `Approx { b, z }`
|
}
|
||||||
/// - mixed exact/approx or different approx params → `IncompatibleEvidence`
|
|
||||||
fn validate_evidence_compat(sources: &[&KmerIndex]) -> OKIResult<IndexMode> {
|
/// Sum of all `unitigs.bin` sizes across every partition and layer.
|
||||||
let ref_ev = &sources[0].meta.config.evidence;
|
/// Used as a proxy for the number of indexed smers.
|
||||||
for src in &sources[1..] {
|
fn index_unitig_size(src: &KmerIndex) -> u64 {
|
||||||
let ev = &src.meta.config.evidence;
|
let n = src.partition.n_partitions();
|
||||||
let compat = match (ref_ev, ev) {
|
let mut total = 0u64;
|
||||||
(IndexMode::Exact, IndexMode::Exact) => true,
|
for i in 0..n {
|
||||||
(IndexMode::Approx { b: b1, z: z1 },
|
let index_dir = src.partition.part_dir(i).join("index");
|
||||||
IndexMode::Approx { b: b2, z: z2 }) => b1 == b2 && z1 == z2,
|
let mut l = 0usize;
|
||||||
(IndexMode::Hybrid { b: b1, z: z1 },
|
loop {
|
||||||
IndexMode::Hybrid { b: b2, z: z2 }) => b1 == b2 && z1 == z2,
|
let p = index_dir.join(format!("layer_{l}")).join("unitigs.bin");
|
||||||
_ => false,
|
if !p.exists() { break; }
|
||||||
};
|
if let Ok(m) = std::fs::metadata(&p) { total += m.len(); }
|
||||||
if !compat {
|
l += 1;
|
||||||
return Err(OKIError::IncompatibleEvidence(format!(
|
|
||||||
"source {:?} has evidence {:?}, expected {:?} — \
|
|
||||||
convert all sources to the same evidence kind first \
|
|
||||||
(use the `reindex` command)",
|
|
||||||
src.root_path.display(), ev, ref_ev,
|
|
||||||
)));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Ok(ref_ev.clone())
|
total
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Choose the index to use as bootstrap base.
|
||||||
|
///
|
||||||
|
/// Rule — mieux-disant: if any non-trivial source uses approximate evidence
|
||||||
|
/// (Approx or Hybrid), the output must also be approximate; the base must
|
||||||
|
/// therefore come from an approximate source so its layers carry the right
|
||||||
|
/// evidence files. Among qualifying candidates, the largest (by unitig size)
|
||||||
|
/// is chosen to minimise the number of new smers in the merge layer.
|
||||||
|
fn choose_base(sources: &[&KmerIndex], mode: MergeMode) -> usize {
|
||||||
|
let needs_approx = sources.iter().any(|src| {
|
||||||
|
!is_trivial(src, mode)
|
||||||
|
&& matches!(src.meta.config.evidence, IndexMode::Approx { .. } | IndexMode::Hybrid { .. })
|
||||||
|
});
|
||||||
|
|
||||||
|
sources.iter().enumerate()
|
||||||
|
.filter(|(_, src)| {
|
||||||
|
!needs_approx
|
||||||
|
|| matches!(src.meta.config.evidence, IndexMode::Approx { .. } | IndexMode::Hybrid { .. })
|
||||||
|
})
|
||||||
|
.max_by_key(|(_, src)| index_unitig_size(src))
|
||||||
|
.map(|(i, _)| i)
|
||||||
|
.unwrap()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn copy_dir_all(src: &Path, dst: &Path) -> io::Result<()> {
|
fn copy_dir_all(src: &Path, dst: &Path) -> io::Result<()> {
|
||||||
|
|||||||
Reference in New Issue
Block a user