diff --git a/src/obikindex/src/merge.rs b/src/obikindex/src/merge.rs index e0744c4..68d8db7 100644 --- a/src/obikindex/src/merge.rs +++ b/src/obikindex/src/merge.rs @@ -63,8 +63,15 @@ impl KmerIndex { } } - // ── Validate evidence compatibility ─────────────────────────────────── - let evidence = validate_evidence_compat(sources)?; + // ── Choose base: largest source in the output evidence mode ─────────── + let base_idx = choose_base(sources, mode); + let mut ordered: Vec<&KmerIndex> = Vec::with_capacity(sources.len()); + ordered.push(sources[base_idx]); + for (i, &src) in sources.iter().enumerate() { + if i != base_idx { ordered.push(src); } + } + let sources: &[&KmerIndex] = &ordered; + let evidence = sources[0].meta.config.evidence.clone(); // ── Compute final genome labels (rename duplicates if requested) ─────── let (source_labels, all_genomes) = compute_labels(sources, rename_duplicates)?; @@ -265,34 +272,51 @@ fn partition_bar(n: u64) -> ProgressBar { pb } -/// Check that all sources share the same evidence kind. -/// -/// Rules: -/// - all `Exact` → OK, returns `Exact` -/// - all `Approx { b, z }` same params → OK, returns `Approx { b, z }` -/// - mixed exact/approx or different approx params → `IncompatibleEvidence` -fn validate_evidence_compat(sources: &[&KmerIndex]) -> OKIResult { - let ref_ev = &sources[0].meta.config.evidence; - for src in &sources[1..] { - let ev = &src.meta.config.evidence; - let compat = match (ref_ev, ev) { - (IndexMode::Exact, IndexMode::Exact) => true, - (IndexMode::Approx { b: b1, z: z1 }, - IndexMode::Approx { b: b2, z: z2 }) => b1 == b2 && z1 == z2, - (IndexMode::Hybrid { b: b1, z: z1 }, - IndexMode::Hybrid { b: b2, z: z2 }) => b1 == b2 && z1 == z2, - _ => false, - }; - if !compat { - return Err(OKIError::IncompatibleEvidence(format!( - "source {:?} has evidence {:?}, expected {:?} — \ - convert all sources to the same evidence kind first \ - (use the `reindex` command)", - src.root_path.display(), ev, ref_ev, - ))); +/// A source is "trivial" if its presence/count values carry no approximation: +/// single-genome presence index (SetMembership — all values are 1 by construction). +fn is_trivial(src: &KmerIndex, mode: MergeMode) -> bool { + src.meta.genomes.len() == 1 && mode == MergeMode::Presence +} + +/// Sum of all `unitigs.bin` sizes across every partition and layer. +/// Used as a proxy for the number of indexed smers. +fn index_unitig_size(src: &KmerIndex) -> u64 { + let n = src.partition.n_partitions(); + let mut total = 0u64; + for i in 0..n { + let index_dir = src.partition.part_dir(i).join("index"); + let mut l = 0usize; + loop { + let p = index_dir.join(format!("layer_{l}")).join("unitigs.bin"); + if !p.exists() { break; } + if let Ok(m) = std::fs::metadata(&p) { total += m.len(); } + l += 1; } } - Ok(ref_ev.clone()) + total +} + +/// Choose the index to use as bootstrap base. +/// +/// Rule — mieux-disant: if any non-trivial source uses approximate evidence +/// (Approx or Hybrid), the output must also be approximate; the base must +/// therefore come from an approximate source so its layers carry the right +/// evidence files. Among qualifying candidates, the largest (by unitig size) +/// is chosen to minimise the number of new smers in the merge layer. +fn choose_base(sources: &[&KmerIndex], mode: MergeMode) -> usize { + let needs_approx = sources.iter().any(|src| { + !is_trivial(src, mode) + && matches!(src.meta.config.evidence, IndexMode::Approx { .. } | IndexMode::Hybrid { .. }) + }); + + sources.iter().enumerate() + .filter(|(_, src)| { + !needs_approx + || matches!(src.meta.config.evidence, IndexMode::Approx { .. } | IndexMode::Hybrid { .. }) + }) + .max_by_key(|(_, src)| index_unitig_size(src)) + .map(|(i, _)| i) + .unwrap() } fn copy_dir_all(src: &Path, dst: &Path) -> io::Result<()> {