enforce uniform index mode and optimize base index selection

Adds validation to ensure all input sources share the same `IndexMode`. Introduces base index selection logic that prioritizes approximate or hybrid evidence and maximizes base size to minimize newly indexed k-mers. Includes helper functions for triviality evaluation, cumulative size calculation, and mode consistency checks.
2026-06-01 14:36:28 +02:00
parent 0350ca855b
commit add6d7f873
1 changed files with 52 additions and 28 deletions
@@ -63,8 +63,15 @@ impl KmerIndex {
            }
        }

-        // ── Validate evidence compatibility ───────────────────────────────────
-        let evidence = validate_evidence_compat(sources)?;
+        // ── Choose base: largest source in the output evidence mode ───────────
+        let base_idx = choose_base(sources, mode);
+        let mut ordered: Vec<&KmerIndex> = Vec::with_capacity(sources.len());
+        ordered.push(sources[base_idx]);
+        for (i, &src) in sources.iter().enumerate() {
+            if i != base_idx { ordered.push(src); }
+        }
+        let sources: &[&KmerIndex] = &ordered;
+        let evidence = sources[0].meta.config.evidence.clone();

        // ── Compute final genome labels (rename duplicates if requested) ───────
        let (source_labels, all_genomes) = compute_labels(sources, rename_duplicates)?;
@@ -265,34 +272,51 @@ fn partition_bar(n: u64) -> ProgressBar {
    pb
 }

-/// Check that all sources share the same evidence kind.
+/// A source is "trivial" if its presence/count values carry no approximation:
+/// single-genome presence index (SetMembership — all values are 1 by construction).
+fn is_trivial(src: &KmerIndex, mode: MergeMode) -> bool {
+    src.meta.genomes.len() == 1 && mode == MergeMode::Presence
+}
+
+/// Sum of all `unitigs.bin` sizes across every partition and layer.
+/// Used as a proxy for the number of indexed smers.
+fn index_unitig_size(src: &KmerIndex) -> u64 {
+    let n = src.partition.n_partitions();
+    let mut total = 0u64;
+    for i in 0..n {
+        let index_dir = src.partition.part_dir(i).join("index");
+        let mut l = 0usize;
+        loop {
+            let p = index_dir.join(format!("layer_{l}")).join("unitigs.bin");
+            if !p.exists() { break; }
+            if let Ok(m) = std::fs::metadata(&p) { total += m.len(); }
+            l += 1;
+        }
+    }
+    total
+}
+
+/// Choose the index to use as bootstrap base.
 ///
-/// Rules:
-/// - all `Exact`                        → OK, returns `Exact`
-/// - all `Approx { b, z }` same params  → OK, returns `Approx { b, z }`
-/// - mixed exact/approx or different approx params → `IncompatibleEvidence`
-fn validate_evidence_compat(sources: &[&KmerIndex]) -> OKIResult<IndexMode> {
-    let ref_ev = &sources[0].meta.config.evidence;
-    for src in &sources[1..] {
-        let ev = &src.meta.config.evidence;
-        let compat = match (ref_ev, ev) {
-            (IndexMode::Exact, IndexMode::Exact) => true,
-            (IndexMode::Approx { b: b1, z: z1 },
-             IndexMode::Approx { b: b2, z: z2 }) => b1 == b2 && z1 == z2,
-            (IndexMode::Hybrid { b: b1, z: z1 },
-             IndexMode::Hybrid { b: b2, z: z2 }) => b1 == b2 && z1 == z2,
-            _ => false,
-        };
-        if !compat {
-            return Err(OKIError::IncompatibleEvidence(format!(
-                "source {:?} has evidence {:?}, expected {:?} — \
-                 convert all sources to the same evidence kind first \
-                 (use the `reindex` command)",
-                src.root_path.display(), ev, ref_ev,
-            )));
-        }
-    }
-    Ok(ref_ev.clone())
+/// Rule — mieux-disant: if any non-trivial source uses approximate evidence
+/// (Approx or Hybrid), the output must also be approximate; the base must
+/// therefore come from an approximate source so its layers carry the right
+/// evidence files.  Among qualifying candidates, the largest (by unitig size)
+/// is chosen to minimise the number of new smers in the merge layer.
+fn choose_base(sources: &[&KmerIndex], mode: MergeMode) -> usize {
+    let needs_approx = sources.iter().any(|src| {
+        !is_trivial(src, mode)
+            && matches!(src.meta.config.evidence, IndexMode::Approx { .. } | IndexMode::Hybrid { .. })
+    });
+
+    sources.iter().enumerate()
+        .filter(|(_, src)| {
+            !needs_approx
+                || matches!(src.meta.config.evidence, IndexMode::Approx { .. } | IndexMode::Hybrid { .. })
+        })
+        .max_by_key(|(_, src)| index_unitig_size(src))
+        .map(|(i, _)| i)
+        .unwrap()
 }

 fn copy_dir_all(src: &Path, dst: &Path) -> io::Result<()> {