From c5bcb7b8fa26dcc0388416005f8a4df8603ec592 Mon Sep 17 00:00:00 2001
From: Eric Coissac <eric@coissac.eu>
Date: Wed, 20 May 2026 21:01:16 +0200
Subject: [PATCH 01/11] feat: introduce layered MPHF indexing and partition
 metadata

Refactors obikindex and obikpartitionner to delegate index construction to a new layered MPHF implementation. Adds resume-safe building with abundance filtering and count persistence, while introducing a PartitionMeta struct for JSON configuration persistence. Updates OKIError to wrap layer-specific errors, replaces single-path extraction with full path collection and logging, and registers new internal dependencies across the workspace.
---
 TODO.md                                 |   3 +
 src/Cargo.lock                          |   9 +-
 src/obikindex/Cargo.toml                |   9 +-
 src/obikindex/src/error.rs              |   8 --
 src/obikindex/src/index.rs              | 171 +++++-------------------
 src/obikmer/src/cmd/index.rs            |   7 +-
 src/obikpartitionner/Cargo.toml         |   6 +-
 src/obikpartitionner/src/index_layer.rs | 135 +++++++++++++++++++
 src/obikpartitionner/src/lib.rs         |   1 +
 src/obikpartitionner/src/partition.rs   |  73 ++--------
 10 files changed, 193 insertions(+), 229 deletions(-)
 create mode 100644 src/obikpartitionner/src/index_layer.rs
diff --git a/TODO.md b/TODO.md
index fdb0998..b67b7c3 100644
--- a/TODO.md
+++ b/TODO.md
@@ -1,5 +1,6 @@
 ## Chose à vérifier suite à la commande index
 
+- il faudrait lister les fichier qui vont être indexés
 - partition.meta ne devrait plus exister
 - les spectrums globaux devrait etre identifier par génome
   - regrouper dans un sous-dossier spectrums à la racine de l'index avec un nom basé sur le génome
@@ -26,3 +27,5 @@
   - les arbres NJ sont sauvegardés en Newick avec les longeurs de branche
 
 - dump : une table csv de l'index avec les kmer et les genomes associés en mode count ou presence/absence avec une option pour forcer le mode presence/absence meme si l'index est en mode count. Par defaut, le mode count est utilisé pour les index en mode count et le mode presence/absence pour les index en mode presence/absence.
+
+- status : affiche le statut de l'index
diff --git a/src/Cargo.lock b/src/Cargo.lock
index 858c7c1..4c1929b 100644
--- a/src/Cargo.lock
+++ b/src/Cargo.lock
@@ -1452,17 +1452,10 @@ dependencies = [
 name = "obikindex"
 version = "0.1.0"
 dependencies = [
- "cacheline-ef",
- "epserde",
  "indicatif",
- "obicompactvec",
- "obidebruinj",
  "obikpartitionner",
- "obikseq",
- "obilayeredmap",
  "obiskio",
  "obisys",
- "ptr_hash",
  "rayon",
  "serde",
  "serde_json",
@@ -1501,8 +1494,10 @@ dependencies = [
  "memmap2",
  "niffler 3.0.0",
  "obicompactvec",
+ "obidebruinj",
  "obikrope",
  "obikseq",
+ "obilayeredmap",
  "obiread",
  "obiskbuilder",
  "obiskio",
diff --git a/src/obikindex/Cargo.toml b/src/obikindex/Cargo.toml
index caa12d6..bf4ebd6 100644
--- a/src/obikindex/Cargo.toml
+++ b/src/obikindex/Cargo.toml
@@ -5,15 +5,8 @@ edition = "2024"
 
 [dependencies]
 obikpartitionner = { path = "../obikpartitionner" }
-obikseq          = { path = "../obikseq" }
-obisys           = { path = "../obisys" }
 obiskio          = { path = "../obiskio" }
-obidebruinj      = { path = "../obidebruinj" }
-obilayeredmap    = { path = "../obilayeredmap" }
-obicompactvec    = { path = "../obicompactvec" }
-cacheline-ef     = "1.1"
-epserde          = "0.8"
-ptr_hash         = "1.1"
+obisys           = { path = "../obisys" }
 rayon            = "1"
 serde            = { version = "1", features = ["derive"] }
 serde_json       = "1"
diff --git a/src/obikindex/src/error.rs b/src/obikindex/src/error.rs
index 55b2a79..df191ed 100644
--- a/src/obikindex/src/error.rs
+++ b/src/obikindex/src/error.rs
@@ -2,14 +2,12 @@ use std::fmt;
 use std::io;
 
 use obiskio::SKError;
-use obilayeredmap::OLMError;
 
 #[derive(Debug)]
 pub enum OKIError {
     Io(io::Error),
     Json(serde_json::Error),
     Partition(SKError),
-    Layer(OLMError),
 }
 
 pub type OKIResult<T> = Result<T, OKIError>;
@@ -20,7 +18,6 @@ impl fmt::Display for OKIError {
             OKIError::Io(e)        => write!(f, "I/O error: {e}"),
             OKIError::Json(e)      => write!(f, "JSON error: {e}"),
             OKIError::Partition(e) => write!(f, "partition error: {e}"),
-            OKIError::Layer(e)     => write!(f, "layer error: {e}"),
         }
     }
 }
@@ -31,7 +28,6 @@ impl std::error::Error for OKIError {
             OKIError::Io(e)        => Some(e),
             OKIError::Json(e)      => Some(e),
             OKIError::Partition(e) => Some(e),
-            OKIError::Layer(e)     => Some(e),
         }
     }
 }
@@ -47,7 +43,3 @@ impl From<serde_json::Error> for OKIError {
 impl From<SKError> for OKIError {
     fn from(e: SKError) -> Self { OKIError::Partition(e) }
 }
-
-impl From<OLMError> for OKIError {
-    fn from(e: OLMError) -> Self { OKIError::Layer(e) }
-}
diff --git a/src/obikindex/src/index.rs b/src/obikindex/src/index.rs
index 37b30c1..4e6bd7c 100644
--- a/src/obikindex/src/index.rs
+++ b/src/obikindex/src/index.rs
@@ -3,16 +3,9 @@ use std::path::{Path, PathBuf};
 use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::{Arc, Mutex};
 
-use cacheline_ef::{CachelineEf, CachelineEfVec};
-use epserde::prelude::*;
 use indicatif::{ProgressBar, ProgressStyle};
-use obicompactvec::{PersistentCompactIntMatrix, PersistentCompactIntVec};
-use obidebruinj::GraphDeBruijn;
 use obikpartitionner::KmerPartition;
-use obilayeredmap::layer::Layer;
-use obiskio::{SKFileMeta, SKFileReader};
 use obisys::{Reporter, Stage};
-use ptr_hash::{PtrHash, bucket_fn::CubicEps, hash::Xx64};
 use rayon::prelude::*;
 use tracing::info;
 
@@ -20,8 +13,6 @@ use crate::error::{OKIError, OKIResult};
 use crate::meta::{IndexConfig, IndexMeta};
 use crate::state::{IndexState, SENTINEL_INDEXED, SENTINEL_SCATTERED};
 
-type Mphf = PtrHash<u64, CubicEps, CachelineEfVec<Vec<CachelineEf>>, Xx64, Vec<u8>>;
-
 pub struct KmerIndex {
     root_path: PathBuf,
     meta: IndexMeta,
@@ -59,7 +50,12 @@ impl KmerIndex {
     pub fn open<P: AsRef<Path>>(path: P) -> OKIResult<Self> {
         let root_path = path.as_ref().to_owned();
         let meta = IndexMeta::read(&root_path).map_err(OKIError::Io)?;
-        let partition = KmerPartition::open(&root_path)?;
+        let partition = KmerPartition::open_with_config(
+            &root_path,
+            meta.config.kmer_size,
+            meta.config.minimizer_size,
+            meta.config.n_bits,
+        )?;
         Ok(Self { root_path, meta, partition })
     }
 
@@ -87,13 +83,10 @@ impl KmerIndex {
     /// Mark scatter as complete and write `scatter.done`.
     ///
     /// If no genome label was set at creation time, one is derived from
-    /// `first_scatter_path` (filename stripped of all extensions).
-    /// If `first_scatter_path` is also `None`, the label defaults to `"unknown"`.
-    pub fn mark_scattered(&mut self, first_scatter_path: Option<&Path>) -> OKIResult<()> {
+    /// the index root directory name (stripped of all extensions).
+    pub fn mark_scattered(&mut self) -> OKIResult<()> {
         if self.meta.genomes.is_empty() {
-            let label = first_scatter_path
-                .map(label_from_path)
-                .unwrap_or_else(|| "unknown".to_string());
+            let label = label_from_path(&self.root_path);
             self.meta.genomes.push(label);
             self.meta.write(&self.root_path)?;
         }
@@ -116,20 +109,9 @@ impl KmerIndex {
         Ok(())
     }
 
-    /// Build the layered MPHF index for all partitions.
-    ///
-    /// Default mode (`config.with_counts = false`): set membership only.
-    /// With counts: count matrix per kmer.
+    /// Build the layered MPHF index for all partitions in parallel.
     ///
     /// Writes `index.done` upon completion.
-    /// Path to the unitigs file for partition `part`, layer `layer`.
-    pub fn layer_unitigs_path(&self, part: usize, layer: usize) -> PathBuf {
-        self.partition.part_dir(part)
-            .join("index")
-            .join(format!("layer_{layer}"))
-            .join("unitigs.bin")
-    }
-
     pub fn build_layers(
         &self,
         min_ab: u32,
@@ -140,12 +122,8 @@ impl KmerIndex {
         let n = self.partition.n_partitions();
         let t = Stage::start("index");
         let with_counts = self.meta.config.with_counts;
-        let filter_active = min_ab > 1 || max_ab.is_some();
-        let need_counts = filter_active || with_counts;
         let total_kmers = AtomicUsize::new(0);
 
-        let partition = &self.partition;
-
         let pb = Arc::new(Mutex::new(
             ProgressBar::new(n as u64).with_style(
                 ProgressStyle::with_template("index — [{bar:20}] {pos}/{len} | {msg}").unwrap(),
@@ -153,101 +131,19 @@ impl KmerIndex {
         ));
 
         (0..n).into_par_iter().for_each(|i| {
-            let part_dir = partition.part_dir(i);
-            let dedup_path = part_dir.join("dereplicated.skmer.zst");
-            if !dedup_path.exists() {
-                return;
-            }
-
-            let layer_dir = part_dir.join("index").join("layer_0");
-            if layer_dir.join("mphf.bin").exists() {
-                return;
-            }
-
-            let mphf1_opt: Option<Mphf> = if need_counts {
-                let p = part_dir.join("mphf1.bin");
-                p.exists().then(|| Mphf::load_full(&p).ok()).flatten()
-            } else {
-                None
-            };
-
-            let counts1_opt: Option<PersistentCompactIntVec> = if need_counts {
-                let p = part_dir.join("counts1.bin");
-                p.exists()
-                    .then(|| PersistentCompactIntVec::open(&p).ok())
-                    .flatten()
-            } else {
-                None
-            };
-
-            let mut g = GraphDeBruijn::new();
-            let mut reader = SKFileReader::open(&dedup_path).unwrap_or_else(|e| {
-                eprintln!("error opening {}: {e}", dedup_path.display());
-                std::process::exit(1);
-            });
-            for sk in reader.iter() {
-                for kmer in sk.iter_canonical_kmers() {
-                    let accept = if filter_active {
-                        match (&mphf1_opt, &counts1_opt) {
-                            (Some(mphf), Some(counts)) => {
-                                let ab = counts.get(mphf.index(&kmer.raw()));
-                                ab >= min_ab && max_ab.map_or(true, |max| ab <= max)
-                            }
-                            _ => true,
-                        }
-                    } else {
-                        true
-                    };
-                    if accept {
-                        g.push(kmer);
-                    }
+            match self.partition.build_index_layer(i, min_ab, max_ab, with_counts) {
+                Ok(0) => {}
+                Ok(n_kmers) => {
+                    total_kmers.fetch_add(n_kmers, Ordering::Relaxed);
+                    let pb = pb.lock().unwrap();
+                    pb.inc(1);
+                    pb.set_message(format!("{i}: {n_kmers} kmers"));
+                }
+                Err(e) => {
+                    eprintln!("error building layer for partition {i}: {e}");
+                    std::process::exit(1);
                 }
             }
-
-            let n_kmers = g.len();
-            total_kmers.fetch_add(n_kmers, Ordering::Relaxed);
-            g.compute_degrees();
-
-            fs::create_dir_all(&layer_dir).unwrap_or_else(|e| {
-                eprintln!("error creating {}: {e}", layer_dir.display());
-                std::process::exit(1);
-            });
-            let mut uw = Layer::<()>::unitig_writer(&layer_dir).unwrap_or_else(|e| {
-                eprintln!("error creating unitig writer (partition {i}): {e}");
-                std::process::exit(1);
-            });
-            for unitig in g.iter_unitig() {
-                uw.write(&unitig).unwrap_or_else(|e| {
-                    eprintln!("error writing unitig (partition {i}): {e}");
-                    std::process::exit(1);
-                });
-            }
-            uw.close().unwrap_or_else(|e| {
-                eprintln!("error closing unitig writer (partition {i}): {e}");
-                std::process::exit(1);
-            });
-
-            if with_counts {
-                Layer::<PersistentCompactIntMatrix>::build(&layer_dir, |kmer| {
-                    match (&mphf1_opt, &counts1_opt) {
-                        (Some(mphf), Some(counts)) => counts.get(mphf.index(&kmer.raw())),
-                        _ => 1,
-                    }
-                })
-                .unwrap_or_else(|e| {
-                    eprintln!("error building count layer (partition {i}): {e}");
-                    std::process::exit(1);
-                });
-            } else {
-                Layer::<()>::build(&layer_dir).unwrap_or_else(|e| {
-                    eprintln!("error building set layer (partition {i}): {e}");
-                    std::process::exit(1);
-                });
-            }
-
-            let pb = pb.lock().unwrap();
-            pb.inc(1);
-            pb.set_message(format!("{i}: {n_kmers} kmers"));
         });
 
         pb.lock().unwrap().finish_and_clear();
@@ -258,13 +154,7 @@ impl KmerIndex {
 
         if !keep_intermediate {
             for i in 0..n {
-                let part_dir = partition.part_dir(i);
-                remove_if_exists(&part_dir.join("dereplicated.skmer.zst"));
-                remove_if_exists(&SKFileMeta::sidecar_path(
-                    &part_dir.join("dereplicated.skmer.zst"),
-                ));
-                remove_if_exists(&part_dir.join("mphf1.bin"));
-                remove_if_exists(&part_dir.join("counts1.bin"));
+                self.partition.remove_build_artifacts(i);
             }
         }
 
@@ -272,9 +162,16 @@ impl KmerIndex {
         rep.push(t.stop());
         Ok(())
     }
+
+    /// Path to the unitigs file for partition `part`, layer `layer`.
+    pub fn layer_unitigs_path(&self, part: usize, layer: usize) -> PathBuf {
+        self.partition.part_dir(part)
+            .join("index")
+            .join(format!("layer_{layer}"))
+            .join("unitigs.bin")
+    }
 }
 
-/// Derive a genome label from a file path: filename stripped of all extensions.
 fn label_from_path(path: &Path) -> String {
     let name = path
         .file_name()
@@ -291,11 +188,3 @@ fn label_from_path(path: &Path) -> String {
 fn touch(path: &Path) -> Result<(), std::io::Error> {
     fs::File::create(path).map(|_| ())
 }
-
-fn remove_if_exists(path: &Path) {
-    if let Err(e) = fs::remove_file(path) {
-        if e.kind() != std::io::ErrorKind::NotFound {
-            eprintln!("warning: could not remove {}: {e}", path.display());
-        }
-    }
-}
diff --git a/src/obikmer/src/cmd/index.rs b/src/obikmer/src/cmd/index.rs
index 40349e1..089402c 100644
--- a/src/obikmer/src/cmd/index.rs
+++ b/src/obikmer/src/cmd/index.rs
@@ -72,7 +72,10 @@ pub fn run(args: IndexArgs) {
 
     // ── Stage 1: scatter ─────────────────────────────────────────────────────
     if idx.state() < IndexState::Scattered {
-        let first_path = args.common.inputs.first().map(PathBuf::from);
+        let paths: Vec<_> = args.common.seqfile_paths().collect();
+        for path in &paths {
+            info!("indexing: {}", path.display());
+        }
         let k         = idx.kmer_size();
         let level_max = args.common.level_max;
         let theta     = args.common.theta;
@@ -80,7 +83,7 @@ pub fn run(args: IndexArgs) {
 
         scatter(idx.partition_mut(), args.common.seqfile_paths(), k, level_max, theta, n_workers, &mut rep);
 
-        idx.mark_scattered(first_path.as_deref()).unwrap_or_else(|e| {
+        idx.mark_scattered().unwrap_or_else(|e| {
             eprintln!("error marking scatter done: {e}");
             std::process::exit(1);
         });
diff --git a/src/obikpartitionner/Cargo.toml b/src/obikpartitionner/Cargo.toml
index 8e94ff2..d62fdf4 100644
--- a/src/obikpartitionner/Cargo.toml
+++ b/src/obikpartitionner/Cargo.toml
@@ -13,8 +13,10 @@ obikrope  = { path = "../obikrope" }
 [dependencies]
 niffler = "3.0.0"
 remove_dir_all = "0.8"
-obikseq = { path = "../obikseq" }
-obiskio = { path = "../obiskio" }
+obikseq      = { path = "../obikseq" }
+obiskio      = { path = "../obiskio" }
+obidebruinj  = { path = "../obidebruinj" }
+obilayeredmap = { path = "../obilayeredmap" }
 rayon = "1"
 sysinfo = "0.33"
 serde = { version = "1", features = ["derive"] }
diff --git a/src/obikpartitionner/src/index_layer.rs b/src/obikpartitionner/src/index_layer.rs
new file mode 100644
index 0000000..a6d873a
--- /dev/null
+++ b/src/obikpartitionner/src/index_layer.rs
@@ -0,0 +1,135 @@
+use std::fs;
+use std::io;
+
+use cacheline_ef::{CachelineEf, CachelineEfVec};
+use epserde::prelude::*;
+use obicompactvec::{PersistentCompactIntMatrix, PersistentCompactIntVec};
+use obidebruinj::GraphDeBruijn;
+use obilayeredmap::{OLMError, layer::Layer};
+use obiskio::{SKError, SKFileMeta, SKFileReader};
+use ptr_hash::{PtrHash, bucket_fn::CubicEps, hash::Xx64};
+
+use crate::partition::KmerPartition;
+
+type Mphf = PtrHash<u64, CubicEps, CachelineEfVec<Vec<CachelineEf>>, Xx64, Vec<u8>>;
+
+fn olm_to_sk(e: OLMError) -> SKError {
+    match e {
+        OLMError::Io(io_err) => SKError::Io(io_err),
+        other => SKError::InvalidData { context: "layer build", detail: other.to_string() },
+    }
+}
+
+fn remove_if_exists(path: &std::path::Path) {
+    if let Err(e) = fs::remove_file(path) {
+        if e.kind() != io::ErrorKind::NotFound {
+            eprintln!("warning: could not remove {}: {e}", path.display());
+        }
+    }
+}
+
+impl KmerPartition {
+    /// Build the layered MPHF index for partition `i`.
+    ///
+    /// Returns the number of canonical k-mers indexed, or 0 if the partition
+    /// has no data or its layer was already built (resume-safe).
+    ///
+    /// Abundance filtering is applied when `min_ab > 1` or `max_ab.is_some()`,
+    /// using `mphf1.bin` + `counts1.bin` if they exist.
+    /// Count payload is stored iff `with_counts` is true.
+    pub fn build_index_layer(
+        &self,
+        i: usize,
+        min_ab: u32,
+        max_ab: Option<u32>,
+        with_counts: bool,
+    ) -> Result<usize, SKError> {
+        let part_dir = self.part_dir(i);
+        let dedup_path = part_dir.join("dereplicated.skmer.zst");
+        if !dedup_path.exists() {
+            return Ok(0);
+        }
+
+        let layer_dir = part_dir.join("index").join("layer_0");
+        if layer_dir.join("mphf.bin").exists() {
+            return Ok(0);
+        }
+
+        let filter_active = min_ab > 1 || max_ab.is_some();
+        let need_counts = filter_active || with_counts;
+
+        let mphf1_opt: Option<Mphf> = if need_counts {
+            let p = part_dir.join("mphf1.bin");
+            p.exists().then(|| Mphf::load_full(&p).ok()).flatten()
+        } else {
+            None
+        };
+
+        let counts1_opt: Option<PersistentCompactIntVec> = if need_counts {
+            let p = part_dir.join("counts1.bin");
+            p.exists()
+                .then(|| PersistentCompactIntVec::open(&p).ok())
+                .flatten()
+        } else {
+            None
+        };
+
+        let mut g = GraphDeBruijn::new();
+        let mut reader = SKFileReader::open(&dedup_path)?;
+        for sk in reader.iter() {
+            for kmer in sk.iter_canonical_kmers() {
+                let accept = if filter_active {
+                    match (&mphf1_opt, &counts1_opt) {
+                        (Some(mphf), Some(counts)) => {
+                            let ab = counts.get(mphf.index(&kmer.raw()));
+                            ab >= min_ab && max_ab.map_or(true, |max| ab <= max)
+                        }
+                        _ => true,
+                    }
+                } else {
+                    true
+                };
+                if accept {
+                    g.push(kmer);
+                }
+            }
+        }
+
+        let n_kmers = g.len();
+        g.compute_degrees();
+
+        fs::create_dir_all(&layer_dir)?;
+
+        let mut uw = Layer::<()>::unitig_writer(&layer_dir).map_err(olm_to_sk)?;
+        for unitig in g.iter_unitig() {
+            uw.write(&unitig)?;
+        }
+        uw.close()?;
+
+        if with_counts {
+            Layer::<PersistentCompactIntMatrix>::build(&layer_dir, |kmer| {
+                match (&mphf1_opt, &counts1_opt) {
+                    (Some(mphf), Some(counts)) => counts.get(mphf.index(&kmer.raw())),
+                    _ => 1,
+                }
+            })
+            .map_err(olm_to_sk)?;
+        } else {
+            Layer::<()>::build(&layer_dir).map_err(olm_to_sk)?;
+        }
+
+        Ok(n_kmers)
+    }
+
+    /// Remove intermediate build artifacts for partition `i`.
+    ///
+    /// Deletes `dereplicated.skmer.zst` (+ sidecar), `mphf1.bin`, `counts1.bin`.
+    pub fn remove_build_artifacts(&self, i: usize) {
+        let part_dir = self.part_dir(i);
+        let dedup = part_dir.join("dereplicated.skmer.zst");
+        remove_if_exists(&SKFileMeta::sidecar_path(&dedup));
+        remove_if_exists(&dedup);
+        remove_if_exists(&part_dir.join("mphf1.bin"));
+        remove_if_exists(&part_dir.join("counts1.bin"));
+    }
+}
diff --git a/src/obikpartitionner/src/lib.rs b/src/obikpartitionner/src/lib.rs
index 09fc147..bd7934f 100644
--- a/src/obikpartitionner/src/lib.rs
+++ b/src/obikpartitionner/src/lib.rs
@@ -1,3 +1,4 @@
+mod index_layer;
 mod kmer_sort;
 mod partition;
 
diff --git a/src/obikpartitionner/src/partition.rs b/src/obikpartitionner/src/partition.rs
index a638f93..5740aa2 100644
--- a/src/obikpartitionner/src/partition.rs
+++ b/src/obikpartitionner/src/partition.rs
@@ -18,7 +18,6 @@ use obiskio::{SKFileMeta, SKFileReader, SKFileWriter, SKResult};
 use ptr_hash::{PtrHash, PtrHashParams, bucket_fn::CubicEps, hash::Xx64};
 use rayon::prelude::*;
 use remove_dir_all::remove_dir_all;
-use serde::{Deserialize, Serialize};
 use sysinfo::System;
 
 use niffler::Level;
@@ -28,18 +27,9 @@ use crate::kmer_sort::{chunk_size_from_ram, sort_unique_kmers};
 
 type Mphf = PtrHash<u64, CubicEps, CachelineEfVec<Vec<CachelineEf>>, Xx64, Vec<u8>>;
 
-const META_FILENAME: &str = "partition.meta";
 const SK_EXT: &str = "skmer.zst";
 pub const PARTITIONS_SUBDIR: &str = "partitions";
 
-#[derive(Serialize, Deserialize)]
-struct PartitionMeta {
-    n_bits: usize,
-    kmer_size: usize,
-    minimizer_size: usize,
-    level: u32,
-}
-
 pub struct KmerPartition {
     root_path: PathBuf,
     n_partitions: usize,
@@ -98,11 +88,15 @@ impl KmerPartition {
             level,
             closed: false,
         };
-        partition.write_meta(n_bits)?;
         Ok(partition)
     }
 
-    pub fn open<P: AsRef<Path>>(path: P) -> SKResult<Self> {
+    pub fn open_with_config<P: AsRef<Path>>(
+        path: P,
+        kmer_size: usize,
+        minimizer_size: usize,
+        n_bits: usize,
+    ) -> SKResult<Self> {
         let root_path = path.as_ref().to_owned();
         if !root_path.exists() {
             return Err(io::Error::new(
@@ -111,22 +105,17 @@ impl KmerPartition {
             )
             .into());
         }
-        let meta_path = root_path.join(META_FILENAME);
-        let meta: PartitionMeta =
-            serde_json::from_reader(fs::File::open(&meta_path)?).map_err(io::Error::other)?;
-
-        let level = level_from_u32(meta.level);
-        let n_partitions = 1usize << meta.n_bits;
+        let n_partitions = 1usize << n_bits;
         let writers = (0..n_partitions).map(|_| None).collect();
         Ok(Self {
             root_path,
             n_partitions,
-            partitions_mask: (1u64 << meta.n_bits) - 1,
-            kmer_size: meta.kmer_size,
-            minimizer_size: meta.minimizer_size,
+            partitions_mask: (1u64 << n_bits) - 1,
+            kmer_size,
+            minimizer_size,
             writers,
-            level,
-            closed: true, // read-only: writing is not allowed on an opened partition
+            level: Level::One,
+            closed: true,
         })
     }
 
@@ -339,18 +328,6 @@ impl KmerPartition {
         }
     }
 
-    fn write_meta(&self, n_bits: usize) -> SKResult<()> {
-        let meta = PartitionMeta {
-            n_bits,
-            kmer_size: self.kmer_size,
-            minimizer_size: self.minimizer_size,
-            level: u32::from(self.level),
-        };
-        let f = fs::File::create(self.root_path.join(META_FILENAME))?;
-        serde_json::to_writer_pretty(f, &meta).map_err(|e| io::Error::other(e))?;
-        Ok(())
-    }
-
     fn ensure_writer(&mut self, partition: usize) -> SKResult<&mut SKFileWriter> {
         if self.writers[partition].is_none() {
             let dir = self.root_path.join(PARTITIONS_SUBDIR).join(format!("part_{:05}", partition));
@@ -411,32 +388,6 @@ fn optimal_buckets(raw_path: &Path, available_bytes: u64) -> usize {
     n.next_power_of_two() as usize
 }
 
-fn level_from_u32(n: u32) -> Level {
-    match n {
-        0 => Level::Zero,
-        1 => Level::One,
-        2 => Level::Two,
-        3 => Level::Three,
-        4 => Level::Four,
-        5 => Level::Five,
-        6 => Level::Six,
-        7 => Level::Seven,
-        8 => Level::Eight,
-        9 => Level::Nine,
-        10 => Level::Ten,
-        11 => Level::Eleven,
-        12 => Level::Twelve,
-        13 => Level::Thirteen,
-        14 => Level::Fourteen,
-        15 => Level::Fifteen,
-        16 => Level::Sixteen,
-        17 => Level::Seventeen,
-        18 => Level::Eighteen,
-        19 => Level::Nineteen,
-        20 => Level::Twenty,
-        _ => Level::TwentyOne,
-    }
-}
 
 /// Maximum value that fits in the 24-bit COUNT field of a SuperKmer header.
 const MAX_SK_COUNT: u64 = (1 << 24) - 1;
-- 
2.52.0


From 7d1b62ddf3828b0a7aa566ec960b01f96b6cc87e Mon Sep 17 00:00:00 2001
From: Eric Coissac <eric@coissac.eu>
Date: Wed, 20 May 2026 21:06:27 +0200
Subject: [PATCH 02/11] refactor: replace single spectrum file with
 per-partition outputs

Replace the single `kmer_spectrum_raw.json` output with per-partition JSON files in a `spectrums/` directory. Add a `keep_intermediate` parameter to control intermediate file cleanup, and introduce a `write_spectrum` helper for serialization. Update the completion sentinel to `count.done` and align state documentation accordingly.
---
 src/obikindex/src/index.rs            | 34 ++++++++++++++++++----
 src/obikindex/src/state.rs            |  4 +--
 src/obikmer/src/cmd/index.rs          |  2 +-
 src/obikpartitionner/src/lib.rs       |  2 +-
 src/obikpartitionner/src/partition.rs | 41 ++++++++++++++-------------
 5 files changed, 53 insertions(+), 30 deletions(-)

diff --git a/src/obikindex/src/index.rs b/src/obikindex/src/index.rs
index 4e6bd7c..c5ba267 100644
--- a/src/obikindex/src/index.rs
+++ b/src/obikindex/src/index.rs
@@ -1,17 +1,18 @@
+use std::collections::BTreeMap;
 use std::fs;
 use std::path::{Path, PathBuf};
 use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::{Arc, Mutex};
 
 use indicatif::{ProgressBar, ProgressStyle};
-use obikpartitionner::KmerPartition;
+use obikpartitionner::{KmerPartition, KmerSpectrum};
 use obisys::{Reporter, Stage};
 use rayon::prelude::*;
 use tracing::info;
 
 use crate::error::{OKIError, OKIResult};
 use crate::meta::{IndexConfig, IndexMeta};
-use crate::state::{IndexState, SENTINEL_INDEXED, SENTINEL_SCATTERED};
+use crate::state::{IndexState, SENTINEL_COUNTED, SENTINEL_INDEXED, SENTINEL_SCATTERED};
 
 pub struct KmerIndex {
     root_path: PathBuf,
@@ -96,16 +97,37 @@ impl KmerIndex {
 
     /// Dereplicate all partitions then compute kmer counts.
     ///
-    /// Writes `kmer_spectrum_raw.json` at the index root upon completion
-    /// (this file doubles as the `Counted` sentinel).
-    pub fn dereplicate_and_count(&self, rep: &mut Reporter) -> OKIResult<()> {
+    /// Writes `spectrums/{label}.json` and touches `count.done` upon completion.
+    /// Per-partition spectrum files are removed unless `keep_intermediate` is true.
+    pub fn dereplicate_and_count(&self, keep_intermediate: bool, rep: &mut Reporter) -> OKIResult<()> {
         let t = Stage::start("dereplicate");
         self.partition.dereplicate()?;
         rep.push(t.stop());
 
         let t = Stage::start("count_kmer");
-        self.partition.count_kmer()?;
+        let spectrum = self.partition.count_kmer(keep_intermediate)?;
         rep.push(t.stop());
+
+        self.write_spectrum(&spectrum)?;
+        touch(&self.root_path.join(SENTINEL_COUNTED))?;
+        Ok(())
+    }
+
+    fn write_spectrum(&self, sp: &KmerSpectrum) -> OKIResult<()> {
+        let label = self.meta.genomes.first().map(String::as_str).unwrap_or("unknown");
+        let spectrums_dir = self.root_path.join("spectrums");
+        fs::create_dir_all(&spectrums_dir)?;
+        let path = spectrums_dir.join(format!("{label}.json"));
+        let spectrum_map: BTreeMap<String, u64> = sp.counts
+            .iter()
+            .map(|(&c, &f)| (format!("{c:010}"), f))
+            .collect();
+        let f = fs::File::create(&path)?;
+        serde_json::to_writer_pretty(
+            f,
+            &serde_json::json!({ "f0": sp.f0, "f1": sp.f1, "spectrum": spectrum_map }),
+        )
+        .map_err(OKIError::Json)?;
         Ok(())
     }
 
diff --git a/src/obikindex/src/state.rs b/src/obikindex/src/state.rs
index a32d4d8..4db50ce 100644
--- a/src/obikindex/src/state.rs
+++ b/src/obikindex/src/state.rs
@@ -3,7 +3,7 @@ use std::path::Path;
 use crate::meta::META_FILENAME;
 
 pub const SENTINEL_SCATTERED: &str = "scatter.done";
-pub const SENTINEL_COUNTED:   &str = "kmer_spectrum_raw.json";
+pub const SENTINEL_COUNTED:   &str = "count.done";
 pub const SENTINEL_INDEXED:   &str = "index.done";
 
 /// Progression state of a `KmerIndex`.
@@ -17,7 +17,7 @@ pub enum IndexState {
     Empty,
     /// `scatter.done` sentinel present — all super-kmers have been routed.
     Scattered,
-    /// `kmer_spectrum_raw.json` present — dereplicate + count complete.
+    /// `count.done` sentinel present — dereplicate + count complete.
     Counted,
     /// `index.done` sentinel present — layered MPHF index fully built.
     Indexed,
diff --git a/src/obikmer/src/cmd/index.rs b/src/obikmer/src/cmd/index.rs
index 089402c..9f9bad2 100644
--- a/src/obikmer/src/cmd/index.rs
+++ b/src/obikmer/src/cmd/index.rs
@@ -93,7 +93,7 @@ pub fn run(args: IndexArgs) {
 
     // ── Stage 2: dereplicate + count ─────────────────────────────────────────
     if idx.state() < IndexState::Counted {
-        idx.dereplicate_and_count(&mut rep).unwrap_or_else(|e| {
+        idx.dereplicate_and_count(args.keep_intermediate, &mut rep).unwrap_or_else(|e| {
             eprintln!("error: {e}");
             std::process::exit(1);
         });
diff --git a/src/obikpartitionner/src/lib.rs b/src/obikpartitionner/src/lib.rs
index bd7934f..2e94e56 100644
--- a/src/obikpartitionner/src/lib.rs
+++ b/src/obikpartitionner/src/lib.rs
@@ -2,4 +2,4 @@ mod index_layer;
 mod kmer_sort;
 mod partition;
 
-pub use partition::{KmerPartition, PARTITIONS_SUBDIR};
+pub use partition::{KmerPartition, KmerSpectrum, PARTITIONS_SUBDIR};
diff --git a/src/obikpartitionner/src/partition.rs b/src/obikpartitionner/src/partition.rs
index 5740aa2..c668309 100644
--- a/src/obikpartitionner/src/partition.rs
+++ b/src/obikpartitionner/src/partition.rs
@@ -27,6 +27,12 @@ use crate::kmer_sort::{chunk_size_from_ram, sort_unique_kmers};
 
 type Mphf = PtrHash<u64, CubicEps, CachelineEfVec<Vec<CachelineEf>>, Xx64, Vec<u8>>;
 
+pub struct KmerSpectrum {
+    pub f0:     u64,
+    pub f1:     u64,
+    pub counts: BTreeMap<u32, u64>,
+}
+
 const SK_EXT: &str = "skmer.zst";
 pub const PARTITIONS_SUBDIR: &str = "partitions";
 
@@ -238,11 +244,13 @@ impl KmerPartition {
     /// 3. Writes a flat binary count file (`counts1.bin`, one `u32` per slot,
     ///    memory-mapped) accumulating kmer abundances from the superkmer counts.
     /// 4. Persists the MPHF to `mphf1.bin` for downstream use.
-    /// 5. Writes a global `kmer_spectrum_raw.json` at the partition root.
+    ///
+    /// Returns the aggregated `KmerSpectrum`. Per-partition spectrum files are
+    /// deleted after aggregation unless `keep_partial` is true.
     ///
     /// Partitions are processed in parallel via Rayon (one task per thread).
     /// Peak memory per partition is ~80 MB, so n_threads partitions run simultaneously.
-    pub fn count_kmer(&self) -> SKResult<()> {
+    pub fn count_kmer(&self, keep_partial: bool) -> SKResult<KmerSpectrum> {
         let sys = System::new_all();
         let available = match sys.available_memory() {
             0 => sys.total_memory() / 2,
@@ -282,10 +290,10 @@ impl KmerPartition {
             r?;
         }
 
-        // Aggregate per-partition spectra into a global one at the root.
-        let mut global_spectrum: BTreeMap<u32, u64> = BTreeMap::new();
-        let mut global_f0: u64 = 0;
-        let mut global_f1: u64 = 0;
+        // Aggregate per-partition spectra.
+        let mut counts: BTreeMap<u32, u64> = BTreeMap::new();
+        let mut f0: u64 = 0;
+        let mut f1: u64 = 0;
 
         for i in 0..self.n_partitions {
             let path = self.part_dir(i).join("kmer_spectrum_raw.json");
@@ -294,28 +302,21 @@ impl KmerPartition {
             }
             let v: serde_json::Value =
                 serde_json::from_str(&fs::read_to_string(&path)?).map_err(io::Error::other)?;
-            global_f0 += v["f0"].as_u64().unwrap_or(0);
-            global_f1 += v["f1"].as_u64().unwrap_or(0);
+            f0 += v["f0"].as_u64().unwrap_or(0);
+            f1 += v["f1"].as_u64().unwrap_or(0);
             if let Some(obj) = v["spectrum"].as_object() {
                 for (c_str, freq) in obj {
                     if let (Ok(c), Some(f)) = (c_str.parse::<u32>(), freq.as_u64()) {
-                        *global_spectrum.entry(c).or_insert(0) += f;
+                        *counts.entry(c).or_insert(0) += f;
                     }
                 }
             }
+            if !keep_partial {
+                let _ = fs::remove_file(&path);
+            }
         }
 
-        let global_spectrum_map: BTreeMap<String, u64> = global_spectrum
-            .iter()
-            .map(|(&c, &f)| (format!("{c:010}"), f))
-            .collect();
-        serde_json::to_writer_pretty(
-            fs::File::create(self.root_path.join("kmer_spectrum_raw.json"))?,
-            &serde_json::json!({ "f0": global_f0, "f1": global_f1, "spectrum": &global_spectrum_map }),
-        )
-        .map_err(io::Error::other)?;
-
-        Ok(())
+        Ok(KmerSpectrum { f0, f1, counts })
     }
 
     // ── private ───────────────────────────────────────────────────────────────
-- 
2.52.0


From bfa436ad155966e055990de98e55426819dfe181 Mon Sep 17 00:00:00 2001
From: Eric Coissac <eric@coissac.eu>
Date: Wed, 20 May 2026 21:18:19 +0200
Subject: [PATCH 03/11] feat: add merge operation specs and partition progress
 bar

Added implementation specifications for the `merge` operation, detailing parallel partition processing, I/O paths, and kmer matrix aggregation across multiple indexes. Integrated an `indicatif` progress bar into the `rayon` parallel loop to monitor processing position, throughput, ETA, and recent partition duration.
---
 TODO.md                               | 11 +++++++++++
 src/obikpartitionner/src/partition.rs | 17 ++++++++++++++++-
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/TODO.md b/TODO.md
index b67b7c3..038bd71 100644
--- a/TODO.md
+++ b/TODO.md
@@ -11,6 +11,17 @@
 
 - merge : pour construire un index à partir d'index existants
   - deux modes : count et presence/absence. count exige que tous les index mergés soient déjà en mode count. mode presence/absence par defaut. Si passage de mode count à mode presence/absence, par defaut presence = count >= 1. Possibilité de spécifier un seuil personnalisé.
+  - le merge doit se faire en parallèle sur chaque partition
+  - en entrée : une liste de chemins vers les index à fusionner
+  - en sortie : un nouvel index fusionné (option -o <output_index>)
+  - j'imagine comme algo: 
+    - on copie le premier index dans le nouvel index
+    - on ajoute a chaque partition une matrice de count ou de presence s'il n'y en avait pas déjà.
+    - si besoin, on cree la colone 0 de la matrice de count ou de presence pour le genome courant
+    - on parcourt les partitions et les index à fusionner en parallèle
+    - pour chaque partition, on ajoute les kmer présents dans les index à fusionner au nouvel index
+      - si le kmer est déjà présent dans le nouvel index on ajoute le compte ou la presence du kmer dans la matrice de count ou de presence
+      - sinon, on ajoute le kmer dans une nouvelle layer
 
 - filter : produit un nouvel index filtré à partir d'un index existant en verifiant que les kmer présents dans le nouvel index respectent les critères de filtrage spécifiés
   - quorum de presence en fraction-(min/max) du nombre de génomes, en nombre-(min/max) de génomes, si mode count la présence peut être défini par un seuil personnalisé minimum et maximum
diff --git a/src/obikpartitionner/src/partition.rs b/src/obikpartitionner/src/partition.rs
index c668309..ea7f810 100644
--- a/src/obikpartitionner/src/partition.rs
+++ b/src/obikpartitionner/src/partition.rs
@@ -219,19 +219,34 @@ impl KmerPartition {
         let n_threads = rayon::current_num_threads().max(1) as u64;
         let available_per_thread = available / n_threads;
 
+        let pb = ProgressBar::new(self.n_partitions as u64);
+        pb.set_style(
+            ProgressStyle::with_template(
+                "dereplicating [{bar:40}] {pos}/{len} ({percent}%)  {per_sec}  eta {eta}  {msg}",
+            )
+            .unwrap()
+            .progress_chars("█▌░"),
+        );
+
         let results: Vec<SKResult<()>> = (0..self.n_partitions)
             .into_par_iter()
             .map(|i| {
                 let dir = self.part_dir(i);
                 if !dir.exists() {
+                    pb.inc(1);
                     return Ok(());
                 }
                 let raw_path = dir.join(format!("raw.{SK_EXT}"));
+                let t = Instant::now();
                 let n_buckets = optimal_buckets(&raw_path, available_per_thread);
-                dereplicate_partition(&dir, level, n_buckets)
+                let result = dereplicate_partition(&dir, level, n_buckets);
+                pb.set_message(format!("last {:.0}ms", t.elapsed().as_millis()));
+                pb.inc(1);
+                result
             })
             .collect();
 
+        pb.finish_and_clear();
         for r in results {
             r?;
         }
-- 
2.52.0


From e1d59fde54b735eb799d4cb141a2f489482d99a9 Mon Sep 17 00:00:00 2001
From: Eric Coissac <eric@coissac.eu>
Date: Thu, 21 May 2026 05:53:55 +0200
Subject: [PATCH 04/11] feat: add merge command to consolidate k-mer indexes

Introduces a new `merge` CLI subcommand and underlying implementation to consolidate multiple pre-indexed k-mer indexes into a single output. Adds `append_column` methods to persistent bit and int matrices to enable incremental genome column expansion without rebuilding the MPHF. Includes new error variants for index readiness and configuration mismatches, adds a `--force` flag to the index command, and updates documentation and navigation structure accordingly.
---
 docmd/architecture/index_architecture.md |  27 +++
 docmd/implementation/merge.md            | 133 ++++++++++
 docmd/implementation/obilayeredmap.md    |  54 +++++
 mkdocs.yml                               |   1 +
 src/obicompactvec/src/bitmatrix.rs       |  20 ++
 src/obicompactvec/src/intmatrix.rs       |  20 ++
 src/obikindex/src/error.rs               |  16 +-
 src/obikindex/src/index.rs               |   6 +-
 src/obikindex/src/lib.rs                 |   2 +
 src/obikindex/src/merge.rs               | 131 ++++++++++
 src/obikmer/src/cmd/index.rs             |  11 +-
 src/obikmer/src/cmd/merge.rs             |  47 ++++
 src/obikmer/src/cmd/mod.rs               |   1 +
 src/obikmer/src/main.rs                  |   3 +
 src/obikpartitionner/src/lib.rs          |   2 +
 src/obikpartitionner/src/merge_layer.rs  | 294 +++++++++++++++++++++++
 src/obilayeredmap/src/layer.rs           |  39 +++
 17 files changed, 799 insertions(+), 8 deletions(-)
 create mode 100644 docmd/implementation/merge.md
 create mode 100644 src/obikindex/src/merge.rs
 create mode 100644 src/obikmer/src/cmd/merge.rs
 create mode 100644 src/obikpartitionner/src/merge_layer.rs

diff --git a/docmd/architecture/index_architecture.md b/docmd/architecture/index_architecture.md
index cbc1083..0915a9c 100644
--- a/docmd/architecture/index_architecture.md
+++ b/docmd/architecture/index_architecture.md
@@ -327,3 +327,30 @@ Other derivations: threshold a count matrix → binary presence matrix; union tw
 2. Replace `LayerData` trait with the `DataStore` / `ColumnWeights` / `CountPartials` / `BitPartials` system.
 3. Rewire `LayeredMap` to hold `LayeredStore<PersistentCompactIntMatrix>` (or bit variant) alongside the MPHF layers.
 4. Implement `PartitionedIndex` using `LayeredStore<LayeredStore<S>>` for data and parallel dispatch for queries.
+
+---
+
+## Multi-genome column invariant
+
+### The invariant
+
+After any merge operation, **every layer in every partition has exactly `n_genomes` columns**, where `n_genomes` is the current total genome count recorded in `index.meta`. This applies to both `PersistentCompactIntMatrix` (Count mode) and `PersistentBitMatrix` (Presence mode).
+
+### How it is maintained
+
+The invariant is established and preserved by three coordinated operations:
+
+**1. Existing layers — column append.**
+When merging source genome G into an existing index with `n_existing_genomes` genomes, one column is appended to every existing layer via `append_genome_column`. Slots that contain a kmer present in source G receive its count or `true`; all other slots receive 0 or `false`. After this step, every pre-existing layer has `n_existing_genomes + 1` columns.
+
+**2. New layers — absent columns prepended.**
+If source G introduces kmers not found in any existing layer, a new layer is created for those kmers. Before appending source G's own column, `n_existing_genomes` absent columns (all-zero or all-false) are prepended — one per genome already in the index. This ensures the new layer starts at the same column count as every other layer in the partition immediately after creation.
+
+**3. First merge, Presence mode — `init_presence_matrix`.**
+The initial single-genome index has no `presence/` directory (presence is implicit: every kmer in the index is present in genome 0). On the first merge, before appending any column for source 1, `Layer<()>::init_presence_matrix` creates `presence/col_000000.pbiv` set entirely to `true` for each existing layer. This retroactively materialises genome 0's presence column, bringing the column count from 0 to 1 so that the subsequent append for source 1 raises it to 2.
+
+### Why the invariant is required
+
+The `LayeredStore` aggregation traits (`col_weights`, `partial_bray`, `partial_jaccard`, etc.) sum contributions across all `(partition, layer)` pairs without any shape check. If one layer had fewer columns than others, its contribution would silently produce a malformed result — wrong column sums, wrong distance matrices, and incorrect genome-level statistics.
+
+The invariant is the precondition that makes the progressive aggregation principle correct: each level can blindly sum matrices from the level below because all matrices have the same shape.
diff --git a/docmd/implementation/merge.md b/docmd/implementation/merge.md
new file mode 100644
index 0000000..8a51609
--- /dev/null
+++ b/docmd/implementation/merge.md
@@ -0,0 +1,133 @@
+# Merge command
+
+## Purpose
+
+`obikmer merge` combines multiple existing kmer indexes into a single index. The result contains all kmers from all sources, with per-genome presence/absence or count data for every genome across every layer.
+
+---
+
+## Modes
+
+```rust
+pub enum MergeMode { Presence, Count }
+```
+
+Default mode is `Presence`. `Count` mode requires **all** source indexes to have `with_counts=true`; mixing count and non-count sources is rejected at validation.
+
+| Mode | Column type | Constraint |
+|---|---|---|
+| `Presence` | `PersistentBitMatrix` (one bit per genome per slot) | none |
+| `Count` | `PersistentCompactIntMatrix` (one u32 per genome per slot) | all sources `with_counts=true` |
+
+---
+
+## Input / output constraints
+
+All source indexes must satisfy:
+
+- `IndexState::Indexed` (fully built — `index.done` sentinel present)
+- Same `kmer_size`, `minimizer_size`, `n_bits`
+- If `Count` mode: all sources must have `with_counts=true`
+
+`--force`: if the output directory already exists, it is deleted before the merge begins.
+
+---
+
+## Algorithm
+
+### 1. Validation
+
+Check all sources against the constraints above. Abort on any mismatch.
+
+### 2. Bootstrap output from first source
+
+Recursive file copy of `sources[0]` → `output`. This establishes the partition layout, all existing MPHFs, unitigs, and evidence files. The first source's genome is genome 0 in the output.
+
+### 3. For each subsequent source (parallel across partitions)
+
+For each partition, process one source at a time sequentially:
+
+**a. Classify kmers**
+
+Iterate all kmers in the source partition (via `UnitigFileReader` + canonical kmer iteration). For each kmer, probe the destination `LayeredMap<()>`:
+
+- **Hit** `(dst_layer, slot)`: record `(dst_layer, slot, value)` where `value` is the source count (Count mode) or `1` (Presence mode).
+- **Miss**: add kmer to a `GraphDeBruijn` accumulator; record its value in a `HashMap<CanonicalKmer, Vec<u32>>`.
+
+**b. Extend existing layers**
+
+For each existing layer in the destination partition, call `append_genome_column` once per source genome being merged. Slots that received a hit are populated with their recorded value; all other slots receive 0 (absent).
+
+If this is the **first merge** and operating in Presence mode, call `Layer<()>::init_presence_matrix` before appending any source column. This creates `presence/` with `col_000000.pbiv` set all-true (genome 0 is present in every slot of this layer).
+
+**c. Build new layer for new kmers**
+
+If the `GraphDeBruijn` accumulator is non-empty (misses exist), write compact unitigs from the de Bruijn graph, then call the appropriate `Layer::build` variant. Before appending the source's own column, prepend `n_existing_genomes` absent columns (value 0) — one per genome already in the index — so the column count invariant holds immediately after layer creation.
+
+**d. Update partition metadata**
+
+Write updated `meta.json` with the incremented `n_layers` if a new layer was added.
+
+### 4. Update index metadata
+
+Append the merged source's genome label(s) to `index.meta.genomes`. Write updated `index.meta`.
+
+---
+
+## Column count invariant
+
+After any merge, **every layer in every partition has exactly `n_genomes` columns**, where `n_genomes` is the total genome count in the index at that point.
+
+This is maintained by three mechanisms:
+
+1. **Existing layers**: one column appended per source genome (`append_genome_column`).
+2. **New layers created during merge**: `n_existing_genomes` absent columns prepended before the source's own column.
+3. **First merge, Presence mode**: `init_presence_matrix` retroactively creates `presence/col_0` all-true for genome 0 before any source column is appended.
+
+The invariant is a precondition of the `LayeredStore` aggregation traits: `col_weights()` and all partial distance methods assume every inner store has the same column count.
+
+---
+
+## New layer construction
+
+All kmers absent from the destination index — across **all** sources being merged — accumulate into a **single** `GraphDeBruijn` per partition. One new layer is built from this graph, not one layer per source. This keeps the layer count bounded and preserves compact unitig representation.
+
+De Bruijn reconstruction ensures that overlapping k-1 suffixes/prefixes from different source kmers are merged into minimal unitigs before MPHF construction.
+
+---
+
+## On-disk impact
+
+After merging `G` genomes (1 existing + G-1 new sources):
+
+```
+partitions/
+  part_00000/
+    index/
+      meta.json              ← n_layers updated if new layer added
+      layer_0/
+        mphf.bin             ← unchanged (MPHF immutable)
+        unitigs.bin          ← unchanged
+        evidence.bin         ← unchanged
+        presence/            ← created on first merge (Presence mode)
+          meta.json            {"n": N, "n_cols": G}
+          col_000000.pbiv      ← all-true (genome 0)
+          col_000001.pbiv      ← source 1 presence
+          ...
+        counts/              ← extended (Count mode)
+          meta.json            {"n": N, "n_cols": G}
+          col_000000.pciv      ← genome 0 counts (from original build)
+          col_000001.pciv      ← source 1 counts
+          ...
+      layer_1/               ← new layer (if new kmers found)
+        mphf.bin
+        unitigs.bin
+        evidence.bin
+        presence/ or counts/
+          meta.json            {"n": N1, "n_cols": G}
+          col_000000.pbiv      ← all-false (genome 0 absent from this layer)
+          ...
+          col_000001.pbiv      ← source 1 presence in this new layer
+```
+
+The `index.meta` genome list grows from 1 entry to G entries after all sources are merged.
diff --git a/docmd/implementation/obilayeredmap.md b/docmd/implementation/obilayeredmap.md
index cec43a2..d5ad7d2 100644
--- a/docmd/implementation/obilayeredmap.md
+++ b/docmd/implementation/obilayeredmap.md
@@ -255,3 +255,57 @@ Each partition's new layer is built independently; the operation is fully parall
 | `obicompactvec` | payload types + aggregation traits |
 | `rayon 1` | parallel MPHF construction pass |
 | `ndarray 0.16` | aggregation output arrays |
+
+---
+
+## Column append and merge support
+
+These methods extend existing layers with new genome columns without touching the MPHF. They are the building blocks of the `merge` command.
+
+### Matrix column append
+
+```rust
+impl PersistentCompactIntMatrix {
+    pub fn append_column(dir: &Path, value_of: impl Fn(usize) -> u32) -> OLMResult<()>
+}
+
+impl PersistentBitMatrix {
+    pub fn append_column(dir: &Path, value_of: impl Fn(usize) -> bool) -> OLMResult<()>
+}
+```
+
+Both methods write a new column file (`col_NNNNNN.pciv` / `col_NNNNNN.pbiv`) and update `meta.json` to increment `n_cols`. The `value_of` closure is called once per slot (indexed 0..n) to populate the column. The matrix `n` (row count) is read from the existing `meta.json` and must not change.
+
+### Presence matrix initialisation
+
+```rust
+impl Layer<()> {
+    pub fn init_presence_matrix(layer_dir: &Path, n_kmers: usize) -> OLMResult<()>
+}
+```
+
+Called on the first merge of a Presence-mode index. Creates the `presence/` subdirectory with `meta.json {"n": n_kmers, "n_cols": 1}` and `col_000000.pbiv` set entirely to `true`. This retroactively records that genome 0 (the original source) is present in every slot of this layer, satisfying the column count invariant before any new-source column is appended.
+
+### Layer-level genome column append
+
+```rust
+impl Layer<PersistentBitMatrix> {
+    pub fn append_genome_column(
+        layer_dir: &Path,
+        value_of: impl Fn(usize) -> bool,
+    ) -> OLMResult<()>
+}
+
+impl Layer<PersistentCompactIntMatrix> {
+    pub fn append_genome_column(
+        layer_dir: &Path,
+        value_of: impl Fn(usize) -> u32,
+    ) -> OLMResult<()>
+}
+```
+
+These delegate directly to the corresponding `PersistentBitMatrix::append_column` / `PersistentCompactIntMatrix::append_column`. They are typed at the `Layer` level to make call sites mode-aware without exposing the inner matrix path construction.
+
+### Why the MPHF is never rebuilt
+
+The MPHF (`mphf.bin`, `evidence.bin`, `unitigs.bin`) is built once from the kmer set of a layer and is immutable for the lifetime of that layer. Adding a genome column does not change the kmer set — it only adds a new data column indexed by the same slot numbers. Rebuilding the MPHF would require re-running the full construction pipeline (two sequential passes over unitigs, parallel ptr_hash construction) and would invalidate any open memory maps. Column append avoids all of this: the only disk writes are one new `.pciv`/`.pbiv` file and a single `meta.json` update. Kmers absent from a given layer are represented as zero (count) or false (presence) values in the new column — no structural change to the layer is required.
diff --git a/mkdocs.yml b/mkdocs.yml
index af78836..9f2ea83 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -47,6 +47,7 @@ nav:
       - obilayeredmap crate: implementation/obilayeredmap.md
       - PersistentCompactIntVec: implementation/persistent_compact_int_vec.md
       - PersistentBitVec: implementation/persistent_bit_vec.md
+      - Merge command: implementation/merge.md
   - Architecture:
       - Sequences: architecture/sequences/invariant.md
       - Kmer index: architecture/index_architecture.md
diff --git a/src/obicompactvec/src/bitmatrix.rs b/src/obicompactvec/src/bitmatrix.rs
index df823d1..b7305ef 100644
--- a/src/obicompactvec/src/bitmatrix.rs
+++ b/src/obicompactvec/src/bitmatrix.rs
@@ -101,6 +101,26 @@ impl PersistentBitMatrix {
     }
 }
 
+// ── Column append ─────────────────────────────────────────────────────────────
+
+impl PersistentBitMatrix {
+    /// Append a new column to an existing matrix on disk.
+    ///
+    /// Reads `meta.json` to obtain `n` and the current column count, writes
+    /// `col_{n_cols:06}.pbiv` filled by `value_of(slot)`, then increments
+    /// `n_cols` in `meta.json`.
+    pub fn append_column(dir: &Path, value_of: impl Fn(usize) -> bool) -> io::Result<()> {
+        let mut meta = MatrixMeta::load(dir)?;
+        let mut b = PersistentBitVecBuilder::new(meta.n, &col_path(dir, meta.n_cols))?;
+        for slot in 0..meta.n {
+            b.set(slot, value_of(slot));
+        }
+        b.close()?;
+        meta.n_cols += 1;
+        meta.save(dir)
+    }
+}
+
 fn upper_pairs(n: usize) -> Vec<(usize, usize)> {
     (0..n).flat_map(|i| (i + 1..n).map(move |j| (i, j))).collect()
 }
diff --git a/src/obicompactvec/src/intmatrix.rs b/src/obicompactvec/src/intmatrix.rs
index c77a135..a9aae75 100644
--- a/src/obicompactvec/src/intmatrix.rs
+++ b/src/obicompactvec/src/intmatrix.rs
@@ -203,6 +203,26 @@ where
     m
 }
 
+// ── Column append ─────────────────────────────────────────────────────────────
+
+impl PersistentCompactIntMatrix {
+    /// Append a new column to an existing matrix on disk.
+    ///
+    /// Reads `meta.json` to obtain `n` and the current column count, writes
+    /// `col_{n_cols:06}.pciv` filled by `value_of(slot)`, then increments
+    /// `n_cols` in `meta.json`.
+    pub fn append_column(dir: &Path, value_of: impl Fn(usize) -> u32) -> io::Result<()> {
+        let mut meta = MatrixMeta::load(dir)?;
+        let mut b = PersistentCompactIntVecBuilder::new(meta.n, &col_path(dir, meta.n_cols))?;
+        for slot in 0..meta.n {
+            b.set(slot, value_of(slot));
+        }
+        b.close()?;
+        meta.n_cols += 1;
+        meta.save(dir)
+    }
+}
+
 // ── Trait impls ───────────────────────────────────────────────────────────────
 
 use crate::traits::{ColumnWeights, CountPartials};
diff --git a/src/obikindex/src/error.rs b/src/obikindex/src/error.rs
index df191ed..74082fe 100644
--- a/src/obikindex/src/error.rs
+++ b/src/obikindex/src/error.rs
@@ -8,6 +8,12 @@ pub enum OKIError {
     Io(io::Error),
     Json(serde_json::Error),
     Partition(SKError),
+    /// Source index is not in `Indexed` state.
+    NotIndexed(std::path::PathBuf),
+    /// Source indexes have incompatible configurations (k, m, n_bits).
+    IncompatibleConfig,
+    /// Count mode requested but a source index lacks count data.
+    MismatchedMode,
 }
 
 pub type OKIResult<T> = Result<T, OKIError>;
@@ -15,9 +21,12 @@ pub type OKIResult<T> = Result<T, OKIError>;
 impl fmt::Display for OKIError {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         match self {
-            OKIError::Io(e)        => write!(f, "I/O error: {e}"),
-            OKIError::Json(e)      => write!(f, "JSON error: {e}"),
-            OKIError::Partition(e) => write!(f, "partition error: {e}"),
+            OKIError::Io(e)            => write!(f, "I/O error: {e}"),
+            OKIError::Json(e)          => write!(f, "JSON error: {e}"),
+            OKIError::Partition(e)     => write!(f, "partition error: {e}"),
+            OKIError::NotIndexed(p)    => write!(f, "index not fully built: {}", p.display()),
+            OKIError::IncompatibleConfig => write!(f, "incompatible index configurations"),
+            OKIError::MismatchedMode   => write!(f, "count mode requires all sources to have with_counts=true"),
         }
     }
 }
@@ -28,6 +37,7 @@ impl std::error::Error for OKIError {
             OKIError::Io(e)        => Some(e),
             OKIError::Json(e)      => Some(e),
             OKIError::Partition(e) => Some(e),
+            _                      => None,
         }
     }
 }
diff --git a/src/obikindex/src/index.rs b/src/obikindex/src/index.rs
index c5ba267..16c1154 100644
--- a/src/obikindex/src/index.rs
+++ b/src/obikindex/src/index.rs
@@ -15,9 +15,9 @@ use crate::meta::{IndexConfig, IndexMeta};
 use crate::state::{IndexState, SENTINEL_COUNTED, SENTINEL_INDEXED, SENTINEL_SCATTERED};
 
 pub struct KmerIndex {
-    root_path: PathBuf,
-    meta: IndexMeta,
-    partition: KmerPartition,
+    pub(crate) root_path: PathBuf,
+    pub(crate) meta: IndexMeta,
+    pub(crate) partition: KmerPartition,
 }
 
 impl KmerIndex {
diff --git a/src/obikindex/src/lib.rs b/src/obikindex/src/lib.rs
index 91a728a..4dcf4ff 100644
--- a/src/obikindex/src/lib.rs
+++ b/src/obikindex/src/lib.rs
@@ -2,8 +2,10 @@ pub mod error;
 pub mod meta;
 pub mod state;
 mod index;
+mod merge;
 
 pub use error::{OKIError, OKIResult};
 pub use index::KmerIndex;
+pub use merge::MergeMode;
 pub use meta::{IndexConfig, IndexMeta, META_FILENAME};
 pub use state::{IndexState, SENTINEL_COUNTED, SENTINEL_INDEXED, SENTINEL_SCATTERED};
diff --git a/src/obikindex/src/merge.rs b/src/obikindex/src/merge.rs
new file mode 100644
index 0000000..be71a02
--- /dev/null
+++ b/src/obikindex/src/merge.rs
@@ -0,0 +1,131 @@
+use std::fs;
+use std::io;
+use std::path::Path;
+
+use obisys::{Reporter, Stage};
+use rayon::prelude::*;
+use tracing::info;
+
+use crate::error::{OKIError, OKIResult};
+use crate::index::KmerIndex;
+use crate::meta::IndexMeta;
+use crate::state::IndexState;
+
+pub use obikpartitionner::MergeMode;
+
+impl KmerIndex {
+    /// Merge `sources` into a new index at `output`.
+    ///
+    /// All sources must be in `Indexed` state and share the same `kmer_size`,
+    /// `minimizer_size`, and `n_partitions`. Count mode additionally requires
+    /// every source to have `with_counts = true`.
+    ///
+    /// The first source is copied to `output`, then each subsequent source is
+    /// merged partition-by-partition in parallel.
+    pub fn merge<P: AsRef<Path>>(
+        output: P,
+        sources: &[&KmerIndex],
+        mode: MergeMode,
+        force: bool,
+    ) -> OKIResult<Self> {
+        let output = output.as_ref();
+
+        if sources.is_empty() {
+            return Err(OKIError::Io(io::Error::new(
+                io::ErrorKind::InvalidInput,
+                "merge requires at least one source index",
+            )));
+        }
+
+        // ── Validate ──────────────────────────────────────────────────────────
+        let ref0 = sources[0];
+        for src in sources {
+            if src.state() != IndexState::Indexed {
+                return Err(OKIError::NotIndexed(src.root_path.clone()));
+            }
+            if src.kmer_size()      != ref0.kmer_size()
+            || src.minimizer_size() != ref0.minimizer_size()
+            || src.n_partitions()   != ref0.n_partitions()
+            {
+                return Err(OKIError::IncompatibleConfig);
+            }
+            if mode == MergeMode::Count && !src.meta.config.with_counts {
+                return Err(OKIError::MismatchedMode);
+            }
+        }
+
+        // ── Prepare output directory ──────────────────────────────────────────
+        if output.exists() {
+            if force {
+                fs::remove_dir_all(output)?;
+            } else {
+                return Err(OKIError::Io(io::Error::new(
+                    io::ErrorKind::AlreadyExists,
+                    format!("{}: output directory already exists", output.display()),
+                )));
+            }
+        }
+
+        // ── Bootstrap: copy first source to output ────────────────────────────
+        info!("copying {} → {}", sources[0].root_path.display(), output.display());
+        copy_dir_all(&sources[0].root_path, output)?;
+
+        // Rewrite index.meta with all genome labels.
+        let all_genomes: Vec<String> = sources
+            .iter()
+            .flat_map(|s| s.meta.genomes.iter().cloned())
+            .collect();
+        let mut meta = IndexMeta::read(output).map_err(OKIError::Io)?;
+        meta.genomes = all_genomes;
+        meta.write(output)?;
+
+        // Open the destination index.
+        let dst = KmerIndex::open(output)?;
+        let n_partitions = dst.n_partitions();
+
+        // ── Merge each subsequent source partition-by-partition ───────────────
+        let remaining_sources: Vec<&KmerIndex> = sources[1..].to_vec();
+        if !remaining_sources.is_empty() {
+            let mut rep = Reporter::new();
+            let t = Stage::start("merge_partitions");
+
+            let dst_partition = &dst.partition;
+
+            let errors: Vec<obiskio::SKError> = (0..n_partitions)
+                .into_par_iter()
+                .filter_map(|i| {
+                    let srcs: Vec<&obikpartitionner::KmerPartition> =
+                        remaining_sources.iter().map(|s| &s.partition).collect();
+                    // n_dst_genomes = 1 (copied from source_0 only)
+                    dst_partition.merge_partition(i, &srcs, mode, 1).err()
+                })
+                .collect();
+
+            if let Some(e) = errors.into_iter().next() {
+                return Err(OKIError::Partition(e));
+            }
+
+            rep.push(t.stop());
+        }
+
+        // Re-open to get the updated state.
+        KmerIndex::open(output)
+    }
+}
+
+// ── Directory copy ────────────────────────────────────────────────────────────
+
+fn copy_dir_all(src: &Path, dst: &Path) -> io::Result<()> {
+    fs::create_dir_all(dst)?;
+    for entry in fs::read_dir(src)? {
+        let entry = entry?;
+        let src_path = entry.path();
+        let dst_path = dst.join(entry.file_name());
+        if src_path.is_dir() {
+            copy_dir_all(&src_path, &dst_path)?;
+        } else {
+            fs::copy(&src_path, &dst_path)?;
+        }
+    }
+    Ok(())
+}
diff --git a/src/obikmer/src/cmd/index.rs b/src/obikmer/src/cmd/index.rs
index 9f9bad2..ccfa487 100644
--- a/src/obikmer/src/cmd/index.rs
+++ b/src/obikmer/src/cmd/index.rs
@@ -48,20 +48,27 @@ pub fn run(args: IndexArgs) {
     let mut rep = Reporter::new();
 
     // ── Open or create the index ─────────────────────────────────────────────
-    let mut idx = if KmerIndex::exists(&output) {
+    let mut idx = if KmerIndex::exists(&output) && !args.force {
         info!("resuming from existing index at {}", output.display());
         KmerIndex::open(&output).unwrap_or_else(|e| {
             eprintln!("error opening index: {e}");
             std::process::exit(1);
         })
     } else {
+        if args.force && KmerIndex::exists(&output) {
+            info!("--force: removing existing index at {}", output.display());
+            std::fs::remove_dir_all(&output).unwrap_or_else(|e| {
+                eprintln!("error removing existing index: {e}");
+                std::process::exit(1);
+            });
+        }
         let config = IndexConfig {
             kmer_size:      args.common.kmer_size,
             minimizer_size: args.common.minimizer_size,
             n_bits:         args.common.partition_bits,
             with_counts:    args.with_counts,
         };
-        KmerIndex::create(&output, config, args.label.clone(), args.force).unwrap_or_else(|e| {
+        KmerIndex::create(&output, config, args.label.clone(), false).unwrap_or_else(|e| {
             eprintln!("error creating index: {e}");
             std::process::exit(1);
         })
diff --git a/src/obikmer/src/cmd/merge.rs b/src/obikmer/src/cmd/merge.rs
new file mode 100644
index 0000000..0109617
--- /dev/null
+++ b/src/obikmer/src/cmd/merge.rs
@@ -0,0 +1,47 @@
+use std::path::PathBuf;
+
+use clap::Args;
+use obikindex::{KmerIndex, MergeMode};
+use obisys::Reporter;
+use tracing::info;
+
+#[derive(Args)]
+pub struct MergeArgs {
+    /// Source index directories to merge
+    #[arg(required = true)]
+    pub sources: Vec<PathBuf>,
+
+    /// Output index directory
+    #[arg(short, long)]
+    pub output: PathBuf,
+
+    /// Overwrite output directory if it already exists
+    #[arg(long, default_value_t = false)]
+    pub force: bool,
+
+    /// Merge in count mode (requires all sources to have with_counts=true)
+    #[arg(long, default_value_t = false)]
+    pub with_counts: bool,
+}
+
+pub fn run(args: MergeArgs) {
+    let mode = if args.with_counts { MergeMode::Count } else { MergeMode::Presence };
+
+    let sources: Vec<KmerIndex> = args.sources.iter().map(|p| {
+        info!("opening source index: {}", p.display());
+        KmerIndex::open(p).unwrap_or_else(|e| {
+            eprintln!("error opening source index {}: {e}", p.display());
+            std::process::exit(1);
+        })
+    }).collect();
+
+    let source_refs: Vec<&KmerIndex> = sources.iter().collect();
+
+    info!("merging {} indexes → {}", sources.len(), args.output.display());
+    let rep = Reporter::new();
+    KmerIndex::merge(&args.output, &source_refs, mode, args.force).unwrap_or_else(|e| {
+        eprintln!("error merging: {e}");
+        std::process::exit(1);
+    });
+    rep.print();
+}
diff --git a/src/obikmer/src/cmd/mod.rs b/src/obikmer/src/cmd/mod.rs
index 1d2188e..aaccdbf 100644
--- a/src/obikmer/src/cmd/mod.rs
+++ b/src/obikmer/src/cmd/mod.rs
@@ -1,3 +1,4 @@
 pub mod index;
+pub mod merge;
 pub mod superkmer;
 pub mod unitig;
diff --git a/src/obikmer/src/main.rs b/src/obikmer/src/main.rs
index 470da38..52c5953 100644
--- a/src/obikmer/src/main.rs
+++ b/src/obikmer/src/main.rs
@@ -18,6 +18,8 @@ enum Commands {
     Superkmer(cmd::superkmer::SuperkmerArgs),
     /// Build the complete genome index (scatter → dereplicate → count → layered MPHF)
     Index(cmd::index::IndexArgs),
+    /// Merge multiple built indexes into one
+    Merge(cmd::merge::MergeArgs),
     /// Dump unitigs from a built index to stdout (debug)
     Unitig(cmd::unitig::UnitigArgs),
 }
@@ -43,6 +45,7 @@ fn main() {
     match cli.command {
         Commands::Superkmer(args) => cmd::superkmer::run(args),
         Commands::Index(args)     => cmd::index::run(args),
+        Commands::Merge(args)     => cmd::merge::run(args),
         Commands::Unitig(args)    => cmd::unitig::run(args),
     }
 
diff --git a/src/obikpartitionner/src/lib.rs b/src/obikpartitionner/src/lib.rs
index 2e94e56..672c118 100644
--- a/src/obikpartitionner/src/lib.rs
+++ b/src/obikpartitionner/src/lib.rs
@@ -1,5 +1,7 @@
 mod index_layer;
 mod kmer_sort;
+mod merge_layer;
 mod partition;
 
+pub use merge_layer::MergeMode;
 pub use partition::{KmerPartition, KmerSpectrum, PARTITIONS_SUBDIR};
diff --git a/src/obikpartitionner/src/merge_layer.rs b/src/obikpartitionner/src/merge_layer.rs
new file mode 100644
index 0000000..7d77c45
--- /dev/null
+++ b/src/obikpartitionner/src/merge_layer.rs
@@ -0,0 +1,294 @@
+use std::fs;
+use std::io;
+use std::path::{Path, PathBuf};
+
+use obidebruinj::GraphDeBruijn;
+use obicompactvec::{PersistentBitMatrix, PersistentBitMatrixBuilder,
+                    PersistentBitVecBuilder,
+                    PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder,
+                    PersistentCompactIntVecBuilder};
+use obiskio::{SKError, SKResult, UnitigFileReader};
+use obilayeredmap::{Layer, LayeredMap, MphfLayer, OLMError};
+use obilayeredmap::meta::PartitionMeta;
+
+use crate::partition::KmerPartition;
+
+// ── MergeMode ─────────────────────────────────────────────────────────────────
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum MergeMode { Presence, Count }
+
+// ── ColBuilder — enum dispatch to avoid trait-object boxing issues ─────────────
+
+enum ColBuilder {
+    Bit(PersistentBitVecBuilder),
+    Int(PersistentCompactIntVecBuilder),
+}
+
+impl ColBuilder {
+    fn set_val(&mut self, slot: usize, value: u32) {
+        match self {
+            ColBuilder::Bit(b) => b.set(slot, value > 0),
+            ColBuilder::Int(b) => b.set(slot, value),
+        }
+    }
+
+    fn close(self) -> SKResult<()> {
+        match self {
+            ColBuilder::Bit(b)  => b.close().map_err(SKError::Io),
+            ColBuilder::Int(b)  => b.close().map_err(SKError::Io),
+        }
+    }
+}
+
+// ── helpers ───────────────────────────────────────────────────────────────────
+
+const INDEX_SUBDIR: &str = "index";
+
+fn olm_to_sk(e: OLMError) -> SKError {
+    match e {
+        OLMError::Io(e) => SKError::Io(e),
+        other => SKError::InvalidData { context: "merge", detail: other.to_string() },
+    }
+}
+
+fn col_path_bit(dir: &Path, col: usize) -> PathBuf {
+    dir.join(format!("col_{col:06}.pbiv"))
+}
+
+fn col_path_int(dir: &Path, col: usize) -> PathBuf {
+    dir.join(format!("col_{col:06}.pciv"))
+}
+
+fn write_matrix_meta(dir: &Path, n: usize, n_cols: usize) -> io::Result<()> {
+    fs::write(dir.join("meta.json"), format!("{{\"n\":{n},\"n_cols\":{n_cols}}}\n"))
+}
+
+// ── KmerPartition::merge_partition ────────────────────────────────────────────
+
+impl KmerPartition {
+    /// Merge `sources` into destination partition `i`.
+    ///
+    /// `n_dst_genomes` is the number of genome columns already in the dst
+    /// matrices (1 after copying source_0, more for subsequent merges).
+    ///
+    /// Two-pass algorithm:
+    /// 1. Classify each source kmer as dst-hit or new → build de Bruijn graph
+    ///    of new kmers → write unitigs → build MPHF for the new layer.
+    /// 2. Iterate source kmers again → fill per-genome column builders
+    ///    (memory-mapped) → close → update matrix metadata.
+    pub fn merge_partition(
+        &self,
+        i: usize,
+        sources: &[&KmerPartition],
+        mode: MergeMode,
+        n_dst_genomes: usize,
+    ) -> SKResult<()> {
+        let dst_index_dir = self.part_dir(i).join(INDEX_SUBDIR);
+        if !dst_index_dir.exists() {
+            return Ok(());
+        }
+
+        let dst_map = LayeredMap::<()>::open(&dst_index_dir).map_err(olm_to_sk)?;
+        let n_dst_layers = dst_map.n_layers();
+        let n_src = sources.len();
+
+        // First merge in presence mode: init presence matrices on existing layers
+        // (all slots true — every kmer in a layer belongs to genome_0).
+        if n_dst_genomes == 1 && mode == MergeMode::Presence {
+            for l in 0..n_dst_layers {
+                let layer_dir = dst_index_dir.join(format!("layer_{l}"));
+                Layer::<()>::init_presence_matrix(&layer_dir, dst_map.layer(l).n())
+                    .map_err(olm_to_sk)?;
+            }
+        }
+
+        // ── Pass 1: classify kmers, build new-kmer de Bruijn graph ───────────
+        let mut g = GraphDeBruijn::new();
+        let mut any_new = false;
+
+        for src in sources.iter() {
+            let src_index_dir = src.part_dir(i).join(INDEX_SUBDIR);
+            if !src_index_dir.exists() { continue; }
+            let src_meta = PartitionMeta::load(&src_index_dir).map_err(olm_to_sk)?;
+
+            for l in 0..src_meta.n_layers {
+                let unitigs_path = src_index_dir
+                    .join(format!("layer_{l}")).join("unitigs.bin");
+                let reader = UnitigFileReader::open(&unitigs_path)?;
+                for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
+                    if dst_map.query(kmer).is_none() {
+                        g.push(kmer);
+                        any_new = true;
+                    }
+                }
+            }
+        }
+
+        // Build new layer from de Bruijn graph if there are new kmers.
+        let new_layer_idx = n_dst_layers;
+        let new_layer_dir = dst_index_dir.join(format!("layer_{new_layer_idx}"));
+
+        if any_new {
+            g.compute_degrees();
+            fs::create_dir_all(&new_layer_dir)?;
+            let mut uw = Layer::<()>::unitig_writer(&new_layer_dir).map_err(olm_to_sk)?;
+            for unitig in g.iter_unitig() {
+                uw.write(&unitig)?;
+            }
+            uw.close()?;
+            Layer::<()>::build(&new_layer_dir).map_err(olm_to_sk)?;
+        }
+        drop(g);
+
+        let new_mphf = if any_new {
+            Some(MphfLayer::open(&new_layer_dir).map_err(olm_to_sk)?)
+        } else {
+            None
+        };
+        let n_new = new_mphf.as_ref().map_or(0, |m| m.n());
+
+        // ── Prepare matrix directories for the new layer ──────────────────────
+        // Absent columns (dst genomes) are written now via append_column (all-zero/false).
+        // Source-genome columns are created as mutable builders for pass 2.
+        let mut new_src_builders: Vec<ColBuilder> = if any_new {
+            let data_dir = match mode {
+                MergeMode::Presence => new_layer_dir.join("presence"),
+                MergeMode::Count    => new_layer_dir.join("counts"),
+            };
+            fs::create_dir_all(&data_dir)?;
+            // Bootstrap meta with 0 cols.
+            match mode {
+                MergeMode::Presence => {
+                    PersistentBitMatrixBuilder::new(n_new, &data_dir)
+                        .map_err(SKError::Io)?.close().map_err(SKError::Io)?;
+                    for _ in 0..n_dst_genomes {
+                        PersistentBitMatrix::append_column(&data_dir, |_| false)
+                            .map_err(SKError::Io)?;
+                    }
+                    (0..n_src).map(|g| -> SKResult<ColBuilder> {
+                        let b = PersistentBitVecBuilder::new(
+                            n_new, &col_path_bit(&data_dir, n_dst_genomes + g))?;
+                        Ok(ColBuilder::Bit(b))
+                    }).collect::<SKResult<_>>()?
+                }
+                MergeMode::Count => {
+                    PersistentCompactIntMatrixBuilder::new(n_new, &data_dir)
+                        .map_err(SKError::Io)?.close().map_err(SKError::Io)?;
+                    for _ in 0..n_dst_genomes {
+                        PersistentCompactIntMatrix::append_column(&data_dir, |_| 0)
+                            .map_err(SKError::Io)?;
+                    }
+                    (0..n_src).map(|g| -> SKResult<ColBuilder> {
+                        let b = PersistentCompactIntVecBuilder::new(
+                            n_new, &col_path_int(&data_dir, n_dst_genomes + g))?;
+                        Ok(ColBuilder::Int(b))
+                    }).collect::<SKResult<_>>()?
+                }
+            }
+        } else {
+            vec![]
+        };
+
+        // Builders for existing layers: one per (layer, src_genome).
+        // Invariant: existing layers already have exactly n_dst_genomes columns.
+        // New source columns go at positions n_dst_genomes .. n_dst_genomes+n_src-1.
+        let mut exist_builders: Vec<Vec<ColBuilder>> = (0..n_dst_layers)
+            .map(|l| {
+                let layer_dir = dst_index_dir.join(format!("layer_{l}"));
+                let n = dst_map.layer(l).n();
+                (0..n_src).map(|src_g| -> SKResult<ColBuilder> {
+                    match mode {
+                        MergeMode::Presence => {
+                            let data_dir = layer_dir.join("presence");
+                            let b = PersistentBitVecBuilder::new(
+                                n, &col_path_bit(&data_dir, n_dst_genomes + src_g))?;
+                            Ok(ColBuilder::Bit(b))
+                        }
+                        MergeMode::Count => {
+                            let data_dir = layer_dir.join("counts");
+                            let b = PersistentCompactIntVecBuilder::new(
+                                n, &col_path_int(&data_dir, n_dst_genomes + src_g))?;
+                            Ok(ColBuilder::Int(b))
+                        }
+                    }
+                }).collect::<SKResult<_>>()
+            })
+            .collect::<SKResult<_>>()?;
+
+        // ── Pass 2: fill builders ─────────────────────────────────────────────
+        for (src_g, src) in sources.iter().enumerate() {
+            let src_index_dir = src.part_dir(i).join(INDEX_SUBDIR);
+            if !src_index_dir.exists() { continue; }
+            let src_meta = PartitionMeta::load(&src_index_dir).map_err(olm_to_sk)?;
+
+            for l in 0..src_meta.n_layers {
+                let src_layer_dir = src_index_dir.join(format!("layer_{l}"));
+                let reader = UnitigFileReader::open(&src_layer_dir.join("unitigs.bin"))?;
+
+                // Open source MPHF + count matrix for count mode.
+                let src_count_data: Option<(MphfLayer, PersistentCompactIntMatrix)> =
+                    if mode == MergeMode::Count {
+                        let counts_dir = src_layer_dir.join("counts");
+                        if counts_dir.exists() {
+                            let mphf = MphfLayer::open(&src_layer_dir).map_err(olm_to_sk)?;
+                            let mat  = PersistentCompactIntMatrix::open(&counts_dir)
+                                .map_err(SKError::Io)?;
+                            Some((mphf, mat))
+                        } else {
+                            None
+                        }
+                    } else {
+                        None
+                    };
+
+                for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
+                    let value: u32 = match &src_count_data {
+                        Some((mphf, counts)) => {
+                            mphf.find(kmer).map(|s| counts.col(0).get(s)).unwrap_or(1)
+                        }
+                        None => 1,
+                    };
+
+                    if let Some((dst_layer, hit)) = dst_map.query(kmer) {
+                        exist_builders[dst_layer][src_g].set_val(hit.slot, value);
+                    } else if let Some(ref mphf) = new_mphf {
+                        if let Some(slot) = mphf.find(kmer) {
+                            new_src_builders[src_g].set_val(slot, value);
+                        }
+                    }
+                }
+            }
+        }
+
+        // ── Close builders and update metadata ────────────────────────────────
+        for (l, builders) in exist_builders.into_iter().enumerate() {
+            let layer_dir = dst_index_dir.join(format!("layer_{l}"));
+            for b in builders { b.close()?; }
+            // Update the matrix meta to reflect the n_src new columns.
+            let n = dst_map.layer(l).n();
+            let data_dir = match mode {
+                MergeMode::Presence => layer_dir.join("presence"),
+                MergeMode::Count    => layer_dir.join("counts"),
+            };
+            write_matrix_meta(&data_dir, n, n_dst_genomes + n_src).map_err(SKError::Io)?;
+        }
+
+        for b in new_src_builders { b.close()?; }
+        // new layer matrix meta was already written by append_column calls above
+        // with n_dst_genomes cols; update to n_dst_genomes + n_src.
+        if any_new {
+            let data_dir = match mode {
+                MergeMode::Presence => new_layer_dir.join("presence"),
+                MergeMode::Count    => new_layer_dir.join("counts"),
+            };
+            write_matrix_meta(&data_dir, n_new, n_dst_genomes + n_src).map_err(SKError::Io)?;
+
+            let mut part_meta = PartitionMeta::load(&dst_index_dir).map_err(olm_to_sk)?;
+            part_meta.n_layers = new_layer_idx + 1;
+            part_meta.save(&dst_index_dir).map_err(olm_to_sk)?;
+        }
+
+        Ok(())
+    }
+}
diff --git a/src/obilayeredmap/src/layer.rs b/src/obilayeredmap/src/layer.rs
index 166a072..9f41675 100644
--- a/src/obilayeredmap/src/layer.rs
+++ b/src/obilayeredmap/src/layer.rs
@@ -1,4 +1,5 @@
 use std::collections::HashMap;
+use std::fs;
 use std::path::Path;
 
 use obicompactvec::{
@@ -83,6 +84,22 @@ impl Layer<()> {
     pub fn build(out_dir: &Path) -> OLMResult<usize> {
         MphfLayer::build(out_dir, &mut |_, _| Ok(()))
     }
+
+    /// Create a presence matrix for a set-membership layer (first merge).
+    ///
+    /// All `n_kmers` slots are set to `true`: every kmer in this layer belongs
+    /// to genome_0, so genome_0 is present at every slot.
+    pub fn init_presence_matrix(layer_dir: &Path, n_kmers: usize) -> OLMResult<()> {
+        let presence_dir = layer_dir.join(PRESENCE_DIR);
+        fs::create_dir_all(&presence_dir).map_err(OLMError::Io)?;
+        let mut mb = PersistentBitMatrixBuilder::new(n_kmers, &presence_dir).map_err(OLMError::Io)?;
+        let mut col = mb.add_col().map_err(OLMError::Io)?;
+        for slot in 0..n_kmers {
+            col.set(slot, true);
+        }
+        col.close().map_err(OLMError::Io)?;
+        mb.close().map_err(OLMError::Io)
+    }
 }
 
 // ── Mode 2 — count matrix (1 column per layer) ────────────────────────────────
@@ -111,9 +128,31 @@ impl Layer<PersistentCompactIntMatrix> {
     }
 }
 
+// ── Mode 2 — count matrix column append ──────────────────────────────────────
+
+impl Layer<PersistentCompactIntMatrix> {
+    /// Append a genome column to an existing count matrix.
+    pub fn append_genome_column(
+        layer_dir: &Path,
+        value_of: impl Fn(usize) -> u32,
+    ) -> OLMResult<()> {
+        PersistentCompactIntMatrix::append_column(&layer_dir.join(COUNTS_DIR), value_of)
+            .map_err(OLMError::Io)
+    }
+}
+
 // ── Mode 3 — presence/absence matrix (1 column per genome) ───────────────────
 
 impl Layer<PersistentBitMatrix> {
+    /// Append a genome column to an existing presence matrix.
+    pub fn append_genome_column(
+        layer_dir: &Path,
+        value_of: impl Fn(usize) -> bool,
+    ) -> OLMResult<()> {
+        PersistentBitMatrix::append_column(&layer_dir.join(PRESENCE_DIR), value_of)
+            .map_err(OLMError::Io)
+    }
+
     pub fn build_presence(
         out_dir: &Path,
         n_genomes: usize,
-- 
2.52.0


From 1a1f95e59d3b129b0cfe0fc286793cd696d2c16f Mon Sep 17 00:00:00 2001
From: Eric Coissac <eric@coissac.eu>
Date: Thu, 21 May 2026 06:02:27 +0200
Subject: [PATCH 05/11] feat: add CLI command to export indexed k-mers to CSV

This change introduces a new `dump` subcommand that exports all indexed k-mers to a CSV stream. The implementation spans multiple crates, adding core export logic to `obikindex` and partition iteration to `obikpartitionner`. The command supports a `--force-presence` flag to output binary presence/absence data instead of stored counts, and includes necessary module registrations and structural updates across the codebase.
---
 src/obikindex/src/dump.rs              | 50 +++++++++++++++++
 src/obikindex/src/lib.rs               |  1 +
 src/obikmer/src/cmd/dump.rs            | 39 +++++++++++++
 src/obikmer/src/cmd/mod.rs             |  1 +
 src/obikmer/src/main.rs                |  3 +
 src/obikpartitionner/src/dump_layer.rs | 77 ++++++++++++++++++++++++++
 src/obikpartitionner/src/lib.rs        |  1 +
 7 files changed, 172 insertions(+)
 create mode 100644 src/obikindex/src/dump.rs
 create mode 100644 src/obikmer/src/cmd/dump.rs
 create mode 100644 src/obikpartitionner/src/dump_layer.rs

diff --git a/src/obikindex/src/dump.rs b/src/obikindex/src/dump.rs
new file mode 100644
index 0000000..31e5ed3
--- /dev/null
+++ b/src/obikindex/src/dump.rs
@@ -0,0 +1,50 @@
+use std::io::Write;
+
+use crate::error::{OKIError, OKIResult};
+use crate::index::KmerIndex;
+
+impl KmerIndex {
+    /// Write a CSV table of all indexed kmers to `out`.
+    ///
+    /// Columns: `kmer`, then one column per genome (in index order).
+    /// Values are counts (u32) when `use_counts = true`, otherwise 0/1.
+    ///
+    /// `force_presence` overrides `with_counts`: even if the index stores counts,
+    /// the output uses 0/1 presence columns.
+    ///
+    /// The caller must have set the global kmer length (`obikseq::set_k`) before
+    /// calling this method.
+    pub fn dump<W: Write>(&self, out: &mut W, force_presence: bool) -> OKIResult<()> {
+
+        let genomes  = &self.meta.genomes;
+        let use_counts = self.meta.config.with_counts && !force_presence;
+        let n_genomes  = genomes.len().max(1);
+
+        // ── Header ────────────────────────────────────────────────────────────
+        write!(out, "kmer")?;
+        for g in genomes {
+            write!(out, ",{g}")?;
+        }
+        writeln!(out)?;
+
+        // ── Rows ──────────────────────────────────────────────────────────────
+        let n = self.n_partitions();
+        for i in 0..n {
+            self.partition
+                .iter_partition_kmers(i, use_counts, n_genomes, |kmer, row| {
+                    let seq = String::from_utf8(kmer.to_ascii())
+                        .unwrap_or_else(|_| "?".repeat(self.kmer_size()));
+                    // write is infallible inside a closure — propagate via a flag if needed
+                    let _ = write!(out, "{seq}");
+                    for &v in row.iter() {
+                        let _ = write!(out, ",{v}");
+                    }
+                    let _ = writeln!(out);
+                })
+                .map_err(OKIError::Partition)?;
+        }
+
+        out.flush()?;
+        Ok(())
+    }
+}
diff --git a/src/obikindex/src/lib.rs b/src/obikindex/src/lib.rs
index 4dcf4ff..c4a66f5 100644
--- a/src/obikindex/src/lib.rs
+++ b/src/obikindex/src/lib.rs
@@ -1,6 +1,7 @@
 pub mod error;
 pub mod meta;
 pub mod state;
+mod dump;
 mod index;
 mod merge;
 
diff --git a/src/obikmer/src/cmd/dump.rs b/src/obikmer/src/cmd/dump.rs
new file mode 100644
index 0000000..8775699
--- /dev/null
+++ b/src/obikmer/src/cmd/dump.rs
@@ -0,0 +1,39 @@
+use std::io::{self, BufWriter};
+use std::path::PathBuf;
+
+use clap::Args;
+use obikindex::KmerIndex;
+use obikseq::set_k;
+use tracing::info;
+
+#[derive(Args)]
+pub struct DumpArgs {
+    /// Index directory to dump
+    pub index: PathBuf,
+
+    /// Output presence/absence (0/1) even if the index stores counts
+    #[arg(long, default_value_t = false)]
+    pub force_presence: bool,
+}
+
+pub fn run(args: DumpArgs) {
+    let idx = KmerIndex::open(&args.index).unwrap_or_else(|e| {
+        eprintln!("error opening index: {e}");
+        std::process::exit(1);
+    });
+
+    set_k(idx.kmer_size());
+    info!(
+        "dumping {} partitions, {} genome(s)",
+        idx.n_partitions(),
+        idx.meta().genomes.len()
+    );
+
+    let stdout = io::stdout();
+    let mut out = BufWriter::new(stdout.lock());
+
+    idx.dump(&mut out, args.force_presence).unwrap_or_else(|e| {
+        eprintln!("dump error: {e}");
+        std::process::exit(1);
+    });
+}
diff --git a/src/obikmer/src/cmd/mod.rs b/src/obikmer/src/cmd/mod.rs
index aaccdbf..85d8603 100644
--- a/src/obikmer/src/cmd/mod.rs
+++ b/src/obikmer/src/cmd/mod.rs
@@ -1,3 +1,4 @@
+pub mod dump;
 pub mod index;
 pub mod merge;
 pub mod superkmer;
diff --git a/src/obikmer/src/main.rs b/src/obikmer/src/main.rs
index 52c5953..1a1f8bc 100644
--- a/src/obikmer/src/main.rs
+++ b/src/obikmer/src/main.rs
@@ -20,6 +20,8 @@ enum Commands {
     Index(cmd::index::IndexArgs),
     /// Merge multiple built indexes into one
     Merge(cmd::merge::MergeArgs),
+    /// Dump all indexed kmers as CSV (kmer + per-genome counts or presence)
+    Dump(cmd::dump::DumpArgs),
     /// Dump unitigs from a built index to stdout (debug)
     Unitig(cmd::unitig::UnitigArgs),
 }
@@ -46,6 +48,7 @@ fn main() {
         Commands::Superkmer(args) => cmd::superkmer::run(args),
         Commands::Index(args)     => cmd::index::run(args),
         Commands::Merge(args)     => cmd::merge::run(args),
+        Commands::Dump(args)      => cmd::dump::run(args),
         Commands::Unitig(args)    => cmd::unitig::run(args),
     }
 
diff --git a/src/obikpartitionner/src/dump_layer.rs b/src/obikpartitionner/src/dump_layer.rs
new file mode 100644
index 0000000..71bcc93
--- /dev/null
+++ b/src/obikpartitionner/src/dump_layer.rs
@@ -0,0 +1,77 @@
+use obicompactvec::{PersistentBitMatrix, PersistentCompactIntMatrix};
+use obikseq::CanonicalKmer;
+use obiskio::{SKError, SKResult, UnitigFileReader};
+use obilayeredmap::OLMError;
+use obilayeredmap::meta::PartitionMeta;
+use obilayeredmap::MphfLayer;
+
+use crate::partition::KmerPartition;
+
+const INDEX_SUBDIR: &str = "index";
+
+fn olm_to_sk(e: OLMError) -> SKError {
+    match e {
+        OLMError::Io(e) => SKError::Io(e),
+        other => SKError::InvalidData { context: "dump", detail: other.to_string() },
+    }
+}
+
+impl KmerPartition {
+    /// Iterate all indexed kmers in partition `part`, calling `cb(kmer, row)` for each.
+    ///
+    /// `use_counts = true`  → reads count columns (u32 values per genome).
+    /// `use_counts = false` → reads presence columns, converted to 0/1 u32.
+    ///
+    /// If no data matrix exists for a layer (pure set-membership, single genome),
+    /// a row of `n_genomes` ones is emitted for every kmer in that layer.
+    pub fn iter_partition_kmers(
+        &self,
+        part: usize,
+        use_counts: bool,
+        n_genomes: usize,
+        mut cb: impl FnMut(CanonicalKmer, Box<[u32]>),
+    ) -> SKResult<()> {
+        let index_dir = self.part_dir(part).join(INDEX_SUBDIR);
+        if !index_dir.exists() {
+            return Ok(());
+        }
+
+        let meta = PartitionMeta::load(&index_dir).map_err(olm_to_sk)?;
+
+        for l in 0..meta.n_layers {
+            let layer_dir = index_dir.join(format!("layer_{l}"));
+            let mphf = MphfLayer::open(&layer_dir).map_err(olm_to_sk)?;
+            let reader = UnitigFileReader::open(&layer_dir.join("unitigs.bin"))?;
+
+            let counts_dir  = layer_dir.join("counts");
+            let presence_dir = layer_dir.join("presence");
+
+            if use_counts && counts_dir.exists() {
+                let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?;
+                for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
+                    if let Some(slot) = mphf.find(kmer) {
+                        cb(kmer, mat.row(slot));
+                    }
+                }
+            } else if !use_counts && presence_dir.exists() {
+                let mat = PersistentBitMatrix::open(&presence_dir).map_err(SKError::Io)?;
+                for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
+                    if let Some(slot) = mphf.find(kmer) {
+                        let row: Box<[u32]> = mat.row(slot).iter().map(|&b| b as u32).collect();
+                        cb(kmer, row);
+                    }
+                }
+            } else {
+                // No data matrix: pure set-membership layer, all kmers belong to every genome.
+                let all_present: Box<[u32]> = vec![1u32; n_genomes].into();
+                for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
+                    if mphf.find(kmer).is_some() {
+                        cb(kmer, all_present.clone());
+                    }
+                }
+            }
+        }
+
+        Ok(())
+    }
+}
diff --git a/src/obikpartitionner/src/lib.rs b/src/obikpartitionner/src/lib.rs
index 672c118..3b098a2 100644
--- a/src/obikpartitionner/src/lib.rs
+++ b/src/obikpartitionner/src/lib.rs
@@ -1,3 +1,4 @@
+mod dump_layer;
 mod index_layer;
 mod kmer_sort;
 mod merge_layer;
-- 
2.52.0


From 11182005a2d1b9100e6f7cceb048675d0eaa7a61 Mon Sep 17 00:00:00 2001
From: Eric Coissac <eric@coissac.eu>
Date: Thu, 21 May 2026 08:12:02 +0200
Subject: [PATCH 06/11] feat: enhance merge label resolution, debug dump, and
 layer metadata

This commit enhances the CLI and index pipelines by introducing `--force-presence` to normalize output to binary values, `--debug` to expose partition and layer metadata, and `--rename-duplicates` to automatically disambiguate overlapping genome labels. It updates the partitioner and index layers to auto-discover layers, persist `meta.json` for single-genome builds, and fix per-source column offsets during merging. A `DuplicateGenomeLabel` error variant is also added, and stale directories are properly managed in presence/absence mode.
---
 src/obikindex/src/dump.rs               |  45 +++++--
 src/obikindex/src/error.rs              |   5 +-
 src/obikindex/src/merge.rs              | 117 ++++++++++++++--
 src/obikmer/src/cmd/dump.rs             |   6 +-
 src/obikmer/src/cmd/merge.rs            |  28 +++-
 src/obikpartitionner/src/dump_layer.rs  |  64 ++++++++-
 src/obikpartitionner/src/index_layer.rs |   6 +
 src/obikpartitionner/src/merge_layer.rs | 170 ++++++++++++++++--------
 8 files changed, 347 insertions(+), 94 deletions(-)

diff --git a/src/obikindex/src/dump.rs b/src/obikindex/src/dump.rs
index 31e5ed3..d1fb8db 100644
--- a/src/obikindex/src/dump.rs
+++ b/src/obikindex/src/dump.rs
@@ -14,13 +14,17 @@ impl KmerIndex {
     ///
     /// The caller must have set the global kmer length (`obikseq::set_k`) before
     /// calling this method.
-    pub fn dump<W: Write>(&self, out: &mut W, force_presence: bool) -> OKIResult<()> {
+    pub fn dump<W: Write>(&self, out: &mut W, force_presence: bool, debug: bool) -> OKIResult<()> {
 
-        let genomes  = &self.meta.genomes;
+        let genomes    = &self.meta.genomes;
         let use_counts = self.meta.config.with_counts && !force_presence;
         let n_genomes  = genomes.len().max(1);
+        let kmer_size  = self.kmer_size();
 
         // ── Header ────────────────────────────────────────────────────────────
+        if debug {
+            write!(out, "partition,layer,")?;
+        }
         write!(out, "kmer")?;
         for g in genomes {
             write!(out, ",{g}")?;
@@ -30,18 +34,31 @@ impl KmerIndex {
         // ── Rows ──────────────────────────────────────────────────────────────
         let n = self.n_partitions();
         for i in 0..n {
-            self.partition
-                .iter_partition_kmers(i, use_counts, n_genomes, |kmer, row| {
-                    let seq = String::from_utf8(kmer.to_ascii())
-                        .unwrap_or_else(|_| "?".repeat(self.kmer_size()));
-                    // write is infallible inside a closure — propagate via a flag if needed
-                    let _ = write!(out, "{seq}");
-                    for &v in row.iter() {
-                        let _ = write!(out, ",{v}");
-                    }
-                    let _ = writeln!(out);
-                })
-                .map_err(OKIError::Partition)?;
+            if debug {
+                self.partition
+                    .iter_partition_kmers_located(i, use_counts, n_genomes, |part, layer, kmer, row| {
+                        let seq = String::from_utf8(kmer.to_ascii())
+                            .unwrap_or_else(|_| "?".repeat(kmer_size));
+                        let _ = write!(out, "{part},{layer},{seq}");
+                        for &v in row.iter() {
+                            let _ = write!(out, ",{v}");
+                        }
+                        let _ = writeln!(out);
+                    })
+                    .map_err(OKIError::Partition)?;
+            } else {
+                self.partition
+                    .iter_partition_kmers(i, use_counts, n_genomes, |kmer, row| {
+                        let seq = String::from_utf8(kmer.to_ascii())
+                            .unwrap_or_else(|_| "?".repeat(kmer_size));
+                        let _ = write!(out, "{seq}");
+                        for &v in row.iter() {
+                            let _ = write!(out, ",{v}");
+                        }
+                        let _ = writeln!(out);
+                    })
+                    .map_err(OKIError::Partition)?;
+            }
         }
 
         out.flush()?;
diff --git a/src/obikindex/src/error.rs b/src/obikindex/src/error.rs
index 74082fe..66f1abc 100644
--- a/src/obikindex/src/error.rs
+++ b/src/obikindex/src/error.rs
@@ -14,6 +14,8 @@ pub enum OKIError {
     IncompatibleConfig,
     /// Count mode requested but a source index lacks count data.
     MismatchedMode,
+    /// Two or more sources share the same genome label.
+    DuplicateGenomeLabel(String),
 }
 
 pub type OKIResult<T> = Result<T, OKIError>;
@@ -27,6 +29,7 @@ impl fmt::Display for OKIError {
             OKIError::NotIndexed(p)    => write!(f, "index not fully built: {}", p.display()),
             OKIError::IncompatibleConfig => write!(f, "incompatible index configurations"),
             OKIError::MismatchedMode   => write!(f, "count mode requires all sources to have with_counts=true"),
+            OKIError::DuplicateGenomeLabel(l) => write!(f, "duplicate genome label across sources: {l}"),
         }
     }
 }
@@ -37,7 +40,7 @@ impl std::error::Error for OKIError {
             OKIError::Io(e)        => Some(e),
             OKIError::Json(e)      => Some(e),
             OKIError::Partition(e) => Some(e),
-            _                      => None,
+            _                      => None, // IncompatibleConfig, MismatchedMode, DuplicateGenomeLabel
         }
     }
 }
diff --git a/src/obikindex/src/merge.rs b/src/obikindex/src/merge.rs
index be71a02..305d344 100644
--- a/src/obikindex/src/merge.rs
+++ b/src/obikindex/src/merge.rs
@@ -1,3 +1,4 @@
+use std::collections::HashMap;
 use std::fs;
 use std::io;
 use std::path::Path;
@@ -20,13 +21,16 @@ impl KmerIndex {
     /// `minimizer_size`, and `n_partitions`. Count mode additionally requires
     /// every source to have `with_counts = true`.
     ///
-    /// The first source is copied to `output`, then each subsequent source is
-    /// merged partition-by-partition in parallel.
+    /// Genome labels must be unique across all sources. If `rename_duplicates`
+    /// is true, repeated labels are disambiguated by appending `.1`, `.2`, …
+    /// to the second and subsequent occurrences. Otherwise a
+    /// `DuplicateGenomeLabel` error is returned on the first conflict.
     pub fn merge<P: AsRef<Path>>(
         output: P,
         sources: &[&KmerIndex],
         mode: MergeMode,
         force: bool,
+        rename_duplicates: bool,
     ) -> OKIResult<Self> {
         let output = output.as_ref();
 
@@ -37,7 +41,7 @@ impl KmerIndex {
             )));
         }
 
-        // ── Validate ──────────────────────────────────────────────────────────
+        // ── Validate config compatibility ─────────────────────────────────────
         let ref0 = sources[0];
         for src in sources {
             if src.state() != IndexState::Indexed {
@@ -54,6 +58,9 @@ impl KmerIndex {
             }
         }
 
+        // ── Compute final genome labels (rename duplicates if requested) ───────
+        let (source_labels, all_genomes) = compute_labels(sources, rename_duplicates)?;
+
         // ── Prepare output directory ──────────────────────────────────────────
         if output.exists() {
             if force {
@@ -70,18 +77,32 @@ impl KmerIndex {
         info!("copying {} → {}", sources[0].root_path.display(), output.display());
         copy_dir_all(&sources[0].root_path, output)?;
 
-        // Rewrite index.meta with all genome labels.
-        let all_genomes: Vec<String> = sources
-            .iter()
-            .flat_map(|s| s.meta.genomes.iter().cloned())
-            .collect();
+        // Rewrite index.meta with final genome labels and the effective mode.
         let mut meta = IndexMeta::read(output).map_err(OKIError::Io)?;
         meta.genomes = all_genomes;
+        meta.config.with_counts = mode == MergeMode::Count;
         meta.write(output)?;
 
+        // In presence/absence mode, purge counts/ directories inherited from
+        // source_0 — they are stale data from the source's count index.
+        if mode == MergeMode::Presence {
+            remove_dirs_named(output, "counts")?;
+        }
+
+        // Rebuild spectrums/ from all sources using the (possibly renamed) labels.
+        // Drop the spectrums/ that were copied from source_0 and rebuild from scratch.
+        let spectrums_dir = output.join("spectrums");
+        if spectrums_dir.exists() {
+            fs::remove_dir_all(&spectrums_dir)?;
+        }
+        for (src, new_labels) in sources.iter().zip(&source_labels) {
+            copy_spectrums(&src.root_path, output, &src.meta.genomes, new_labels)?;
+        }
+
         // Open the destination index.
         let dst = KmerIndex::open(output)?;
         let n_partitions = dst.n_partitions();
+        let n_dst_genomes = sources[0].meta.genomes.len();
 
         // ── Merge each subsequent source partition-by-partition ───────────────
         let remaining_sources: Vec<&KmerIndex> = sources[1..].to_vec();
@@ -94,10 +115,9 @@ impl KmerIndex {
             let errors: Vec<obiskio::SKError> = (0..n_partitions)
                 .into_par_iter()
                 .filter_map(|i| {
-                    let srcs: Vec<&obikpartitionner::KmerPartition> =
-                        remaining_sources.iter().map(|s| &s.partition).collect();
-                    // n_dst_genomes = 1 (copied from source_0 only)
-                    dst_partition.merge_partition(i, &srcs, mode, 1).err()
+                    let srcs: Vec<(&obikpartitionner::KmerPartition, usize)> =
+                        remaining_sources.iter().map(|s| (&s.partition, s.meta.genomes.len())).collect();
+                    dst_partition.merge_partition(i, &srcs, mode, n_dst_genomes).err()
                 })
                 .collect();
 
@@ -113,7 +133,78 @@ impl KmerIndex {
     }
 }
 
-// ── Directory copy ────────────────────────────────────────────────────────────
+// ── Helpers ───────────────────────────────────────────────────────────────────
+
+/// Compute the final genome label lists for all sources.
+///
+/// Returns `(per_source_labels, all_genomes_flat)`.
+/// The first occurrence of a label keeps the original name. Subsequent
+/// occurrences receive `.1`, `.2`, … suffixes when `rename_duplicates` is true,
+/// or trigger a `DuplicateGenomeLabel` error otherwise.
+fn compute_labels(
+    sources: &[&KmerIndex],
+    rename_duplicates: bool,
+) -> OKIResult<(Vec<Vec<String>>, Vec<String>)> {
+    let mut seen: HashMap<String, usize> = HashMap::new();
+    let mut source_labels: Vec<Vec<String>> = Vec::with_capacity(sources.len());
+    let mut all_genomes: Vec<String> = Vec::new();
+
+    for src in sources {
+        let mut labels = Vec::with_capacity(src.meta.genomes.len());
+        for label in &src.meta.genomes {
+            let count = seen.entry(label.clone()).or_insert(0);
+            let new_label = if *count == 0 {
+                label.clone()
+            } else if rename_duplicates {
+                format!("{label}.{count}")
+            } else {
+                return Err(OKIError::DuplicateGenomeLabel(label.clone()));
+            };
+            *count += 1;
+            labels.push(new_label.clone());
+            all_genomes.push(new_label);
+        }
+        source_labels.push(labels);
+    }
+
+    Ok((source_labels, all_genomes))
+}
+
+/// Copy spectrum JSON files from `src_root/spectrums/` to `dst_root/spectrums/`,
+/// mapping each `old_labels[i]` filename to `new_labels[i]`.
+fn copy_spectrums(
+    src_root: &Path,
+    dst_root: &Path,
+    old_labels: &[String],
+    new_labels: &[String],
+) -> io::Result<()> {
+    let src_dir = src_root.join("spectrums");
+    let dst_dir = dst_root.join("spectrums");
+    fs::create_dir_all(&dst_dir)?;
+    for (old, new) in old_labels.iter().zip(new_labels.iter()) {
+        let src_file = src_dir.join(format!("{old}.json"));
+        if src_file.exists() {
+            fs::copy(&src_file, dst_dir.join(format!("{new}.json")))?;
+        }
+    }
+    Ok(())
+}
+
+/// Recursively remove every directory named `name` under `root`.
+fn remove_dirs_named(root: &Path, name: &str) -> io::Result<()> {
+    for entry in fs::read_dir(root)? {
+        let entry = entry?;
+        let path = entry.path();
+        if path.is_dir() {
+            if path.file_name().and_then(|n| n.to_str()) == Some(name) {
+                fs::remove_dir_all(&path)?;
+            } else {
+                remove_dirs_named(&path, name)?;
+            }
+        }
+    }
+    Ok(())
+}
 
 fn copy_dir_all(src: &Path, dst: &Path) -> io::Result<()> {
     fs::create_dir_all(dst)?;
diff --git a/src/obikmer/src/cmd/dump.rs b/src/obikmer/src/cmd/dump.rs
index 8775699..f7c324c 100644
--- a/src/obikmer/src/cmd/dump.rs
+++ b/src/obikmer/src/cmd/dump.rs
@@ -14,6 +14,10 @@ pub struct DumpArgs {
     /// Output presence/absence (0/1) even if the index stores counts
     #[arg(long, default_value_t = false)]
     pub force_presence: bool,
+
+    /// Prepend partition and layer columns to each row
+    #[arg(long, default_value_t = false)]
+    pub debug: bool,
 }
 
 pub fn run(args: DumpArgs) {
@@ -32,7 +36,7 @@ pub fn run(args: DumpArgs) {
     let stdout = io::stdout();
     let mut out = BufWriter::new(stdout.lock());
 
-    idx.dump(&mut out, args.force_presence).unwrap_or_else(|e| {
+    idx.dump(&mut out, args.force_presence, args.debug).unwrap_or_else(|e| {
         eprintln!("dump error: {e}");
         std::process::exit(1);
     });
diff --git a/src/obikmer/src/cmd/merge.rs b/src/obikmer/src/cmd/merge.rs
index 0109617..0b18ac8 100644
--- a/src/obikmer/src/cmd/merge.rs
+++ b/src/obikmer/src/cmd/merge.rs
@@ -2,6 +2,7 @@ use std::path::PathBuf;
 
 use clap::Args;
 use obikindex::{KmerIndex, MergeMode};
+use obikseq::{set_k, set_m};
 use obisys::Reporter;
 use tracing::info;
 
@@ -19,14 +20,16 @@ pub struct MergeArgs {
     #[arg(long, default_value_t = false)]
     pub force: bool,
 
-    /// Merge in count mode (requires all sources to have with_counts=true)
+    /// Force presence/absence mode even if all sources have count data
     #[arg(long, default_value_t = false)]
-    pub with_counts: bool,
+    pub force_presence: bool,
+
+    /// Disambiguate duplicate genome labels by appending .1, .2, … instead of erroring
+    #[arg(long, default_value_t = false)]
+    pub rename_duplicates: bool,
 }
 
 pub fn run(args: MergeArgs) {
-    let mode = if args.with_counts { MergeMode::Count } else { MergeMode::Presence };
-
     let sources: Vec<KmerIndex> = args.sources.iter().map(|p| {
         info!("opening source index: {}", p.display());
         KmerIndex::open(p).unwrap_or_else(|e| {
@@ -35,11 +38,26 @@ pub fn run(args: MergeArgs) {
         })
     }).collect();
 
+    // Auto-detect mode: count if all sources have count data, presence otherwise.
+    // --force-presence overrides to presence regardless.
+    let all_have_counts = sources.iter().all(|s| s.meta().config.with_counts);
+    let mode = if !args.force_presence && all_have_counts {
+        MergeMode::Count
+    } else {
+        MergeMode::Presence
+    };
+    info!(
+        "merge mode: {}",
+        if mode == MergeMode::Count { "count" } else { "presence/absence" }
+    );
+
     let source_refs: Vec<&KmerIndex> = sources.iter().collect();
 
+    set_k(sources[0].kmer_size());
+    set_m(sources[0].minimizer_size());
     info!("merging {} indexes → {}", sources.len(), args.output.display());
     let rep = Reporter::new();
-    KmerIndex::merge(&args.output, &source_refs, mode, args.force).unwrap_or_else(|e| {
+    KmerIndex::merge(&args.output, &source_refs, mode, args.force, args.rename_duplicates).unwrap_or_else(|e| {
         eprintln!("error merging: {e}");
         std::process::exit(1);
     });
diff --git a/src/obikpartitionner/src/dump_layer.rs b/src/obikpartitionner/src/dump_layer.rs
index 71bcc93..eaf8231 100644
--- a/src/obikpartitionner/src/dump_layer.rs
+++ b/src/obikpartitionner/src/dump_layer.rs
@@ -2,7 +2,6 @@ use obicompactvec::{PersistentBitMatrix, PersistentCompactIntMatrix};
 use obikseq::CanonicalKmer;
 use obiskio::{SKError, SKResult, UnitigFileReader};
 use obilayeredmap::OLMError;
-use obilayeredmap::meta::PartitionMeta;
 use obilayeredmap::MphfLayer;
 
 use crate::partition::KmerPartition;
@@ -36,10 +35,14 @@ impl KmerPartition {
             return Ok(());
         }
 
-        let meta = PartitionMeta::load(&index_dir).map_err(olm_to_sk)?;
-
-        for l in 0..meta.n_layers {
+        // Discover layers by probing layer_0, layer_1, … until one is absent.
+        // PartitionMeta (meta.json) is only created by the merge path, not by
+        // the initial single-genome build, so we cannot rely on it here.
+        let mut l = 0;
+        loop {
             let layer_dir = index_dir.join(format!("layer_{l}"));
+            if !layer_dir.exists() { break; }
+            l += 1;
             let mphf = MphfLayer::open(&layer_dir).map_err(olm_to_sk)?;
             let reader = UnitigFileReader::open(&layer_dir.join("unitigs.bin"))?;
 
@@ -74,4 +77,57 @@ impl KmerPartition {
 
         Ok(())
     }
+
+    /// Like [`iter_partition_kmers`] but the callback also receives the layer index,
+    /// enabling debug output that identifies where each kmer was stored.
+    pub fn iter_partition_kmers_located(
+        &self,
+        part: usize,
+        use_counts: bool,
+        n_genomes: usize,
+        mut cb: impl FnMut(usize, usize, CanonicalKmer, Box<[u32]>),
+    ) -> SKResult<()> {
+        let index_dir = self.part_dir(part).join(INDEX_SUBDIR);
+        if !index_dir.exists() {
+            return Ok(());
+        }
+
+        let mut layer = 0;
+        loop {
+            let layer_dir = index_dir.join(format!("layer_{layer}"));
+            if !layer_dir.exists() { break; }
+            let mphf = MphfLayer::open(&layer_dir).map_err(olm_to_sk)?;
+            let reader = UnitigFileReader::open(&layer_dir.join("unitigs.bin"))?;
+
+            let counts_dir   = layer_dir.join("counts");
+            let presence_dir = layer_dir.join("presence");
+
+            if use_counts && counts_dir.exists() {
+                let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?;
+                for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
+                    if let Some(slot) = mphf.find(kmer) {
+                        cb(part, layer, kmer, mat.row(slot));
+                    }
+                }
+            } else if !use_counts && presence_dir.exists() {
+                let mat = PersistentBitMatrix::open(&presence_dir).map_err(SKError::Io)?;
+                for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
+                    if let Some(slot) = mphf.find(kmer) {
+                        let row: Box<[u32]> = mat.row(slot).iter().map(|&b| b as u32).collect();
+                        cb(part, layer, kmer, row);
+                    }
+                }
+            } else {
+                let all_present: Box<[u32]> = vec![1u32; n_genomes].into();
+                for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
+                    if mphf.find(kmer).is_some() {
+                        cb(part, layer, kmer, all_present.clone());
+                    }
+                }
+            }
+            layer += 1;
+        }
+
+        Ok(())
+    }
 }
diff --git a/src/obikpartitionner/src/index_layer.rs b/src/obikpartitionner/src/index_layer.rs
index a6d873a..a6401b5 100644
--- a/src/obikpartitionner/src/index_layer.rs
+++ b/src/obikpartitionner/src/index_layer.rs
@@ -6,6 +6,7 @@ use epserde::prelude::*;
 use obicompactvec::{PersistentCompactIntMatrix, PersistentCompactIntVec};
 use obidebruinj::GraphDeBruijn;
 use obilayeredmap::{OLMError, layer::Layer};
+use obilayeredmap::meta::PartitionMeta;
 use obiskio::{SKError, SKFileMeta, SKFileReader};
 use ptr_hash::{PtrHash, bucket_fn::CubicEps, hash::Xx64};
 
@@ -118,6 +119,11 @@ impl KmerPartition {
             Layer::<()>::build(&layer_dir).map_err(olm_to_sk)?;
         }
 
+        // Write meta.json in the index/ directory so LayeredMap::open works
+        // (e.g. for subsequent merge operations).
+        let index_dir = layer_dir.parent().expect("layer_dir has a parent");
+        PartitionMeta { n_layers: 1 }.save(index_dir).map_err(olm_to_sk)?;
+
         Ok(n_kmers)
     }
 
diff --git a/src/obikpartitionner/src/merge_layer.rs b/src/obikpartitionner/src/merge_layer.rs
index 7d77c45..c933966 100644
--- a/src/obikpartitionner/src/merge_layer.rs
+++ b/src/obikpartitionner/src/merge_layer.rs
@@ -7,6 +7,7 @@ use obicompactvec::{PersistentBitMatrix, PersistentBitMatrixBuilder,
                     PersistentBitVecBuilder,
                     PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder,
                     PersistentCompactIntVecBuilder};
+use obikseq::CanonicalKmer;
 use obiskio::{SKError, SKResult, UnitigFileReader};
 use obilayeredmap::{Layer, LayeredMap, MphfLayer, OLMError};
 use obilayeredmap::meta::PartitionMeta;
@@ -41,10 +42,88 @@ impl ColBuilder {
     }
 }
 
+// ── SrcLayerData — opened source matrix for pass-2 lookup ─────────────────────
+
+enum SrcLayerData {
+    /// Pure set-membership layer (no data matrix): every kmer is present in all genomes.
+    SetMembership,
+    Presence(MphfLayer, PersistentBitMatrix),
+    Count(MphfLayer, PersistentCompactIntMatrix),
+}
+
+impl SrcLayerData {
+    fn open(layer_dir: &Path, mode: MergeMode) -> SKResult<Self> {
+        let presence_dir = layer_dir.join("presence");
+        let counts_dir   = layer_dir.join("counts");
+        match mode {
+            MergeMode::Presence => {
+                if presence_dir.exists() {
+                    let mphf = MphfLayer::open(layer_dir).map_err(olm_to_sk)?;
+                    let mat  = PersistentBitMatrix::open(&presence_dir).map_err(SKError::Io)?;
+                    Ok(SrcLayerData::Presence(mphf, mat))
+                } else if counts_dir.exists() {
+                    // Source is a count index; treat count > 0 as present via ColBuilder::Bit.
+                    let mphf = MphfLayer::open(layer_dir).map_err(olm_to_sk)?;
+                    let mat  = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?;
+                    Ok(SrcLayerData::Count(mphf, mat))
+                } else {
+                    Ok(SrcLayerData::SetMembership)
+                }
+            }
+            MergeMode::Count => {
+                if counts_dir.exists() {
+                    let mphf = MphfLayer::open(layer_dir).map_err(olm_to_sk)?;
+                    let mat  = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?;
+                    Ok(SrcLayerData::Count(mphf, mat))
+                } else {
+                    Ok(SrcLayerData::SetMembership)
+                }
+            }
+        }
+    }
+
+    /// Return one value per source genome for `kmer`.
+    fn lookup(&self, kmer: CanonicalKmer, n_genomes: usize) -> Vec<u32> {
+        match self {
+            SrcLayerData::SetMembership => vec![1u32; n_genomes],
+            SrcLayerData::Presence(mphf, mat) => {
+                if let Some(slot) = mphf.find(kmer) {
+                    mat.row(slot).iter().map(|&b| b as u32).collect()
+                } else {
+                    vec![0u32; n_genomes]
+                }
+            }
+            SrcLayerData::Count(mphf, mat) => {
+                if let Some(slot) = mphf.find(kmer) {
+                    mat.row(slot).iter().copied().collect()
+                } else {
+                    vec![0u32; n_genomes]
+                }
+            }
+        }
+    }
+}
+
 // ── helpers ───────────────────────────────────────────────────────────────────
 
 const INDEX_SUBDIR: &str = "index";
 
+/// Load PartitionMeta, or recover it by probing layer directories.
+/// Indexes built before meta.json was introduced lack the file.
+fn load_meta(dir: &Path) -> SKResult<PartitionMeta> {
+    match PartitionMeta::load(dir) {
+        Ok(m) => Ok(m),
+        Err(e) if matches!(e, OLMError::Io(ref io_e) if io_e.kind() == std::io::ErrorKind::NotFound) => {
+            let mut n = 0usize;
+            while dir.join(format!("layer_{n}")).exists() { n += 1; }
+            let m = PartitionMeta { n_layers: n };
+            m.save(dir).map_err(olm_to_sk)?;
+            Ok(m)
+        }
+        Err(e) => Err(olm_to_sk(e)),
+    }
+}
+
 fn olm_to_sk(e: OLMError) -> SKError {
     match e {
         OLMError::Io(e) => SKError::Io(e),
@@ -69,18 +148,17 @@ fn write_matrix_meta(dir: &Path, n: usize, n_cols: usize) -> io::Result<()> {
 impl KmerPartition {
     /// Merge `sources` into destination partition `i`.
     ///
-    /// `n_dst_genomes` is the number of genome columns already in the dst
-    /// matrices (1 after copying source_0, more for subsequent merges).
+    /// Each entry in `sources` is `(partition, n_genomes)` where `n_genomes` is
+    /// the number of genome columns that source contributes. A merged index
+    /// contributes more than one. The total new columns added to the destination
+    /// is `sum(n_genomes)`.
     ///
-    /// Two-pass algorithm:
-    /// 1. Classify each source kmer as dst-hit or new → build de Bruijn graph
-    ///    of new kmers → write unitigs → build MPHF for the new layer.
-    /// 2. Iterate source kmers again → fill per-genome column builders
-    ///    (memory-mapped) → close → update matrix metadata.
+    /// `n_dst_genomes` is the number of genome columns already in the destination
+    /// matrices (copied from source_0 before this call).
     pub fn merge_partition(
         &self,
         i: usize,
-        sources: &[&KmerPartition],
+        sources: &[(&KmerPartition, usize)],
         mode: MergeMode,
         n_dst_genomes: usize,
     ) -> SKResult<()> {
@@ -89,12 +167,13 @@ impl KmerPartition {
             return Ok(());
         }
 
+        load_meta(&dst_index_dir)?;  // ensure meta.json exists before LayeredMap::open
         let dst_map = LayeredMap::<()>::open(&dst_index_dir).map_err(olm_to_sk)?;
         let n_dst_layers = dst_map.n_layers();
-        let n_src = sources.len();
+        let n_src_total: usize = sources.iter().map(|(_, n)| *n).sum();
 
         // First merge in presence mode: init presence matrices on existing layers
-        // (all slots true — every kmer in a layer belongs to genome_0).
+        // (all slots true — every kmer in those layers belongs to genome_0).
         if n_dst_genomes == 1 && mode == MergeMode::Presence {
             for l in 0..n_dst_layers {
                 let layer_dir = dst_index_dir.join(format!("layer_{l}"));
@@ -107,10 +186,10 @@ impl KmerPartition {
         let mut g = GraphDeBruijn::new();
         let mut any_new = false;
 
-        for src in sources.iter() {
+        for (src, _) in sources.iter() {
             let src_index_dir = src.part_dir(i).join(INDEX_SUBDIR);
             if !src_index_dir.exists() { continue; }
-            let src_meta = PartitionMeta::load(&src_index_dir).map_err(olm_to_sk)?;
+            let src_meta = load_meta(&src_index_dir)?;
 
             for l in 0..src_meta.n_layers {
                 let unitigs_path = src_index_dir
@@ -149,7 +228,7 @@ impl KmerPartition {
         let n_new = new_mphf.as_ref().map_or(0, |m| m.n());
 
         // ── Prepare matrix directories for the new layer ──────────────────────
-        // Absent columns (dst genomes) are written now via append_column (all-zero/false).
+        // Absent columns (dst genomes) are written via append_column (all-zero/false).
         // Source-genome columns are created as mutable builders for pass 2.
         let mut new_src_builders: Vec<ColBuilder> = if any_new {
             let data_dir = match mode {
@@ -157,7 +236,6 @@ impl KmerPartition {
                 MergeMode::Count    => new_layer_dir.join("counts"),
             };
             fs::create_dir_all(&data_dir)?;
-            // Bootstrap meta with 0 cols.
             match mode {
                 MergeMode::Presence => {
                     PersistentBitMatrixBuilder::new(n_new, &data_dir)
@@ -166,7 +244,7 @@ impl KmerPartition {
                         PersistentBitMatrix::append_column(&data_dir, |_| false)
                             .map_err(SKError::Io)?;
                     }
-                    (0..n_src).map(|g| -> SKResult<ColBuilder> {
+                    (0..n_src_total).map(|g| -> SKResult<ColBuilder> {
                         let b = PersistentBitVecBuilder::new(
                             n_new, &col_path_bit(&data_dir, n_dst_genomes + g))?;
                         Ok(ColBuilder::Bit(b))
@@ -179,7 +257,7 @@ impl KmerPartition {
                         PersistentCompactIntMatrix::append_column(&data_dir, |_| 0)
                             .map_err(SKError::Io)?;
                     }
-                    (0..n_src).map(|g| -> SKResult<ColBuilder> {
+                    (0..n_src_total).map(|g| -> SKResult<ColBuilder> {
                         let b = PersistentCompactIntVecBuilder::new(
                             n_new, &col_path_int(&data_dir, n_dst_genomes + g))?;
                         Ok(ColBuilder::Int(b))
@@ -190,14 +268,13 @@ impl KmerPartition {
             vec![]
         };
 
-        // Builders for existing layers: one per (layer, src_genome).
-        // Invariant: existing layers already have exactly n_dst_genomes columns.
-        // New source columns go at positions n_dst_genomes .. n_dst_genomes+n_src-1.
+        // Builders for existing layers: n_src_total per layer.
+        // Columns n_dst_genomes .. n_dst_genomes + n_src_total - 1.
         let mut exist_builders: Vec<Vec<ColBuilder>> = (0..n_dst_layers)
             .map(|l| {
                 let layer_dir = dst_index_dir.join(format!("layer_{l}"));
                 let n = dst_map.layer(l).n();
-                (0..n_src).map(|src_g| -> SKResult<ColBuilder> {
+                (0..n_src_total).map(|src_g| -> SKResult<ColBuilder> {
                     match mode {
                         MergeMode::Presence => {
                             let data_dir = layer_dir.join("presence");
@@ -217,72 +294,53 @@ impl KmerPartition {
             .collect::<SKResult<_>>()?;
 
         // ── Pass 2: fill builders ─────────────────────────────────────────────
-        for (src_g, src) in sources.iter().enumerate() {
+        let mut col_offset = 0usize;
+        for (src, src_n) in sources.iter() {
             let src_index_dir = src.part_dir(i).join(INDEX_SUBDIR);
-            if !src_index_dir.exists() { continue; }
-            let src_meta = PartitionMeta::load(&src_index_dir).map_err(olm_to_sk)?;
+            if !src_index_dir.exists() { col_offset += src_n; continue; }
+            let src_meta = load_meta(&src_index_dir)?;
 
             for l in 0..src_meta.n_layers {
                 let src_layer_dir = src_index_dir.join(format!("layer_{l}"));
                 let reader = UnitigFileReader::open(&src_layer_dir.join("unitigs.bin"))?;
-
-                // Open source MPHF + count matrix for count mode.
-                let src_count_data: Option<(MphfLayer, PersistentCompactIntMatrix)> =
-                    if mode == MergeMode::Count {
-                        let counts_dir = src_layer_dir.join("counts");
-                        if counts_dir.exists() {
-                            let mphf = MphfLayer::open(&src_layer_dir).map_err(olm_to_sk)?;
-                            let mat  = PersistentCompactIntMatrix::open(&counts_dir)
-                                .map_err(SKError::Io)?;
-                            Some((mphf, mat))
-                        } else {
-                            None
-                        }
-                    } else {
-                        None
-                    };
+                let src_data = SrcLayerData::open(&src_layer_dir, mode)?;
 
                 for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
-                    let value: u32 = match &src_count_data {
-                        Some((mphf, counts)) => {
-                            mphf.find(kmer).map(|s| counts.col(0).get(s)).unwrap_or(1)
-                        }
-                        None => 1,
-                    };
-
-                    if let Some((dst_layer, hit)) = dst_map.query(kmer) {
-                        exist_builders[dst_layer][src_g].set_val(hit.slot, value);
-                    } else if let Some(ref mphf) = new_mphf {
-                        if let Some(slot) = mphf.find(kmer) {
-                            new_src_builders[src_g].set_val(slot, value);
+                    let values = src_data.lookup(kmer, *src_n);
+                    for (g, &value) in values.iter().enumerate() {
+                        let builder_idx = col_offset + g;
+                        if let Some((dst_layer, hit)) = dst_map.query(kmer) {
+                            exist_builders[dst_layer][builder_idx].set_val(hit.slot, value);
+                        } else if let Some(ref mphf) = new_mphf {
+                            if let Some(slot) = mphf.find(kmer) {
+                                new_src_builders[builder_idx].set_val(slot, value);
+                            }
                         }
                     }
                 }
             }
+            col_offset += src_n;
         }
 
         // ── Close builders and update metadata ────────────────────────────────
         for (l, builders) in exist_builders.into_iter().enumerate() {
             let layer_dir = dst_index_dir.join(format!("layer_{l}"));
             for b in builders { b.close()?; }
-            // Update the matrix meta to reflect the n_src new columns.
             let n = dst_map.layer(l).n();
             let data_dir = match mode {
                 MergeMode::Presence => layer_dir.join("presence"),
                 MergeMode::Count    => layer_dir.join("counts"),
             };
-            write_matrix_meta(&data_dir, n, n_dst_genomes + n_src).map_err(SKError::Io)?;
+            write_matrix_meta(&data_dir, n, n_dst_genomes + n_src_total).map_err(SKError::Io)?;
         }
 
         for b in new_src_builders { b.close()?; }
-        // new layer matrix meta was already written by append_column calls above
-        // with n_dst_genomes cols; update to n_dst_genomes + n_src.
         if any_new {
             let data_dir = match mode {
                 MergeMode::Presence => new_layer_dir.join("presence"),
                 MergeMode::Count    => new_layer_dir.join("counts"),
             };
-            write_matrix_meta(&data_dir, n_new, n_dst_genomes + n_src).map_err(SKError::Io)?;
+            write_matrix_meta(&data_dir, n_new, n_dst_genomes + n_src_total).map_err(SKError::Io)?;
 
             let mut part_meta = PartitionMeta::load(&dst_index_dir).map_err(olm_to_sk)?;
             part_meta.n_layers = new_layer_idx + 1;
-- 
2.52.0


From 9e1d6f2f25615428e1f8af50ae848609516eda62 Mon Sep 17 00:00:00 2001
From: Eric Coissac <eric@coissac.eu>
Date: Thu, 21 May 2026 11:16:00 +0200
Subject: [PATCH 07/11] feat: implement partition-based merge command for k-mer
 indices

Implements a new `merge` command that aggregates k-mer counts and presence/absence matrices from multiple source indices using a parallelized, partition-based algorithm. Adds CLI progress bars and execution timing across the bootstrap, spectrum rebuild, and merge phases. Updates logging to report the aggregate genome count and introduces a bounds check in the perfect hash layer to safely return `None` for unknown k-mers, preventing out-of-bounds access in downstream operations.
---
 TODO.md                             | 14 --------
 src/obikindex/src/merge.rs          | 56 +++++++++++++++++++++++++++--
 src/obikmer/src/cmd/merge.rs        | 12 +++++--
 src/obilayeredmap/src/mphf_layer.rs |  2 ++
 4 files changed, 64 insertions(+), 20 deletions(-)

diff --git a/TODO.md b/TODO.md
index 038bd71..592fc74 100644
--- a/TODO.md
+++ b/TODO.md
@@ -9,20 +9,6 @@
 
 ## commandes à ajouter
 
-- merge : pour construire un index à partir d'index existants
-  - deux modes : count et presence/absence. count exige que tous les index mergés soient déjà en mode count. mode presence/absence par defaut. Si passage de mode count à mode presence/absence, par defaut presence = count >= 1. Possibilité de spécifier un seuil personnalisé.
-  - le merge doit se faire en parallèle sur chaque partition
-  - en entrée : une liste de chemins vers les index à fusionner
-  - en sortie : un nouvel index fusionné (option -o <output_index>)
-  - j'imagine comme algo: 
-    - on copie le premier index dans le nouvel index
-    - on ajoute a chaque partition une matrice de count ou de presence s'il n'y en avait pas déjà.
-    - si besoin, on cree la colone 0 de la matrice de count ou de presence pour le genome courant
-    - on parcourt les partitions et les index à fusionner en parallèle
-    - pour chaque partition, on ajoute les kmer présents dans les index à fusionner au nouvel index
-      - si le kmer est déjà présent dans le nouvel index on ajoute le compte ou la presence du kmer dans la matrice de count ou de presence
-      - sinon, on ajoute le kmer dans une nouvelle layer
-
 - filter : produit un nouvel index filtré à partir d'un index existant en verifiant que les kmer présents dans le nouvel index respectent les critères de filtrage spécifiés
   - quorum de presence en fraction-(min/max) du nombre de génomes, en nombre-(min/max) de génomes, si mode count la présence peut être défini par un seuil personnalisé minimum et maximum
 
diff --git a/src/obikindex/src/merge.rs b/src/obikindex/src/merge.rs
index 305d344..c31e3e5 100644
--- a/src/obikindex/src/merge.rs
+++ b/src/obikindex/src/merge.rs
@@ -2,7 +2,9 @@ use std::collections::HashMap;
 use std::fs;
 use std::io;
 use std::path::Path;
+use std::time::Duration;
 
+use indicatif::{ProgressBar, ProgressStyle};
 use obisys::{Reporter, Stage};
 use rayon::prelude::*;
 use tracing::info;
@@ -31,6 +33,7 @@ impl KmerIndex {
         mode: MergeMode,
         force: bool,
         rename_duplicates: bool,
+        rep: &mut Reporter,
     ) -> OKIResult<Self> {
         let output = output.as_ref();
 
@@ -74,7 +77,14 @@ impl KmerIndex {
         }
 
         // ── Bootstrap: copy first source to output ────────────────────────────
-        info!("copying {} → {}", sources[0].root_path.display(), output.display());
+        info!(
+            "bootstrap: copying {} → {} ({} genome(s))",
+            sources[0].root_path.display(),
+            output.display(),
+            sources[0].meta.genomes.len(),
+        );
+        let t = Stage::start("bootstrap");
+        let pb = spinner("bootstrap — copying index …");
         copy_dir_all(&sources[0].root_path, output)?;
 
         // Rewrite index.meta with final genome labels and the effective mode.
@@ -88,9 +98,14 @@ impl KmerIndex {
         if mode == MergeMode::Presence {
             remove_dirs_named(output, "counts")?;
         }
+        pb.finish_and_clear();
+        rep.push(t.stop());
 
         // Rebuild spectrums/ from all sources using the (possibly renamed) labels.
         // Drop the spectrums/ that were copied from source_0 and rebuild from scratch.
+        info!("rebuilding spectrums for {} source(s)", sources.len());
+        let t = Stage::start("spectrums");
+        let pb = spinner("spectrums — copying …");
         let spectrums_dir = output.join("spectrums");
         if spectrums_dir.exists() {
             fs::remove_dir_all(&spectrums_dir)?;
@@ -98,6 +113,8 @@ impl KmerIndex {
         for (src, new_labels) in sources.iter().zip(&source_labels) {
             copy_spectrums(&src.root_path, output, &src.meta.genomes, new_labels)?;
         }
+        pb.finish_and_clear();
+        rep.push(t.stop());
 
         // Open the destination index.
         let dst = KmerIndex::open(output)?;
@@ -107,8 +124,13 @@ impl KmerIndex {
         // ── Merge each subsequent source partition-by-partition ───────────────
         let remaining_sources: Vec<&KmerIndex> = sources[1..].to_vec();
         if !remaining_sources.is_empty() {
-            let mut rep = Reporter::new();
+            let n_src_genomes: usize = remaining_sources.iter().map(|s| s.meta.genomes.len()).sum();
+            info!(
+                "merging {} partition(s) × {} additional source genome(s) into {} destination genome(s)",
+                n_partitions, n_src_genomes, n_dst_genomes,
+            );
             let t = Stage::start("merge_partitions");
+            let pb = partition_bar(n_partitions as u64);
 
             let dst_partition = &dst.partition;
 
@@ -117,10 +139,13 @@ impl KmerIndex {
                 .filter_map(|i| {
                     let srcs: Vec<(&obikpartitionner::KmerPartition, usize)> =
                         remaining_sources.iter().map(|s| (&s.partition, s.meta.genomes.len())).collect();
-                    dst_partition.merge_partition(i, &srcs, mode, n_dst_genomes).err()
+                    let result = dst_partition.merge_partition(i, &srcs, mode, n_dst_genomes).err();
+                    pb.inc(1);
+                    result
                 })
                 .collect();
 
+            pb.finish_and_clear();
             if let Some(e) = errors.into_iter().next() {
                 return Err(OKIError::Partition(e));
             }
@@ -206,6 +231,31 @@ fn remove_dirs_named(root: &Path, name: &str) -> io::Result<()> {
     Ok(())
 }
 
+fn spinner(msg: &'static str) -> ProgressBar {
+    let pb = ProgressBar::new_spinner();
+    pb.set_style(
+        ProgressStyle::with_template("{spinner} {msg}  {elapsed}")
+            .unwrap()
+            .tick_strings(&["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"]),
+    );
+    pb.set_message(msg);
+    pb.enable_steady_tick(Duration::from_millis(100));
+    pb
+}
+
+fn partition_bar(n: u64) -> ProgressBar {
+    let pb = ProgressBar::new(n);
+    pb.set_style(
+        ProgressStyle::with_template(
+            "{spinner} merge — {bar:40.cyan/blue} {pos}/{len} partitions  {elapsed}",
+        )
+        .unwrap()
+        .tick_strings(&["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"]),
+    );
+    pb.enable_steady_tick(Duration::from_millis(100));
+    pb
+}
+
 fn copy_dir_all(src: &Path, dst: &Path) -> io::Result<()> {
     fs::create_dir_all(dst)?;
     for entry in fs::read_dir(src)? {
diff --git a/src/obikmer/src/cmd/merge.rs b/src/obikmer/src/cmd/merge.rs
index 0b18ac8..3442273 100644
--- a/src/obikmer/src/cmd/merge.rs
+++ b/src/obikmer/src/cmd/merge.rs
@@ -55,9 +55,15 @@ pub fn run(args: MergeArgs) {
 
     set_k(sources[0].kmer_size());
     set_m(sources[0].minimizer_size());
-    info!("merging {} indexes → {}", sources.len(), args.output.display());
-    let rep = Reporter::new();
-    KmerIndex::merge(&args.output, &source_refs, mode, args.force, args.rename_duplicates).unwrap_or_else(|e| {
+
+    let n_genomes: usize = sources.iter().map(|s| s.meta().genomes.len()).sum();
+    info!(
+        "merging {} index(es), {} genome(s) total → {}",
+        sources.len(), n_genomes, args.output.display()
+    );
+
+    let mut rep = Reporter::new();
+    KmerIndex::merge(&args.output, &source_refs, mode, args.force, args.rename_duplicates, &mut rep).unwrap_or_else(|e| {
         eprintln!("error merging: {e}");
         std::process::exit(1);
     });
diff --git a/src/obilayeredmap/src/mphf_layer.rs b/src/obilayeredmap/src/mphf_layer.rs
index 5fcb677..0844237 100644
--- a/src/obilayeredmap/src/mphf_layer.rs
+++ b/src/obilayeredmap/src/mphf_layer.rs
@@ -41,6 +41,8 @@ impl MphfLayer {
     #[inline]
     pub fn find(&self, kmer: CanonicalKmer) -> Option<usize> {
         let slot = self.mphf.index(&kmer.raw());
+        // PtrHash guarantees slot < n only for its key set; arbitrary queries may exceed bounds.
+        if slot >= self.n { return None; }
         let (chunk_id, rank) = self.evidence.decode(slot);
         if self.unitigs.verify_canonical_kmer(chunk_id as usize, rank as usize, kmer) {
             Some(slot)
-- 
2.52.0


From 3fa1dbf8cc91c34289edc2277fb2f54197441ca4 Mon Sep 17 00:00:00 2001
From: Eric Coissac <eric@coissac.eu>
Date: Thu, 21 May 2026 11:47:35 +0200
Subject: [PATCH 08/11] feat: add pairwise distance computation and
 phylogenetic trees

This commit introduces a new `distance` CLI subcommand that computes pairwise genomic distance matrices using configurable metrics (Jaccard, Hamming, Bray-Curtis, Euclidean, and Hellinger). It optionally generates phylogenetic trees (NJ or UPGMA) in Newick format and outputs results as CSV. The implementation adds a robust distance computation backend that dynamically routes to optimized backends based on index configuration, supports parallel iteration, and gracefully handles missing data. Additionally, it adds a `dump` task for exporting k-mer to genome mappings as CSV, introduces an `InvalidInput` error variant, updates dependencies to support numerical operations and tree construction, and performs minor module reorganizations.
---
 TODO.md                              |   2 -
 src/Cargo.lock                       | 100 ++++++++++++-
 src/obicompactvec/src/traits.rs      |   7 +
 src/obikindex/Cargo.toml             |   3 +
 src/obikindex/src/distance.rs        | 132 ++++++++++++++++
 src/obikindex/src/error.rs           |   3 +
 src/obikindex/src/lib.rs             |   2 +
 src/obikmer/Cargo.toml               |   2 +
 src/obikmer/src/cmd/distance.rs      | 216 +++++++++++++++++++++++++++
 src/obikmer/src/cmd/mod.rs           |   1 +
 src/obikmer/src/main.rs              |   3 +
 src/obikpartitionner/src/distance.rs |  47 ++++++
 src/obikpartitionner/src/lib.rs      |   1 +
 13 files changed, 512 insertions(+), 7 deletions(-)
 create mode 100644 src/obikindex/src/distance.rs
 create mode 100644 src/obikmer/src/cmd/distance.rs
 create mode 100644 src/obikpartitionner/src/distance.rs

diff --git a/TODO.md b/TODO.md
index 592fc74..e6ba5d0 100644
--- a/TODO.md
+++ b/TODO.md
@@ -23,6 +23,4 @@
   - les matrices sont sauvegardées en CSV
   - les arbres NJ sont sauvegardés en Newick avec les longeurs de branche
 
-- dump : une table csv de l'index avec les kmer et les genomes associés en mode count ou presence/absence avec une option pour forcer le mode presence/absence meme si l'index est en mode count. Par defaut, le mode count est utilisé pour les index en mode count et le mode presence/absence pour les index en mode presence/absence.
-
 - status : affiche le statut de l'index
diff --git a/src/Cargo.lock b/src/Cargo.lock
index 4c1929b..1de3042 100644
--- a/src/Cargo.lock
+++ b/src/Cargo.lock
@@ -176,6 +176,21 @@ dependencies = [
  "thiserror 1.0.69",
 ]
 
+[[package]]
+name = "bit-set"
+version = "0.5.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0700ddab506f33b20a03b13996eccd309a48e5ff77d0d95926aa0210fb4e95f1"
+dependencies = [
+ "bit-vec",
+]
+
+[[package]]
+name = "bit-vec"
+version = "0.6.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb"
+
 [[package]]
 name = "bitflags"
 version = "1.3.2"
@@ -625,6 +640,12 @@ dependencies = [
  "syn",
 ]
 
+[[package]]
+name = "dtoa"
+version = "1.0.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4c3cf4824e2d5f025c7b531afcb2325364084a16806f6d47fbc1f5fbd9960590"
+
 [[package]]
 name = "either"
 version = "1.15.0"
@@ -1102,6 +1123,15 @@ dependencies = [
  "wasm-bindgen",
 ]
 
+[[package]]
+name = "kodama"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7a44f3a71a44fbf49ce38152db7dc9adf959d4fe5c29344cd1858bdbda8d9091"
+dependencies = [
+ "num-traits",
+]
+
 [[package]]
 name = "lazy_static"
 version = "1.5.0"
@@ -1453,7 +1483,10 @@ name = "obikindex"
 version = "0.1.0"
 dependencies = [
  "indicatif",
+ "ndarray",
+ "obicompactvec",
  "obikpartitionner",
+ "obilayeredmap",
  "obiskio",
  "obisys",
  "rayon",
@@ -1468,6 +1501,7 @@ version = "0.1.0"
 dependencies = [
  "clap",
  "indicatif",
+ "kodama",
  "obifastwrite",
  "obikindex",
  "obikpartitionner",
@@ -1480,6 +1514,7 @@ dependencies = [
  "obisys",
  "pprof",
  "rayon",
+ "speedytree",
  "tracing",
  "tracing-subscriber",
 ]
@@ -1886,8 +1921,8 @@ dependencies = [
  "lazy_static",
  "log",
  "mem_dbg",
- "rand",
- "rand_chacha",
+ "rand 0.9.4",
+ "rand_chacha 0.9.0",
  "rayon",
  "rdst",
  "rustc-hash",
@@ -1918,14 +1953,35 @@ version = "0.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "dc33ff2d4973d518d823d61aa239014831e521c75da58e3df4840d3f47749d09"
 
+[[package]]
+name = "rand"
+version = "0.8.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5ca0ecfa931c29007047d1bc58e623ab12e5590e8c7cc53200d5202b69266d8a"
+dependencies = [
+ "libc",
+ "rand_chacha 0.3.1",
+ "rand_core 0.6.4",
+]
+
 [[package]]
 name = "rand"
 version = "0.9.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea"
 dependencies = [
- "rand_chacha",
- "rand_core",
+ "rand_chacha 0.9.0",
+ "rand_core 0.9.5",
+]
+
+[[package]]
+name = "rand_chacha"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
+dependencies = [
+ "ppv-lite86",
+ "rand_core 0.6.4",
 ]
 
 [[package]]
@@ -1935,7 +1991,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
 dependencies = [
  "ppv-lite86",
- "rand_core",
+ "rand_core 0.9.5",
+]
+
+[[package]]
+name = "rand_core"
+version = "0.6.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
+dependencies = [
+ "getrandom 0.2.17",
 ]
 
 [[package]]
@@ -1973,6 +2038,12 @@ dependencies = [
  "crossbeam-utils",
 ]
 
+[[package]]
+name = "rb_tree"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6724c78e033e1c4155b4d3f76593d09ad384739c6986c1addc2bd2f55b1aefe"
+
 [[package]]
 name = "rdst"
 version = "0.20.14"
@@ -2227,6 +2298,25 @@ version = "1.15.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03"
 
+[[package]]
+name = "speedytree"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f4522052445ce1b002c095d93e9dc545c326e9e2321c1315f1ab2a381c11666"
+dependencies = [
+ "bit-set",
+ "bit-vec",
+ "bitvec",
+ "clap",
+ "dtoa",
+ "fixedbitset",
+ "parking_lot",
+ "petgraph",
+ "rand 0.8.6",
+ "rayon",
+ "rb_tree",
+]
+
 [[package]]
 name = "stable_deref_trait"
 version = "1.2.1"
diff --git a/src/obicompactvec/src/traits.rs b/src/obicompactvec/src/traits.rs
index bc10097..7cccf48 100644
--- a/src/obicompactvec/src/traits.rs
+++ b/src/obicompactvec/src/traits.rs
@@ -80,6 +80,13 @@ pub trait CountPartials: ColumnWeights {
         let sq2 = std::f64::consts::SQRT_2;
         self.partial_hellinger(&global).mapv(|v| v.sqrt() / sq2)
     }
+
+    /// Euclidean distance in the Hellinger (√relative-frequency) space.
+    /// Equal to √2 × hellinger_dist — unnormalised variant.
+    fn hellinger_euclidean_dist_matrix(&self) -> Array2<f64> {
+        let global = self.col_weights();
+        self.partial_hellinger(&global).mapv(|v| v.sqrt())
+    }
 }
 
 /// Partial distance matrices for bit-based data (`PersistentBitMatrix`).
diff --git a/src/obikindex/Cargo.toml b/src/obikindex/Cargo.toml
index bf4ebd6..538cffb 100644
--- a/src/obikindex/Cargo.toml
+++ b/src/obikindex/Cargo.toml
@@ -7,6 +7,9 @@ edition = "2024"
 obikpartitionner = { path = "../obikpartitionner" }
 obiskio          = { path = "../obiskio" }
 obisys           = { path = "../obisys" }
+obicompactvec    = { path = "../obicompactvec" }
+obilayeredmap    = { path = "../obilayeredmap" }
+ndarray          = "0.16"
 rayon            = "1"
 serde            = { version = "1", features = ["derive"] }
 serde_json       = "1"
diff --git a/src/obikindex/src/distance.rs b/src/obikindex/src/distance.rs
new file mode 100644
index 0000000..a1c9cb0
--- /dev/null
+++ b/src/obikindex/src/distance.rs
@@ -0,0 +1,132 @@
+use ndarray::Array2;
+use obicompactvec::traits::{BitPartials, CountPartials};
+use obilayeredmap::LayeredStore;
+use rayon::prelude::*;
+
+use crate::error::{OKIError, OKIResult};
+use crate::index::KmerIndex;
+
+// ── Public API ────────────────────────────────────────────────────────────────
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum DistanceMetric {
+    /// Jaccard distance on presence/absence data.
+    Jaccard,
+    /// Hamming distance (number of differing kmer positions) on presence/absence data.
+    Hamming,
+    /// Bray-Curtis dissimilarity on raw counts.
+    BrayCurtis,
+    /// Bray-Curtis dissimilarity normalised by per-genome total counts.
+    RelfreqBrayCurtis,
+    /// Euclidean distance on raw counts.
+    Euclidean,
+    /// Euclidean distance on relative frequencies.
+    RelfreqEuclidean,
+    /// Hellinger distance on counts.
+    Hellinger,
+    /// Euclidean distance in the Hellinger (√relative-frequency) space (unnormalised variant).
+    HellingerEuclidean,
+}
+
+pub struct DistanceOutput {
+    /// n×n pairwise distance matrix (genomes in index order).
+    pub matrix: Array2<f64>,
+    /// n×n shared-kmer count matrix (intersection), if requested.
+    pub shared_kmers: Option<Array2<u64>>,
+}
+
+impl DistanceMetric {
+    pub fn requires_counts(self) -> bool {
+        matches!(
+            self,
+            DistanceMetric::BrayCurtis
+            | DistanceMetric::RelfreqBrayCurtis
+            | DistanceMetric::Euclidean
+            | DistanceMetric::RelfreqEuclidean
+            | DistanceMetric::Hellinger
+            | DistanceMetric::HellingerEuclidean
+        )
+    }
+}
+
+// ── KmerIndex::distance ───────────────────────────────────────────────────────
+
+impl KmerIndex {
+    pub fn distance(&self, metric: DistanceMetric, shared_kmers: bool) -> OKIResult<DistanceOutput> {
+        let n_genomes = self.meta.genomes.len();
+        if n_genomes < 2 {
+            return Err(OKIError::InvalidInput(
+                "distance requires at least 2 genomes in the index".into(),
+            ));
+        }
+
+        let use_counts = self.meta.config.with_counts;
+        if metric.requires_counts() && !use_counts {
+            return Err(OKIError::InvalidInput(format!(
+                "{metric:?} requires a count index (with_counts = true)"
+            )));
+        }
+
+        let n_parts = self.n_partitions();
+
+        if use_counts {
+            let stores: Vec<_> = (0..n_parts)
+                .into_par_iter()
+                .map(|i| self.partition.count_store(i).map_err(OKIError::Partition))
+                .collect::<OKIResult<_>>()?;
+            let global = LayeredStore::new(stores);
+
+            let matrix = match metric {
+                DistanceMetric::BrayCurtis        => CountPartials::bray_dist_matrix(&global),
+                DistanceMetric::RelfreqBrayCurtis => CountPartials::relfreq_bray_dist_matrix(&global),
+                DistanceMetric::Euclidean         => CountPartials::euclidean_dist_matrix(&global),
+                DistanceMetric::RelfreqEuclidean  => CountPartials::relfreq_euclidean_dist_matrix(&global),
+                DistanceMetric::Hellinger         => CountPartials::hellinger_dist_matrix(&global),
+                DistanceMetric::HellingerEuclidean => CountPartials::hellinger_euclidean_dist_matrix(&global),
+                // Jaccard on count data: threshold at 0 (present if count > 0)
+                DistanceMetric::Jaccard           => CountPartials::threshold_jaccard_dist_matrix(&global, 0),
+                DistanceMetric::Hamming           => {
+                    return Err(OKIError::InvalidInput(
+                        "Hamming is only available for presence/absence indexes".into(),
+                    ));
+                }
+            };
+
+            let shared = if shared_kmers {
+                let (inter, _) = CountPartials::partial_threshold_jaccard(&global, 0);
+                Some(inter)
+            } else {
+                None
+            };
+
+            Ok(DistanceOutput { matrix, shared_kmers: shared })
+        } else {
+            let stores: Vec<_> = (0..n_parts)
+                .into_par_iter()
+                .map(|i| self.partition.presence_store(i).map_err(OKIError::Partition))
+                .collect::<OKIResult<_>>()?;
+            let global = LayeredStore::new(stores);
+
+            let matrix = match metric {
+                DistanceMetric::Jaccard => BitPartials::jaccard_dist_matrix(&global),
+                DistanceMetric::Hamming => {
+                    BitPartials::hamming_dist_matrix(&global).mapv(|v| v as f64)
+                }
+                other => {
+                    return Err(OKIError::InvalidInput(format!(
+                        "{other:?} requires a count index; use --metric jaccard or --metric hamming"
+                    )));
+                }
+            };
+
+            let shared = if shared_kmers {
+                let (inter, _) = BitPartials::partial_jaccard(&global);
+                Some(inter)
+            } else {
+                None
+            };
+
+            Ok(DistanceOutput { matrix, shared_kmers: shared })
+        }
+    }
+}
diff --git a/src/obikindex/src/error.rs b/src/obikindex/src/error.rs
index 66f1abc..96834cb 100644
--- a/src/obikindex/src/error.rs
+++ b/src/obikindex/src/error.rs
@@ -16,6 +16,8 @@ pub enum OKIError {
     MismatchedMode,
     /// Two or more sources share the same genome label.
     DuplicateGenomeLabel(String),
+    /// Operation not valid for this index configuration.
+    InvalidInput(String),
 }
 
 pub type OKIResult<T> = Result<T, OKIError>;
@@ -30,6 +32,7 @@ impl fmt::Display for OKIError {
             OKIError::IncompatibleConfig => write!(f, "incompatible index configurations"),
             OKIError::MismatchedMode   => write!(f, "count mode requires all sources to have with_counts=true"),
             OKIError::DuplicateGenomeLabel(l) => write!(f, "duplicate genome label across sources: {l}"),
+            OKIError::InvalidInput(m)         => write!(f, "invalid input: {m}"),
         }
     }
 }
diff --git a/src/obikindex/src/lib.rs b/src/obikindex/src/lib.rs
index c4a66f5..d6a0908 100644
--- a/src/obikindex/src/lib.rs
+++ b/src/obikindex/src/lib.rs
@@ -1,11 +1,13 @@
 pub mod error;
 pub mod meta;
 pub mod state;
+mod distance;
 mod dump;
 mod index;
 mod merge;
 
 pub use error::{OKIError, OKIResult};
+pub use distance::{DistanceMetric, DistanceOutput};
 pub use index::KmerIndex;
 pub use merge::MergeMode;
 pub use meta::{IndexConfig, IndexMeta, META_FILENAME};
diff --git a/src/obikmer/Cargo.toml b/src/obikmer/Cargo.toml
index f61a524..fffb28b 100644
--- a/src/obikmer/Cargo.toml
+++ b/src/obikmer/Cargo.toml
@@ -19,6 +19,8 @@ obisys        = { path = "../obisys" }
 obiskio       = { path = "../obiskio" }
 obikindex     = { path = "../obikindex" }
 clap          = { version = "4", features = ["derive"] }
+kodama        = "0.2"
+speedytree    = "0.1"
 rayon         = "1"
 indicatif     = "0.17"
 tracing       = "0.1.44"
diff --git a/src/obikmer/src/cmd/distance.rs b/src/obikmer/src/cmd/distance.rs
new file mode 100644
index 0000000..e8574ae
--- /dev/null
+++ b/src/obikmer/src/cmd/distance.rs
@@ -0,0 +1,216 @@
+use std::io::{self, BufWriter, Write};
+use std::path::PathBuf;
+
+use clap::Args;
+use kodama::{Method, linkage};
+use obikindex::{DistanceMetric, KmerIndex};
+use obikseq::{set_k, set_m};
+use speedytree::{DistanceMatrix, Hybrid, NeighborJoiningSolver, to_newick};
+use tracing::info;
+
+#[derive(clap::ValueEnum, Clone, Copy, Debug)]
+pub enum MetricArg {
+    Jaccard,
+    Hamming,
+    BrayCurtis,
+    #[value(name = "relfreq-bray-curtis")]
+    RelfreqBrayCurtis,
+    Euclidean,
+    #[value(name = "relfreq-euclidean")]
+    RelfreqEuclidean,
+    Hellinger,
+    #[value(name = "hellinger-euclidean")]
+    HellingerEuclidean,
+}
+
+impl From<MetricArg> for DistanceMetric {
+    fn from(m: MetricArg) -> Self {
+        match m {
+            MetricArg::Jaccard            => DistanceMetric::Jaccard,
+            MetricArg::Hamming            => DistanceMetric::Hamming,
+            MetricArg::BrayCurtis         => DistanceMetric::BrayCurtis,
+            MetricArg::RelfreqBrayCurtis  => DistanceMetric::RelfreqBrayCurtis,
+            MetricArg::Euclidean          => DistanceMetric::Euclidean,
+            MetricArg::RelfreqEuclidean   => DistanceMetric::RelfreqEuclidean,
+            MetricArg::Hellinger          => DistanceMetric::Hellinger,
+            MetricArg::HellingerEuclidean => DistanceMetric::HellingerEuclidean,
+        }
+    }
+}
+
+#[derive(Args)]
+pub struct DistanceArgs {
+    /// Index directory
+    pub index: PathBuf,
+
+    /// Distance metric to compute
+    #[arg(long, value_enum, default_value = "jaccard")]
+    pub metric: MetricArg,
+
+    /// Also output the shared-kmer count matrix (CSV)
+    #[arg(long)]
+    pub shared_kmers: bool,
+
+    /// Compute and write a Neighbor-Joining tree (Newick)
+    #[arg(long)]
+    pub nj: bool,
+
+    /// Compute and write a UPGMA tree (Newick)
+    #[arg(long)]
+    pub upgma: bool,
+
+    /// Output prefix: <prefix>_dist.csv, <prefix>_shared.csv,
+    /// <prefix>_nj.nwk, <prefix>_upgma.nwk.
+    /// If omitted, the distance matrix is written to stdout.
+    #[arg(short, long)]
+    pub output: Option<PathBuf>,
+}
+
+pub fn run(args: DistanceArgs) {
+    let idx = KmerIndex::open(&args.index).unwrap_or_else(|e| {
+        eprintln!("error opening index: {e}");
+        std::process::exit(1);
+    });
+
+    set_k(idx.kmer_size());
+    set_m(idx.minimizer_size());
+
+    let genomes = idx.meta().genomes.clone();
+    let n = genomes.len();
+    info!(
+        "computing {:?} distances for {} genome(s)",
+        args.metric, n
+    );
+
+    let need_shared = args.shared_kmers || args.nj || args.upgma;
+    let result = idx
+        .distance(args.metric.into(), need_shared)
+        .unwrap_or_else(|e| {
+            eprintln!("error computing distances: {e}");
+            std::process::exit(1);
+        });
+
+    // ── Distance matrix → CSV ─────────────────────────────────────────────────
+    let write_dist_csv = |w: &mut dyn Write| {
+        write!(w, "genome").unwrap();
+        for g in &genomes { write!(w, ",{g}").unwrap(); }
+        writeln!(w).unwrap();
+        for (i, g) in genomes.iter().enumerate() {
+            write!(w, "{g}").unwrap();
+            for j in 0..n {
+                write!(w, ",{:.6}", result.matrix[[i, j]]).unwrap();
+            }
+            writeln!(w).unwrap();
+        }
+    };
+
+    match &args.output {
+        Some(prefix) => {
+            let path = format!("{}_dist.csv", prefix.display());
+            let mut f = BufWriter::new(std::fs::File::create(&path).unwrap_or_else(|e| {
+                eprintln!("error creating {path}: {e}");
+                std::process::exit(1);
+            }));
+            write_dist_csv(&mut f);
+            info!("distance matrix → {path}");
+        }
+        None => {
+            let stdout = io::stdout();
+            let mut out = BufWriter::new(stdout.lock());
+            write_dist_csv(&mut out);
+        }
+    }
+
+    // ── Shared-kmer matrix → CSV ──────────────────────────────────────────────
+    if args.shared_kmers {
+        if let Some(shared) = &result.shared_kmers {
+            let path = args.output.as_ref()
+                .map(|p| format!("{}_shared.csv", p.display()))
+                .unwrap_or_else(|| "shared.csv".into());
+            let mut f = BufWriter::new(std::fs::File::create(&path).unwrap_or_else(|e| {
+                eprintln!("error creating {path}: {e}");
+                std::process::exit(1);
+            }));
+            write!(f, "genome").unwrap();
+            for g in &genomes { write!(f, ",{g}").unwrap(); }
+            writeln!(f).unwrap();
+            for (i, g) in genomes.iter().enumerate() {
+                write!(f, "{g}").unwrap();
+                for j in 0..n { write!(f, ",{}", shared[[i, j]]).unwrap(); }
+                writeln!(f).unwrap();
+            }
+            info!("shared-kmer matrix → {path}");
+        }
+    }
+
+    // ── NJ tree via speedytree ────────────────────────────────────────────────
+    if args.nj {
+        let rows: Vec<Vec<f64>> = (0..n)
+            .map(|i| (0..n).map(|j| result.matrix[[i, j]]).collect())
+            .collect();
+        let dm = DistanceMatrix::build(rows, genomes.clone()).unwrap_or_else(|e| {
+            eprintln!("error building distance matrix for NJ: {e}");
+            std::process::exit(1);
+        });
+        let tree = NeighborJoiningSolver::<Hybrid>::default(dm).solve().unwrap_or_else(|e| {
+            eprintln!("error computing NJ tree: {e}");
+            std::process::exit(1);
+        });
+        let newick = to_newick(&tree);
+        let path = args.output.as_ref()
+            .map(|p| format!("{}_nj.nwk", p.display()))
+            .unwrap_or_else(|| "nj.nwk".into());
+        std::fs::write(&path, &newick).unwrap_or_else(|e| {
+            eprintln!("error writing {path}: {e}");
+            std::process::exit(1);
+        });
+        info!("NJ tree → {path}");
+    }
+
+    // ── UPGMA tree via kodama ─────────────────────────────────────────────────
+    if args.upgma {
+        let mut condensed: Vec<f64> = Vec::with_capacity(n * (n - 1) / 2);
+        for i in 0..n {
+            for j in (i + 1)..n {
+                condensed.push(result.matrix[[i, j]]);
+            }
+        }
+        let dendro = linkage(&mut condensed, n, Method::Average);
+        let newick = upgma_to_newick(&dendro, &genomes);
+        let path = args.output.as_ref()
+            .map(|p| format!("{}_upgma.nwk", p.display()))
+            .unwrap_or_else(|| "upgma.nwk".into());
+        std::fs::write(&path, &newick).unwrap_or_else(|e| {
+            eprintln!("error writing {path}: {e}");
+            std::process::exit(1);
+        });
+        info!("UPGMA tree → {path}");
+    }
+}
+
+// ── UPGMA Newick from kodama dendrogram ───────────────────────────────────────
+
+fn upgma_to_newick(dendro: &kodama::Dendrogram<f64>, names: &[String]) -> String {
+    let n = names.len();
+    // node_labels[i]: Newick subtree string for node i (leaves 0..n, internals n..)
+    let mut labels: Vec<String> = names.to_vec();
+    // height of each node: leaves = 0, internal = dissimilarity/2
+    let mut heights: Vec<f64> = vec![0.0; 2 * n - 1];
+
+    for (k, step) in dendro.steps().iter().enumerate() {
+        let new_node = n + k;
+        let h = step.dissimilarity / 2.0;
+        heights[new_node] = h;
+        let c1 = step.cluster1;
+        let c2 = step.cluster2;
+        let bl1 = (h - heights[c1]).max(0.0);
+        let bl2 = (h - heights[c2]).max(0.0);
+        labels.push(format!(
+            "({label1}:{bl1:.6},{label2}:{bl2:.6})",
+            label1 = labels[c1],
+            label2 = labels[c2],
+        ));
+    }
+
+    format!("{};", labels.last().unwrap())
+}
diff --git a/src/obikmer/src/cmd/mod.rs b/src/obikmer/src/cmd/mod.rs
index 85d8603..ed66c65 100644
--- a/src/obikmer/src/cmd/mod.rs
+++ b/src/obikmer/src/cmd/mod.rs
@@ -1,3 +1,4 @@
+pub mod distance;
 pub mod dump;
 pub mod index;
 pub mod merge;
diff --git a/src/obikmer/src/main.rs b/src/obikmer/src/main.rs
index 1a1f8bc..c88428b 100644
--- a/src/obikmer/src/main.rs
+++ b/src/obikmer/src/main.rs
@@ -22,6 +22,8 @@ enum Commands {
     Merge(cmd::merge::MergeArgs),
     /// Dump all indexed kmers as CSV (kmer + per-genome counts or presence)
     Dump(cmd::dump::DumpArgs),
+    /// Compute pairwise distance matrix between genomes; optionally build NJ/UPGMA trees
+    Distance(cmd::distance::DistanceArgs),
     /// Dump unitigs from a built index to stdout (debug)
     Unitig(cmd::unitig::UnitigArgs),
 }
@@ -49,6 +51,7 @@ fn main() {
         Commands::Index(args)     => cmd::index::run(args),
         Commands::Merge(args)     => cmd::merge::run(args),
         Commands::Dump(args)      => cmd::dump::run(args),
+        Commands::Distance(args)  => cmd::distance::run(args),
         Commands::Unitig(args)    => cmd::unitig::run(args),
     }
 
diff --git a/src/obikpartitionner/src/distance.rs b/src/obikpartitionner/src/distance.rs
new file mode 100644
index 0000000..b5e6914
--- /dev/null
+++ b/src/obikpartitionner/src/distance.rs
@@ -0,0 +1,47 @@
+use obicompactvec::{PersistentBitMatrix, PersistentCompactIntMatrix};
+use obilayeredmap::LayeredStore;
+use obiskio::{SKError, SKResult};
+
+use crate::partition::KmerPartition;
+
+const INDEX_SUBDIR: &str = "index";
+
+fn probe_n_layers(index_dir: &std::path::Path) -> usize {
+    let mut n = 0;
+    while index_dir.join(format!("layer_{n}")).exists() { n += 1; }
+    n
+}
+
+impl KmerPartition {
+    /// Open all count matrices for partition `part`, one per layer.
+    /// Layers without a `counts/` directory are skipped.
+    pub fn count_store(&self, part: usize) -> SKResult<LayeredStore<PersistentCompactIntMatrix>> {
+        let index_dir = self.part_dir(part).join(INDEX_SUBDIR);
+        if !index_dir.exists() {
+            return Ok(LayeredStore::new(vec![]));
+        }
+        let matrices = (0..probe_n_layers(&index_dir))
+            .filter_map(|l| {
+                let dir = index_dir.join(format!("layer_{l}")).join("counts");
+                dir.exists().then(|| PersistentCompactIntMatrix::open(&dir).map_err(SKError::Io))
+            })
+            .collect::<SKResult<Vec<_>>>()?;
+        Ok(LayeredStore::new(matrices))
+    }
+
+    /// Open all presence matrices for partition `part`, one per layer.
+    /// Layers without a `presence/` directory are skipped.
+    pub fn presence_store(&self, part: usize) -> SKResult<LayeredStore<PersistentBitMatrix>> {
+        let index_dir = self.part_dir(part).join(INDEX_SUBDIR);
+        if !index_dir.exists() {
+            return Ok(LayeredStore::new(vec![]));
+        }
+        let matrices = (0..probe_n_layers(&index_dir))
+            .filter_map(|l| {
+                let dir = index_dir.join(format!("layer_{l}")).join("presence");
+                dir.exists().then(|| PersistentBitMatrix::open(&dir).map_err(SKError::Io))
+            })
+            .collect::<SKResult<Vec<_>>>()?;
+        Ok(LayeredStore::new(matrices))
+    }
+}
diff --git a/src/obikpartitionner/src/lib.rs b/src/obikpartitionner/src/lib.rs
index 3b098a2..4b03fce 100644
--- a/src/obikpartitionner/src/lib.rs
+++ b/src/obikpartitionner/src/lib.rs
@@ -1,3 +1,4 @@
+mod distance;
 mod dump_layer;
 mod index_layer;
 mod kmer_sort;
-- 
2.52.0


From d9aa211b8fc79a71a9cdbab3b939df312725ed8a Mon Sep 17 00:00:00 2001
From: Eric Coissac <eric@coissac.eu>
Date: Thu, 21 May 2026 12:11:55 +0200
Subject: [PATCH 09/11] feat: add k-mer index rebuild and compaction feature

This commit introduces a new `rebuild` CLI subcommand that reconstructs an existing multi-layer k-mer index into a compact, single-layer index. It implements a configurable filtering pipeline supporting min/max genome fraction/count and total count thresholds, parallel partition processing via `rayon`, and CLI progress tracking. The change also restructures module declarations across `obikindex` and `obikpartitionner` to integrate the new rebuild and layer-handling logic.
---
 src/obikindex/src/lib.rs                  |   1 +
 src/obikindex/src/rebuild.rs              | 116 ++++++++++++
 src/obikmer/src/cmd/mod.rs                |   1 +
 src/obikmer/src/cmd/rebuild.rs            | 111 ++++++++++++
 src/obikmer/src/main.rs                   |   3 +
 src/obikpartitionner/src/filter.rs        |  87 +++++++++
 src/obikpartitionner/src/lib.rs           |   3 +
 src/obikpartitionner/src/merge_layer.rs   |   6 +-
 src/obikpartitionner/src/rebuild_layer.rs | 205 ++++++++++++++++++++++
 9 files changed, 530 insertions(+), 3 deletions(-)
 create mode 100644 src/obikindex/src/rebuild.rs
 create mode 100644 src/obikmer/src/cmd/rebuild.rs
 create mode 100644 src/obikpartitionner/src/filter.rs
 create mode 100644 src/obikpartitionner/src/rebuild_layer.rs

diff --git a/src/obikindex/src/lib.rs b/src/obikindex/src/lib.rs
index d6a0908..65df6c0 100644
--- a/src/obikindex/src/lib.rs
+++ b/src/obikindex/src/lib.rs
@@ -5,6 +5,7 @@ mod distance;
 mod dump;
 mod index;
 mod merge;
+mod rebuild;
 
 pub use error::{OKIError, OKIResult};
 pub use distance::{DistanceMetric, DistanceOutput};
diff --git a/src/obikindex/src/rebuild.rs b/src/obikindex/src/rebuild.rs
new file mode 100644
index 0000000..b360030
--- /dev/null
+++ b/src/obikindex/src/rebuild.rs
@@ -0,0 +1,116 @@
+use std::fs;
+use std::io;
+use std::path::Path;
+use std::time::Duration;
+
+use indicatif::{ProgressBar, ProgressStyle};
+use obikpartitionner::{KmerFilter, KmerPartition, MergeMode};
+use obisys::{Reporter, Stage};
+use rayon::prelude::*;
+use tracing::info;
+
+use crate::error::{OKIError, OKIResult};
+use crate::index::KmerIndex;
+use crate::meta::IndexMeta;
+use crate::state::{IndexState, SENTINEL_INDEXED};
+
+impl KmerIndex {
+    /// Rebuild `src` into a new compact single-layer index at `output`.
+    ///
+    /// Only k-mers whose per-genome row passes every filter in `filters` are
+    /// written. If `filters` is empty every k-mer is kept (pure compaction).
+    ///
+    /// `mode` controls whether the output stores counts or presence/absence.
+    /// A count source may be rebuilt in presence mode; a presence source
+    /// cannot be rebuilt in count mode.
+    pub fn rebuild<P: AsRef<Path>>(
+        output: P,
+        src: &KmerIndex,
+        filters: &[Box<dyn KmerFilter>],
+        mode: MergeMode,
+        force: bool,
+        rep: &mut Reporter,
+    ) -> OKIResult<Self> {
+        let output = output.as_ref();
+
+        if src.state() != IndexState::Indexed {
+            return Err(OKIError::NotIndexed(src.root_path.clone()));
+        }
+
+        if mode == MergeMode::Count && !src.meta.config.with_counts {
+            return Err(OKIError::InvalidInput(
+                "cannot rebuild in count mode from a presence-only source index".into(),
+            ));
+        }
+
+        if output.exists() {
+            if force {
+                fs::remove_dir_all(output)?;
+            } else {
+                return Err(OKIError::Io(io::Error::new(
+                    io::ErrorKind::AlreadyExists,
+                    format!("{}: output directory already exists", output.display()),
+                )));
+            }
+        }
+
+        // ── Create output directory + metadata ────────────────────────────────
+        fs::create_dir_all(output)?;
+        let mut meta = IndexMeta::new(src.meta.config.clone());
+        meta.config.with_counts = mode == MergeMode::Count;
+        meta.genomes = src.meta.genomes.clone();
+        meta.write(output)?;
+
+        let n_genomes = src.meta.genomes.len();
+        let n_partitions = src.partition.n_partitions();
+
+        // ── Create an empty destination KmerPartition ─────────────────────────
+        // Create the partitions/ subdirectory so KmerPartition::open_with_config works.
+        fs::create_dir_all(output.join(obikpartitionner::PARTITIONS_SUBDIR))?;
+        let dst_partition = KmerPartition::open_with_config(
+            output,
+            meta.config.kmer_size,
+            meta.config.minimizer_size,
+            meta.config.n_bits,
+        )?;
+
+        info!(
+            "rebuild: {} partition(s), {} genome(s), mode={:?}",
+            n_partitions, n_genomes, mode,
+        );
+
+        let t = Stage::start("rebuild");
+        let pb = ProgressBar::new(n_partitions as u64).with_style(
+            ProgressStyle::with_template("rebuild — [{bar:20}] {pos}/{len} | {msg}")
+                .unwrap()
+                .progress_chars("=> "),
+        );
+        pb.enable_steady_tick(Duration::from_millis(100));
+
+        let src_partition = &src.partition;
+
+        let errors: Vec<obiskio::SKError> = (0..n_partitions)
+            .into_par_iter()
+            .filter_map(|i| {
+                let result = dst_partition
+                    .rebuild_partition(src_partition, i, filters, mode, n_genomes)
+                    .err();
+                pb.inc(1);
+                result
+            })
+            .collect();
+
+        pb.finish_and_clear();
+
+        if let Some(e) = errors.into_iter().next() {
+            return Err(OKIError::Partition(e));
+        }
+
+        rep.push(t.stop());
+
+        // Write SENTINEL_INDEXED — output is ready to use.
+        fs::File::create(output.join(SENTINEL_INDEXED))?;
+
+        KmerIndex::open(output)
+    }
+}
diff --git a/src/obikmer/src/cmd/mod.rs b/src/obikmer/src/cmd/mod.rs
index ed66c65..ac4057e 100644
--- a/src/obikmer/src/cmd/mod.rs
+++ b/src/obikmer/src/cmd/mod.rs
@@ -2,5 +2,6 @@ pub mod distance;
 pub mod dump;
 pub mod index;
 pub mod merge;
+pub mod rebuild;
 pub mod superkmer;
 pub mod unitig;
diff --git a/src/obikmer/src/cmd/rebuild.rs b/src/obikmer/src/cmd/rebuild.rs
new file mode 100644
index 0000000..f172811
--- /dev/null
+++ b/src/obikmer/src/cmd/rebuild.rs
@@ -0,0 +1,111 @@
+use std::path::PathBuf;
+
+use clap::Args;
+use obikindex::{KmerIndex, MergeMode};
+use obikpartitionner::filter::{
+    KmerFilter, MaxGenomeCount, MaxGenomeFraction, MaxTotalCount,
+    MinGenomeCount, MinGenomeFraction, MinTotalCount,
+};
+use obisys::Reporter;
+use obikseq::{set_k, set_m};
+use tracing::info;
+
+#[derive(Args)]
+pub struct RebuildArgs {
+    /// Source index directory
+    pub source: PathBuf,
+
+    /// Output index directory
+    #[arg(short, long)]
+    pub output: PathBuf,
+
+    /// Minimum fraction of genomes containing the k-mer [0.0–1.0]
+    #[arg(long)]
+    pub min_quorum_frac: Option<f64>,
+
+    /// Maximum fraction of genomes containing the k-mer [0.0–1.0]
+    #[arg(long)]
+    pub max_quorum_frac: Option<f64>,
+
+    /// Minimum number of genomes containing the k-mer
+    #[arg(long)]
+    pub min_quorum_count: Option<usize>,
+
+    /// Maximum number of genomes containing the k-mer
+    #[arg(long)]
+    pub max_quorum_count: Option<usize>,
+
+    /// Minimum total count across all genomes (count index only)
+    #[arg(long)]
+    pub min_total_count: Option<u32>,
+
+    /// Maximum total count across all genomes (count index only)
+    #[arg(long)]
+    pub max_total_count: Option<u32>,
+
+    /// Per-genome count threshold for presence in quorum filters (default 0)
+    #[arg(long, default_value = "0")]
+    pub presence_threshold: u32,
+
+    /// Output as presence/absence instead of counts
+    #[arg(long)]
+    pub presence: bool,
+
+    /// Overwrite existing output directory
+    #[arg(short, long)]
+    pub force: bool,
+}
+
+pub fn run(args: RebuildArgs) {
+    let src = KmerIndex::open(&args.source).unwrap_or_else(|e| {
+        eprintln!("error opening source index: {e}");
+        std::process::exit(1);
+    });
+
+    set_k(src.kmer_size());
+    set_m(src.minimizer_size());
+
+    let n_genomes = src.meta().genomes.len();
+    let mode = if args.presence || !src.meta().config.with_counts {
+        MergeMode::Presence
+    } else {
+        MergeMode::Count
+    };
+
+    info!(
+        "rebuild: {} genome(s), mode={:?}, source={}",
+        n_genomes, mode, args.source.display()
+    );
+
+    let pt = args.presence_threshold;
+    let mut filters: Vec<Box<dyn KmerFilter>> = Vec::new();
+
+    if let Some(v) = args.min_quorum_frac {
+        filters.push(Box::new(MinGenomeFraction { frac: v, threshold: pt }));
+    }
+    if let Some(v) = args.max_quorum_frac {
+        filters.push(Box::new(MaxGenomeFraction { frac: v, threshold: pt }));
+    }
+    if let Some(v) = args.min_quorum_count {
+        filters.push(Box::new(MinGenomeCount { count: v, threshold: pt }));
+    }
+    if let Some(v) = args.max_quorum_count {
+        filters.push(Box::new(MaxGenomeCount { count: v, threshold: pt }));
+    }
+    if let Some(v) = args.min_total_count {
+        filters.push(Box::new(MinTotalCount { total: v }));
+    }
+    if let Some(v) = args.max_total_count {
+        filters.push(Box::new(MaxTotalCount { total: v }));
+    }
+
+    let mut rep = Reporter::new();
+    KmerIndex::rebuild(&args.output, &src, &filters, mode, args.force, &mut rep)
+        .unwrap_or_else(|e| {
+            eprintln!("error rebuilding index: {e}");
+            std::process::exit(1);
+        });
+
+    rep.print();
+    info!("rebuilt index → {}", args.output.display());
+}
diff --git a/src/obikmer/src/main.rs b/src/obikmer/src/main.rs
index c88428b..bae4cdf 100644
--- a/src/obikmer/src/main.rs
+++ b/src/obikmer/src/main.rs
@@ -20,6 +20,8 @@ enum Commands {
     Index(cmd::index::IndexArgs),
     /// Merge multiple built indexes into one
     Merge(cmd::merge::MergeArgs),
+    /// Filter and compact an existing index into a new single-layer index
+    Rebuild(cmd::rebuild::RebuildArgs),
     /// Dump all indexed kmers as CSV (kmer + per-genome counts or presence)
     Dump(cmd::dump::DumpArgs),
     /// Compute pairwise distance matrix between genomes; optionally build NJ/UPGMA trees
@@ -51,6 +53,7 @@ fn main() {
         Commands::Index(args)     => cmd::index::run(args),
         Commands::Merge(args)     => cmd::merge::run(args),
         Commands::Dump(args)      => cmd::dump::run(args),
+        Commands::Rebuild(args)   => cmd::rebuild::run(args),
         Commands::Distance(args)  => cmd::distance::run(args),
         Commands::Unitig(args)    => cmd::unitig::run(args),
     }
diff --git a/src/obikpartitionner/src/filter.rs b/src/obikpartitionner/src/filter.rs
new file mode 100644
index 0000000..f35a3e8
--- /dev/null
+++ b/src/obikpartitionner/src/filter.rs
@@ -0,0 +1,87 @@
+/// Trait for kmer row filters used in the rebuild command.
+///
+/// `row` contains raw per-genome counts (or 0/1 for presence/absence data).
+/// `n_genomes` equals `row.len()`.
+pub trait KmerFilter: Send + Sync {
+    fn passes(&self, row: &[u32], n_genomes: usize) -> bool;
+}
+
+// ── Quorum filters ─────────────────────────────────────────────────────────────
+
+fn present_count(row: &[u32], threshold: u32) -> usize {
+    row.iter().filter(|&&v| v > threshold).count()
+}
+
+/// At least `frac` fraction of genomes contain this kmer (count > `threshold`).
+pub struct MinGenomeFraction {
+    pub frac: f64,
+    pub threshold: u32,
+}
+
+impl KmerFilter for MinGenomeFraction {
+    fn passes(&self, row: &[u32], n_genomes: usize) -> bool {
+        let p = present_count(row, self.threshold);
+        p as f64 / n_genomes as f64 >= self.frac
+    }
+}
+
+/// At most `frac` fraction of genomes contain this kmer (count > `threshold`).
+pub struct MaxGenomeFraction {
+    pub frac: f64,
+    pub threshold: u32,
+}
+
+impl KmerFilter for MaxGenomeFraction {
+    fn passes(&self, row: &[u32], n_genomes: usize) -> bool {
+        let p = present_count(row, self.threshold);
+        p as f64 / n_genomes as f64 <= self.frac
+    }
+}
+
+/// At least `count` genomes contain this kmer (count > `threshold`).
+pub struct MinGenomeCount {
+    pub count: usize,
+    pub threshold: u32,
+}
+
+impl KmerFilter for MinGenomeCount {
+    fn passes(&self, row: &[u32], _n_genomes: usize) -> bool {
+        present_count(row, self.threshold) >= self.count
+    }
+}
+
+/// At most `count` genomes contain this kmer (count > `threshold`).
+pub struct MaxGenomeCount {
+    pub count: usize,
+    pub threshold: u32,
+}
+
+impl KmerFilter for MaxGenomeCount {
+    fn passes(&self, row: &[u32], _n_genomes: usize) -> bool {
+        present_count(row, self.threshold) <= self.count
+    }
+}
+
+// ── Total-count filters (count indexes only) ───────────────────────────────────
+
+/// Sum of counts across all genomes >= `total`.
+pub struct MinTotalCount {
+    pub total: u32,
+}
+
+impl KmerFilter for MinTotalCount {
+    fn passes(&self, row: &[u32], _n_genomes: usize) -> bool {
+        row.iter().sum::<u32>() >= self.total
+    }
+}
+
+/// Sum of counts across all genomes <= `total`.
+pub struct MaxTotalCount {
+    pub total: u32,
+}
+
+impl KmerFilter for MaxTotalCount {
+    fn passes(&self, row: &[u32], _n_genomes: usize) -> bool {
+        row.iter().sum::<u32>() <= self.total
+    }
+}
diff --git a/src/obikpartitionner/src/lib.rs b/src/obikpartitionner/src/lib.rs
index 4b03fce..48d72c1 100644
--- a/src/obikpartitionner/src/lib.rs
+++ b/src/obikpartitionner/src/lib.rs
@@ -1,9 +1,12 @@
+pub mod filter;
 mod distance;
 mod dump_layer;
 mod index_layer;
 mod kmer_sort;
 mod merge_layer;
 mod partition;
+mod rebuild_layer;
 
+pub use filter::KmerFilter;
 pub use merge_layer::MergeMode;
 pub use partition::{KmerPartition, KmerSpectrum, PARTITIONS_SUBDIR};
diff --git a/src/obikpartitionner/src/merge_layer.rs b/src/obikpartitionner/src/merge_layer.rs
index c933966..21785d0 100644
--- a/src/obikpartitionner/src/merge_layer.rs
+++ b/src/obikpartitionner/src/merge_layer.rs
@@ -44,7 +44,7 @@ impl ColBuilder {
 
 // ── SrcLayerData — opened source matrix for pass-2 lookup ─────────────────────
 
-enum SrcLayerData {
+pub(crate) enum SrcLayerData {
     /// Pure set-membership layer (no data matrix): every kmer is present in all genomes.
     SetMembership,
     Presence(MphfLayer, PersistentBitMatrix),
@@ -52,7 +52,7 @@ enum SrcLayerData {
 }
 
 impl SrcLayerData {
-    fn open(layer_dir: &Path, mode: MergeMode) -> SKResult<Self> {
+    pub(crate) fn open(layer_dir: &Path, mode: MergeMode) -> SKResult<Self> {
         let presence_dir = layer_dir.join("presence");
         let counts_dir   = layer_dir.join("counts");
         match mode {
@@ -83,7 +83,7 @@ impl SrcLayerData {
     }
 
     /// Return one value per source genome for `kmer`.
-    fn lookup(&self, kmer: CanonicalKmer, n_genomes: usize) -> Vec<u32> {
+    pub(crate) fn lookup(&self, kmer: CanonicalKmer, n_genomes: usize) -> Vec<u32> {
         match self {
             SrcLayerData::SetMembership => vec![1u32; n_genomes],
             SrcLayerData::Presence(mphf, mat) => {
diff --git a/src/obikpartitionner/src/rebuild_layer.rs b/src/obikpartitionner/src/rebuild_layer.rs
new file mode 100644
index 0000000..fa581f9
--- /dev/null
+++ b/src/obikpartitionner/src/rebuild_layer.rs
@@ -0,0 +1,205 @@
+use std::fs;
+use std::io;
+use std::path::{Path, PathBuf};
+
+use obicompactvec::{PersistentBitMatrixBuilder,
+                    PersistentBitVecBuilder,
+                    PersistentCompactIntMatrixBuilder,
+                    PersistentCompactIntVecBuilder};
+use obidebruinj::GraphDeBruijn;
+use obiskio::{SKError, SKResult, UnitigFileReader};
+use obilayeredmap::{Layer, MphfLayer, OLMError};
+use obilayeredmap::meta::PartitionMeta;
+
+use crate::filter::KmerFilter;
+use crate::merge_layer::{MergeMode, SrcLayerData};
+use crate::partition::KmerPartition;
+
+const INDEX_SUBDIR: &str = "index";
+
+fn olm_to_sk(e: OLMError) -> SKError {
+    match e {
+        OLMError::Io(e) => SKError::Io(e),
+        other => SKError::InvalidData { context: "rebuild", detail: other.to_string() },
+    }
+}
+
+fn col_path_bit(dir: &Path, col: usize) -> PathBuf {
+    dir.join(format!("col_{col:06}.pbiv"))
+}
+
+fn col_path_int(dir: &Path, col: usize) -> PathBuf {
+    dir.join(format!("col_{col:06}.pciv"))
+}
+
+fn write_matrix_meta(dir: &Path, n: usize, n_cols: usize) -> io::Result<()> {
+    fs::write(dir.join("meta.json"), format!("{{\"n\":{n},\"n_cols\":{n_cols}}}\n"))
+}
+
+// ── ColBuilder ────────────────────────────────────────────────────────────────
+
+enum ColBuilder {
+    Bit(PersistentBitVecBuilder),
+    Int(PersistentCompactIntVecBuilder),
+}
+
+impl ColBuilder {
+    fn set_val(&mut self, slot: usize, value: u32) {
+        match self {
+            ColBuilder::Bit(b) => b.set(slot, value > 0),
+            ColBuilder::Int(b) => b.set(slot, value),
+        }
+    }
+
+    fn close(self) -> SKResult<()> {
+        match self {
+            ColBuilder::Bit(b)  => b.close().map_err(SKError::Io),
+            ColBuilder::Int(b)  => b.close().map_err(SKError::Io),
+        }
+    }
+}
+
+// ── Helpers ───────────────────────────────────────────────────────────────────
+
+fn load_meta(dir: &Path) -> SKResult<PartitionMeta> {
+    match PartitionMeta::load(dir) {
+        Ok(m) => Ok(m),
+        Err(e) if matches!(e, OLMError::Io(ref io_e) if io_e.kind() == std::io::ErrorKind::NotFound) => {
+            let mut n = 0usize;
+            while dir.join(format!("layer_{n}")).exists() { n += 1; }
+            let m = PartitionMeta { n_layers: n };
+            m.save(dir).map_err(olm_to_sk)?;
+            Ok(m)
+        }
+        Err(e) => Err(olm_to_sk(e)),
+    }
+}
+
+fn passes_all(filters: &[Box<dyn KmerFilter>], row: &[u32], n_genomes: usize) -> bool {
+    filters.iter().all(|f| f.passes(row, n_genomes))
+}
+
+// ── KmerPartition::rebuild_partition ─────────────────────────────────────────
+
+impl KmerPartition {
+    /// Rebuild partition `i` from `src` into `self` (an empty destination partition).
+    ///
+    /// Only k-mers whose per-genome row passes all `filters` are written.
+    /// The output is a single-layer index — regardless of how many layers the
+    /// source has.
+    ///
+    /// `n_genomes` is the number of genome columns in the source (and destination).
+    pub fn rebuild_partition(
+        &self,
+        src: &KmerPartition,
+        i: usize,
+        filters: &[Box<dyn KmerFilter>],
+        mode: MergeMode,
+        n_genomes: usize,
+    ) -> SKResult<()> {
+        let src_index_dir = src.part_dir(i).join(INDEX_SUBDIR);
+        if !src_index_dir.exists() {
+            return Ok(());
+        }
+
+        let src_meta = load_meta(&src_index_dir)?;
+        if src_meta.n_layers == 0 {
+            return Ok(());
+        }
+
+        // ── Pass 1: collect filtered kmers into de Bruijn graph ───────────────
+        let mut g = GraphDeBruijn::new();
+
+        for l in 0..src_meta.n_layers {
+            let src_layer_dir = src_index_dir.join(format!("layer_{l}"));
+            let unitigs_path = src_layer_dir.join("unitigs.bin");
+            if !unitigs_path.exists() { continue; }
+
+            let reader = UnitigFileReader::open(&unitigs_path)?;
+            let src_data = SrcLayerData::open(&src_layer_dir, mode)?;
+
+            for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
+                let row = src_data.lookup(kmer, n_genomes);
+                if passes_all(filters, &row, n_genomes) {
+                    g.push(kmer);
+                }
+            }
+        }
+
+        if g.len() == 0 {
+            return Ok(());
+        }
+
+        let n_new = g.len();
+        g.compute_degrees();
+
+        // ── Build MPHF in dst layer_0 ─────────────────────────────────────────
+        let dst_index_dir = self.part_dir(i).join(INDEX_SUBDIR);
+        let dst_layer_dir = dst_index_dir.join("layer_0");
+        fs::create_dir_all(&dst_layer_dir)?;
+
+        let mut uw = Layer::<()>::unitig_writer(&dst_layer_dir).map_err(olm_to_sk)?;
+        for unitig in g.iter_unitig() {
+            uw.write(&unitig)?;
+        }
+        uw.close()?;
+        drop(g);
+
+        Layer::<()>::build(&dst_layer_dir).map_err(olm_to_sk)?;
+        let dst_mphf = MphfLayer::open(&dst_layer_dir).map_err(olm_to_sk)?;
+
+        // ── Prepare matrix builders (one column per genome) ───────────────────
+        let data_dir = match mode {
+            MergeMode::Presence => dst_layer_dir.join("presence"),
+            MergeMode::Count    => dst_layer_dir.join("counts"),
+        };
+        fs::create_dir_all(&data_dir)?;
+
+        let mut builders: Vec<ColBuilder> = match mode {
+            MergeMode::Presence => {
+                PersistentBitMatrixBuilder::new(n_new, &data_dir)
+                    .map_err(SKError::Io)?.close().map_err(SKError::Io)?;
+                (0..n_genomes).map(|g| -> SKResult<ColBuilder> {
+                    let b = PersistentBitVecBuilder::new(n_new, &col_path_bit(&data_dir, g))?;
+                    Ok(ColBuilder::Bit(b))
+                }).collect::<SKResult<_>>()?
+            }
+            MergeMode::Count => {
+                PersistentCompactIntMatrixBuilder::new(n_new, &data_dir)
+                    .map_err(SKError::Io)?.close().map_err(SKError::Io)?;
+                (0..n_genomes).map(|g| -> SKResult<ColBuilder> {
+                    let b = PersistentCompactIntVecBuilder::new(n_new, &col_path_int(&data_dir, g))?;
+                    Ok(ColBuilder::Int(b))
+                }).collect::<SKResult<_>>()?
+            }
+        };
+
+        // ── Pass 2: fill builders ─────────────────────────────────────────────
+        for l in 0..src_meta.n_layers {
+            let src_layer_dir = src_index_dir.join(format!("layer_{l}"));
+            let unitigs_path = src_layer_dir.join("unitigs.bin");
+            if !unitigs_path.exists() { continue; }
+
+            let reader = UnitigFileReader::open(&unitigs_path)?;
+            let src_data = SrcLayerData::open(&src_layer_dir, mode)?;
+
+            for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
+                let row = src_data.lookup(kmer, n_genomes);
+                if !passes_all(filters, &row, n_genomes) { continue; }
+                if let Some(slot) = dst_mphf.find(kmer) {
+                    for (col, &value) in row.iter().enumerate() {
+                        builders[col].set_val(slot, value);
+                    }
+                }
+            }
+        }
+
+        // ── Close builders, write metadata ────────────────────────────────────
+        for b in builders { b.close()?; }
+        write_matrix_meta(&data_dir, n_new, n_genomes).map_err(SKError::Io)?;
+
+        PartitionMeta { n_layers: 1 }.save(&dst_index_dir).map_err(olm_to_sk)?;
+
+        Ok(())
+    }
+}
-- 
2.52.0


From c8e591fc78e2a9148f331e273e7aab3062f69fb6 Mon Sep 17 00:00:00 2001
From: Eric Coissac <eric@coissac.eu>
Date: Thu, 21 May 2026 12:17:32 +0200
Subject: [PATCH 10/11] feat: add superkmer CLI setup and partition bit
 handling

This commit introduces CLI argument parsing for the `superkmer` command via a new `SuperkmerArgs` struct. It also adds a `partitions_to_bits` utility to compute the minimum bit width for partition encoding, enforcing a 1-bit floor. Finally, the index configuration automatically rounds the partition count up to the nearest power of two to ensure compatibility with bitmask-based indexing operations.
---
 src/obikmer/src/cli.rs           | 12 +++++++++---
 src/obikmer/src/cmd/index.rs     |  9 +++++++--
 src/obikmer/src/cmd/superkmer.rs |  4 ++--
 3 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/src/obikmer/src/cli.rs b/src/obikmer/src/cli.rs
index 76ed52f..1b79fa0 100644
--- a/src/obikmer/src/cli.rs
+++ b/src/obikmer/src/cli.rs
@@ -29,9 +29,9 @@ pub struct CommonArgs {
     #[arg(long, default_value_t = 6)]
     pub level_max: usize,
 
-    /// Number of bits to encode partitions (allows up to 2^partition_bits partitions)
-    #[arg(short, long, default_value_t = 8)]
-    pub partition_bits: usize,
+    /// Number of partitions (rounded up to the next power of 2)
+    #[arg(short, long, default_value_t = 256)]
+    pub partitions: usize,
 
     /// Number of worker threads
     #[arg(
@@ -44,6 +44,12 @@ pub struct CommonArgs {
     pub threads: usize,
 }
 
+/// Smallest `b` such that `2^b >= n` (i.e. `n.next_power_of_two().ilog2()`).
+/// Minimum 1 (degenerate n=0 or n=1 → 1 partition).
+pub fn partitions_to_bits(n: usize) -> usize {
+    n.max(1).next_power_of_two().trailing_zeros() as usize
+}
+
 impl CommonArgs {
     pub fn seqfile_paths(&self) -> obiread::PathIter {
         let paths = self.inputs.iter().map(PathBuf::from).collect();
diff --git a/src/obikmer/src/cmd/index.rs b/src/obikmer/src/cmd/index.rs
index ccfa487..38bc722 100644
--- a/src/obikmer/src/cmd/index.rs
+++ b/src/obikmer/src/cmd/index.rs
@@ -6,7 +6,7 @@ use obikseq::{set_k, set_m};
 use obisys::Reporter;
 use tracing::info;
 
-use crate::cli::CommonArgs;
+use crate::cli::{CommonArgs, partitions_to_bits};
 use crate::steps::scatter;
 
 #[derive(Args)]
@@ -62,10 +62,15 @@ pub fn run(args: IndexArgs) {
                 std::process::exit(1);
             });
         }
+        let n_bits = partitions_to_bits(args.common.partitions);
+        let effective = 1usize << n_bits;
+        if effective != args.common.partitions {
+            info!("partitions: {} → {} (next power of 2)", args.common.partitions, effective);
+        }
         let config = IndexConfig {
             kmer_size:      args.common.kmer_size,
             minimizer_size: args.common.minimizer_size,
-            n_bits:         args.common.partition_bits,
+            n_bits,
             with_counts:    args.with_counts,
         };
         KmerIndex::create(&output, config, args.label.clone(), false).unwrap_or_else(|e| {
diff --git a/src/obikmer/src/cmd/superkmer.rs b/src/obikmer/src/cmd/superkmer.rs
index 2ea1060..881d4a6 100644
--- a/src/obikmer/src/cmd/superkmer.rs
+++ b/src/obikmer/src/cmd/superkmer.rs
@@ -5,7 +5,7 @@ use clap::Args;
 use obifastwrite::write_scatter;
 use obikseq::{RoutableSuperKmer, set_k, set_m};
 
-use crate::cli::{CommonArgs, PipelineData, open_chunks};
+use crate::cli::{CommonArgs, PipelineData, open_chunks, partitions_to_bits};
 
 #[derive(Args)]
 pub struct SuperkmerArgs {
@@ -38,7 +38,7 @@ pub fn run(args: SuperkmerArgs) {
     let m = args.common.minimizer_size;
     let theta = args.common.theta;
     let level_max = args.common.level_max;
-    let partition_bits = args.common.partition_bits;
+    let partition_bits = partitions_to_bits(args.common.partitions);
     let n_workers = args.common.threads.max(1);
 
     set_k(k);
-- 
2.52.0


From 13599dd444e972cbb3b0f21d10b01f1bd9cbe976 Mon Sep 17 00:00:00 2001
From: Eric Coissac <eric@coissac.eu>
Date: Thu, 21 May 2026 13:23:05 +0200
Subject: [PATCH 11/11] feat: Implement query subcommand for sequence-to-genome
 mapping

This change introduces the `query` CLI command and its supporting infrastructure for sequence-to-genome mapping and k-mer matching. It adds a `QueryLayer` abstraction backed by MPHF and persistent matrices, exposes the index partition for direct querying, and implements `Hash`/`Eq` for `RoutableSuperKmer`. The command ingests sequence batches, deduplicates superkmers, routes them to index partitions for parallel exact or 1-mismatch matching, and outputs results as FASTA records annotated with JSON metadata. Includes `serde_json` dependency addition, module exports, and documentation updates.
---
 TODO.md                                 |  20 +-
 docmd/architecture/query.md             | 111 ++++++++++
 src/Cargo.lock                          |   1 +
 src/obikindex/src/index.rs              |   5 +
 src/obikmer/Cargo.toml                  |   1 +
 src/obikmer/src/cmd/mod.rs              |   1 +
 src/obikmer/src/cmd/query.rs            | 281 ++++++++++++++++++++++++
 src/obikmer/src/main.rs                 |   3 +
 src/obikpartitionner/src/lib.rs         |   1 +
 src/obikpartitionner/src/query_layer.rs | 120 ++++++++++
 src/obikseq/src/routable.rs             |  14 ++
 src/obiread/src/lib.rs                  |   1 +
 src/obiread/src/record.rs               | 222 +++++++++++++++++++
 13 files changed, 762 insertions(+), 19 deletions(-)
 create mode 100644 docmd/architecture/query.md
 create mode 100644 src/obikmer/src/cmd/query.rs
 create mode 100644 src/obikpartitionner/src/query_layer.rs
 create mode 100644 src/obiread/src/record.rs

diff --git a/TODO.md b/TODO.md
index e6ba5d0..a874150 100644
--- a/TODO.md
+++ b/TODO.md
@@ -1,26 +1,8 @@
-## Chose à vérifier suite à la commande index
-
-- il faudrait lister les fichier qui vont être indexés
-- partition.meta ne devrait plus exister
-- les spectrums globaux devrait etre identifier par génome
-  - regrouper dans un sous-dossier spectrums à la racine de l'index avec un nom basé sur le génome
-- les spectrum patiels ont-ils vocation à être conserver ?
-- l'étape de déreplication dure quasiment autant de temps que le comptage mais ne laisse aucune trace de progression à l'utilisateur
-
 ## commandes à ajouter
 
-- filter : produit un nouvel index filtré à partir d'un index existant en verifiant que les kmer présents dans le nouvel index respectent les critères de filtrage spécifiés
-  - quorum de presence en fraction-(min/max) du nombre de génomes, en nombre-(min/max) de génomes, si mode count la présence peut être défini par un seuil personnalisé minimum et maximum
-
 - aggregate : aggrege toutes les colonnes d'une matrice d'index en une seule colonne.
 
 - query : scan un fichier de sequences et retourne pour chaque sequence quels kmer sont présents dans l'index et dans quel genomes
-
-- distance : calcule la matrice de distance entre les genomes
-  - proposer une option pour chaque distance à calculer
-  - un possibité de récuperer la matrice des kmer communs
-  - un possibité de calculer l'arbre nj 
-  - les matrices sont sauvegardées en CSV
-  - les arbres NJ sont sauvegardés en Newick avec les longeurs de branche
+  --detail et --mismatch à implementer
 
 - status : affiche le statut de l'index
diff --git a/docmd/architecture/query.md b/docmd/architecture/query.md
new file mode 100644
index 0000000..6a11a11
--- /dev/null
+++ b/docmd/architecture/query.md
@@ -0,0 +1,111 @@
+# Query system
+
+## Goal
+
+Given a set of query sequences, determine for each sequence how many of its k-mers are found in the index and, for each indexed genome, how many k-mers match. The query system is the foundation for read classification and sequence-to-genome mapping.
+
+---
+
+## Input
+
+- Query sequences in FASTA or FASTQ format (gzip supported, streaming stdin supported).
+- Sequences shorter than k bases are silently skipped.
+- Non-ACGT characters are handled by the superkmer decomposition layer: they act as hard breaks, producing shorter superkmers (identical to the behaviour at indexing time).
+
+---
+
+## Algorithm
+
+The query follows the same superkmer-based partitioning strategy used at indexing time.
+
+```
+for each query sequence:
+    decompose into superkmers  (non-ACGT breaks, same minimiser scheme as indexing)
+    for each superkmer:
+        route to partition p via minimiser hash
+        for each kmer in the superkmer:
+            lookup kmer in partition p  (MPHF → evidence check → matrix row)
+            accumulate result into per-sequence accumulators
+    emit annotated sequence
+```
+
+Parallelism is **per sequence**: each worker thread handles all partitions of one sequence independently. This avoids cross-thread coordination when merging partial results and keeps memory usage proportional to the number of concurrent sequences rather than to the number of partitions.
+
+---
+
+## Exact vs. approximate matching
+
+### Exact (default)
+
+Standard MPHF lookup followed by evidence check. O(1) per k-mer.
+
+### 1-mismatch (`--mismatch` flag)
+
+For each k-mer of the query, generate all `3·k` single-substitution variants. Each variant is canonicalised and looked up independently in the index. If one or more variants are found, their per-genome rows are **summed** into the result for that k-mer position.
+
+- If a k-mer matches exactly AND one of its variants also matches (distinct k-mers in the index), both contributions are accumulated.
+- Exact and approximate matches are tracked separately in the output (see annotation schema below).
+- The superkmer routing optimisation is **not** used in 1-mismatch mode: each variant is looked up directly via its own minimiser.
+- Cost: up to `3·k` MPHF probes per k-mer position vs. 1 in exact mode.
+
+---
+
+## Output format
+
+Output sequences are written in **OBITools4 format**: the original sequence with a JSON annotation map in the title line.
+
+```
+>read_id {"kmer_total":150,"kmer_found":59,...}
+ATCGATCG...
+```
+
+Genome order in all list-valued annotations follows the genome order recorded in `index.meta`.
+
+---
+
+## Annotation schema
+
+### Summary mode (default)
+
+| Key | Type | Condition | Semantics |
+|---|---|---|---|
+| `kmer_total` | int | always | total k-mers in the (masked) sequence |
+| `kmer_found` | int | always | k-mers with at least one match (exact or approx) |
+| `kmer_missing` | int | `--count-missing` | k-mers absent from the index |
+| `kmer_match` | list[int] | always | per-genome matched k-mer count (exact + approx) |
+| `kmer_match_exact` | list[int] | `--mismatch` | per-genome exact match count |
+| `kmer_match_approx` | list[int] | `--mismatch` | per-genome approx match count |
+| `count_match` | list[int] | count index | per-genome sum of index counts for matched k-mers |
+
+`kmer_match[i]` is the number of k-mer positions in the query that contribute at least one match to genome i. In 1-mismatch mode, a single k-mer position can contribute to multiple genomes if several of its variants are present in the index.
+
+`count_match[i]` sums raw index counts across all matched k-mer positions for genome i. Only meaningful for count indexes.
+
+### Detail mode (`--detail`)
+
+All summary keys, plus per-position coverage vectors — one list per genome, length `len(sequence) − k + 1`:
+
+| Key | Type | Condition | Semantics |
+|---|---|---|---|
+| `cov_<i>` | list[int] | `--detail` | coverage at each k-mer position for genome i; raw count (count index) or 0/1 (presence index); 0 if absent |
+| `cov_exact_<i>` | list[int] | `--detail` + `--mismatch` | exact-match contribution per position |
+| `cov_approx_<i>` | list[int] | `--detail` + `--mismatch` | approx-match contribution per position |
+
+Genome indices in key names are 0-based integers matching the `index.meta` genome order. Genome labels are not used as key names to avoid issues with special characters in long or complex genome identifiers.
+
+---
+
+## CLI
+
+```
+obikmer query -i <index> [--summary | --detail] [--mismatch] [--count-missing] <query.fa>
+```
+
+`--summary` is the default; `--detail` implies `--summary` (all summary keys are always present).
+
+---
+
+## Future work
+
+- **Read classification** (`--classify`): assign each read to the genome with the highest `kmer_match` score; emit as a single annotation key.
+- **Whitelist / blacklist filtering**: accept or reject sequences based on whether their k-mer match score for a designated set of genomes exceeds a threshold.
diff --git a/src/Cargo.lock b/src/Cargo.lock
index 1de3042..a07c18c 100644
--- a/src/Cargo.lock
+++ b/src/Cargo.lock
@@ -1514,6 +1514,7 @@ dependencies = [
  "obisys",
  "pprof",
  "rayon",
+ "serde_json",
  "speedytree",
  "tracing",
  "tracing-subscriber",
diff --git a/src/obikindex/src/index.rs b/src/obikindex/src/index.rs
index 16c1154..07792a4 100644
--- a/src/obikindex/src/index.rs
+++ b/src/obikindex/src/index.rs
@@ -185,6 +185,11 @@ impl KmerIndex {
         Ok(())
     }
 
+    /// Borrow the inner partition for direct superkmer-level queries.
+    pub fn partition(&self) -> &KmerPartition {
+        &self.partition
+    }
+
     /// Path to the unitigs file for partition `part`, layer `layer`.
     pub fn layer_unitigs_path(&self, part: usize, layer: usize) -> PathBuf {
         self.partition.part_dir(part)
diff --git a/src/obikmer/Cargo.toml b/src/obikmer/Cargo.toml
index fffb28b..053bc97 100644
--- a/src/obikmer/Cargo.toml
+++ b/src/obikmer/Cargo.toml
@@ -19,6 +19,7 @@ obisys        = { path = "../obisys" }
 obiskio       = { path = "../obiskio" }
 obikindex     = { path = "../obikindex" }
 clap          = { version = "4", features = ["derive"] }
+serde_json    = "1"
 kodama        = "0.2"
 speedytree    = "0.1"
 rayon         = "1"
diff --git a/src/obikmer/src/cmd/mod.rs b/src/obikmer/src/cmd/mod.rs
index ac4057e..b71a789 100644
--- a/src/obikmer/src/cmd/mod.rs
+++ b/src/obikmer/src/cmd/mod.rs
@@ -2,6 +2,7 @@ pub mod distance;
 pub mod dump;
 pub mod index;
 pub mod merge;
+pub mod query;
 pub mod rebuild;
 pub mod superkmer;
 pub mod unitig;
diff --git a/src/obikmer/src/cmd/query.rs b/src/obikmer/src/cmd/query.rs
new file mode 100644
index 0000000..2067067
--- /dev/null
+++ b/src/obikmer/src/cmd/query.rs
@@ -0,0 +1,281 @@
+use std::collections::HashMap;
+use std::io::{self, BufWriter, Write};
+use std::path::PathBuf;
+
+use clap::Args;
+use obikindex::KmerIndex;
+use obiread::record::{SeqRecord, parse_chunk};
+use obiread::chunk::read_sequence_chunks;
+use obikseq::{RoutableSuperKmer, set_k, set_m};
+use obiskbuilder::SuperKmerIter;
+use tracing::info;
+
+// ── CLI ───────────────────────────────────────────────────────────────────────
+
+#[derive(Args)]
+pub struct QueryArgs {
+    /// Index directory
+    pub index: PathBuf,
+
+    /// Input sequences (FASTA/FASTQ, optionally gzip-compressed)
+    #[arg(num_args = 1..)]
+    pub inputs: Vec<String>,
+
+    /// Also report per-position coverage vectors (cov_<i> per genome)
+    #[arg(long)]
+    pub detail: bool,
+
+    /// Enable 1-mismatch approximate matching
+    #[arg(long)]
+    pub mismatch: bool,
+
+    /// Count k-mers absent from the index (adds kmer_missing annotation)
+    #[arg(long)]
+    pub count_missing: bool,
+
+    /// Report per-genome presence (0/1) instead of raw counts
+    #[arg(long)]
+    pub force_presence: bool,
+
+    /// Minimum accumulated match count to declare a genome present (implies --force-presence)
+    #[arg(long, default_value_t = 1)]
+    pub presence_threshold: u32,
+
+    /// Number of worker threads
+    #[arg(
+        short = 'T',
+        long,
+        default_value_t = std::thread::available_parallelism()
+            .map(|n| n.get())
+            .unwrap_or(1)
+    )]
+    pub threads: usize,
+}
+
+// ── SKDesc — one occurrence of a superkmer in the batch ───────────────────────
+
+/// Describes one occurrence of a superkmer in the query batch.
+pub struct SKDesc {
+    /// Index of the source sequence within the batch.
+    pub seq_idx: u32,
+    /// Kmer offset of the first kmer of this superkmer within its sequence.
+    /// Computed as the cumulative number of kmers emitted before this superkmer
+    /// in the same sequence. Used for `--detail` coverage vectors.
+    pub kmer_offset: u32,
+}
+
+// ── QueryBatch ────────────────────────────────────────────────────────────────
+
+/// A batch of query sequences with their superkmers deduplicated.
+///
+/// Each unique `RoutableSuperKmer` maps to all the (seq_idx, kmer_offset)
+/// positions it occupies across the batch. The superkmer is queried once
+/// per partition; the result is broadcast to every SKDesc entry.
+pub struct QueryBatch {
+    /// Sequence ids in batch order.
+    pub ids: Vec<String>,
+    /// Raw sequence bytes (for output), in batch order.
+    pub seqs: Vec<Vec<u8>>,
+    /// Per-sequence total kmer count (kmer_count + kmer_missing).
+    pub n_kmers: Vec<u32>,
+    /// Deduplicated superkmer map.
+    pub map: HashMap<RoutableSuperKmer, Vec<SKDesc>>,
+}
+
+impl QueryBatch {
+    /// Build a batch from a vec of parsed sequence records.
+    pub fn from_records(
+        records: Vec<SeqRecord>,
+        k: usize,
+        level_max: usize,
+        theta: f64,
+    ) -> Self {
+        let mut ids   = Vec::with_capacity(records.len());
+        let mut seqs  = Vec::with_capacity(records.len());
+        let mut n_kmers = Vec::with_capacity(records.len());
+        let mut map: HashMap<RoutableSuperKmer, Vec<SKDesc>> = HashMap::new();
+
+        for (seq_idx, record) in records.into_iter().enumerate() {
+            let mut kmer_offset = 0u32;
+
+            for rsk in SuperKmerIter::new(&record.normalized, k, level_max, theta) {
+                let n = (rsk.seql() - k + 1) as u32;
+                map.entry(rsk)
+                    .or_default()
+                    .push(SKDesc { seq_idx: seq_idx as u32, kmer_offset });
+                kmer_offset += n;
+            }
+
+            ids.push(record.id);
+            seqs.push(record.sequence);
+            n_kmers.push(kmer_offset);
+        }
+
+        Self { ids, seqs, n_kmers, map }
+    }
+
+    /// Split the superkmer map by partition index.
+    ///
+    /// Returns a vec of length `n_partitions`; each slot holds the RSK refs
+    /// whose minimizer routes to that partition.
+    pub fn split_by_partition(&self, n_partitions: usize) -> Vec<Vec<&RoutableSuperKmer>> {
+        let mask = (n_partitions as u64) - 1;
+        let mut by_part: Vec<Vec<&RoutableSuperKmer>> = vec![Vec::new(); n_partitions];
+        for rsk in self.map.keys() {
+            let part = (rsk.minimizer().seq_hash() & mask) as usize;
+            by_part[part].push(rsk);
+        }
+        by_part
+    }
+}
+
+// ── Per-sequence accumulator ──────────────────────────────────────────────────
+
+struct SeqAcc {
+    kmer_count:   u32,
+    kmer_missing: u32,
+    /// Per-genome accumulated count (count mode) or presence sum (presence mode).
+    genome_totals: Vec<u32>,
+}
+
+impl SeqAcc {
+    fn new(n_genomes: usize) -> Self {
+        Self {
+            kmer_count:   0,
+            kmer_missing: 0,
+            genome_totals: vec![0u32; n_genomes],
+        }
+    }
+}
+
+// ── Entry point ───────────────────────────────────────────────────────────────
+
+pub fn run(args: QueryArgs) {
+    let idx = KmerIndex::open(&args.index).unwrap_or_else(|e| {
+        eprintln!("error opening index: {e}");
+        std::process::exit(1);
+    });
+
+    set_k(idx.kmer_size());
+    set_m(idx.minimizer_size());
+
+    let k            = idx.kmer_size();
+    let n_genomes    = idx.meta().genomes.len();
+    let n_partitions = idx.n_partitions();
+    let with_counts  = idx.meta().config.with_counts;
+
+    info!(
+        "query: k={k}, {} genome(s), with_counts={with_counts}, mismatch={}, detail={}",
+        n_genomes, args.mismatch, args.detail
+    );
+
+    if args.mismatch {
+        eprintln!("warning: --mismatch not yet implemented, ignored");
+    }
+    if args.detail {
+        eprintln!("warning: --detail not yet implemented, ignored");
+    }
+
+    let paths: Vec<PathBuf> = args.inputs.iter().map(PathBuf::from).collect();
+    let mut out = BufWriter::new(io::stdout());
+
+    for path in &paths {
+        let chunks = read_sequence_chunks(path.to_str().unwrap_or(""))
+            .unwrap_or_else(|e| {
+                eprintln!("error opening {}: {e}", path.display());
+                std::process::exit(1);
+            });
+
+        for chunk_result in chunks {
+            let chunk = chunk_result.unwrap_or_else(|e| {
+                eprintln!("read error: {e}");
+                std::process::exit(1);
+            });
+
+            let records = parse_chunk(&chunk, k);
+            if records.is_empty() {
+                continue;
+            }
+
+            let batch = QueryBatch::from_records(records, k, 6, 0.7);
+            let n_seqs = batch.ids.len();
+
+            let mut accs: Vec<SeqAcc> = (0..n_seqs).map(|_| SeqAcc::new(n_genomes)).collect();
+
+            let by_part = batch.split_by_partition(n_partitions);
+
+            for (part_idx, part_sks) in by_part.iter().enumerate() {
+                if part_sks.is_empty() {
+                    continue;
+                }
+
+                let kmer_results = idx
+                    .partition()
+                    .query_partition(part_idx, part_sks, k, n_genomes, with_counts)
+                    .unwrap_or_else(|e| {
+                        eprintln!("query error on partition {part_idx}: {e}");
+                        std::process::exit(1);
+                    });
+
+                let presence   = args.force_presence || !with_counts;
+                let threshold  = args.presence_threshold;
+
+                for (rsk, sk_kmer_results) in part_sks.iter().zip(kmer_results.iter()) {
+                    let descs = batch.map.get(*rsk).expect("rsk must be in map");
+                    for desc in descs {
+                        let acc = &mut accs[desc.seq_idx as usize];
+                        for hit in sk_kmer_results.iter() {
+                            match hit {
+                                None => acc.kmer_missing += 1,
+                                Some(row) => {
+                                    acc.kmer_count += 1;
+                                    for (g, &v) in row.iter().enumerate() {
+                                        acc.genome_totals[g] += if presence {
+                                            u32::from(v >= threshold)
+                                        } else {
+                                            v
+                                        };
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+
+            emit_batch(&batch, &accs, idx.meta(), args.count_missing, &mut out);
+        }
+    }
+}
+
+// ── Output ────────────────────────────────────────────────────────────────────
+
+fn emit_batch(
+    batch:        &QueryBatch,
+    accs:         &[SeqAcc],
+    meta:         &obikindex::meta::IndexMeta,
+    count_missing: bool,
+    out:          &mut impl Write,
+) {
+    for (seq_idx, (id, seq)) in batch.ids.iter().zip(batch.seqs.iter()).enumerate() {
+        let acc = &accs[seq_idx];
+        let mut ann = serde_json::Map::new();
+
+        ann.insert("kmer_count".into(), acc.kmer_count.into());
+        if count_missing {
+            ann.insert("kmer_missing".into(), acc.kmer_missing.into());
+        }
+        let mut match_map = serde_json::Map::new();
+        for (g, label) in meta.genomes.iter().enumerate() {
+            match_map.insert(label.clone(), acc.genome_totals[g].into());
+        }
+        ann.insert("kmer_strict_matches".into(), match_map.into());
+
+        let ann_str = serde_json::to_string(&ann).unwrap_or_else(|_| "{}".to_string());
+
+        // OBITools4 FASTA format: >id {"key":value,...}
+        let _ = writeln!(out, ">{id} {ann_str}");
+        let _ = out.write_all(seq);
+        let _ = out.write_all(b"\n");
+    }
+}
diff --git a/src/obikmer/src/main.rs b/src/obikmer/src/main.rs
index bae4cdf..1b8f833 100644
--- a/src/obikmer/src/main.rs
+++ b/src/obikmer/src/main.rs
@@ -22,6 +22,8 @@ enum Commands {
     Merge(cmd::merge::MergeArgs),
     /// Filter and compact an existing index into a new single-layer index
     Rebuild(cmd::rebuild::RebuildArgs),
+    /// Query an index with sequences and annotate matches
+    Query(cmd::query::QueryArgs),
     /// Dump all indexed kmers as CSV (kmer + per-genome counts or presence)
     Dump(cmd::dump::DumpArgs),
     /// Compute pairwise distance matrix between genomes; optionally build NJ/UPGMA trees
@@ -54,6 +56,7 @@ fn main() {
         Commands::Merge(args)     => cmd::merge::run(args),
         Commands::Dump(args)      => cmd::dump::run(args),
         Commands::Rebuild(args)   => cmd::rebuild::run(args),
+        Commands::Query(args)     => cmd::query::run(args),
         Commands::Distance(args)  => cmd::distance::run(args),
         Commands::Unitig(args)    => cmd::unitig::run(args),
     }
diff --git a/src/obikpartitionner/src/lib.rs b/src/obikpartitionner/src/lib.rs
index 48d72c1..49a81fc 100644
--- a/src/obikpartitionner/src/lib.rs
+++ b/src/obikpartitionner/src/lib.rs
@@ -5,6 +5,7 @@ mod index_layer;
 mod kmer_sort;
 mod merge_layer;
 mod partition;
+mod query_layer;
 mod rebuild_layer;
 
 pub use filter::KmerFilter;
diff --git a/src/obikpartitionner/src/query_layer.rs b/src/obikpartitionner/src/query_layer.rs
new file mode 100644
index 0000000..f18a0d5
--- /dev/null
+++ b/src/obikpartitionner/src/query_layer.rs
@@ -0,0 +1,120 @@
+use std::path::Path;
+
+use obicompactvec::{PersistentBitMatrix, PersistentCompactIntMatrix};
+use obikseq::{CanonicalKmer, RoutableSuperKmer};
+use obiskio::{SKError, SKResult};
+use obilayeredmap::{MphfLayer, OLMError};
+use obilayeredmap::meta::PartitionMeta;
+
+use crate::partition::KmerPartition;
+
+const INDEX_SUBDIR: &str = "index";
+
+fn olm_to_sk(e: OLMError) -> SKError {
+    match e {
+        OLMError::Io(io_err) => SKError::Io(io_err),
+        other => SKError::InvalidData { context: "query", detail: other.to_string() },
+    }
+}
+
+// ── per-layer query handle ────────────────────────────────────────────────────
+
+enum QueryLayer {
+    /// Layer<()> — MPHF-only, no data matrix; all indexed kmers map to 1 per genome.
+    SetOnly(MphfLayer),
+    Presence(MphfLayer, PersistentBitMatrix),
+    Count(MphfLayer, PersistentCompactIntMatrix),
+}
+
+impl QueryLayer {
+    fn open(layer_dir: &Path, with_counts: bool) -> SKResult<Self> {
+        let presence_dir = layer_dir.join("presence");
+        let counts_dir   = layer_dir.join("counts");
+
+        if with_counts && counts_dir.exists() {
+            let mphf = MphfLayer::open(layer_dir).map_err(olm_to_sk)?;
+            let mat  = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?;
+            Ok(QueryLayer::Count(mphf, mat))
+        } else if presence_dir.exists() {
+            let mphf = MphfLayer::open(layer_dir).map_err(olm_to_sk)?;
+            let mat  = PersistentBitMatrix::open(&presence_dir).map_err(SKError::Io)?;
+            Ok(QueryLayer::Presence(mphf, mat))
+        } else if counts_dir.exists() {
+            // presence query on a count index — return counts as-is
+            let mphf = MphfLayer::open(layer_dir).map_err(olm_to_sk)?;
+            let mat  = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?;
+            Ok(QueryLayer::Count(mphf, mat))
+        } else {
+            let mphf = MphfLayer::open(layer_dir).map_err(olm_to_sk)?;
+            Ok(QueryLayer::SetOnly(mphf))
+        }
+    }
+
+    /// Return `Some(per-genome row)` if `kmer` is indexed in this layer, else `None`.
+    fn find(&self, kmer: CanonicalKmer, n_genomes: usize) -> Option<Box<[u32]>> {
+        match self {
+            QueryLayer::SetOnly(mphf) => {
+                mphf.find(kmer)
+                    .map(|_| vec![1u32; n_genomes].into_boxed_slice())
+            }
+            QueryLayer::Presence(mphf, mat) => {
+                mphf.find(kmer)
+                    .map(|slot| mat.row(slot).iter().map(|&b| b as u32).collect())
+            }
+            QueryLayer::Count(mphf, mat) => {
+                mphf.find(kmer).map(|slot| mat.row(slot))
+            }
+        }
+    }
+}
+
+// ── KmerPartition::query_partition ───────────────────────────────────────────
+
+impl KmerPartition {
+    /// Query a single partition for a slice of (already-routed) super-kmers.
+    ///
+    /// Returns one entry per input super-kmer; each entry is a `Vec` with one
+    /// `Option<Box<[u32]>>` per k-mer inside that super-kmer:
+    /// - `None`        — k-mer absent from the index
+    /// - `Some(row)`   — per-genome count (count index) or 0/1 (presence index)
+    ///
+    /// All `superkmers` must belong to this partition (same minimizer bucket).
+    pub fn query_partition(
+        &self,
+        part_idx: usize,
+        superkmers: &[&RoutableSuperKmer],
+        k: usize,
+        n_genomes: usize,
+        with_counts: bool,
+    ) -> SKResult<Vec<Vec<Option<Box<[u32]>>>>> {
+        if superkmers.is_empty() {
+            return Ok(Vec::new());
+        }
+
+        let index_dir = self.part_dir(part_idx).join(INDEX_SUBDIR);
+
+        if !index_dir.exists() {
+            return Ok(superkmers
+                .iter()
+                .map(|rsk| vec![None; rsk.seql() - k + 1])
+                .collect());
+        }
+
+        let meta = PartitionMeta::load(&index_dir).map_err(olm_to_sk)?;
+        let layers: Vec<QueryLayer> = (0..meta.n_layers)
+            .map(|i| QueryLayer::open(&index_dir.join(format!("layer_{i}")), with_counts))
+            .collect::<SKResult<_>>()?;
+
+        Ok(superkmers
+            .iter()
+            .map(|rsk| {
+                rsk.superkmer()
+                    .iter_canonical_kmers()
+                    .map(|kmer| {
+                        layers.iter().find_map(|layer| layer.find(kmer, n_genomes))
+                    })
+                    .collect()
+            })
+            .collect())
+    }
+}
diff --git a/src/obikseq/src/routable.rs b/src/obikseq/src/routable.rs
index a4b2338..411fcb0 100644
--- a/src/obikseq/src/routable.rs
+++ b/src/obikseq/src/routable.rs
@@ -71,6 +71,20 @@ impl RoutableSuperKmer {
     }
 }
 
+impl PartialEq for RoutableSuperKmer {
+    fn eq(&self, other: &Self) -> bool {
+        self.superkmer == other.superkmer
+    }
+}
+
+impl Eq for RoutableSuperKmer {}
+
+impl std::hash::Hash for RoutableSuperKmer {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.superkmer.hash(state);
+    }
+}
+
 impl Sequence for RoutableSuperKmer {
     type Canonical = RoutableSuperKmer;
 
diff --git a/src/obiread/src/lib.rs b/src/obiread/src/lib.rs
index a07d250..771d839 100644
--- a/src/obiread/src/lib.rs
+++ b/src/obiread/src/lib.rs
@@ -12,6 +12,7 @@ mod mimetype;
 pub mod normalize;
 mod path_iterator;
 pub mod peakreader;
+pub mod record;
 pub mod xopen;
 
 pub use chunk::{SeqChunkIter, fasta_chunks, fastq_chunks,
diff --git a/src/obiread/src/record.rs b/src/obiread/src/record.rs
new file mode 100644
index 0000000..fb5e141
--- /dev/null
+++ b/src/obiread/src/record.rs
@@ -0,0 +1,222 @@
+//! Per-sequence record parser for FASTA and FASTQ chunks.
+//!
+//! Same automaton structure as `normalize.rs` — only the actions differ:
+//! instead of writing into a single flat rope, we accumulate per-sequence
+//! data (id, raw ASCII, normalised ACGT\x00 rope).
+
+use obikrope::{ForwardCursor, Rope, RopeCursor};
+
+/// One sequence record extracted from a FASTA or FASTQ chunk.
+pub struct SeqRecord {
+    /// Sequence identifier (everything before the first space in the header).
+    pub id: String,
+    /// Raw sequence bytes, newlines stripped, non-ACGT characters preserved.
+    /// Reproduced verbatim in query output.
+    pub sequence: Vec<u8>,
+    /// Per-sequence normalised rope: uppercase ACGT segments of length ≥ k
+    /// separated by `\x00`. Ready for `SuperKmerIter`.
+    pub normalized: Rope,
+}
+
+/// Parse all records from a FASTA or FASTQ chunk rope.
+/// Returns an empty vec if the rope carries no recognised mime type.
+pub fn parse_chunk(rope: &Rope, k: usize) -> Vec<SeqRecord> {
+    let cursor = rope.fw_cursor();
+    match rope.mime_type() {
+        Some("text/fasta") => parse_fasta(cursor, k),
+        Some("text/fastq") => parse_fastq(cursor, k),
+        _ => vec![],
+    }
+}
+
+// ── Shared state accumulated while scanning one sequence ──────────────────────
+
+struct RecordBuilder {
+    id:       String,
+    sequence: Vec<u8>,   // raw ASCII, no newlines
+    norm:     Vec<u8>,   // ACGT\x00 segments being built
+    seg_start: usize,    // index in norm where current segment started
+    k:        usize,
+}
+
+impl RecordBuilder {
+    fn new(k: usize) -> Self {
+        Self { id: String::new(), sequence: Vec::new(), norm: Vec::new(), seg_start: 0, k }
+    }
+
+    fn reset(&mut self, id: String) {
+        self.id = id;
+        self.sequence.clear();
+        self.norm.clear();
+        self.seg_start = 0;
+    }
+
+    /// Push one accepted ACGT byte.
+    fn push_acgt(&mut self, b: u8) {
+        self.sequence.push(b);
+        self.norm.push(b);
+    }
+
+    /// Push one non-ACGT byte to the raw sequence only (not to the norm buffer).
+    fn push_raw(&mut self, b: u8) {
+        self.sequence.push(b);
+    }
+
+    /// Close the current ACGT segment (same logic as `end_segment` in normalize.rs).
+    fn end_segment(&mut self) {
+        if self.norm.len() - self.seg_start >= self.k {
+            self.norm.push(0x00);
+            self.seg_start = self.norm.len();
+        } else {
+            self.norm.truncate(self.seg_start);
+        }
+    }
+
+    /// Consume into a SeqRecord. Closes any open segment first.
+    fn finish(mut self) -> Option<SeqRecord> {
+        self.end_segment();
+        if self.id.is_empty() {
+            return None;
+        }
+        let mut rope = Rope::new(None);
+        if !self.norm.is_empty() {
+            rope.push(self.norm);
+        }
+        Some(SeqRecord { id: self.id, sequence: self.sequence, normalized: rope })
+    }
+}
+
+// ── FASTA automaton ───────────────────────────────────────────────────────────
+
+fn parse_fasta(cursor: ForwardCursor<'_>, k: usize) -> Vec<SeqRecord> {
+    let mut records: Vec<SeqRecord> = Vec::new();
+    let mut builder = RecordBuilder::new(k);
+
+    // skip up to (and including) the first '>'
+    loop {
+        match cursor.read_next().ok() {
+            None => return records,
+            Some(b'>') => break,
+            Some(_) => {}
+        }
+    }
+
+    // read first id — read_id already consumes the full header line
+    builder.id = read_id(&cursor);
+
+    loop {
+        match cursor.read_next().ok() {
+            None => {
+                // EOF — close final segment and emit
+                if let Some(rec) = builder.finish() {
+                    records.push(rec);
+                }
+                return records;
+            }
+            Some(b'\n') | Some(b'\r') => {
+                // peek: next non-empty char determines if new record starts
+                match cursor.read_ahead(1).ok() {
+                    Some(b'>') => {
+                        // end of current record
+                        builder.end_segment();
+                        if let Some(rec) = builder.finish() {
+                            records.push(rec);
+                        }
+                        cursor.read_next().ok(); // consume '>'
+                        let id = read_id(&cursor); // already consumes header line
+                        builder = RecordBuilder::new(k);
+                        builder.reset(id);
+                    }
+                    None => {
+                        builder.end_segment();
+                        if let Some(rec) = builder.finish() {
+                            records.push(rec);
+                        }
+                        return records;
+                    }
+                    Some(_) => {} // continuation line — do nothing
+                }
+            }
+            Some(b) => {
+                let upper = b & !0x20u8;
+                if matches!(upper, b'A' | b'C' | b'G' | b'T') {
+                    builder.push_acgt(upper);
+                } else {
+                    builder.push_raw(b);
+                    builder.end_segment();
+                }
+            }
+        }
+    }
+}
+
+// ── FASTQ automaton ───────────────────────────────────────────────────────────
+
+fn parse_fastq(cursor: ForwardCursor<'_>, k: usize) -> Vec<SeqRecord> {
+    let mut records: Vec<SeqRecord> = Vec::new();
+
+    loop {
+        // find '@'
+        loop {
+            match cursor.read_next().ok() {
+                None => return records,
+                Some(b'@') => break,
+                Some(_) => {}
+            }
+        }
+
+        let mut builder = RecordBuilder::new(k);
+        builder.id = read_id(&cursor); // already consumes the full header line
+
+        // sequence line — stop at newline, non-ACGT breaks segment
+        loop {
+            match cursor.read_next().ok() {
+                None | Some(b'\n') | Some(b'\r') => {
+                    builder.end_segment();
+                    break;
+                }
+                Some(b) => {
+                    let upper = b & !0x20u8;
+                    if matches!(upper, b'A' | b'C' | b'G' | b'T') {
+                        builder.push_acgt(upper);
+                    } else {
+                        builder.push_raw(b);
+                        builder.end_segment();
+                    }
+                }
+            }
+        }
+
+        skip_line(&cursor); // '+' line
+        skip_line(&cursor); // quality line
+
+        if let Some(rec) = builder.finish() {
+            records.push(rec);
+        }
+    }
+}
+
+// ── Helpers ───────────────────────────────────────────────────────────────────
+
+fn read_id(cursor: &ForwardCursor<'_>) -> String {
+    let mut id = Vec::new();
+    loop {
+        match cursor.read_next().ok() {
+            None | Some(b'\n') | Some(b'\r') => break,
+            Some(b' ') | Some(b'\t') => {
+                skip_line(cursor);
+                break;
+            }
+            Some(b) => id.push(b),
+        }
+    }
+    String::from_utf8_lossy(&id).into_owned()
+}
+
+fn skip_line(cursor: &ForwardCursor<'_>) {
+    while let Some(c) = cursor.read_next().ok() {
+        if c == b'\n' {
+            return;
+        }
+    }
+}
-- 
2.52.0