From b6fcbc545f10ab5c924120edf46cf62bb55f9ee2 Mon Sep 17 00:00:00 2001
From: Eric Coissac <eric@coissac.eu>
Date: Mon, 15 Jun 2026 15:45:04 +0200
Subject: [PATCH 01/24] refactor: replace rayon with NUMA-aware PartitionRunner

Replaces `rayon` parallel iteration across index, rebuild, reindex, and select modules with a custom `PartitionRunner`. This introduces NUMA-aware task distribution with CPU pinning and round-robin scheduling, eliminating `Arc`, `Mutex`, and atomic synchronization primitives in favor of a flat, pre-spawned worker architecture. Error handling is simplified via `.map_err()` and the `?` operator, while progress bar updates are decoupled into dedicated callbacks.
---
 docmd/installation.md              |   4 +-
 src/obicompactvec/src/bitmatrix.rs |  49 ++++--
 src/obicompactvec/src/intmatrix.rs |  49 ++++--
 src/obikindex/src/index.rs         |  74 ++++-----
 src/obikindex/src/numa.rs          | 246 +++++++++--------------------
 src/obikindex/src/rebuild.rs       |  22 +--
 src/obikindex/src/reindex.rs       |  25 +--
 src/obikindex/src/select.rs        |  48 ++----
 8 files changed, 198 insertions(+), 319 deletions(-)
diff --git a/docmd/installation.md b/docmd/installation.md
index d9a5cda..ab4b934 100644
--- a/docmd/installation.md
+++ b/docmd/installation.md
@@ -60,13 +60,13 @@ HPC home directories are typically on a network filesystem (Lustre, NFS) optimis
 **Always redirect the build directory to a local scratch disk:**
 
 ```bash
-CARGO_TARGET_DIR=/scratch/local/$USER/cargo-target cargo build --release
+CARGO_TARGET_DIR=/scratch/$USER/cargo-target cargo build --release
 ```
 
 Adapt the path to the local scratch available on your cluster (`/var/tmp`, `/tmp`, `/scratch/local`, etc.). Once built, copy the binary to a permanent location:
 
 ```bash
-cp /scratch/local/$USER/cargo-target/release/obikmer ~/bin/
+cp /scratch/$USER/cargo-target/release/obikmer ~/bin/
 ```
 
 ## NUMA support
diff --git a/src/obicompactvec/src/bitmatrix.rs b/src/obicompactvec/src/bitmatrix.rs
index 631db63..ca1b393 100644
--- a/src/obicompactvec/src/bitmatrix.rs
+++ b/src/obicompactvec/src/bitmatrix.rs
@@ -1,5 +1,5 @@
 use std::fs::{self, File};
-use std::io::{self, Write as _};
+use std::io::{self, BufWriter, Write as _};
 use std::path::{Path, PathBuf};
 
 use memmap2::Mmap;
@@ -230,30 +230,47 @@ impl PackedBitMatrix {
 
 /// Build `presence/matrix.pbmx` from existing `col_*.pbiv` files.
 pub fn pack_bit_matrix(dir: &Path) -> io::Result<()> {
+    let packed_path = dir.join("matrix.pbmx");
+    if packed_path.exists() {
+        // Matrix complete; remove any leftover column files from a killed cleanup.
+        if let Ok(meta) = MatrixMeta::load(dir) {
+            for c in 0..meta.n_cols { let _ = fs::remove_file(col_path(dir, c)); }
+            let _ = fs::remove_file(dir.join("meta.json"));
+        }
+        return Ok(());
+    }
+
     let meta = MatrixMeta::load(dir)?;
     let n_cols = meta.n_cols;
 
-    let col_files: Vec<Vec<u8>> = (0..n_cols)
-        .map(|c| fs::read(col_path(dir, c)))
+    // Compute offsets from file sizes — no column data loaded into RAM.
+    let col_sizes: Vec<u64> = (0..n_cols)
+        .map(|c| fs::metadata(col_path(dir, c)).map(|m| m.len()))
         .collect::<io::Result<_>>()?;
 
-    let header_size = PBMX_HEADER + n_cols * 8;
+    let header_size = (PBMX_HEADER + n_cols * 8) as u64;
     let mut col_offset = header_size;
     let mut offsets = Vec::with_capacity(n_cols);
-    for data in &col_files {
-        offsets.push(col_offset as u64);
-        col_offset += data.len();
+    for &size in &col_sizes {
+        offsets.push(col_offset);
+        col_offset += size;
     }
 
-    let packed_path = dir.join("matrix.pbmx");
-    let mut file = File::create(&packed_path)?;
-    file.write_all(&PBMX_MAGIC)?;
-    file.write_all(&[0u8; 4])?;
-    file.write_all(&(meta.n as u64).to_le_bytes())?;
-    file.write_all(&(n_cols as u64).to_le_bytes())?;
-    for &off in &offsets { file.write_all(&off.to_le_bytes())?; }
-    for data in &col_files { file.write_all(data)?; }
-    drop(file);
+    // Write to a temp file; rename atomically so a killed process never leaves
+    // a truncated matrix.pbmx that would be mistaken for a complete file.
+    let tmp_path = dir.join("matrix.pbmx.tmp");
+    let mut out = BufWriter::new(File::create(&tmp_path)?);
+    out.write_all(&PBMX_MAGIC)?;
+    out.write_all(&[0u8; 4])?;
+    out.write_all(&(meta.n as u64).to_le_bytes())?;
+    out.write_all(&(n_cols as u64).to_le_bytes())?;
+    for &off in &offsets { out.write_all(&off.to_le_bytes())?; }
+    for c in 0..n_cols {
+        io::copy(&mut File::open(col_path(dir, c))?, &mut out)?;
+    }
+    out.flush()?;
+    drop(out);
+    fs::rename(&tmp_path, &packed_path)?;
 
     for c in 0..n_cols { fs::remove_file(col_path(dir, c))?; }
     fs::remove_file(dir.join("meta.json"))?;
diff --git a/src/obicompactvec/src/intmatrix.rs b/src/obicompactvec/src/intmatrix.rs
index 8db78da..b563335 100644
--- a/src/obicompactvec/src/intmatrix.rs
+++ b/src/obicompactvec/src/intmatrix.rs
@@ -1,6 +1,6 @@
 use std::cmp::Ordering;
 use std::fs::{self, File};
-use std::io::{self, Write as _};
+use std::io::{self, BufWriter, Write as _};
 use std::path::{Path, PathBuf};
 
 use memmap2::Mmap;
@@ -354,30 +354,47 @@ impl PackedCompactIntMatrix {
 
 /// Build `counts/matrix.pcmx` from existing `col_*.pciv` files.
 pub fn pack_compact_int_matrix(dir: &Path) -> io::Result<()> {
+    let packed_path = dir.join("matrix.pcmx");
+    if packed_path.exists() {
+        // Matrix complete; remove any leftover column files from a killed cleanup.
+        if let Ok(meta) = MatrixMeta::load(dir) {
+            for c in 0..meta.n_cols { let _ = fs::remove_file(col_path(dir, c)); }
+            let _ = fs::remove_file(dir.join("meta.json"));
+        }
+        return Ok(());
+    }
+
     let meta = MatrixMeta::load(dir)?;
     let n_cols = meta.n_cols;
 
-    let col_files: Vec<Vec<u8>> = (0..n_cols)
-        .map(|c| fs::read(col_path(dir, c)))
+    // Compute offsets from file sizes — no column data loaded into RAM.
+    let col_sizes: Vec<u64> = (0..n_cols)
+        .map(|c| fs::metadata(col_path(dir, c)).map(|m| m.len()))
         .collect::<io::Result<_>>()?;
 
-    let header_size = PCMX_HEADER + n_cols * 8;
+    let header_size = (PCMX_HEADER + n_cols * 8) as u64;
     let mut col_offset = header_size;
     let mut offsets = Vec::with_capacity(n_cols);
-    for data in &col_files {
-        offsets.push(col_offset as u64);
-        col_offset += data.len();
+    for &size in &col_sizes {
+        offsets.push(col_offset);
+        col_offset += size;
     }
 
-    let packed_path = dir.join("matrix.pcmx");
-    let mut file = File::create(&packed_path)?;
-    file.write_all(&PCMX_MAGIC)?;
-    file.write_all(&[0u8; 4])?;
-    file.write_all(&(meta.n as u64).to_le_bytes())?;
-    file.write_all(&(n_cols as u64).to_le_bytes())?;
-    for &off in &offsets { file.write_all(&off.to_le_bytes())?; }
-    for data in &col_files { file.write_all(data)?; }
-    drop(file);
+    // Write to a temp file; rename atomically so a killed process never leaves
+    // a truncated matrix.pcmx that would be mistaken for a complete file.
+    let tmp_path = dir.join("matrix.pcmx.tmp");
+    let mut out = BufWriter::new(File::create(&tmp_path)?);
+    out.write_all(&PCMX_MAGIC)?;
+    out.write_all(&[0u8; 4])?;
+    out.write_all(&(meta.n as u64).to_le_bytes())?;
+    out.write_all(&(n_cols as u64).to_le_bytes())?;
+    for &off in &offsets { out.write_all(&off.to_le_bytes())?; }
+    for c in 0..n_cols {
+        io::copy(&mut File::open(col_path(dir, c))?, &mut out)?;
+    }
+    out.flush()?;
+    drop(out);
+    fs::rename(&tmp_path, &packed_path)?;
 
     for c in 0..n_cols { fs::remove_file(col_path(dir, c))?; }
     fs::remove_file(dir.join("meta.json"))?;
diff --git a/src/obikindex/src/index.rs b/src/obikindex/src/index.rs
index 2c58aed..f6b0889 100644
--- a/src/obikindex/src/index.rs
+++ b/src/obikindex/src/index.rs
@@ -1,8 +1,6 @@
 use std::collections::BTreeMap;
 use std::fs;
 use std::path::{Path, PathBuf};
-use std::sync::atomic::{AtomicUsize, Ordering};
-use std::sync::{Arc, Mutex};
 
 use obikpartitionner::{KmerPartition, KmerSpectrum};
 use obilayeredmap;
@@ -152,31 +150,25 @@ impl KmerIndex {
         let with_counts = self.meta.config.with_counts;
         let evidence = self.meta.config.evidence.clone();
         let block_bits = self.meta.config.block_bits;
-        let total_kmers = AtomicUsize::new(0);
+        let mut total_kmers: usize = 0;
+        let pb = progress_bar("index", n as u64, "partitions");
 
-        let pb = Arc::new(Mutex::new(progress_bar("index", n as u64, "partitions")));
-
-        (0..n).into_par_iter().for_each(|i| {
-            match self.partition.build_index_layer(i, min_ab, max_ab, with_counts, &evidence, block_bits) {
-                Ok(0) => {}
-                Ok(n_kmers) => {
-                    total_kmers.fetch_add(n_kmers, Ordering::Relaxed);
-                    let pb = pb.lock().unwrap();
+        let order: Vec<usize> = (0..n).collect();
+        let runner = crate::numa::PartitionRunner::new();
+        runner.run(
+            &order,
+            |i| self.partition.build_index_layer(i, min_ab, max_ab, with_counts, &evidence, block_bits),
+            |i, n_kmers, _| {
+                if n_kmers > 0 {
+                    total_kmers += n_kmers;
                     pb.inc(1);
                     pb.set_message(format!("{i}: {n_kmers} kmers"));
                 }
-                Err(e) => {
-                    eprintln!("error building layer for partition {i}: {e}");
-                    std::process::exit(1);
-                }
-            }
-        });
+            },
+        ).map_err(OKIError::Partition)?;
 
-        pb.lock().unwrap().finish_and_clear();
-        info!(
-            "done — {} total kmers indexed",
-            total_kmers.load(Ordering::Relaxed)
-        );
+        pb.finish_and_clear();
+        info!("done — {} total kmers indexed", total_kmers);
 
         if !keep_intermediate {
             for i in 0..n {
@@ -211,35 +203,27 @@ impl KmerIndex {
         use obilayeredmap::meta::PartitionMeta;
 
         let n = self.n_partitions();
-        let errors: Vec<_> = (0..n)
-            .into_par_iter()
-            .filter_map(|i| {
+        let order: Vec<usize> = (0..n).collect();
+        let pb = progress_bar("pack", n as u64, "partitions");
+        crate::numa::PartitionRunner::new().run(
+            &order,
+            |i| -> OKIResult<()> {
                 let index_dir = self.partition.part_dir(i).join("index");
-                if !index_dir.exists() { return None; }
-                let meta = match PartitionMeta::load(&index_dir) {
-                    Ok(m) => m,
-                    Err(e) => return Some(OKIError::Io(std::io::Error::new(std::io::ErrorKind::Other, e.to_string()))),
-                };
+                if !index_dir.exists() { return Ok(()); }
+                let meta = PartitionMeta::load(&index_dir)
+                    .map_err(|e| OKIError::Io(std::io::Error::new(std::io::ErrorKind::Other, e.to_string())))?;
                 for l in 0..meta.n_layers {
                     let layer_dir = index_dir.join(format!("layer_{l}"));
                     let presence_dir = layer_dir.join("presence");
                     let counts_dir   = layer_dir.join("counts");
-                    if presence_dir.exists() {
-                        if let Err(e) = pack_bit_matrix(&presence_dir) {
-                            return Some(OKIError::Io(e));
-                        }
-                    }
-                    if counts_dir.exists() {
-                        if let Err(e) = pack_compact_int_matrix(&counts_dir) {
-                            return Some(OKIError::Io(e));
-                        }
-                    }
+                    if presence_dir.exists() { pack_bit_matrix(&presence_dir).map_err(OKIError::Io)?; }
+                    if counts_dir.exists()   { pack_compact_int_matrix(&counts_dir).map_err(OKIError::Io)?; }
                 }
-                None
-            })
-            .collect();
-
-        if let Some(e) = errors.into_iter().next() { return Err(e); }
+                Ok(())
+            },
+            |_, _, _| { pb.inc(1); },
+        )?;
+        pb.finish_and_clear();
         Ok(())
     }
 
diff --git a/src/obikindex/src/numa.rs b/src/obikindex/src/numa.rs
index dde62b7..4c12013 100644
--- a/src/obikindex/src/numa.rs
+++ b/src/obikindex/src/numa.rs
@@ -1,4 +1,4 @@
-// NUMA-aware Rayon thread pools via hwlocality.
+// NUMA-aware partition runner via hwlocality.
 //
 // Detects NUMA topology using hwloc (cross-platform: Linux, macOS, etc.) and
 // builds one Rayon ThreadPool per NUMA node with threads pinned to that node's
@@ -10,15 +10,14 @@
 //   - the system has only one NUMA node (UMA, Apple Silicon, single-socket)
 //   - any per-node pool fails to build
 
-use std::sync::{Arc, Mutex};
+use std::sync::Arc;
 use std::time::{Duration, Instant};
 
-use crossbeam_channel::{RecvTimeoutError, unbounded};
+use crossbeam_channel::unbounded;
 use hwlocality::Topology;
 use hwlocality::cpu::binding::CpuBindingFlags;
 use hwlocality::cpu::cpuset::CpuSet;
 use hwlocality::object::types::ObjectType;
-use obisys::CpuSample;
 use tracing::debug;
 
 // ── Public interface ──────────────────────────────────────────────────────────
@@ -104,27 +103,6 @@ fn build_pool(cpus: &[usize]) -> Option<rayon::ThreadPool> {
         .ok()
 }
 
-// ── Adaptive spawn heuristic ──────────────────────────────────────────────────
-//
-// First worker: spawn if CPU efficiency is below SPAWN_THRESHOLD (machine is
-// under-utilised). Subsequent workers: spawn only if the last worker raised
-// efficiency by at least the expected marginal gain (1/n_workers), with a
-// minimum floor to avoid spurious spawns from efficiency fluctuations.
-
-const SPAWN_THRESHOLD: f64 = 0.95;
-const MIN_MARGINAL_GAIN: f64 = 0.03;
-const SPAWN_POLL: Duration = Duration::from_secs(20);
-
-fn should_spawn_worker(n_workers: usize, eff: f64, eff_at_last_spawn: f64) -> bool {
-    if n_workers == 1 {
-        eff < SPAWN_THRESHOLD
-    } else {
-        let gain = eff - eff_at_last_spawn;
-        let expected = 1.0 / n_workers as f64;
-        gain >= (expected * 0.25).max(MIN_MARGINAL_GAIN)
-    }
-}
-
 // ── PartitionRunner ───────────────────────────────────────────────────────────
 
 struct NodeConfig {
@@ -135,42 +113,38 @@ struct NodeConfig {
 
 /// Generic NUMA-aware runner for partition-level parallel work.
 ///
-/// Encapsulates worker spawning, NUMA pinning, adaptive activation, and result
-/// collection.  UMA systems are handled as the degenerate case of a single node
-/// with no pinning.
+/// Workers are distributed round-robin across NUMA nodes and pinned to their
+/// node's CPUs.  UMA systems are the degenerate case: one node, no pinning.
 ///
-/// # Model
+/// # Termination
 ///
-/// One controller thread per NUMA node (one total on UMA).  Each controller
-/// manages up to `max_workers` dormant workers that drain a shared work queue.
-/// Workers are activated one at a time; a new worker is added when global CPU
-/// efficiency justifies it.  On NUMA all workers are activated immediately
-/// (memory bandwidth, not CPU count, is the bottleneck).
+/// Termination is driven entirely by channel closure:
+///
+/// ```text
+/// drop(part_tx)    → part_rx drains → workers exit → drop their result_tx
+/// drop(result_tx)  → result_rx closes → controller loop exits
+/// ```
+///
+/// No explicit counter or sentinel needed.
 pub struct PartitionRunner {
-    nodes:   Vec<NodeConfig>,
-    n_cores: usize,
+    nodes: Vec<NodeConfig>,
 }
 
 impl PartitionRunner {
-    /// Detect topology and build.  Falls back to a single-node UMA runner on
-    /// macOS, single-socket machines, or hwloc failure.
-    /// Total number of pre-spawned worker slots across all nodes.
+    /// Total worker slots across all nodes.
     pub fn max_workers(&self) -> usize {
         self.nodes.iter().map(|n| n.max_workers).sum()
     }
 
+    /// Detect topology and build.  Falls back to a single-node UMA runner on
+    /// macOS, single-socket machines, or hwloc failure.
     pub fn new() -> Self {
-        let n_cores = std::thread::available_parallelism()
-            .map(|n| n.get())
-            .unwrap_or(1);
-
         match build() {
             Some(ns) => {
                 let wpn = ns.workers_per_node();
                 debug!(
                     "PartitionRunner: NUMA mode — {} node(s) × {} worker(s)/node",
-                    ns.pools.len(),
-                    wpn,
+                    ns.pools.len(), wpn,
                 );
                 let nodes = ns.pools
                     .into_iter()
@@ -181,21 +155,20 @@ impl PartitionRunner {
                         max_workers: wpn,
                     })
                     .collect();
-                Self { nodes, n_cores }
+                Self { nodes }
             }
             None => {
+                let n_cores = std::thread::available_parallelism()
+                    .map(|n| n.get())
+                    .unwrap_or(1);
                 let max_workers = (n_cores / 2).max(1);
-                debug!(
-                    "PartitionRunner: UMA mode — adaptive up to {} worker(s)",
-                    max_workers,
-                );
+                debug!("PartitionRunner: UMA mode — {} worker(s)", max_workers);
                 Self {
                     nodes: vec![NodeConfig {
                         pool:        None,
                         cpu_ids:     vec![],
                         max_workers,
                     }],
-                    n_cores,
                 }
             }
         }
@@ -203,19 +176,17 @@ impl PartitionRunner {
 
     /// Run `f(i)` for every index in `order`.
     ///
-    /// `on_done(i, result, elapsed)` is called under an internal mutex as each
-    /// partition completes — suitable for progress bars, logging, and result
-    /// aggregation.  No `Send` or `Sync` bound is required on the callback.
-    ///
-    /// The work queue is shared across all NUMA nodes: any idle worker takes
-    /// the next available partition regardless of node, ensuring load balance.
+    /// Workers are spawned upfront and distributed round-robin across NUMA
+    /// nodes.  `on_done(i, result, elapsed)` is called from the controller
+    /// thread as each partition completes — suitable for progress bars and
+    /// result aggregation.
     ///
     /// Returns the first error produced by `f`, if any.
     pub fn run<F, R, E, C>(
         &self,
-        order:   &[usize],
-        f:       F,
-        on_done: C,
+        order:      &[usize],
+        f:          F,
+        mut on_done: C,
     ) -> Result<(), E>
     where
         F: Fn(usize) -> Result<R, E> + Send + Sync,
@@ -223,131 +194,56 @@ impl PartitionRunner {
         E: Send,
         C: FnMut(usize, R, Duration) + Send,
     {
-        let f        = Arc::new(f);
-        let on_done  = Arc::new(Mutex::new(on_done));
-        let first_err: Arc<Mutex<Option<E>>> = Arc::new(Mutex::new(None));
-
-        // Shared work queue — pre-loaded in caller-supplied order.
+        // Pre-load the work queue, then drop the sender so workers' part_rx
+        // iterators terminate when the queue is drained.
         let (part_tx, part_rx) = unbounded::<usize>();
-        for &i in order {
-            part_tx.send(i).ok();
-        }
+        for &i in order { part_tx.send(i).ok(); }
         drop(part_tx);
 
-        let n_cores = self.n_cores;
+        let (result_tx, result_rx) = unbounded::<(usize, Result<R, E>, Duration)>();
+        let n_nodes = self.nodes.len();
+        let f = &f; // shared borrow; F: Sync so concurrent calls are safe
+
+        let mut first_err: Option<E> = None;
 
         std::thread::scope(|s| {
-            for node in &self.nodes {
-                let f         = Arc::clone(&f);
-                let on_done   = Arc::clone(&on_done);
-                let first_err = Arc::clone(&first_err);
-                let part_rx   = part_rx.clone();
+            // Spawn all workers upfront, round-robin across NUMA nodes.
+            for w in 0..self.max_workers() {
+                let node    = &self.nodes[w % n_nodes];
+                let prx     = part_rx.clone();
+                let rtx     = result_tx.clone();
+                let pool    = node.pool.clone();
+                let cpu_ids = &node.cpu_ids;
 
                 s.spawn(move || {
-                    // Per-node result and activation channels.
-                    let (result_tx, result_rx) =
-                        unbounded::<(usize, Result<R, E>, Duration)>();
-                    let (activate_tx, activate_rx) = unbounded::<()>();
-
-                    std::thread::scope(|ws| {
-                        // Pre-spawn workers (all dormant until activated).
-                        for _ in 0..node.max_workers {
-                            let prx = part_rx.clone();
-                            let rtx = result_tx.clone();
-                            let arx = activate_rx.clone();
-                            let f   = Arc::clone(&f);
-                            let pool    = node.pool.clone();
-                            let cpu_ids = node.cpu_ids.clone();
-
-                            ws.spawn(move || {
-                                if !cpu_ids.is_empty() {
-                                    pin_current_thread(&cpu_ids);
-                                }
-                                if arx.recv().is_err() {
-                                    return; // never activated — exit cleanly
-                                }
-                                for i in &prx {
-                                    let t = Instant::now();
-                                    let r = match &pool {
-                                        Some(p) => p.install(|| f(i)),
-                                        None    => f(i),
-                                    };
-                                    rtx.send((i, r, t.elapsed())).ok();
-                                }
-                            });
-                        }
-                        // Drop the controller's copy: result_rx disconnects
-                        // once all worker copies are also dropped (workers done).
-                        drop(result_tx);
-
-                        // In NUMA mode activate all workers immediately;
-                        // in UMA mode activate one and grow adaptively.
-                        let numa_mode = node.pool.is_some();
-                        let initial   = if numa_mode { node.max_workers } else { 1 };
-                        for _ in 0..initial {
-                            activate_tx.send(()).ok();
-                        }
-                        let mut active_workers      = initial;
-                        let mut cpu_sample          = CpuSample::now();
-                        let mut eff_at_last_spawn   = 0.0f64;
-
-                        // Controller loop.
-                        loop {
-                            match result_rx.recv_timeout(SPAWN_POLL) {
-                                Ok((i, r, dur)) => {
-                                    match r {
-                                        Ok(v) => {
-                                            on_done.lock().unwrap()(i, v, dur);
-                                        }
-                                        Err(e) => {
-                                            let mut g = first_err.lock().unwrap();
-                                            if g.is_none() { *g = Some(e); }
-                                        }
-                                    }
-                                    if !numa_mode && active_workers < node.max_workers {
-                                        let eff = cpu_sample.cpu_efficiency(n_cores);
-                                        if should_spawn_worker(active_workers, eff, eff_at_last_spawn) {
-                                            debug!(
-                                                "activated worker {} — efficiency {:.0}%",
-                                                active_workers + 1,
-                                                eff * 100.0,
-                                            );
-                                            activate_tx.send(()).ok();
-                                            active_workers     += 1;
-                                            eff_at_last_spawn   = eff;
-                                            cpu_sample          = CpuSample::now();
-                                        }
-                                    }
-                                }
-                                Err(RecvTimeoutError::Timeout) => {
-                                    if !numa_mode && active_workers < node.max_workers {
-                                        let eff = cpu_sample.cpu_efficiency(n_cores);
-                                        if should_spawn_worker(active_workers, eff, eff_at_last_spawn) {
-                                            debug!(
-                                                "activated worker {} (poll) — efficiency {:.0}%",
-                                                active_workers + 1,
-                                                eff * 100.0,
-                                            );
-                                            activate_tx.send(()).ok();
-                                            active_workers     += 1;
-                                            eff_at_last_spawn   = eff;
-                                            cpu_sample          = CpuSample::now();
-                                        }
-                                    }
-                                }
-                                Err(RecvTimeoutError::Disconnected) => break,
-                            }
-                        }
-                        // Signal any dormant workers that were never activated
-                        // to exit (UMA mode where max_workers was never reached).
-                        drop(activate_tx);
-                    }); // ws: waits for all workers of this node
+                    if !cpu_ids.is_empty() { pin_current_thread(cpu_ids); }
+                    for i in &prx {
+                        let t = Instant::now();
+                        let r = match &pool {
+                            Some(p) => p.install(|| f(i)),
+                            None    => f(i),
+                        };
+                        rtx.send((i, r, t.elapsed())).ok();
+                    }
                 });
             }
-        }); // s: waits for all node controllers
 
-        let mut g = first_err.lock().unwrap();
-        match g.take() {
+            // Drop the controller's sender: result_rx closes once all worker
+            // rtx clones are dropped (i.e. all workers have exited).
+            drop(result_tx);
+
+            // Drain results concurrently with workers.  The for loop exits
+            // when result_rx is disconnected — at that point all workers are
+            // done and the scope join below is instantaneous.
+            for (i, r, dur) in &result_rx {
+                match r {
+                    Ok(v)  => on_done(i, v, dur),
+                    Err(e) => { if first_err.is_none() { first_err = Some(e); } }
+                }
+            }
+        });
+
+        match first_err {
             Some(e) => Err(e),
             None    => Ok(()),
         }
diff --git a/src/obikindex/src/rebuild.rs b/src/obikindex/src/rebuild.rs
index 6948209..b1a8b5c 100644
--- a/src/obikindex/src/rebuild.rs
+++ b/src/obikindex/src/rebuild.rs
@@ -4,7 +4,6 @@ use std::path::Path;
 
 use obikpartitionner::{KmerFilter, KmerPartition, MergeMode};
 use obisys::{Reporter, Stage, progress_bar};
-use rayon::prelude::*;
 use tracing::info;
 
 use crate::error::{OKIError, OKIResult};
@@ -83,23 +82,16 @@ impl KmerIndex {
         let src_partition = &src.partition;
         let block_bits = meta.config.block_bits;
 
-        let errors: Vec<obiskio::SKError> = (0..n_partitions)
-            .into_par_iter()
-            .filter_map(|i| {
-                let result = dst_partition
-                    .rebuild_partition(src_partition, i, filters, mode, n_genomes, block_bits)
-                    .err();
-                pb.inc(1);
-                result
-            })
-            .collect();
+        let order: Vec<usize> = (0..n_partitions).collect();
+        let runner = crate::numa::PartitionRunner::new();
+        runner.run(
+            &order,
+            |i| dst_partition.rebuild_partition(src_partition, i, filters, mode, n_genomes, block_bits),
+            |_, _, _| { pb.inc(1); },
+        ).map_err(OKIError::Partition)?;
 
         pb.finish_and_clear();
 
-        if let Some(e) = errors.into_iter().next() {
-            return Err(OKIError::Partition(e));
-        }
-
         rep.push(t.stop());
 
         // Write SENTINEL_INDEXED — output is ready to use.
diff --git a/src/obikindex/src/reindex.rs b/src/obikindex/src/reindex.rs
index 878d51a..db724b9 100644
--- a/src/obikindex/src/reindex.rs
+++ b/src/obikindex/src/reindex.rs
@@ -3,7 +3,6 @@ use std::path::Path;
 use obilayeredmap::{IndexMode, layer::Layer};
 use obilayeredmap::meta::PartitionMeta;
 use obisys::{Reporter, Stage, progress_bar};
-use rayon::prelude::*;
 use tracing::info;
 
 use crate::error::{OKIError, OKIResult};
@@ -45,25 +44,17 @@ impl KmerIndex {
         let t = Stage::start("reindex");
         let pb = progress_bar("reindex", n as u64, "partitions");
 
-        let errors: Vec<String> = (0..n)
-            .into_par_iter()
-            .filter_map(|i| {
-                let res = reindex_partition(
-                    &self.partition.part_dir(i).join("index"),
-                    &target,
-                    block_bits,
-                );
-                pb.inc(1);
-                res.err().map(|e| format!("partition {i}: {e}"))
-            })
-            .collect();
+        let order: Vec<usize> = (0..n).collect();
+        let runner = crate::numa::PartitionRunner::new();
+        runner.run(
+            &order,
+            |i| reindex_partition(&self.partition.part_dir(i).join("index"), &target, block_bits)
+                .map_err(|e| OKIError::InvalidInput(format!("partition {i}: {e}"))),
+            |_, _, _| { pb.inc(1); },
+        )?;
 
         pb.finish_and_clear();
 
-        if let Some(e) = errors.into_iter().next() {
-            return Err(OKIError::InvalidInput(e));
-        }
-
         self.meta.config.evidence = target;
         if matches!(self.meta.config.evidence, IndexMode::Exact) {
             self.meta.config.block_bits = block_bits;
diff --git a/src/obikindex/src/select.rs b/src/obikindex/src/select.rs
index 653c8ef..1db57bd 100644
--- a/src/obikindex/src/select.rs
+++ b/src/obikindex/src/select.rs
@@ -4,7 +4,6 @@ use std::path::Path;
 
 use obikpartitionner::{KmerPartition, OutputCol, PARTITIONS_SUBDIR};
 use obisys::{Stage, progress_bar};
-use rayon::prelude::*;
 use tracing::info;
 
 use crate::error::{OKIError, OKIResult};
@@ -72,25 +71,16 @@ impl KmerIndex {
         let pb  = progress_bar("select", n_partitions as u64, "partitions");
         let src_partition = &src.partition;
 
-        let errors: Vec<obiskio::SKError> = (0..n_partitions)
-            .into_par_iter()
-            .filter_map(|i| {
-                let result = dst_partition.select_partition(
-                    src_partition, i, specs,
-                    n_src_genomes, threshold, output_presence,
-                    false,
-                );
-                pb.inc(1);
-                result.err()
-            })
-            .collect();
+        let order: Vec<usize> = (0..n_partitions).collect();
+        let runner = crate::numa::PartitionRunner::new();
+        runner.run(
+            &order,
+            |i| dst_partition.select_partition(src_partition, i, specs, n_src_genomes, threshold, output_presence, false),
+            |_, _, _| { pb.inc(1); },
+        ).map_err(OKIError::Partition)?;
 
         pb.finish_and_clear();
 
-        if let Some(e) = errors.into_iter().next() {
-            return Err(OKIError::Partition(e));
-        }
-
         let _ = t.stop();
 
         fs::File::create(output.join(SENTINEL_INDEXED))?;
@@ -132,25 +122,17 @@ impl KmerIndex {
         let t  = Stage::start("select");
         let pb = progress_bar("select", n_partitions as u64, "partitions");
 
-        let errors: Vec<obiskio::SKError> = (0..n_partitions)
-            .into_par_iter()
-            .filter_map(|i| {
-                let result = self.partition.select_partition(
-                    &src_partition, i, specs,
-                    n_src_genomes, threshold, output_presence,
-                    true,
-                );
-                pb.inc(1);
-                result.err()
-            })
-            .collect();
+        let partition = &self.partition;
+        let order: Vec<usize> = (0..n_partitions).collect();
+        let runner = crate::numa::PartitionRunner::new();
+        runner.run(
+            &order,
+            |i| partition.select_partition(&src_partition, i, specs, n_src_genomes, threshold, output_presence, true),
+            |_, _, _| { pb.inc(1); },
+        ).map_err(OKIError::Partition)?;
 
         pb.finish_and_clear();
 
-        if let Some(e) = errors.into_iter().next() {
-            return Err(OKIError::Partition(e));
-        }
-
         let _ = t.stop();
 
         // Update index.meta with new genome list and with_counts flag.

From cde6457eea212706c1552b31844012aefdda12c9 Mon Sep 17 00:00:00 2001
From: Eric Coissac <eric@coissac.eu>
Date: Tue, 16 Jun 2026 23:18:10 +0200
Subject: [PATCH 02/24] feat: add memory vectors, slice traits, and column
 extraction methods

Introduce `MemoryBitVec` and `MemoryIntVec` for efficient in-memory storage with hybrid compression and overflow handling. Implement `BitSlice`, `BitSliceMut`, `IntSlice`, and `IntSliceMut` traits across persistent and memory-backed types to enable generic slice operations and bitwise/arithmetic overloads. Add `col_persist` and `col_as_memory` methods to `BitMatrix` and `IntMatrix` for efficient column extraction. Align with the new single-pass rebuild architecture by supporting fast kmer filtering and matrix rebuilding. Includes comprehensive tests and profiling instrumentation for the packing phase.
---
 .DS_Store                                | Bin 10244 -> 10244 bytes
 docmd/architecture/rebuild_filter.md     | 105 +++++++
 src/obicompactvec/src/bitmatrix.rs       |  44 +++
 src/obicompactvec/src/bitvec.rs          |  74 ++---
 src/obicompactvec/src/builder.rs         |  76 ++---
 src/obicompactvec/src/intmatrix.rs       |  42 +++
 src/obicompactvec/src/lib.rs             |   6 +-
 src/obicompactvec/src/memoryintvec.rs    | 120 ++++++++
 src/obicompactvec/src/memoryvec.rs       | 138 +++++++++
 src/obicompactvec/src/reader.rs          |  11 +
 src/obicompactvec/src/tests/bitvec.rs    |   1 +
 src/obicompactvec/src/tests/memoryvec.rs | 359 +++++++++++++++++++++++
 src/obicompactvec/src/tests/mod.rs       |   3 +
 src/obicompactvec/src/traits.rs          | 209 ++++++++++++-
 src/obikindex/src/rebuild.rs             |   2 +
 15 files changed, 1120 insertions(+), 70 deletions(-)
 create mode 100644 docmd/architecture/rebuild_filter.md
 create mode 100644 src/obicompactvec/src/memoryintvec.rs
 create mode 100644 src/obicompactvec/src/memoryvec.rs
 create mode 100644 src/obicompactvec/src/tests/memoryvec.rs

diff --git a/.DS_Store b/.DS_Store
index 2b2a356960abc32c0146d68a62b511de3edb9e1e..5f0cbf7b1dbcb941bc5c3c51ebdd4b614453d7b7 100644
GIT binary patch
delta 1478
zcmeH{Uu+ar6voeYTj)%#uy?mx7H_v$s8T2^wpji~+R_G#Eq2#-fwtDdv@_KS?QFZV
zZLJC$3{N7&WD+$Pg8HBak|s@CKv6?tjFJ!^uqJ9^G~tQFM0qi>Cc@pF6%${0;>~%O
zd(U_7opVlpU*>q?c;byX_)Ow*0Ni^(oYppN*&3_dR9Ej!3j>=y5e(~mnN056f1ta^
zTU}P_T@DXz>$V3|PM7_Xbx?#RY-=(;nz2)kyWdK?YSs1W<&o&>M`|9c-@dc?$+nJ#
zfCz?4!<>7!SFXzj5!3a&wDLhIyT=FYv2i;yC8{(nuU4#*yw5Z;*3&7|9<nV{%xnJn
zHLIk2HkWBhTk35VF08GVJn5=#gi$!SA;x*~u<e+_EtIOQ<8t4yF<=d6g{k|6HgDnb
zz=YqkXfKF6!d!L_rbb3AC*u~hhDIs-b9MI`ZYDlzIjX0x3&mShuGf9RNL%qq$4VE}
zR;3;kfxMo(wA-{ZDL+{r47IB{dUD^}Wt#aZLLwND3EAoONtqoxM7gJ`sV2Wt)fMMT
zewTEN5v!`H)|bQna`z3=sDf&#ncAqIMrn#p(3^CY=4hTS(3kWL{Xjp{O}a&Y(4VwO
zf1w!55P(1s64h9b8f-v4vQ5~9X0#)LPIMuO=P&>h7KV_*I41ETrtm6W!|OPSQ+NyS
z;yt{N^Y{dx;xl}XuW%jTVgWzlH~fxUSY*X)30uknOt26uWoubAt7Y3+GizlDzkP<n
z7`;Tx?@pvP%uuG3-jQ&na$&S}&r@yfd;f9aL-%l@*HLCGQ(r23w1+>eDe7X$f^uAW
zw3@Hi6mPNA4`b1dnzEQ+`H<H|AJ>(UM3EoHqFZ%Mzz=rtCv;^gQS65m+-q2|BHMId
zh+U=Y^gaDTzbayXqX-%zsKh#K#5OdbQ6Xx<Zge1?#50Oij#EDl!N4GfF``(FB8`{v
z3XUR+V~W=aMJ&hbZJfqCID-%HAwI%6%wi4~a1kFXf?wbozQd2WF&l~Sl0XT$G)QT(
sm7g-?dWR#&O8)&nG?xD-Khc$ThEvY-3LIv}LYOV7TrB<{(|yK-JLJPqP5=M^

delta 1228
zcmeHGNlX+$6s=c+bZg1f$S^c89S4P=W*8P1K+w3MfWV+g1VnJK+b}9KI5Tm@V2obe
zO&v7gdeTVLAa2ov2RHO89!!jhMvZImih8hSIT<hB?L+_m@BiPw{`;@?*YB@C?twa3
zf6oF{Fy5!kDJ^n23+EP!tK5qQ!L2rxP14siHMg)<am|`44rjWB-&!L2wfwZ?v?<xs
z^NNbg7gki&t{ky&>*N$0W8#fAJ=qGEs*YH+^o&f0u8XglCfw=Cy^$Pemd;uYZ`iC2
z7`}}@)8IE1nJObWS-QS95?K;7<=aq{C_H1P&Q^zfyG-tlC%W?)TfN!mH+U%0MV8K*
z&Gg1C-VSpMH`G{JNh#CUY%_x)U%<~-M8RI`<oPzHH*E-Xb(#KfD4w}!v97m8`qp?u
zVNbW|muDJ|H(w@WEul_t(DZEgo546*Ezx#viQ<u>3d0u;M1ykTIypvj<adh<BRWKe
zmTu7-^hQNEQ?zk}5K)%m=vAjJERf%kM<F^XB1=gPSx<swH#tI1kpXgpJR*<D3-X$L
zBBNvsz<8*bfQhgn1KDt44vJBNQY?lCE3pa<XhkP{*n|*vU>A0y2M2Hvhj9c)aS~^6
z4(D+JmvJ5D4cx>n+{ZAU;u)UfE#BchK426-@DsmiB2A+jO{Y_-i{?=`Ev6-O9xbDF
z)I*!7mu{i~+8sMTMa+_QaT_;ur&LvjXtz0~j2z1=tEySeA2M=_tkh00aWFNnnPn?-
zdpvO@Dw|w3Ly>#q<72MN;Z~$lIE}ij&@oGudf?+?uFFxZDjYFq9xGR+GI&DF)tTUS
z`@i!3lDs7!$#?Qg@}7(oXvjr@WP1TBQ78G1u-%9@w8M)I80dl@-PkIb@4+7QVlVdL
zkYxTSj^PCQ(2vtNi;Kv)B<UZ(70LZ=+`(Nuz(YL7llT^1$sK&gm!UEH#H7LSdkiHx
Ve<X-EHmAID|GD}rS7OZm?KdS`9-sgK

diff --git a/docmd/architecture/rebuild_filter.md b/docmd/architecture/rebuild_filter.md
new file mode 100644
index 0000000..443aa75
--- /dev/null
+++ b/docmd/architecture/rebuild_filter.md
@@ -0,0 +1,105 @@
+# Rebuild / filter — column-first design
+
+## Problem with the current two-pass design
+
+`rebuild_partition` currently makes **two full passes** over source data:
+
+**Pass 1** — read unitigs → MPHF lookup (source) → read row (108 values) → apply filter → push kmer into `GraphDeBruijn`, **discard row**.
+
+**Pass 2** — read unitigs again → MPHF lookup again → read row again → for each passing kmer, look up slot in new MPHF → fill column builders.
+
+Both passes do random access into the source matrix: for each kmer, the MPHF returns a slot, then we read 108 values scattered across 108 column positions. This is cache-hostile even with a packed matrix (`.pbmx`), because the matrix is column-major: consecutive row reads jump across the file.
+
+## Memory budget
+
+The `keep` bitvector costs **1 bit per slot**. With 256 partitions and realistic kmer counts, each partition holds at most a few tens of millions of slots → a few MB per bitvector. Even in the absolute worst case (800 M slots), it stays under 100 MB. This is negligible.
+
+The `slot_map` option (Option B, 8–16 bytes per slot) is heavier but still bounded: at 15 M slots and 8 bytes, that is 120 MB per partition, acceptable for a single worker.
+
+## Key observation
+
+**The filter operates on column values, not on kmers.** A filter like `--max-outgroup-count 0` only needs to know, for each slot, whether any outgroup column is non-zero. It does not need to know which kmer occupies that slot.
+
+This means filtering can be done as a **sequential column scan** that produces a `keep: BitVec[n_slots]` — no MPHF lookups, no kmer knowledge, perfectly cache-friendly.
+
+## Proposed single-scan design
+
+### Step 1 — column scan → `keep` bitvector
+
+```
+for each column c in source matrix:
+    read column c sequentially (one mmap range)
+    update keep[slot] according to filter contribution of column c
+```
+
+For `GroupQuorumFilter` with ingroup/outgroup:
+- ingroup columns: count presence per slot → `ingroup_count[slot]`
+- outgroup columns: `keep[slot] &= (value[slot] == 0)` (early-exit possible)
+
+Result: `keep: BitVec` of size `n_slots`, computed with purely sequential IO.
+
+### Step 2 — unitig scan → kept kmers + new MPHF
+
+```
+for each kmer in unitig files:
+    old_slot = old_MPHF(kmer)
+    if keep[old_slot]:
+        push kmer into new GraphDeBruijn
+        record (old_slot, kmer)   ← or just old_slot in order
+```
+
+Build new MPHF from `GraphDeBruijn` via `materialize_layer`.
+
+### Step 3 — fill new matrix
+
+Two sub-options:
+
+**Option A — from recorded (old_slot, kmer) pairs:**
+
+```
+for each (old_slot, kmer) in recorded list:
+    new_slot = new_MPHF(kmer)
+    for each column c:
+        new_matrix[new_slot, c] = old_matrix[old_slot, c]
+```
+
+Memory cost: `n_kept × (8 + 8)` bytes for `(old_slot: usize, kmer: CanonicalKmer)`.
+For species-specific filters, `n_kept` is small. For unfiltered rebuild, `n_kept = n_slots`.
+
+**Option B — column-by-column copy using old→new slot mapping:**
+
+Precompute `slot_map: Vec<Option<usize>>` of size `n_slots`:
+- For each kmer in unitig file: `slot_map[old_MPHF(kmer)] = Some(new_MPHF(kmer))`
+
+Then for each source column:
+```
+read source column sequentially
+for each slot where slot_map[slot] = Some(new_slot):
+    write value to new column at new_slot
+```
+
+Memory cost: `n_slots × sizeof(usize)` for the slot map (one usize per source slot).
+IO pattern: sequential read of each source column → random write into new column builders.
+
+Option B avoids storing kmer values and works uniformly regardless of filter selectivity.
+
+## Comparison
+
+| | Current | Proposed |
+|---|---|---|
+| Disk reads | 2× unitigs + 2× random matrix | 1× columns (sequential) + 1× unitigs |
+| MPHF lookups (source) | 2× N_kmers | 1× N_kept (step 2) or 0 (option B, col scan only) |
+| Cache behavior | poor (random row access) | good (sequential column scan) |
+| Extra memory | none | slot_map (option B) or (old_slot, kmer) list (option A) |
+
+## Files to modify
+
+- `src/obikpartitionner/src/rebuild_layer.rs` — `rebuild_partition` and `iter_src_layers`
+- Possibly `src/obicompactvec/` — add column iterator API if not already present
+- `src/obilayeredmap/` — check if per-column sequential access is exposed on `SrcLayerData`
+
+## Open questions
+
+- Does `SrcLayerData` expose per-column sequential iteration, or only `lookup(kmer, n_genomes)` random access?
+- For option B: are new column builders writable in random-slot order (i.e. `set_val(slot, value)` without sequential constraint)?
+- For `GroupQuorumFilter` specifically: can the filter be decomposed into independent per-column contributions, or does it need the full row?
diff --git a/src/obicompactvec/src/bitmatrix.rs b/src/obicompactvec/src/bitmatrix.rs
index ca1b393..2dbc266 100644
--- a/src/obicompactvec/src/bitmatrix.rs
+++ b/src/obicompactvec/src/bitmatrix.rs
@@ -7,6 +7,8 @@ use ndarray::{Array1, Array2};
 use rayon::prelude::*;
 
 use crate::bitvec::{PersistentBitVec, PersistentBitVecBuilder};
+use crate::memoryvec::MemoryBitVec;
+use crate::traits::BitSliceMut;
 use crate::layer_meta::LayerMeta;
 use crate::meta::MatrixMeta;
 
@@ -154,6 +156,28 @@ impl PackedBitMatrix {
         &self.mmap[start..start + len]
     }
 
+    pub(crate) fn col_persist(&self, c: usize, path: &Path) -> io::Result<PersistentBitVecBuilder> {
+        PersistentBitVecBuilder::from_raw_bytes(self.col_bytes(c), self.n_rows, path)
+    }
+
+    pub(crate) fn col_as_memory(&self, c: usize) -> MemoryBitVec {
+        let bytes = self.col_bytes(c);
+        let n = self.n_rows;
+        let n_words = n.div_ceil(64);
+        let mut words = vec![0u64; n_words];
+        let full = bytes.len() / 8;
+        for (i, chunk) in bytes[..full * 8].chunks_exact(8).enumerate() {
+            words[i] = u64::from_le_bytes(chunk.try_into().unwrap());
+        }
+        let rem = bytes.len() % 8;
+        if rem > 0 {
+            let mut last = [0u8; 8];
+            last[..rem].copy_from_slice(&bytes[full * 8..]);
+            words[full] = u64::from_le_bytes(last);
+        }
+        MemoryBitVec::from_words(words, n)
+    }
+
     fn count_ones_col(&self, c: usize) -> u64 {
         let bytes = self.col_bytes(c);
         let full = self.n_rows / 8;
@@ -343,6 +367,26 @@ impl PersistentBitMatrix {
         }
     }
 
+    pub fn col_persist(&self, c: usize, path: &Path) -> io::Result<PersistentBitVecBuilder> {
+        match self {
+            Self::Columnar(m) => PersistentBitVecBuilder::build_from(m.col(c), path),
+            Self::Packed(m)   => m.col_persist(c, path),
+            Self::Implicit { n_rows, .. } => {
+                let mut b = PersistentBitVecBuilder::new(*n_rows, path)?;
+                b.not();
+                Ok(b)
+            }
+        }
+    }
+
+    pub fn col_as_memory(&self, c: usize) -> MemoryBitVec {
+        match self {
+            Self::Columnar(m) => MemoryBitVec::from(m.col(c)),
+            Self::Packed(m)   => m.col_as_memory(c),
+            Self::Implicit { n_rows, .. } => MemoryBitVec::ones(*n_rows),
+        }
+    }
+
     pub fn row(&self, slot: usize) -> Box<[bool]> {
         match self {
             Self::Columnar(m)             => m.row(slot),
diff --git a/src/obicompactvec/src/bitvec.rs b/src/obicompactvec/src/bitvec.rs
index cfc26aa..dc95512 100644
--- a/src/obicompactvec/src/bitvec.rs
+++ b/src/obicompactvec/src/bitvec.rs
@@ -188,6 +188,21 @@ impl PersistentBitVecBuilder {
         Ok(Self { mmap, n })
     }
 
+    /// Create a PBIV file from raw packed bit-bytes, zero-padding to the next word boundary.
+    /// `bytes` is `n.div_ceil(8)` bytes; `n` is the number of bits.
+    pub(crate) fn from_raw_bytes(bytes: &[u8], n: usize, path: &Path) -> io::Result<Self> {
+        let file_size = HEADER_SIZE + n_bytes_for_words(n);
+        let file = OpenOptions::new()
+            .read(true).write(true).create(true).truncate(true)
+            .open(path)?;
+        file.set_len(file_size as u64)?;
+        let mut mmap = unsafe { MmapMut::map_mut(&file)? };
+        mmap[0..4].copy_from_slice(&MAGIC);
+        mmap[8..16].copy_from_slice(&(n as u64).to_le_bytes());
+        mmap[HEADER_SIZE..HEADER_SIZE + bytes.len()].copy_from_slice(bytes);
+        Ok(Self { mmap, n })
+    }
+
     pub fn build_from(source: &PersistentBitVec, path: &Path) -> io::Result<Self> {
         fs::copy(source.path(), path)?;
         let file = OpenOptions::new().read(true).write(true).open(path)?;
@@ -217,6 +232,12 @@ impl PersistentBitVecBuilder {
         }
     }
 
+    fn data_words(&self) -> &[u64] {
+        let nw = n_words(self.n);
+        let ptr = self.mmap[HEADER_SIZE..].as_ptr() as *const u64;
+        unsafe { std::slice::from_raw_parts(ptr, nw) }
+    }
+
     // SAFETY: same alignment argument as PersistentBitVec::data_words.
     fn data_words_mut(&mut self) -> &mut [u64] {
         let nw = n_words(self.n);
@@ -224,41 +245,6 @@ impl PersistentBitVecBuilder {
         unsafe { std::slice::from_raw_parts_mut(ptr, nw) }
     }
 
-    pub fn and(&mut self, other: &PersistentBitVec) {
-        assert_eq!(self.n, other.n, "length mismatch");
-        for (sw, &ow) in self.data_words_mut().iter_mut().zip(other.data_words()) {
-            *sw &= ow;
-        }
-    }
-
-    pub fn or(&mut self, other: &PersistentBitVec) {
-        assert_eq!(self.n, other.n, "length mismatch");
-        for (sw, &ow) in self.data_words_mut().iter_mut().zip(other.data_words()) {
-            *sw |= ow;
-        }
-    }
-
-    pub fn xor(&mut self, other: &PersistentBitVec) {
-        assert_eq!(self.n, other.n, "length mismatch");
-        for (sw, &ow) in self.data_words_mut().iter_mut().zip(other.data_words()) {
-            *sw ^= ow;
-        }
-    }
-
-    pub fn not(&mut self) {
-        let rem = self.n % 64;
-        let words = self.data_words_mut();
-        for w in words.iter_mut() {
-            *w ^= u64::MAX;
-        }
-        // Zero padding bits in the last word so count_ones / jaccard remain correct.
-        if rem != 0 {
-            if let Some(last) = words.last_mut() {
-                *last &= (1u64 << rem) - 1;
-            }
-        }
-    }
-
     /// Convert a count vector to a bit vector: bit set iff count >= threshold.
     /// Fills u64 words directly from the count iterator — O(n), no bit-level set() overhead.
     pub fn build_from_counts(
@@ -304,3 +290,21 @@ impl PersistentBitVecBuilder {
         self.mmap.flush()
     }
 }
+
+// ── BitSlice / BitSliceMut impls ──────────────────────────────────────────────
+
+use crate::traits::{BitSlice, BitSliceMut};
+
+impl BitSlice for PersistentBitVec {
+    fn len(&self) -> usize { self.n }
+    fn words(&self) -> &[u64] { self.data_words() }
+}
+
+impl BitSlice for PersistentBitVecBuilder {
+    fn len(&self) -> usize { self.n }
+    fn words(&self) -> &[u64] { self.data_words() }
+}
+
+impl BitSliceMut for PersistentBitVecBuilder {
+    fn words_mut(&mut self) -> &mut [u64] { self.data_words_mut() }
+}
diff --git a/src/obicompactvec/src/builder.rs b/src/obicompactvec/src/builder.rs
index 32d711f..f2b5326 100644
--- a/src/obicompactvec/src/builder.rs
+++ b/src/obicompactvec/src/builder.rs
@@ -34,6 +34,36 @@ impl PersistentCompactIntVecBuilder {
         })
     }
 
+    /// Create from a [`MemoryIntVec`], copying primary bytes directly into the mmap.
+    /// O(n) memcpy + O(n_overflow) HashMap clone — no per-slot `set` overhead.
+    pub fn from_memory(src: &crate::memoryintvec::MemoryIntVec, path: &Path) -> io::Result<Self> {
+        let n = src.len();
+        let file = OpenOptions::new()
+            .read(true).write(true).create(true).truncate(true)
+            .open(path)?;
+        file.set_len((HEADER_SIZE + n) as u64)?;
+        let mut mmap = unsafe { MmapMut::map_mut(&file)? };
+        mmap[HEADER_SIZE..HEADER_SIZE + n].copy_from_slice(src.primary_bytes());
+        Ok(Self {
+            path: path.to_path_buf(),
+            mmap,
+            n,
+            overflow: src.overflow_map().clone(),
+        })
+    }
+
+    /// Create from raw primary bytes + an already-built overflow map (no per-slot overhead).
+    pub(crate) fn from_raw_primary(primary: &[u8], overflow: HashMap<usize, u32>, path: &Path) -> io::Result<Self> {
+        let n = primary.len();
+        let file = OpenOptions::new()
+            .read(true).write(true).create(true).truncate(true)
+            .open(path)?;
+        file.set_len((HEADER_SIZE + n) as u64)?;
+        let mut mmap = unsafe { MmapMut::map_mut(&file)? };
+        mmap[HEADER_SIZE..HEADER_SIZE + n].copy_from_slice(primary);
+        Ok(Self { path: path.to_path_buf(), mmap, n, overflow })
+    }
+
     /// Copy `source`'s file to `path`, mmap the primary section, load overflow into RAM.
     /// Avoids iterating all n slots: the file copy is OS-level, overflow loading is O(n_overflow).
     pub fn build_from(source: &PersistentCompactIntVec, path: &Path) -> io::Result<Self> {
@@ -91,39 +121,6 @@ impl PersistentCompactIntVecBuilder {
         self.n == 0
     }
 
-    pub fn min(&mut self, other: &PersistentCompactIntVec) {
-        assert_eq!(self.n, other.len(), "length mismatch");
-        for (slot, other_val) in other.iter().enumerate() {
-            if other_val < self.get(slot) {
-                self.set(slot, other_val);
-            }
-        }
-    }
-
-    pub fn max(&mut self, other: &PersistentCompactIntVec) {
-        assert_eq!(self.n, other.len(), "length mismatch");
-        for (slot, other_val) in other.iter().enumerate() {
-            if other_val > self.get(slot) {
-                self.set(slot, other_val);
-            }
-        }
-    }
-
-    pub fn add(&mut self, other: &PersistentCompactIntVec) {
-        assert_eq!(self.n, other.len(), "length mismatch");
-        for (slot, other_val) in other.iter().enumerate() {
-            let cur = self.get(slot);
-            self.set(slot, cur.checked_add(other_val).expect("u32 overflow in add"));
-        }
-    }
-
-    pub fn diff(&mut self, other: &PersistentCompactIntVec) {
-        assert_eq!(self.n, other.len(), "length mismatch");
-        for (slot, other_val) in other.iter().enumerate() {
-            self.set(slot, self.get(slot).saturating_sub(other_val));
-        }
-    }
-
     /// Flush the primary mmap, then write sorted overflow data + index and fix the header.
     pub fn close(self) -> io::Result<()> {
         self.mmap.flush()?;
@@ -141,3 +138,16 @@ impl PersistentCompactIntVecBuilder {
         finalize_pciv(&path, n, &entries)
     }
 }
+
+// ── IntSlice / IntSliceMut impls ──────────────────────────────────────────────
+
+use crate::traits::{IntSlice, IntSliceMut};
+
+impl IntSlice for PersistentCompactIntVecBuilder {
+    fn len(&self) -> usize { self.n }
+    fn get(&self, slot: usize) -> u32 { self.get(slot) }
+}
+
+impl IntSliceMut for PersistentCompactIntVecBuilder {
+    fn set(&mut self, slot: usize, value: u32) { self.set(slot, value); }
+}
diff --git a/src/obicompactvec/src/intmatrix.rs b/src/obicompactvec/src/intmatrix.rs
index b563335..91c3b1b 100644
--- a/src/obicompactvec/src/intmatrix.rs
+++ b/src/obicompactvec/src/intmatrix.rs
@@ -1,4 +1,5 @@
 use std::cmp::Ordering;
+use std::collections::HashMap;
 use std::fs::{self, File};
 use std::io::{self, BufWriter, Write as _};
 use std::path::{Path, PathBuf};
@@ -8,6 +9,7 @@ use ndarray::{Array1, Array2};
 use rayon::prelude::*;
 
 use crate::builder::PersistentCompactIntVecBuilder;
+use crate::memoryintvec::MemoryIntVec;
 use crate::format::{HEADER_SIZE, INDEX_ENTRY_SIZE, OVERFLOW_ENTRY_SIZE};
 use crate::meta::MatrixMeta;
 use crate::reader::PersistentCompactIntVec;
@@ -194,6 +196,32 @@ impl PackedCompactIntMatrix {
         Ok(Self { mmap, n_rows, n_cols, columns })
     }
 
+    pub(crate) fn col_persist(&self, c: usize, path: &Path) -> io::Result<PersistentCompactIntVecBuilder> {
+        let ci = &self.columns[c];
+        let primary = &self.mmap[ci.primary_start..ci.primary_start + self.n_rows];
+        let mut overflow = HashMap::with_capacity(ci.n_overflow);
+        for i in 0..ci.n_overflow {
+            let off   = ci.data_offset + i * OVERFLOW_ENTRY_SIZE;
+            let slot  = u64::from_le_bytes(self.mmap[off..off+8].try_into().unwrap()) as usize;
+            let value = u32::from_le_bytes(self.mmap[off+8..off+12].try_into().unwrap());
+            overflow.insert(slot, value);
+        }
+        PersistentCompactIntVecBuilder::from_raw_primary(primary, overflow, path)
+    }
+
+    pub(crate) fn col_as_memory(&self, c: usize) -> MemoryIntVec {
+        let ci = &self.columns[c];
+        let primary = self.mmap[ci.primary_start..ci.primary_start + self.n_rows].to_vec();
+        let mut overflow = HashMap::with_capacity(ci.n_overflow);
+        for i in 0..ci.n_overflow {
+            let off   = ci.data_offset + i * OVERFLOW_ENTRY_SIZE;
+            let slot  = u64::from_le_bytes(self.mmap[off..off+8].try_into().unwrap()) as usize;
+            let value = u32::from_le_bytes(self.mmap[off+8..off+12].try_into().unwrap());
+            overflow.insert(slot, value);
+        }
+        MemoryIntVec::from_primary_and_overflow(primary, overflow)
+    }
+
     #[inline]
     pub(crate) fn get(&self, col: usize, slot: usize) -> u32 {
         let ci = &self.columns[col];
@@ -442,6 +470,20 @@ impl PersistentCompactIntMatrix {
         }
     }
 
+    pub fn col_persist(&self, c: usize, path: &Path) -> io::Result<PersistentCompactIntVecBuilder> {
+        match self {
+            Self::Columnar(m) => PersistentCompactIntVecBuilder::build_from(m.col(c), path),
+            Self::Packed(m)   => m.col_persist(c, path),
+        }
+    }
+
+    pub fn col_as_memory(&self, c: usize) -> MemoryIntVec {
+        match self {
+            Self::Columnar(m) => MemoryIntVec::from(m.col(c)),
+            Self::Packed(m)   => m.col_as_memory(c),
+        }
+    }
+
     pub fn row(&self, slot: usize) -> Box<[u32]> {
         match self { Self::Columnar(m) => m.row(slot), Self::Packed(m) => m.row(slot) }
     }
diff --git a/src/obicompactvec/src/lib.rs b/src/obicompactvec/src/lib.rs
index 8a1e5bb..b3c2ff4 100644
--- a/src/obicompactvec/src/lib.rs
+++ b/src/obicompactvec/src/lib.rs
@@ -4,6 +4,8 @@ mod builder;
 mod format;
 mod intmatrix;
 mod layer_meta;
+mod memoryintvec;
+mod memoryvec;
 mod meta;
 mod reader;
 pub mod traits;
@@ -13,8 +15,10 @@ pub use bitmatrix::{PersistentBitMatrix, PersistentBitMatrixBuilder, pack_bit_ma
 pub use builder::PersistentCompactIntVecBuilder;
 pub use intmatrix::{PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, pack_compact_int_matrix};
 pub use layer_meta::LayerMeta;
+pub use memoryintvec::MemoryIntVec;
+pub use memoryvec::MemoryBitVec;
 pub use reader::PersistentCompactIntVec;
-pub use traits::{BitPartials, ColumnWeights, CountPartials};
+pub use traits::{BitPartials, BitSlice, BitSliceMut, BitToInt, ColumnWeights, CountPartials, IntSlice, IntSliceMut, IntToBit};
 
 #[cfg(test)]
 #[path = "tests/mod.rs"]
diff --git a/src/obicompactvec/src/memoryintvec.rs b/src/obicompactvec/src/memoryintvec.rs
new file mode 100644
index 0000000..486a0f1
--- /dev/null
+++ b/src/obicompactvec/src/memoryintvec.rs
@@ -0,0 +1,120 @@
+use std::collections::HashMap;
+use std::io;
+use std::ops::{Add, AddAssign, Sub, SubAssign};
+use std::path::Path;
+
+use crate::builder::PersistentCompactIntVecBuilder;
+use crate::traits::{IntSlice, IntSliceMut};
+
+// ── MemoryIntVec ──────────────────────────────────────────────────────────────
+
+#[derive(Clone)]
+pub struct MemoryIntVec {
+    primary:  Vec<u8>,
+    overflow: HashMap<usize, u32>,
+    n:        usize,
+}
+
+impl MemoryIntVec {
+    pub fn new(n: usize) -> Self {
+        Self { primary: vec![0u8; n], overflow: HashMap::new(), n }
+    }
+
+    pub fn len(&self) -> usize { self.n }
+    pub fn is_empty(&self) -> bool { self.n == 0 }
+
+    /// Construct directly from a pre-built primary array (no overflow — all values < 255).
+    pub(crate) fn from_primary(primary: Vec<u8>) -> Self {
+        let n = primary.len();
+        Self { primary, overflow: HashMap::new(), n }
+    }
+
+    pub(crate) fn from_primary_and_overflow(primary: Vec<u8>, overflow: HashMap<usize, u32>) -> Self {
+        let n = primary.len();
+        Self { primary, overflow, n }
+    }
+
+    pub(crate) fn primary_bytes(&self) -> &[u8] { &self.primary }
+    pub(crate) fn overflow_map(&self) -> &HashMap<usize, u32> { &self.overflow }
+
+    pub fn iter(&self) -> impl Iterator<Item = u32> + '_ {
+        (0..self.n).map(move |slot| self.get(slot))
+    }
+
+    /// Write to disk and return a writable builder at `path`.
+    pub fn persist(&self, path: &Path) -> io::Result<PersistentCompactIntVecBuilder> {
+        PersistentCompactIntVecBuilder::from_memory(self, path)
+    }
+}
+
+// ── IntSlice / IntSliceMut ────────────────────────────────────────────────────
+
+impl IntSlice for MemoryIntVec {
+    fn len(&self) -> usize { self.n }
+
+    fn get(&self, slot: usize) -> u32 {
+        match self.primary[slot] {
+            255 => *self.overflow.get(&slot).expect("sentinel without overflow entry"),
+            v   => v as u32,
+        }
+    }
+}
+
+impl IntSliceMut for MemoryIntVec {
+    fn set(&mut self, slot: usize, value: u32) {
+        if value < 255 {
+            self.primary[slot] = value as u8;
+            self.overflow.remove(&slot);
+        } else {
+            self.primary[slot] = 255;
+            self.overflow.insert(slot, value);
+        }
+    }
+}
+
+// ── From conversions ──────────────────────────────────────────────────────────
+
+impl<S: IntSlice> From<&S> for MemoryIntVec {
+    fn from(src: &S) -> Self {
+        let mut v = Self::new(src.len());
+        for slot in 0..src.len() {
+            let val = src.get(slot);
+            if val != 0 { v.set(slot, val); }
+        }
+        v
+    }
+}
+
+// ── std::ops — owned (consumes lhs) ──────────────────────────────────────────
+
+impl<B: IntSlice> Add<&B> for MemoryIntVec {
+    type Output = MemoryIntVec;
+    fn add(mut self, rhs: &B) -> MemoryIntVec { IntSliceMut::add(&mut self, rhs); self }
+}
+
+impl<B: IntSlice> Sub<&B> for MemoryIntVec {
+    type Output = MemoryIntVec;
+    fn sub(mut self, rhs: &B) -> MemoryIntVec { self.diff(rhs); self }
+}
+
+// ── std::ops — borrowed (clones lhs) ─────────────────────────────────────────
+
+impl<B: IntSlice> Add<&B> for &MemoryIntVec {
+    type Output = MemoryIntVec;
+    fn add(self, rhs: &B) -> MemoryIntVec { self.clone().add(rhs) }
+}
+
+impl<B: IntSlice> Sub<&B> for &MemoryIntVec {
+    type Output = MemoryIntVec;
+    fn sub(self, rhs: &B) -> MemoryIntVec { self.clone().sub(rhs) }
+}
+
+// ── std::ops — in-place assign ────────────────────────────────────────────────
+
+impl<B: IntSlice> AddAssign<&B> for MemoryIntVec {
+    fn add_assign(&mut self, rhs: &B) { IntSliceMut::add(self, rhs); }
+}
+
+impl<B: IntSlice> SubAssign<&B> for MemoryIntVec {
+    fn sub_assign(&mut self, rhs: &B) { self.diff(rhs); }
+}
diff --git a/src/obicompactvec/src/memoryvec.rs b/src/obicompactvec/src/memoryvec.rs
new file mode 100644
index 0000000..102a6d6
--- /dev/null
+++ b/src/obicompactvec/src/memoryvec.rs
@@ -0,0 +1,138 @@
+use std::io;
+use std::ops::{BitAnd, BitAndAssign, BitOr, BitOrAssign, BitXor, BitXorAssign, Not};
+use std::path::Path;
+
+use crate::bitvec::PersistentBitVecBuilder;
+use crate::traits::{BitSlice, BitSliceMut};
+
+#[inline]
+fn n_words(n: usize) -> usize { n.div_ceil(64) }
+
+// ── MemoryBitVec ──────────────────────────────────────────────────────────────
+
+#[derive(Clone)]
+pub struct MemoryBitVec {
+    words: Vec<u64>,
+    n: usize,
+}
+
+impl MemoryBitVec {
+    pub fn new(n: usize) -> Self {
+        Self { words: vec![0u64; n_words(n)], n }
+    }
+
+    pub fn ones(n: usize) -> Self {
+        let rem = n % 64;
+        let mut words = vec![u64::MAX; n_words(n)];
+        if rem != 0 {
+            if let Some(last) = words.last_mut() { *last = (1u64 << rem) - 1; }
+        }
+        Self { words, n }
+    }
+
+    pub(crate) fn from_words(words: Vec<u64>, n: usize) -> Self {
+        Self { words, n }
+    }
+
+    pub fn len(&self) -> usize { self.n }
+    pub fn is_empty(&self) -> bool { self.n == 0 }
+
+    pub fn get(&self, slot: usize) -> bool {
+        (self.words[slot >> 6] >> (slot & 63)) & 1 != 0
+    }
+
+    pub fn set(&mut self, slot: usize, value: bool) {
+        let bit = 1u64 << (slot & 63);
+        if value { self.words[slot >> 6] |= bit; } else { self.words[slot >> 6] &= !bit; }
+    }
+
+    pub fn count_ones(&self) -> u64 {
+        self.words.iter().map(|w| w.count_ones() as u64).sum()
+    }
+
+    pub fn count_zeros(&self) -> u64 { self.n as u64 - self.count_ones() }
+
+    /// Write to disk and return a writable builder positioned at the same path.
+    pub fn persist(&self, path: &Path) -> io::Result<PersistentBitVecBuilder> {
+        let mut b = PersistentBitVecBuilder::new(self.n, path)?;
+        b.copy_from(self);
+        Ok(b)
+    }
+}
+
+// ── BitSlice / BitSliceMut ────────────────────────────────────────────────────
+
+impl BitSlice for MemoryBitVec {
+    fn len(&self) -> usize { self.n }
+    fn words(&self) -> &[u64] { &self.words }
+}
+
+impl BitSliceMut for MemoryBitVec {
+    fn words_mut(&mut self) -> &mut [u64] { &mut self.words }
+}
+
+// ── From conversions ──────────────────────────────────────────────────────────
+
+impl<S: BitSlice> From<&S> for MemoryBitVec {
+    fn from(src: &S) -> Self {
+        Self { words: src.words().to_vec(), n: src.len() }
+    }
+}
+
+// ── std::ops — owned (consumes lhs) ──────────────────────────────────────────
+
+impl<B: BitSlice> BitAnd<&B> for MemoryBitVec {
+    type Output = MemoryBitVec;
+    fn bitand(mut self, rhs: &B) -> MemoryBitVec { self.and(rhs); self }
+}
+
+impl<B: BitSlice> BitOr<&B> for MemoryBitVec {
+    type Output = MemoryBitVec;
+    fn bitor(mut self, rhs: &B) -> MemoryBitVec { self.or(rhs); self }
+}
+
+impl<B: BitSlice> BitXor<&B> for MemoryBitVec {
+    type Output = MemoryBitVec;
+    fn bitxor(mut self, rhs: &B) -> MemoryBitVec { self.xor(rhs); self }
+}
+
+impl Not for MemoryBitVec {
+    type Output = MemoryBitVec;
+    fn not(mut self) -> MemoryBitVec { BitSliceMut::not(&mut self); self }
+}
+
+// ── std::ops — borrowed (clones lhs) ─────────────────────────────────────────
+
+impl<B: BitSlice> BitAnd<&B> for &MemoryBitVec {
+    type Output = MemoryBitVec;
+    fn bitand(self, rhs: &B) -> MemoryBitVec { self.clone().bitand(rhs) }
+}
+
+impl<B: BitSlice> BitOr<&B> for &MemoryBitVec {
+    type Output = MemoryBitVec;
+    fn bitor(self, rhs: &B) -> MemoryBitVec { self.clone().bitor(rhs) }
+}
+
+impl<B: BitSlice> BitXor<&B> for &MemoryBitVec {
+    type Output = MemoryBitVec;
+    fn bitxor(self, rhs: &B) -> MemoryBitVec { self.clone().bitxor(rhs) }
+}
+
+impl Not for &MemoryBitVec {
+    type Output = MemoryBitVec;
+    fn not(self) -> MemoryBitVec { !self.clone() }
+}
+
+// ── std::ops — in-place assign ────────────────────────────────────────────────
+
+impl<B: BitSlice> BitAndAssign<&B> for MemoryBitVec {
+    fn bitand_assign(&mut self, rhs: &B) { self.and(rhs); }
+}
+
+impl<B: BitSlice> BitOrAssign<&B> for MemoryBitVec {
+    fn bitor_assign(&mut self, rhs: &B) { self.or(rhs); }
+}
+
+impl<B: BitSlice> BitXorAssign<&B> for MemoryBitVec {
+    fn bitxor_assign(&mut self, rhs: &B) { self.xor(rhs); }
+}
diff --git a/src/obicompactvec/src/reader.rs b/src/obicompactvec/src/reader.rs
index 057ce29..e4c59e4 100644
--- a/src/obicompactvec/src/reader.rs
+++ b/src/obicompactvec/src/reader.rs
@@ -353,6 +353,17 @@ impl PersistentCompactIntVec {
     }
 }
 
+// ── IntSlice impl ─────────────────────────────────────────────────────────────
+
+use crate::traits::IntSlice;
+
+impl IntSlice for PersistentCompactIntVec {
+    fn len(&self) -> usize { self.n }
+    fn get(&self, slot: usize) -> u32 { self.get(slot) }
+    fn sum(&self) -> u64 { self.sum() }
+    fn count_nonzero(&self) -> u64 { self.count_nonzero() }
+}
+
 impl<'a> IntoIterator for &'a PersistentCompactIntVec {
     type Item = u32;
     type IntoIter = Iter<'a>;
diff --git a/src/obicompactvec/src/tests/bitvec.rs b/src/obicompactvec/src/tests/bitvec.rs
index 6b20568..a408e7d 100644
--- a/src/obicompactvec/src/tests/bitvec.rs
+++ b/src/obicompactvec/src/tests/bitvec.rs
@@ -1,5 +1,6 @@
 use tempfile::tempdir;
 
+use crate::traits::BitSliceMut;
 use crate::{PersistentBitVec, PersistentBitVecBuilder, PersistentCompactIntVec, PersistentCompactIntVecBuilder};
 
 fn make_bv(bits: &[bool]) -> (tempfile::TempDir, PersistentBitVec) {
diff --git a/src/obicompactvec/src/tests/memoryvec.rs b/src/obicompactvec/src/tests/memoryvec.rs
new file mode 100644
index 0000000..21c12c9
--- /dev/null
+++ b/src/obicompactvec/src/tests/memoryvec.rs
@@ -0,0 +1,359 @@
+use tempfile::tempdir;
+
+use crate::traits::{BitSlice, BitSliceMut, BitToInt, IntSlice, IntSliceMut, IntToBit};
+use crate::{MemoryBitVec, MemoryIntVec, PersistentBitVec, PersistentBitVecBuilder};
+
+// ── MemoryBitVec ──────────────────────────────────────────────────────────────
+
+#[test]
+fn mbv_new_all_zero() {
+    let v = MemoryBitVec::new(10);
+    assert_eq!(v.len(), 10);
+    assert!(!(0..10).any(|s| v.get(s)));
+    assert_eq!(v.count_ones(), 0);
+}
+
+#[test]
+fn mbv_ones_all_set() {
+    let v = MemoryBitVec::ones(10);
+    assert!((0..10).all(|s| v.get(s)));
+    assert_eq!(v.count_ones(), 10);
+    assert_eq!(v.count_zeros(), 0);
+}
+
+#[test]
+fn mbv_ones_no_padding_leak() {
+    // 5 bits: padding bits in last word must stay 0
+    let v = MemoryBitVec::ones(5);
+    assert_eq!(v.words()[0], 0b11111);
+}
+
+#[test]
+fn mbv_set_get_roundtrip() {
+    let mut v = MemoryBitVec::new(64);
+    v.set(0, true);
+    v.set(63, true);
+    assert!(v.get(0));
+    assert!(!v.get(1));
+    assert!(v.get(63));
+    assert_eq!(v.count_ones(), 2);
+}
+
+#[test]
+fn mbv_and() {
+    let mut a = MemoryBitVec::new(4);
+    a.set(0, true); a.set(1, true);
+    let mut b = MemoryBitVec::new(4);
+    b.set(0, true); b.set(2, true);
+    a.and(&b);
+    assert!(a.get(0)); assert!(!a.get(1)); assert!(!a.get(2));
+}
+
+#[test]
+fn mbv_or() {
+    let mut a = MemoryBitVec::new(4);
+    a.set(0, true); a.set(1, true);
+    let mut b = MemoryBitVec::new(4);
+    b.set(0, true); b.set(2, true);
+    a.or(&b);
+    assert!(a.get(0)); assert!(a.get(1)); assert!(a.get(2)); assert!(!a.get(3));
+}
+
+#[test]
+fn mbv_xor() {
+    let mut a = MemoryBitVec::new(4);
+    a.set(0, true); a.set(1, true);
+    let mut b = MemoryBitVec::new(4);
+    b.set(0, true); b.set(2, true);
+    a.xor(&b);
+    assert!(!a.get(0)); assert!(a.get(1)); assert!(a.get(2)); assert!(!a.get(3));
+}
+
+#[test]
+fn mbv_not() {
+    let mut a = MemoryBitVec::new(4);
+    a.set(0, true); a.set(2, true);
+    a.not();
+    assert!(!a.get(0)); assert!(a.get(1)); assert!(!a.get(2)); assert!(a.get(3));
+}
+
+#[test]
+fn mbv_not_no_padding_leak() {
+    let mut v = MemoryBitVec::new(5);
+    v.not();
+    assert_eq!(v.count_ones(), 5);
+    assert_eq!(v.words()[0], 0b11111);
+}
+
+#[test]
+fn mbv_ops_chaining() {
+    let mut a = MemoryBitVec::ones(8);
+    let b = MemoryBitVec::new(8); // all zeros
+    a.and(&b).or(&b).not();
+    assert_eq!(a.count_ones(), 8);
+}
+
+#[test]
+fn mbv_std_ops_owned() {
+    let mut a = MemoryBitVec::new(4);
+    a.set(0, true); a.set(1, true);
+    let mut b = MemoryBitVec::new(4);
+    b.set(1, true); b.set(2, true);
+    let c = a & &b;
+    assert!(!c.get(0)); assert!(c.get(1)); assert!(!c.get(2));
+}
+
+#[test]
+fn mbv_std_ops_assign() {
+    let mut a = MemoryBitVec::new(4);
+    a.set(0, true); a.set(1, true);
+    let mut b = MemoryBitVec::new(4);
+    b.set(1, true); b.set(2, true);
+    a &= &b;
+    assert!(!a.get(0)); assert!(a.get(1));
+}
+
+#[test]
+fn mbv_from_persistent() {
+    let dir = tempdir().unwrap();
+    let path = dir.path().join("v.pbiv");
+    let mut builder = PersistentBitVecBuilder::new(4, &path).unwrap();
+    builder.set(1, true); builder.set(3, true);
+    builder.close().unwrap();
+    let pv = PersistentBitVec::open(&path).unwrap();
+    let mv = MemoryBitVec::from(&pv);
+    assert!(!mv.get(0)); assert!(mv.get(1)); assert!(!mv.get(2)); assert!(mv.get(3));
+}
+
+#[test]
+fn mbv_persist_roundtrip() {
+    let dir = tempdir().unwrap();
+    let path = dir.path().join("out.pbiv");
+    let mut v = MemoryBitVec::new(8);
+    v.set(2, true); v.set(5, true);
+    let builder = v.persist(&path).unwrap();
+    builder.close().unwrap();
+    let pv = PersistentBitVec::open(&path).unwrap();
+    assert!(pv.get(2)); assert!(pv.get(5));
+    assert_eq!(pv.count_ones(), 2);
+}
+
+// ── MemoryIntVec ──────────────────────────────────────────────────────────────
+
+#[test]
+fn miv_new_all_zero() {
+    let v = MemoryIntVec::new(10);
+    assert_eq!(v.len(), 10);
+    assert!((0..10).all(|s| v.get(s) == 0));
+}
+
+#[test]
+fn miv_set_get_roundtrip() {
+    let mut v = MemoryIntVec::new(4);
+    v.set(0, 42); v.set(3, 200);
+    assert_eq!(v.get(0), 42);
+    assert_eq!(v.get(1), 0);
+    assert_eq!(v.get(3), 200);
+}
+
+#[test]
+fn miv_overflow_roundtrip() {
+    let mut v = MemoryIntVec::new(4);
+    v.set(1, 1000);
+    assert_eq!(v.get(1), 1000);
+    assert_eq!(v.get(0), 0);
+}
+
+#[test]
+fn miv_inc_dec() {
+    let mut v = MemoryIntVec::new(4);
+    v.inc(2); v.inc(2); v.inc(2);
+    assert_eq!(v.get(2), 3);
+    v.dec(2);
+    assert_eq!(v.get(2), 2);
+}
+
+#[test]
+fn miv_dec_saturates_at_zero() {
+    let mut v = MemoryIntVec::new(4);
+    v.dec(0);
+    assert_eq!(v.get(0), 0);
+}
+
+#[test]
+fn miv_add_at() {
+    let mut v = MemoryIntVec::new(4);
+    v.add_at(1, 100); v.add_at(1, 200);
+    assert_eq!(v.get(1), 300);
+}
+
+#[test]
+fn miv_min_max() {
+    let mut a = MemoryIntVec::new(4);
+    a.set(0, 5); a.set(1, 2); a.set(2, 8);
+    let mut b = MemoryIntVec::new(4);
+    b.set(0, 3); b.set(1, 7); b.set(2, 8);
+    let mut c = MemoryIntVec::from(&a);
+    IntSliceMut::min(&mut c, &b);
+    assert_eq!(c.get(0), 3); assert_eq!(c.get(1), 2); assert_eq!(c.get(2), 8);
+    let mut d = MemoryIntVec::from(&a);
+    IntSliceMut::max(&mut d, &b);
+    assert_eq!(d.get(0), 5); assert_eq!(d.get(1), 7); assert_eq!(d.get(2), 8);
+}
+
+#[test]
+fn miv_add_diff() {
+    let mut a = MemoryIntVec::new(3);
+    a.set(0, 10); a.set(1, 5);
+    let mut b = MemoryIntVec::new(3);
+    b.set(0, 3); b.set(1, 8);
+    let mut c = MemoryIntVec::from(&a);
+    c.add(&b);
+    assert_eq!(c.get(0), 13); assert_eq!(c.get(1), 13);
+    let mut d = MemoryIntVec::from(&a);
+    d.diff(&b);
+    assert_eq!(d.get(0), 7); assert_eq!(d.get(1), 0); // saturating sub
+}
+
+#[test]
+fn miv_std_ops() {
+    let mut a = MemoryIntVec::new(3);
+    a.set(0, 10); a.set(1, 5);
+    let mut b = MemoryIntVec::new(3);
+    b.set(0, 3); b.set(1, 8);
+    let c = &a + &b;
+    assert_eq!(c.get(0), 13); assert_eq!(c.get(1), 13);
+    let d = &a - &b;
+    assert_eq!(d.get(0), 7); assert_eq!(d.get(1), 0);
+}
+
+#[test]
+fn miv_from_persistent() {
+    use crate::{PersistentCompactIntVec, PersistentCompactIntVecBuilder};
+    let dir = tempdir().unwrap();
+    let path = dir.path().join("v.pciv");
+    let mut b = PersistentCompactIntVecBuilder::new(4, &path).unwrap();
+    b.set(1, 42); b.set(3, 1000);
+    b.close().unwrap();
+    let pv = PersistentCompactIntVec::open(&path).unwrap();
+    let mv = MemoryIntVec::from(&pv);
+    assert_eq!(mv.get(0), 0); assert_eq!(mv.get(1), 42); assert_eq!(mv.get(3), 1000);
+}
+
+// ── Cross-type conversions ────────────────────────────────────────────────────
+
+#[test]
+fn to_bitvec_threshold() {
+    let mut v = MemoryIntVec::new(5);
+    v.set(0, 0); v.set(1, 1); v.set(2, 5); v.set(3, 10); v.set(4, 3);
+    let bv = v.to_bitvec(4); // > 4: slots 2 (5) and 3 (10) pass
+    assert!(!bv.get(0)); assert!(!bv.get(1)); assert!(bv.get(2));
+    assert!(bv.get(3)); assert!(!bv.get(4));
+}
+
+#[test]
+fn to_presence() {
+    let mut v = MemoryIntVec::new(4);
+    v.set(1, 1); v.set(3, 100);
+    let bv = v.to_presence();
+    assert!(!bv.get(0)); assert!(bv.get(1)); assert!(!bv.get(2)); assert!(bv.get(3));
+}
+
+#[test]
+fn to_intvec_roundtrip() {
+    let mut bv = MemoryBitVec::new(8);
+    bv.set(0, true); bv.set(3, true); bv.set(7, true);
+    let iv = bv.to_intvec();
+    assert_eq!(iv.get(0), 1); assert_eq!(iv.get(1), 0);
+    assert_eq!(iv.get(3), 1); assert_eq!(iv.get(7), 1);
+}
+
+#[test]
+fn to_intvec_word_boundary() {
+    // 65 bits: spans two words
+    let mut bv = MemoryBitVec::new(65);
+    bv.set(63, true); bv.set(64, true);
+    let iv = bv.to_intvec();
+    assert_eq!(iv.get(63), 1); assert_eq!(iv.get(64), 1); assert_eq!(iv.get(62), 0);
+}
+
+#[test]
+fn count_bits_accumulates() {
+    let mut count = MemoryIntVec::new(8);
+    let mut b1 = MemoryBitVec::new(8);
+    b1.set(0, true); b1.set(2, true);
+    let mut b2 = MemoryBitVec::new(8);
+    b2.set(0, true); b2.set(3, true);
+    let mut b3 = MemoryBitVec::new(8);
+    b3.set(2, true); b3.set(3, true);
+    count.count_bits(&b1).count_bits(&b2).count_bits(&b3);
+    assert_eq!(count.get(0), 2);
+    assert_eq!(count.get(2), 2);
+    assert_eq!(count.get(3), 2);
+    assert_eq!(count.get(1), 0);
+}
+
+#[test]
+fn count_bits_skips_zero_words() {
+    // Entire first word is zero — should not touch those slots
+    let mut count = MemoryIntVec::new(128);
+    let mut bv = MemoryBitVec::new(128);
+    bv.set(64, true); bv.set(127, true);
+    count.count_bits(&bv);
+    assert_eq!(count.get(0), 0);
+    assert_eq!(count.get(64), 1);
+    assert_eq!(count.get(127), 1);
+}
+
+// ── Comparison operators ──────────────────────────────────────────────────────
+
+#[test]
+fn cmp_gt() {
+    let mut v = MemoryIntVec::new(5);
+    v.set(0, 0); v.set(1, 3); v.set(2, 5); v.set(3, 3); v.set(4, 10);
+    let bv = v.gt(3);
+    assert!(!bv.get(0)); assert!(!bv.get(1)); assert!(bv.get(2));
+    assert!(!bv.get(3)); assert!(bv.get(4));
+}
+
+#[test]
+fn cmp_geq() {
+    let mut v = MemoryIntVec::new(4);
+    v.set(0, 2); v.set(1, 3); v.set(2, 4); v.set(3, 1);
+    let bv = v.geq(3);
+    assert!(!bv.get(0)); assert!(bv.get(1)); assert!(bv.get(2)); assert!(!bv.get(3));
+}
+
+#[test]
+fn cmp_lt() {
+    let mut v = MemoryIntVec::new(4);
+    v.set(0, 2); v.set(1, 3); v.set(2, 4); v.set(3, 0);
+    let bv = v.lt(3);
+    assert!(bv.get(0)); assert!(!bv.get(1)); assert!(!bv.get(2)); assert!(bv.get(3));
+}
+
+#[test]
+fn cmp_leq() {
+    let mut v = MemoryIntVec::new(4);
+    v.set(0, 2); v.set(1, 3); v.set(2, 4); v.set(3, 3);
+    let bv = v.leq(3);
+    assert!(bv.get(0)); assert!(bv.get(1)); assert!(!bv.get(2)); assert!(bv.get(3));
+}
+
+#[test]
+fn filter_pattern() {
+    // Typical filter: ingroup >= min_count AND outgroup <= max_outgroup
+    let mut ingroup  = MemoryIntVec::new(6);
+    let mut outgroup = MemoryIntVec::new(6);
+    // slot 2: ingroup=3, outgroup=0  → keep
+    // slot 4: ingroup=2, outgroup=1  → drop (outgroup > 0)
+    // slot 5: ingroup=1, outgroup=0  → drop (ingroup < 2)
+    ingroup.set(2, 3); ingroup.set(4, 2); ingroup.set(5, 1);
+    outgroup.set(4, 1);
+    let out_mask  = outgroup.leq(0);
+    let mut in_mask = ingroup.geq(2);
+    let keep = in_mask.and(&out_mask);
+    assert!(!keep.get(0)); assert!(!keep.get(1));
+    assert!(keep.get(2));
+    assert!(!keep.get(4)); assert!(!keep.get(5));
+}
diff --git a/src/obicompactvec/src/tests/mod.rs b/src/obicompactvec/src/tests/mod.rs
index 4d2d9ad..c0be93a 100644
--- a/src/obicompactvec/src/tests/mod.rs
+++ b/src/obicompactvec/src/tests/mod.rs
@@ -1,9 +1,12 @@
 mod bitmatrix;
 mod bitvec;
 mod intmatrix;
+mod memoryvec;
 
 use tempfile::tempdir;
 
+use crate::traits::IntSliceMut;
+
 use crate::{PersistentCompactIntVec, PersistentCompactIntVecBuilder};
 
 fn roundtrip(values: &[(usize, u32)], n: usize) -> Vec<u32> {
diff --git a/src/obicompactvec/src/traits.rs b/src/obicompactvec/src/traits.rs
index b61e69b..91ee8d8 100644
--- a/src/obicompactvec/src/traits.rs
+++ b/src/obicompactvec/src/traits.rs
@@ -1,6 +1,213 @@
 use ndarray::{Array1, Array2};
 
-/// Column-level weight statistic — total count or presence count per column.
+// ── BitSlice / BitSliceMut ────────────────────────────────────────────────────
+
+/// Read-only view over the u64 word array of a bit vector.
+///
+/// Bit `i` is in `words()[i >> 6]` at position `i & 63`.
+/// Padding bits in the last word are zero.
+pub trait BitSlice {
+    fn len(&self) -> usize;
+    fn words(&self) -> &[u64];
+    fn is_empty(&self) -> bool { self.len() == 0 }
+    fn get(&self, slot: usize) -> bool {
+        (self.words()[slot >> 6] >> (slot & 63)) & 1 != 0
+    }
+}
+
+/// Mutable view over a bit-vector word array; default methods maintain the
+/// zero-padding invariant on the last word.
+pub trait BitSliceMut: BitSlice {
+    fn words_mut(&mut self) -> &mut [u64];
+
+    fn copy_from<S: BitSlice>(&mut self, src: &S) -> &mut Self {
+        assert_eq!(self.len(), src.len(), "BitSlice length mismatch");
+        self.words_mut().copy_from_slice(src.words());
+        self
+    }
+
+    fn and<S: BitSlice>(&mut self, other: &S) -> &mut Self {
+        assert_eq!(self.len(), other.len(), "BitSlice length mismatch");
+        for (w, &o) in self.words_mut().iter_mut().zip(other.words()) { *w &= o; }
+        self
+    }
+
+    fn or<S: BitSlice>(&mut self, other: &S) -> &mut Self {
+        assert_eq!(self.len(), other.len(), "BitSlice length mismatch");
+        for (w, &o) in self.words_mut().iter_mut().zip(other.words()) { *w |= o; }
+        self
+    }
+
+    fn xor<S: BitSlice>(&mut self, other: &S) -> &mut Self {
+        assert_eq!(self.len(), other.len(), "BitSlice length mismatch");
+        for (w, &o) in self.words_mut().iter_mut().zip(other.words()) { *w ^= o; }
+        self
+    }
+
+    fn not(&mut self) -> &mut Self {
+        let rem = self.len() % 64;
+        let words = self.words_mut();
+        for w in words.iter_mut() { *w ^= u64::MAX; }
+        if rem != 0 {
+            if let Some(last) = words.last_mut() { *last &= (1u64 << rem) - 1; }
+        }
+        self
+    }
+}
+
+// ── IntSlice / IntSliceMut ────────────────────────────────────────────────────
+
+/// Read-only access to a compact integer vector (values encoded as u32).
+pub trait IntSlice {
+    fn len(&self) -> usize;
+    fn get(&self, slot: usize) -> u32;
+    fn is_empty(&self) -> bool { self.len() == 0 }
+    fn sum(&self) -> u64 { (0..self.len()).map(|s| self.get(s) as u64).sum() }
+    fn count_nonzero(&self) -> u64 { (0..self.len()).filter(|&s| self.get(s) > 0).count() as u64 }
+
+    fn lt(&self, threshold: u32) -> MemoryBitVec { self.cmp_scalar(|v| v <  threshold) }
+    fn leq(&self, threshold: u32) -> MemoryBitVec { self.cmp_scalar(|v| v <= threshold) }
+    fn gt(&self, threshold: u32) -> MemoryBitVec  { self.cmp_scalar(|v| v >  threshold) }
+    fn geq(&self, threshold: u32) -> MemoryBitVec { self.cmp_scalar(|v| v >= threshold) }
+
+    fn cmp_scalar(&self, pred: impl Fn(u32) -> bool) -> MemoryBitVec {
+        let n = self.len();
+        let mut words = vec![0u64; n.div_ceil(64)];
+        for s in 0..n {
+            if pred(self.get(s)) { words[s >> 6] |= 1u64 << (s & 63); }
+        }
+        MemoryBitVec::from_words(words, n)
+    }
+}
+
+/// Mutable access; default methods use only `get` / `set` and maintain the
+/// compact encoding invariants on the implementor's side.
+pub trait IntSliceMut: IntSlice {
+    fn set(&mut self, slot: usize, value: u32);
+
+    fn inc(&mut self, slot: usize) -> &mut Self {
+        let v = self.get(slot);
+        self.set(slot, v.saturating_add(1));
+        self
+    }
+
+    fn dec(&mut self, slot: usize) -> &mut Self {
+        let v = self.get(slot);
+        self.set(slot, v.saturating_sub(1));
+        self
+    }
+
+    fn add_at(&mut self, slot: usize, delta: u32) -> &mut Self {
+        let v = self.get(slot);
+        self.set(slot, v.saturating_add(delta));
+        self
+    }
+
+    fn copy_from<S: IntSlice>(&mut self, src: &S) -> &mut Self {
+        assert_eq!(self.len(), src.len(), "IntSlice length mismatch");
+        for s in 0..src.len() { self.set(s, src.get(s)); }
+        self
+    }
+
+    fn min<S: IntSlice>(&mut self, other: &S) -> &mut Self {
+        assert_eq!(self.len(), other.len(), "IntSlice length mismatch");
+        for s in 0..other.len() { self.set(s, self.get(s).min(other.get(s))); }
+        self
+    }
+
+    fn max<S: IntSlice>(&mut self, other: &S) -> &mut Self {
+        assert_eq!(self.len(), other.len(), "IntSlice length mismatch");
+        for s in 0..other.len() { self.set(s, self.get(s).max(other.get(s))); }
+        self
+    }
+
+    fn add<S: IntSlice>(&mut self, other: &S) -> &mut Self {
+        assert_eq!(self.len(), other.len(), "IntSlice length mismatch");
+        for s in 0..other.len() { self.set(s, self.get(s).saturating_add(other.get(s))); }
+        self
+    }
+
+    fn diff<S: IntSlice>(&mut self, other: &S) -> &mut Self {
+        assert_eq!(self.len(), other.len(), "IntSlice length mismatch");
+        for s in 0..other.len() { self.set(s, self.get(s).saturating_sub(other.get(s))); }
+        self
+    }
+
+    /// For each slot where `bits` is true, increment `self` by 1.
+    /// Skips zero words entirely — O(n_ones) rather than O(n).
+    fn count_bits<B: BitSlice>(&mut self, bits: &B) -> &mut Self {
+        assert_eq!(self.len(), bits.len(), "IntSlice/BitSlice length mismatch");
+        for (w_idx, &word) in bits.words().iter().enumerate() {
+            if word == 0 { continue; }
+            let base = w_idx * 64;
+            let mut w = word;
+            while w != 0 {
+                let bit = w.trailing_zeros() as usize;
+                let slot = base + bit;
+                if slot < self.len() { self.inc(slot); }
+                w &= w - 1;
+            }
+        }
+        self
+    }
+}
+
+// ── IntSlice → MemoryBitVec conversions ───────────────────────────────────────
+
+use crate::memoryvec::MemoryBitVec;
+
+pub trait IntToBit: IntSlice {
+    /// Bit set iff value >= threshold. Consistent with `geq` and `build_from_counts`.
+    fn to_bitvec(&self, threshold: u32) -> MemoryBitVec { self.geq(threshold) }
+
+    /// Bit set iff value >= 1 (slot is present).
+    fn to_presence(&self) -> MemoryBitVec { self.geq(1) }
+}
+
+impl<T: IntSlice> IntToBit for T {}
+
+// ── BitSlice → MemoryIntVec conversion ───────────────────────────────────────
+
+use crate::memoryintvec::MemoryIntVec;
+
+pub trait BitToInt: BitSlice {
+    fn to_intvec(&self) -> MemoryIntVec {
+        let n = self.len();
+        let mut primary = vec![0u8; n];
+
+        // Unpack u64 words: each byte within a word yields 8 output bytes.
+        // Values are always 0 or 1 → no overflow entries needed.
+        let words = self.words();
+        let full_words = n / 64;
+
+        for (w_idx, &word) in words[..full_words].iter().enumerate() {
+            let base = w_idx * 64;
+            for byte_off in 0..8usize {
+                let byte = (word >> (byte_off * 8)) as u8;
+                let out = &mut primary[base + byte_off * 8..base + byte_off * 8 + 8];
+                for bit in 0..8usize {
+                    out[bit] = (byte >> bit) & 1;
+                }
+            }
+        }
+
+        // Remaining bits in the last partial word
+        let rem = n % 64;
+        if rem > 0 {
+            let word = words[full_words];
+            let base = full_words * 64;
+            for bit in 0..rem {
+                primary[base + bit] = ((word >> bit) & 1) as u8;
+            }
+        }
+
+        MemoryIntVec::from_primary(primary)
+    }
+}
+
+impl<T: BitSlice> BitToInt for T {}
+
+// ── Column-level weight statistic — total count or presence count per column.
 /// Additive across layers and partitions; used as denominator in normalised distances.
 ///
 /// `partial_kmer_counts` returns the number of **distinct k-mers** present per
diff --git a/src/obikindex/src/rebuild.rs b/src/obikindex/src/rebuild.rs
index b1a8b5c..83a416d 100644
--- a/src/obikindex/src/rebuild.rs
+++ b/src/obikindex/src/rebuild.rs
@@ -98,7 +98,9 @@ impl KmerIndex {
         fs::File::create(output.join(SENTINEL_INDEXED))?;
 
         let idx = KmerIndex::open(output)?;
+        let t_pack = Stage::start("pack");
         idx.pack_matrices()?;
+        rep.push(t_pack.stop());
         Ok(idx)
     }
 }

From d1717688d20b08fb2990b79f03ab7bfffeabc8cc Mon Sep 17 00:00:00 2001
From: Eric Coissac <eric@coissac.eu>
Date: Tue, 16 Jun 2026 23:36:25 +0200
Subject: [PATCH 03/24] refactor: extract matrix helpers and improve bit
 iteration ergonomics

Refactor parallel matrix construction by extracting reusable `pairwise_matrix` and `pairwise2_matrix` helpers, and consolidate binary record deserialization into dedicated parsing functions. Add `set` and `iter` methods to `BitSliceMut` and `MemoryBitVec` for ergonomic bit manipulation and iteration. Standardize JSON field extraction via `meta::field`, expose `MemoryBitIter`, and improve test reliability by automatically cleaning up temporary directories.
---
 src/obicompactvec/src/bitmatrix.rs       |  76 ++++++-------
 src/obicompactvec/src/bitvec.rs          |  12 +--
 src/obicompactvec/src/builder.rs         |   6 +-
 src/obicompactvec/src/format.rs          |  18 ++++
 src/obicompactvec/src/intmatrix.rs       | 129 +++++------------------
 src/obicompactvec/src/layer_meta.rs      |   7 +-
 src/obicompactvec/src/lib.rs             |   2 +-
 src/obicompactvec/src/memoryvec.rs       |  48 +++++++--
 src/obicompactvec/src/meta.rs            |   2 +-
 src/obicompactvec/src/reader.rs          |   7 +-
 src/obicompactvec/src/tests/bitmatrix.rs |   2 +-
 src/obicompactvec/src/traits.rs          |   6 ++
 src/obikpartitionner/src/common.rs       |   1 +
 src/obikpartitionner/src/select_layer.rs |   1 +
 src/obilayeredmap/src/layer.rs           |   1 +
 src/obilayeredmap/src/layered_store.rs   |   1 +
 16 files changed, 136 insertions(+), 183 deletions(-)

diff --git a/src/obicompactvec/src/bitmatrix.rs b/src/obicompactvec/src/bitmatrix.rs
index 2dbc266..cd7e0e9 100644
--- a/src/obicompactvec/src/bitmatrix.rs
+++ b/src/obicompactvec/src/bitmatrix.rs
@@ -56,34 +56,11 @@ impl ColumnarBitMatrix {
     }
 
     pub(crate) fn partial_jaccard_dist_matrix(&self) -> (Array2<u64>, Array2<u64>) {
-        let n = self.n_cols();
-        let results: Vec<(usize, usize, u64, u64)> = upper_pairs(n)
-            .into_par_iter()
-            .map(|(i, j)| {
-                let (inter, union) = self.col(i).partial_jaccard_dist(self.col(j));
-                (i, j, inter, union)
-            })
-            .collect();
-        let mut inter_m = Array2::zeros((n, n));
-        let mut union_m = Array2::zeros((n, n));
-        for (i, j, inter, union) in results {
-            inter_m[[i, j]] = inter; inter_m[[j, i]] = inter;
-            union_m[[i, j]] = union; union_m[[j, i]] = union;
-        }
-        (inter_m, union_m)
+        pairwise2_matrix(self.n_cols(), |i, j| self.col(i).partial_jaccard_dist(self.col(j)))
     }
 
     pub(crate) fn partial_hamming_dist_matrix(&self) -> Array2<u64> {
-        self.pairwise_u64(|i, j| self.col(i).hamming_dist(self.col(j)))
-    }
-
-    fn pairwise_u64(&self, f: impl Fn(usize, usize) -> u64 + Sync) -> Array2<u64> {
-        let n = self.n_cols();
-        let results: Vec<(usize, usize, u64)> = upper_pairs(n)
-            .into_par_iter()
-            .map(|(i, j)| (i, j, f(i, j)))
-            .collect();
-        fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v)))
+        pairwise_matrix(self.n_cols(), |i, j| self.col(i).hamming_dist(self.col(j)))
     }
 
     pub(crate) fn append_column(dir: &Path, value_of: impl Fn(usize) -> bool) -> io::Result<()> {
@@ -228,27 +205,11 @@ impl PackedBitMatrix {
     }
 
     pub(crate) fn partial_jaccard_dist_matrix(&self) -> (Array2<u64>, Array2<u64>) {
-        let n = self.n_cols;
-        let results: Vec<(usize, usize, u64, u64)> = upper_pairs(n)
-            .into_par_iter()
-            .map(|(i, j)| { let (inter, union) = self.partial_jaccard_col(i, j); (i, j, inter, union) })
-            .collect();
-        let mut inter_m = Array2::zeros((n, n));
-        let mut union_m = Array2::zeros((n, n));
-        for (i, j, inter, union) in results {
-            inter_m[[i, j]] = inter; inter_m[[j, i]] = inter;
-            union_m[[i, j]] = union; union_m[[j, i]] = union;
-        }
-        (inter_m, union_m)
+        pairwise2_matrix(self.n_cols, |i, j| self.partial_jaccard_col(i, j))
     }
 
     pub(crate) fn partial_hamming_dist_matrix(&self) -> Array2<u64> {
-        let n = self.n_cols;
-        let results: Vec<(usize, usize, u64)> = upper_pairs(n)
-            .into_par_iter()
-            .map(|(i, j)| (i, j, self.pair_op(i, j, false)))
-            .collect();
-        fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v)))
+        pairwise_matrix(self.n_cols, |i, j| self.pair_op(i, j, false))
     }
 }
 
@@ -488,7 +449,7 @@ impl PersistentBitMatrixBuilder {
     }
 }
 
-// ── Helpers ───────────────────────────────────────────────────────────────────
+// ── Shared matrix helpers (also used by intmatrix.rs) ─────────────────────────
 
 fn upper_pairs(n: usize) -> Vec<(usize, usize)> {
     (0..n).flat_map(|i| (i + 1..n).map(move |j| (i, j))).collect()
@@ -500,3 +461,30 @@ where T: Clone + Default {
     for (i, j, vij, vji) in vals { m[[i, j]] = vij; m[[j, i]] = vji; }
     m
 }
+
+/// Compute a symmetric `n×n` matrix in parallel by evaluating `f(i,j)` for
+/// all upper-triangle pairs. `T: Copy` avoids the `.clone()` needed for the
+/// lower-triangle mirror.
+pub(crate) fn pairwise_matrix<T>(n: usize, f: impl Fn(usize, usize) -> T + Sync) -> Array2<T>
+where T: Copy + Default + Send {
+    let results: Vec<(usize, usize, T)> = upper_pairs(n)
+        .into_par_iter().map(|(i, j)| (i, j, f(i, j))).collect();
+    fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v)))
+}
+
+/// Same as `pairwise_matrix` but `f` returns two values that fill two
+/// symmetric matrices simultaneously (e.g. intersection + union for Jaccard).
+pub(crate) fn pairwise2_matrix<T>(n: usize, f: impl Fn(usize, usize) -> (T, T) + Sync) -> (Array2<T>, Array2<T>)
+where T: Copy + Default + Send {
+    let results: Vec<(usize, usize, T, T)> = upper_pairs(n)
+        .into_par_iter()
+        .map(|(i, j)| { let (a, b) = f(i, j); (i, j, a, b) })
+        .collect();
+    let mut m0 = Array2::from_elem((n, n), T::default());
+    let mut m1 = Array2::from_elem((n, n), T::default());
+    for (i, j, a, b) in results {
+        m0[[i, j]] = a; m0[[j, i]] = a;
+        m1[[i, j]] = b; m1[[j, i]] = b;
+    }
+    (m0, m1)
+}
diff --git a/src/obicompactvec/src/bitvec.rs b/src/obicompactvec/src/bitvec.rs
index dc95512..dcb52ba 100644
--- a/src/obicompactvec/src/bitvec.rs
+++ b/src/obicompactvec/src/bitvec.rs
@@ -14,7 +14,7 @@ const MAGIC: [u8; 4] = *b"PBIV";
 const HEADER_SIZE: usize = 16;
 
 #[inline]
-fn n_words(n: usize) -> usize {
+pub(crate) fn n_words(n: usize) -> usize {
     n.div_ceil(64)
 }
 
@@ -222,16 +222,6 @@ impl PersistentBitVecBuilder {
         (self.mmap[HEADER_SIZE + (slot >> 3)] >> (slot & 7)) & 1 != 0
     }
 
-    pub fn set(&mut self, slot: usize, value: bool) {
-        let byte = HEADER_SIZE + (slot >> 3);
-        let bit = 1u8 << (slot & 7);
-        if value {
-            self.mmap[byte] |= bit;
-        } else {
-            self.mmap[byte] &= !bit;
-        }
-    }
-
     fn data_words(&self) -> &[u64] {
         let nw = n_words(self.n);
         let ptr = self.mmap[HEADER_SIZE..].as_ptr() as *const u64;
diff --git a/src/obicompactvec/src/builder.rs b/src/obicompactvec/src/builder.rs
index f2b5326..4885216 100644
--- a/src/obicompactvec/src/builder.rs
+++ b/src/obicompactvec/src/builder.rs
@@ -5,7 +5,7 @@ use std::path::{Path, PathBuf};
 
 use memmap2::MmapMut;
 
-use crate::format::{HEADER_SIZE, OVERFLOW_ENTRY_SIZE, finalize_pciv};
+use crate::format::{HEADER_SIZE, finalize_pciv, parse_overflow_entry};
 use crate::reader::PersistentCompactIntVec;
 
 pub struct PersistentCompactIntVecBuilder {
@@ -78,9 +78,7 @@ impl PersistentCompactIntVecBuilder {
 
         let mut overflow = HashMap::with_capacity(n_overflow);
         for i in 0..n_overflow {
-            let off = data_offset + i * OVERFLOW_ENTRY_SIZE;
-            let slot  = u64::from_le_bytes(mmap[off..off + 8].try_into().unwrap()) as usize;
-            let value = u32::from_le_bytes(mmap[off + 8..off + 12].try_into().unwrap());
+            let (slot, value) = parse_overflow_entry(&mmap, data_offset, i);
             overflow.insert(slot, value);
         }
 
diff --git a/src/obicompactvec/src/format.rs b/src/obicompactvec/src/format.rs
index 08f0079..265167d 100644
--- a/src/obicompactvec/src/format.rs
+++ b/src/obicompactvec/src/format.rs
@@ -13,6 +13,24 @@ pub const OVERFLOW_ENTRY_SIZE: usize = 12;
 // Index entry: slot(u64) + pos(u64) = 16 bytes.
 pub const INDEX_ENTRY_SIZE: usize = 16;
 
+/// Parse a single overflow entry `(slot, value)` from a byte slice.
+#[inline]
+pub fn parse_overflow_entry(data: &[u8], base: usize, i: usize) -> (usize, u32) {
+    let off = base + i * OVERFLOW_ENTRY_SIZE;
+    let slot  = u64::from_le_bytes(data[off..off+8].try_into().unwrap()) as usize;
+    let value = u32::from_le_bytes(data[off+8..off+12].try_into().unwrap());
+    (slot, value)
+}
+
+/// Parse a single sparse-index entry `(slot, pos)` from a byte slice.
+#[inline]
+pub fn parse_index_entry(data: &[u8], base: usize, i: usize) -> (usize, usize) {
+    let off = base + i * INDEX_ENTRY_SIZE;
+    let slot = u64::from_le_bytes(data[off..off+8].try_into().unwrap()) as usize;
+    let pos  = u64::from_le_bytes(data[off+8..off+16].try_into().unwrap()) as usize;
+    (slot, pos)
+}
+
 // Sparse index target: ≤ 32 KB in L1 cache (16 B per entry → 2048 entries).
 pub const L1_INDEX_ENTRIES: usize = 2048;
 
diff --git a/src/obicompactvec/src/intmatrix.rs b/src/obicompactvec/src/intmatrix.rs
index 91c3b1b..9d97f8e 100644
--- a/src/obicompactvec/src/intmatrix.rs
+++ b/src/obicompactvec/src/intmatrix.rs
@@ -8,9 +8,10 @@ use memmap2::Mmap;
 use ndarray::{Array1, Array2};
 use rayon::prelude::*;
 
+use crate::bitmatrix::{pairwise_matrix, pairwise2_matrix};
 use crate::builder::PersistentCompactIntVecBuilder;
 use crate::memoryintvec::MemoryIntVec;
-use crate::format::{HEADER_SIZE, INDEX_ENTRY_SIZE, OVERFLOW_ENTRY_SIZE};
+use crate::format::{HEADER_SIZE, OVERFLOW_ENTRY_SIZE, parse_index_entry, parse_overflow_entry};
 use crate::meta::MatrixMeta;
 use crate::reader::PersistentCompactIntVec;
 
@@ -65,49 +66,35 @@ impl ColumnarCompactIntMatrix {
     }
 
     pub(crate) fn partial_bray_dist_matrix(&self) -> Array2<u64> {
-        self.pairwise_u64(|i, j| self.col(i).partial_bray_dist(self.col(j)))
+        pairwise_matrix(self.n_cols(), |i, j| self.col(i).partial_bray_dist(self.col(j)))
     }
 
     pub(crate) fn partial_euclidean_dist_matrix(&self) -> Array2<f64> {
-        self.pairwise(|i, j| self.col(i).partial_euclidean_dist(self.col(j)))
+        pairwise_matrix(self.n_cols(), |i, j| self.col(i).partial_euclidean_dist(self.col(j)))
     }
 
     pub(crate) fn partial_threshold_jaccard_dist_matrix(
         &self, threshold: u32,
     ) -> (Array2<u64>, Array2<u64>) {
-        let n = self.n_cols();
-        let pairs = upper_pairs(n);
-        let results: Vec<(usize, usize, u64, u64)> = pairs
-            .into_par_iter()
-            .map(|(i, j)| {
-                let (inter, union) =
-                    self.col(i).partial_threshold_jaccard_dist(self.col(j), threshold);
-                (i, j, inter, union)
-            })
-            .collect();
-        let mut inter_m = Array2::zeros((n, n));
-        let mut union_m = Array2::zeros((n, n));
-        for (i, j, inter, union) in results {
-            inter_m[[i, j]] = inter; inter_m[[j, i]] = inter;
-            union_m[[i, j]] = union; union_m[[j, i]] = union;
-        }
-        (inter_m, union_m)
+        pairwise2_matrix(self.n_cols(), |i, j| {
+            self.col(i).partial_threshold_jaccard_dist(self.col(j), threshold)
+        })
     }
 
     pub(crate) fn partial_relfreq_bray_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
-        self.pairwise(|i, j| {
+        pairwise_matrix(self.n_cols(), |i, j| {
             self.col(i).partial_relfreq_bray_dist(self.col(j), col_sums[i] as f64, col_sums[j] as f64)
         })
     }
 
     pub(crate) fn partial_relfreq_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
-        self.pairwise(|i, j| {
+        pairwise_matrix(self.n_cols(), |i, j| {
             self.col(i).partial_relfreq_euclidean_dist(self.col(j), col_sums[i] as f64, col_sums[j] as f64)
         })
     }
 
     pub(crate) fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
-        self.pairwise(|i, j| {
+        pairwise_matrix(self.n_cols(), |i, j| {
             self.col(i).partial_hellinger_euclidean_dist(self.col(j), col_sums[i] as f64, col_sums[j] as f64)
         })
     }
@@ -121,19 +108,6 @@ impl ColumnarCompactIntMatrix {
         meta.save(dir)
     }
 
-    fn pairwise(&self, f: impl Fn(usize, usize) -> f64 + Sync) -> Array2<f64> {
-        let n = self.n_cols();
-        let results: Vec<(usize, usize, f64)> = upper_pairs(n)
-            .into_par_iter().map(|(i, j)| (i, j, f(i, j))).collect();
-        fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v)))
-    }
-
-    fn pairwise_u64(&self, f: impl Fn(usize, usize) -> u64 + Sync) -> Array2<u64> {
-        let n = self.n_cols();
-        let results: Vec<(usize, usize, u64)> = upper_pairs(n)
-            .into_par_iter().map(|(i, j)| (i, j, f(i, j))).collect();
-        fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v)))
-    }
 }
 
 // ── PackedCompactIntMatrix ────────────────────────────────────────────────────
@@ -185,10 +159,7 @@ impl PackedCompactIntMatrix {
 
             let mut index = Vec::with_capacity(n_idx);
             for i in 0..n_idx {
-                let ioff  = index_offset + i * INDEX_ENTRY_SIZE;
-                let slot  = u64::from_le_bytes(mmap[ioff..ioff+8].try_into().unwrap())   as usize;
-                let pos   = u64::from_le_bytes(mmap[ioff+8..ioff+16].try_into().unwrap()) as usize;
-                index.push((slot, pos));
+                index.push(parse_index_entry(&mmap, index_offset, i));
             }
             columns.push(ColInfo { primary_start, data_offset, n_overflow: n_ov, step, index });
         }
@@ -196,30 +167,25 @@ impl PackedCompactIntMatrix {
         Ok(Self { mmap, n_rows, n_cols, columns })
     }
 
+    fn col_overflow_map(&self, ci: &ColInfo) -> HashMap<usize, u32> {
+        let mut overflow = HashMap::with_capacity(ci.n_overflow);
+        for i in 0..ci.n_overflow {
+            let (slot, value) = parse_overflow_entry(&self.mmap, ci.data_offset, i);
+            overflow.insert(slot, value);
+        }
+        overflow
+    }
+
     pub(crate) fn col_persist(&self, c: usize, path: &Path) -> io::Result<PersistentCompactIntVecBuilder> {
         let ci = &self.columns[c];
         let primary = &self.mmap[ci.primary_start..ci.primary_start + self.n_rows];
-        let mut overflow = HashMap::with_capacity(ci.n_overflow);
-        for i in 0..ci.n_overflow {
-            let off   = ci.data_offset + i * OVERFLOW_ENTRY_SIZE;
-            let slot  = u64::from_le_bytes(self.mmap[off..off+8].try_into().unwrap()) as usize;
-            let value = u32::from_le_bytes(self.mmap[off+8..off+12].try_into().unwrap());
-            overflow.insert(slot, value);
-        }
-        PersistentCompactIntVecBuilder::from_raw_primary(primary, overflow, path)
+        PersistentCompactIntVecBuilder::from_raw_primary(primary, self.col_overflow_map(ci), path)
     }
 
     pub(crate) fn col_as_memory(&self, c: usize) -> MemoryIntVec {
         let ci = &self.columns[c];
         let primary = self.mmap[ci.primary_start..ci.primary_start + self.n_rows].to_vec();
-        let mut overflow = HashMap::with_capacity(ci.n_overflow);
-        for i in 0..ci.n_overflow {
-            let off   = ci.data_offset + i * OVERFLOW_ENTRY_SIZE;
-            let slot  = u64::from_le_bytes(self.mmap[off..off+8].try_into().unwrap()) as usize;
-            let value = u32::from_le_bytes(self.mmap[off+8..off+12].try_into().unwrap());
-            overflow.insert(slot, value);
-        }
-        MemoryIntVec::from_primary_and_overflow(primary, overflow)
+        MemoryIntVec::from_primary_and_overflow(primary, self.col_overflow_map(ci))
     }
 
     #[inline]
@@ -327,55 +293,28 @@ impl PackedCompactIntMatrix {
 
     // ── Matrix methods ────────────────────────────────────────────────────────
 
-    fn pairwise<T>(&self, f: impl Fn(usize, usize) -> T + Sync) -> Array2<T>
-    where T: Clone + Default + Send {
-        let n = self.n_cols;
-        let results: Vec<(usize, usize, T)> = upper_pairs(n)
-            .into_par_iter().map(|(i, j)| (i, j, f(i, j))).collect();
-        fill_symmetric(n, results.into_iter().map(|(i, j, v)| { let w = v.clone(); (i, j, v, w) }))
-    }
-
-    fn pairwise_u64(&self, f: impl Fn(usize, usize) -> u64 + Sync) -> Array2<u64> {
-        let n = self.n_cols;
-        let results: Vec<(usize, usize, u64)> = upper_pairs(n)
-            .into_par_iter().map(|(i, j)| (i, j, f(i, j))).collect();
-        fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v)))
-    }
-
     pub(crate) fn partial_bray_dist_matrix(&self) -> Array2<u64> {
-        self.pairwise_u64(|i, j| self.pair_partial_bray(i, j))
+        pairwise_matrix(self.n_cols, |i, j| self.pair_partial_bray(i, j))
     }
 
-
     pub(crate) fn partial_euclidean_dist_matrix(&self) -> Array2<f64> {
-        self.pairwise(|i, j| self.pair_partial_euclidean(i, j))
+        pairwise_matrix(self.n_cols, |i, j| self.pair_partial_euclidean(i, j))
     }
 
     pub(crate) fn partial_threshold_jaccard_dist_matrix(&self, t: u32) -> (Array2<u64>, Array2<u64>) {
-        let n = self.n_cols;
-        let results: Vec<(usize, usize, u64, u64)> = upper_pairs(n)
-            .into_par_iter()
-            .map(|(i, j)| { let (inter, union) = self.pair_partial_threshold_jaccard(i, j, t); (i, j, inter, union) })
-            .collect();
-        let mut inter_m = Array2::zeros((n, n));
-        let mut union_m = Array2::zeros((n, n));
-        for (i, j, inter, union) in results {
-            inter_m[[i, j]] = inter; inter_m[[j, i]] = inter;
-            union_m[[i, j]] = union; union_m[[j, i]] = union;
-        }
-        (inter_m, union_m)
+        pairwise2_matrix(self.n_cols, |i, j| self.pair_partial_threshold_jaccard(i, j, t))
     }
 
     pub(crate) fn partial_relfreq_bray_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
-        self.pairwise(|i, j| self.pair_partial_relfreq_bray(i, j, col_sums[i] as f64, col_sums[j] as f64))
+        pairwise_matrix(self.n_cols, |i, j| self.pair_partial_relfreq_bray(i, j, col_sums[i] as f64, col_sums[j] as f64))
     }
 
     pub(crate) fn partial_relfreq_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
-        self.pairwise(|i, j| self.pair_partial_relfreq_euclidean(i, j, col_sums[i] as f64, col_sums[j] as f64))
+        pairwise_matrix(self.n_cols, |i, j| self.pair_partial_relfreq_euclidean(i, j, col_sums[i] as f64, col_sums[j] as f64))
     }
 
     pub(crate) fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
-        self.pairwise(|i, j| self.pair_partial_hellinger(i, j, col_sums[i] as f64, col_sums[j] as f64))
+        pairwise_matrix(self.n_cols, |i, j| self.pair_partial_hellinger(i, j, col_sums[i] as f64, col_sums[j] as f64))
     }
 
 }
@@ -570,15 +509,3 @@ impl PersistentCompactIntMatrixBuilder {
     }
 }
 
-// ── Helpers ───────────────────────────────────────────────────────────────────
-
-fn upper_pairs(n: usize) -> Vec<(usize, usize)> {
-    (0..n).flat_map(|i| (i + 1..n).map(move |j| (i, j))).collect()
-}
-
-fn fill_symmetric<T>(n: usize, vals: impl Iterator<Item = (usize, usize, T, T)>) -> Array2<T>
-where T: Clone + Default {
-    let mut m = Array2::from_elem((n, n), T::default());
-    for (i, j, vij, vji) in vals { m[[i, j]] = vij; m[[j, i]] = vji; }
-    m
-}
diff --git a/src/obicompactvec/src/layer_meta.rs b/src/obicompactvec/src/layer_meta.rs
index 65dc5bc..28fff0c 100644
--- a/src/obicompactvec/src/layer_meta.rs
+++ b/src/obicompactvec/src/layer_meta.rs
@@ -23,11 +23,6 @@ impl LayerMeta {
     }
 
     fn parse(s: &str) -> Option<Self> {
-        let key = "\"n\":";
-        let pos = s.find(key)? + key.len();
-        let rest = s[pos..].trim_start();
-        let end = rest.find(|c: char| !c.is_ascii_digit()).unwrap_or(rest.len());
-        let n = rest[..end].parse().ok()?;
-        Some(Self { n })
+        Some(Self { n: crate::meta::field(s, "n")? })
     }
 }
diff --git a/src/obicompactvec/src/lib.rs b/src/obicompactvec/src/lib.rs
index b3c2ff4..fb2d5e2 100644
--- a/src/obicompactvec/src/lib.rs
+++ b/src/obicompactvec/src/lib.rs
@@ -16,7 +16,7 @@ pub use builder::PersistentCompactIntVecBuilder;
 pub use intmatrix::{PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, pack_compact_int_matrix};
 pub use layer_meta::LayerMeta;
 pub use memoryintvec::MemoryIntVec;
-pub use memoryvec::MemoryBitVec;
+pub use memoryvec::{MemoryBitIter, MemoryBitVec};
 pub use reader::PersistentCompactIntVec;
 pub use traits::{BitPartials, BitSlice, BitSliceMut, BitToInt, ColumnWeights, CountPartials, IntSlice, IntSliceMut, IntToBit};
 
diff --git a/src/obicompactvec/src/memoryvec.rs b/src/obicompactvec/src/memoryvec.rs
index 102a6d6..9195982 100644
--- a/src/obicompactvec/src/memoryvec.rs
+++ b/src/obicompactvec/src/memoryvec.rs
@@ -2,12 +2,9 @@ use std::io;
 use std::ops::{BitAnd, BitAndAssign, BitOr, BitOrAssign, BitXor, BitXorAssign, Not};
 use std::path::Path;
 
-use crate::bitvec::PersistentBitVecBuilder;
+use crate::bitvec::{PersistentBitVecBuilder, n_words};
 use crate::traits::{BitSlice, BitSliceMut};
 
-#[inline]
-fn n_words(n: usize) -> usize { n.div_ceil(64) }
-
 // ── MemoryBitVec ──────────────────────────────────────────────────────────────
 
 #[derive(Clone)]
@@ -41,11 +38,6 @@ impl MemoryBitVec {
         (self.words[slot >> 6] >> (slot & 63)) & 1 != 0
     }
 
-    pub fn set(&mut self, slot: usize, value: bool) {
-        let bit = 1u64 << (slot & 63);
-        if value { self.words[slot >> 6] |= bit; } else { self.words[slot >> 6] &= !bit; }
-    }
-
     pub fn count_ones(&self) -> u64 {
         self.words.iter().map(|w| w.count_ones() as u64).sum()
     }
@@ -136,3 +128,41 @@ impl<B: BitSlice> BitOrAssign<&B> for MemoryBitVec {
 impl<B: BitSlice> BitXorAssign<&B> for MemoryBitVec {
     fn bitxor_assign(&mut self, rhs: &B) { self.xor(rhs); }
 }
+
+// ── Iterator ──────────────────────────────────────────────────────────────────
+
+pub struct MemoryBitIter<'a> {
+    words: &'a [u64],
+    slot: usize,
+    n: usize,
+}
+
+impl Iterator for MemoryBitIter<'_> {
+    type Item = bool;
+
+    fn next(&mut self) -> Option<bool> {
+        if self.slot >= self.n { return None; }
+        let v = (self.words[self.slot >> 6] >> (self.slot & 63)) & 1 != 0;
+        self.slot += 1;
+        Some(v)
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        let rem = self.n - self.slot;
+        (rem, Some(rem))
+    }
+}
+
+impl ExactSizeIterator for MemoryBitIter<'_> {}
+
+impl MemoryBitVec {
+    pub fn iter(&self) -> MemoryBitIter<'_> {
+        MemoryBitIter { words: &self.words, slot: 0, n: self.n }
+    }
+}
+
+impl<'a> IntoIterator for &'a MemoryBitVec {
+    type Item = bool;
+    type IntoIter = MemoryBitIter<'a>;
+    fn into_iter(self) -> MemoryBitIter<'a> { self.iter() }
+}
diff --git a/src/obicompactvec/src/meta.rs b/src/obicompactvec/src/meta.rs
index d8d8466..09deedc 100644
--- a/src/obicompactvec/src/meta.rs
+++ b/src/obicompactvec/src/meta.rs
@@ -23,7 +23,7 @@ fn parse(s: &str) -> Option<MatrixMeta> {
     Some(MatrixMeta { n: field(s, "n")?, n_cols: field(s, "n_cols")? })
 }
 
-fn field(s: &str, name: &str) -> Option<usize> {
+pub(crate) fn field(s: &str, name: &str) -> Option<usize> {
     let key = format!("\"{}\":", name);
     let pos = s.find(&key)? + key.len();
     let rest = s[pos..].trim_start();
diff --git a/src/obicompactvec/src/reader.rs b/src/obicompactvec/src/reader.rs
index e4c59e4..bd3d7d7 100644
--- a/src/obicompactvec/src/reader.rs
+++ b/src/obicompactvec/src/reader.rs
@@ -4,7 +4,7 @@ use std::path::{Path, PathBuf};
 
 use memmap2::Mmap;
 
-use crate::format::{HEADER_SIZE, INDEX_ENTRY_SIZE, MAGIC, OVERFLOW_ENTRY_SIZE};
+use crate::format::{HEADER_SIZE, MAGIC, OVERFLOW_ENTRY_SIZE, parse_index_entry};
 
 pub struct PersistentCompactIntVec {
     mmap: Mmap,
@@ -43,10 +43,7 @@ impl PersistentCompactIntVec {
 
         let mut index = Vec::with_capacity(n_index);
         for i in 0..n_index {
-            let off = index_offset + i * INDEX_ENTRY_SIZE;
-            let slot = u64::from_le_bytes(mmap[off..off + 8].try_into().unwrap()) as usize;
-            let pos = u64::from_le_bytes(mmap[off + 8..off + 16].try_into().unwrap()) as usize;
-            index.push((slot, pos));
+            index.push(parse_index_entry(&mmap, index_offset, i));
         }
 
         Ok(Self {
diff --git a/src/obicompactvec/src/tests/bitmatrix.rs b/src/obicompactvec/src/tests/bitmatrix.rs
index 741a07c..3304410 100644
--- a/src/obicompactvec/src/tests/bitmatrix.rs
+++ b/src/obicompactvec/src/tests/bitmatrix.rs
@@ -1,7 +1,7 @@
 use tempfile::tempdir;
 
 use crate::{PersistentBitMatrix, PersistentBitMatrixBuilder};
-use crate::traits::BitPartials;
+use crate::traits::{BitPartials, BitSliceMut};
 
 fn make_matrix(cols: &[&[bool]]) -> (tempfile::TempDir, PersistentBitMatrix) {
     let n = cols.first().map_or(0, |c| c.len());
diff --git a/src/obicompactvec/src/traits.rs b/src/obicompactvec/src/traits.rs
index 91ee8d8..32e40a1 100644
--- a/src/obicompactvec/src/traits.rs
+++ b/src/obicompactvec/src/traits.rs
@@ -20,6 +20,11 @@ pub trait BitSlice {
 pub trait BitSliceMut: BitSlice {
     fn words_mut(&mut self) -> &mut [u64];
 
+    fn set(&mut self, slot: usize, value: bool) {
+        let bit = 1u64 << (slot & 63);
+        if value { self.words_mut()[slot >> 6] |= bit; } else { self.words_mut()[slot >> 6] &= !bit; }
+    }
+
     fn copy_from<S: BitSlice>(&mut self, src: &S) -> &mut Self {
         assert_eq!(self.len(), src.len(), "BitSlice length mismatch");
         self.words_mut().copy_from_slice(src.words());
@@ -62,6 +67,7 @@ pub trait IntSlice {
     fn len(&self) -> usize;
     fn get(&self, slot: usize) -> u32;
     fn is_empty(&self) -> bool { self.len() == 0 }
+    fn iter(&self) -> impl Iterator<Item = u32> + '_ { (0..self.len()).map(|i| self.get(i)) }
     fn sum(&self) -> u64 { (0..self.len()).map(|s| self.get(s) as u64).sum() }
     fn count_nonzero(&self) -> u64 { (0..self.len()).filter(|&s| self.get(s) > 0).count() as u64 }
 
diff --git a/src/obikpartitionner/src/common.rs b/src/obikpartitionner/src/common.rs
index 99e345e..76d3bf3 100644
--- a/src/obikpartitionner/src/common.rs
+++ b/src/obikpartitionner/src/common.rs
@@ -3,6 +3,7 @@ use std::io;
 use std::path::{Path, PathBuf};
 
 use obicompactvec::{PersistentBitVecBuilder, PersistentCompactIntVecBuilder};
+use obicompactvec::traits::BitSliceMut;
 use obilayeredmap::meta::PartitionMeta;
 use obilayeredmap::{IndexMode, OLMError};
 use obiskio::{SKError, SKResult};
diff --git a/src/obikpartitionner/src/select_layer.rs b/src/obikpartitionner/src/select_layer.rs
index 36286c0..56b2ac7 100644
--- a/src/obikpartitionner/src/select_layer.rs
+++ b/src/obikpartitionner/src/select_layer.rs
@@ -6,6 +6,7 @@ use obicompactvec::{
     PersistentBitMatrix, PersistentBitMatrixBuilder, PersistentBitVecBuilder,
     PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, PersistentCompactIntVecBuilder,
 };
+use obicompactvec::traits::BitSliceMut;
 use obilayeredmap::meta::PartitionMeta;
 use obilayeredmap::OLMError;
 use obiskio::{SKError, SKResult};
diff --git a/src/obilayeredmap/src/layer.rs b/src/obilayeredmap/src/layer.rs
index 72b38ea..c79e781 100644
--- a/src/obilayeredmap/src/layer.rs
+++ b/src/obilayeredmap/src/layer.rs
@@ -6,6 +6,7 @@ use obicompactvec::{
     PersistentBitMatrix, PersistentBitMatrixBuilder,
     PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder,
 };
+use obicompactvec::traits::BitSliceMut;
 use obikseq::CanonicalKmer;
 use obiskio::{UnitigFileReader, UnitigFileWriter};
 
diff --git a/src/obilayeredmap/src/layered_store.rs b/src/obilayeredmap/src/layered_store.rs
index 433183e..6ebf343 100644
--- a/src/obilayeredmap/src/layered_store.rs
+++ b/src/obilayeredmap/src/layered_store.rs
@@ -102,6 +102,7 @@ mod tests {
         PersistentBitMatrix, PersistentBitMatrixBuilder,
         PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder,
     };
+    use obicompactvec::traits::BitSliceMut;
     use tempfile::tempdir;
 
     fn make_int_matrix(cols: &[&[u32]]) -> (tempfile::TempDir, PersistentCompactIntMatrix) {

From df7b400fdabfcb79e69ffd74e14b756edbd9ac84 Mon Sep 17 00:00:00 2001
From: Eric Coissac <eric@coissac.eu>
Date: Wed, 17 Jun 2026 00:13:16 +0200
Subject: [PATCH 04/24] perf: optimize aggregation with byte-level helpers and
 direct mmap

Introduce `byte_sum` and `byte_count_nonzero` to efficiently aggregate compact-int byte slices, bypassing per-element decoding and overflow map lookups. Refactor `sum()` and `count_nonzero()` across the matrix, reader, and traits modules to use direct memory-mapped slice iteration and idiomatic Rust iterators. Additionally, expose `MemoryIntIter` publicly and implement `IntoIterator` and `IntSlice` for `MemoryIntVec` to enable standard iteration and delegate aggregation to the new helpers.
---
 src/obicompactvec/src/format.rs       | 20 ++++++++
 src/obicompactvec/src/intmatrix.rs    | 18 +++++--
 src/obicompactvec/src/lib.rs          |  2 +-
 src/obicompactvec/src/memoryintvec.rs | 69 +++++++++++++++++++++++----
 src/obicompactvec/src/reader.rs       | 12 ++---
 src/obicompactvec/src/traits.rs       |  4 +-
 6 files changed, 101 insertions(+), 24 deletions(-)

diff --git a/src/obicompactvec/src/format.rs b/src/obicompactvec/src/format.rs
index 265167d..b3c24d0 100644
--- a/src/obicompactvec/src/format.rs
+++ b/src/obicompactvec/src/format.rs
@@ -13,6 +13,26 @@ pub const OVERFLOW_ENTRY_SIZE: usize = 12;
 // Index entry: slot(u64) + pos(u64) = 16 bytes.
 pub const INDEX_ENTRY_SIZE: usize = 16;
 
+/// Sum all values in a compact-int primary byte slice, correcting for overflow sentinels.
+///
+/// `primary` is the raw `&[u8]` where 255 is a sentinel for large values.
+/// `overflow` yields the true values (≥ 255) for each sentinel, in any order.
+#[inline]
+pub(crate) fn byte_sum(primary: &[u8], overflow: impl Iterator<Item = u32>) -> u64 {
+    let raw: u64 = primary.iter().map(|&b| b as u64).sum();
+    let (n, ov) = overflow.fold((0u64, 0u64), |(n, s), v| (n + 1, s + v as u64));
+    raw - 255 * n + ov
+}
+
+/// Count non-zero values in a compact-int primary byte slice.
+///
+/// Overflow sentinels (255) are always non-zero by construction, so a single
+/// `b != 0` test is sufficient — no overflow map lookup needed.
+#[inline]
+pub(crate) fn byte_count_nonzero(primary: &[u8]) -> u64 {
+    primary.iter().filter(|&&b| b != 0).count() as u64
+}
+
 /// Parse a single overflow entry `(slot, value)` from a byte slice.
 #[inline]
 pub fn parse_overflow_entry(data: &[u8], base: usize, i: usize) -> (usize, u32) {
diff --git a/src/obicompactvec/src/intmatrix.rs b/src/obicompactvec/src/intmatrix.rs
index 9d97f8e..69240dd 100644
--- a/src/obicompactvec/src/intmatrix.rs
+++ b/src/obicompactvec/src/intmatrix.rs
@@ -11,7 +11,7 @@ use rayon::prelude::*;
 use crate::bitmatrix::{pairwise_matrix, pairwise2_matrix};
 use crate::builder::PersistentCompactIntVecBuilder;
 use crate::memoryintvec::MemoryIntVec;
-use crate::format::{HEADER_SIZE, OVERFLOW_ENTRY_SIZE, parse_index_entry, parse_overflow_entry};
+use crate::format::{byte_count_nonzero, byte_sum, HEADER_SIZE, OVERFLOW_ENTRY_SIZE, parse_index_entry, parse_overflow_entry};
 use crate::meta::MatrixMeta;
 use crate::reader::PersistentCompactIntVec;
 
@@ -230,16 +230,24 @@ impl PackedCompactIntMatrix {
 
     pub(crate) fn sum(&self) -> Array1<u64> {
         Array1::from_vec(
-            (0..self.n_cols).into_par_iter()
-                .map(|c| (0..self.n_rows).map(|s| self.get(c, s) as u64).sum())
+            self.columns.par_iter()
+                .map(|ci| {
+                    let primary = &self.mmap[ci.primary_start..ci.primary_start + self.n_rows];
+                    let overflow = (0..ci.n_overflow)
+                        .map(|i| parse_overflow_entry(&self.mmap, ci.data_offset, i).1);
+                    byte_sum(primary, overflow)
+                })
                 .collect()
         )
     }
 
     pub(crate) fn count_nonzero(&self) -> Array1<u64> {
         Array1::from_vec(
-            (0..self.n_cols).into_par_iter()
-                .map(|c| (0..self.n_rows).filter(|&s| self.get(c, s) > 0).count() as u64)
+            self.columns.par_iter()
+                .map(|ci| {
+                    let primary = &self.mmap[ci.primary_start..ci.primary_start + self.n_rows];
+                    byte_count_nonzero(primary)
+                })
                 .collect()
         )
     }
diff --git a/src/obicompactvec/src/lib.rs b/src/obicompactvec/src/lib.rs
index fb2d5e2..ced509b 100644
--- a/src/obicompactvec/src/lib.rs
+++ b/src/obicompactvec/src/lib.rs
@@ -15,7 +15,7 @@ pub use bitmatrix::{PersistentBitMatrix, PersistentBitMatrixBuilder, pack_bit_ma
 pub use builder::PersistentCompactIntVecBuilder;
 pub use intmatrix::{PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, pack_compact_int_matrix};
 pub use layer_meta::LayerMeta;
-pub use memoryintvec::MemoryIntVec;
+pub use memoryintvec::{MemoryIntIter, MemoryIntVec};
 pub use memoryvec::{MemoryBitIter, MemoryBitVec};
 pub use reader::PersistentCompactIntVec;
 pub use traits::{BitPartials, BitSlice, BitSliceMut, BitToInt, ColumnWeights, CountPartials, IntSlice, IntSliceMut, IntToBit};
diff --git a/src/obicompactvec/src/memoryintvec.rs b/src/obicompactvec/src/memoryintvec.rs
index 486a0f1..735431c 100644
--- a/src/obicompactvec/src/memoryintvec.rs
+++ b/src/obicompactvec/src/memoryintvec.rs
@@ -4,6 +4,7 @@ use std::ops::{Add, AddAssign, Sub, SubAssign};
 use std::path::Path;
 
 use crate::builder::PersistentCompactIntVecBuilder;
+use crate::format::{byte_count_nonzero, byte_sum};
 use crate::traits::{IntSlice, IntSliceMut};
 
 // ── MemoryIntVec ──────────────────────────────────────────────────────────────
@@ -37,8 +38,31 @@ impl MemoryIntVec {
     pub(crate) fn primary_bytes(&self) -> &[u8] { &self.primary }
     pub(crate) fn overflow_map(&self) -> &HashMap<usize, u32> { &self.overflow }
 
-    pub fn iter(&self) -> impl Iterator<Item = u32> + '_ {
-        (0..self.n).map(move |slot| self.get(slot))
+    pub fn get(&self, slot: usize) -> u32 {
+        match self.primary[slot] {
+            255 => *self.overflow.get(&slot).expect("sentinel without overflow entry"),
+            v   => v as u32,
+        }
+    }
+
+    pub fn sum(&self) -> u64 {
+        byte_sum(&self.primary, self.overflow.values().copied())
+    }
+
+    pub fn count_nonzero(&self) -> u64 {
+        byte_count_nonzero(&self.primary)
+    }
+
+    pub fn filled(n: usize, value: u32) -> Self {
+        if value < 255 {
+            Self { primary: vec![value as u8; n], overflow: HashMap::new(), n }
+        } else {
+            Self { primary: vec![255u8; n], overflow: (0..n).map(|i| (i, value)).collect(), n }
+        }
+    }
+
+    pub fn iter(&self) -> MemoryIntIter<'_> {
+        MemoryIntIter { vec: self, slot: 0 }
     }
 
     /// Write to disk and return a writable builder at `path`.
@@ -51,13 +75,9 @@ impl MemoryIntVec {
 
 impl IntSlice for MemoryIntVec {
     fn len(&self) -> usize { self.n }
-
-    fn get(&self, slot: usize) -> u32 {
-        match self.primary[slot] {
-            255 => *self.overflow.get(&slot).expect("sentinel without overflow entry"),
-            v   => v as u32,
-        }
-    }
+    fn get(&self, slot: usize) -> u32 { self.get(slot) }
+    fn sum(&self) -> u64 { self.sum() }
+    fn count_nonzero(&self) -> u64 { self.count_nonzero() }
 }
 
 impl IntSliceMut for MemoryIntVec {
@@ -118,3 +138,34 @@ impl<B: IntSlice> AddAssign<&B> for MemoryIntVec {
 impl<B: IntSlice> SubAssign<&B> for MemoryIntVec {
     fn sub_assign(&mut self, rhs: &B) { self.diff(rhs); }
 }
+
+// ── Iterator ──────────────────────────────────────────────────────────────────
+
+pub struct MemoryIntIter<'a> {
+    vec: &'a MemoryIntVec,
+    slot: usize,
+}
+
+impl Iterator for MemoryIntIter<'_> {
+    type Item = u32;
+
+    fn next(&mut self) -> Option<u32> {
+        if self.slot >= self.vec.n { return None; }
+        let v = self.vec.get(self.slot);
+        self.slot += 1;
+        Some(v)
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        let rem = self.vec.n - self.slot;
+        (rem, Some(rem))
+    }
+}
+
+impl ExactSizeIterator for MemoryIntIter<'_> {}
+
+impl<'a> IntoIterator for &'a MemoryIntVec {
+    type Item = u32;
+    type IntoIter = MemoryIntIter<'a>;
+    fn into_iter(self) -> MemoryIntIter<'a> { self.iter() }
+}
diff --git a/src/obicompactvec/src/reader.rs b/src/obicompactvec/src/reader.rs
index bd3d7d7..4d5b9e0 100644
--- a/src/obicompactvec/src/reader.rs
+++ b/src/obicompactvec/src/reader.rs
@@ -4,7 +4,7 @@ use std::path::{Path, PathBuf};
 
 use memmap2::Mmap;
 
-use crate::format::{HEADER_SIZE, MAGIC, OVERFLOW_ENTRY_SIZE, parse_index_entry};
+use crate::format::{byte_count_nonzero, byte_sum, HEADER_SIZE, MAGIC, OVERFLOW_ENTRY_SIZE, parse_index_entry};
 
 pub struct PersistentCompactIntVec {
     mmap: Mmap,
@@ -129,14 +129,14 @@ impl PersistentCompactIntVec {
         u32::from_le_bytes(self.mmap[off..off + 4].try_into().unwrap())
     }
 
-    #[inline]
     pub fn sum(&self) -> u64 {
-        self.iter().map(|v| v as u64).sum()
+        let primary = &self.mmap[self.primary_offset..self.primary_offset + self.n];
+        byte_sum(primary, (0..self.n_overflow).map(|i| self.data_value(i)))
     }
 
-    #[inline]
     pub fn count_nonzero(&self) -> u64 {
-        self.iter().filter(|&v| v > 0).count() as u64
+        let primary = &self.mmap[self.primary_offset..self.primary_offset + self.n];
+        byte_count_nonzero(primary)
     }
 
     #[inline]
@@ -357,8 +357,6 @@ use crate::traits::IntSlice;
 impl IntSlice for PersistentCompactIntVec {
     fn len(&self) -> usize { self.n }
     fn get(&self, slot: usize) -> u32 { self.get(slot) }
-    fn sum(&self) -> u64 { self.sum() }
-    fn count_nonzero(&self) -> u64 { self.count_nonzero() }
 }
 
 impl<'a> IntoIterator for &'a PersistentCompactIntVec {
diff --git a/src/obicompactvec/src/traits.rs b/src/obicompactvec/src/traits.rs
index 32e40a1..ff9df71 100644
--- a/src/obicompactvec/src/traits.rs
+++ b/src/obicompactvec/src/traits.rs
@@ -68,8 +68,8 @@ pub trait IntSlice {
     fn get(&self, slot: usize) -> u32;
     fn is_empty(&self) -> bool { self.len() == 0 }
     fn iter(&self) -> impl Iterator<Item = u32> + '_ { (0..self.len()).map(|i| self.get(i)) }
-    fn sum(&self) -> u64 { (0..self.len()).map(|s| self.get(s) as u64).sum() }
-    fn count_nonzero(&self) -> u64 { (0..self.len()).filter(|&s| self.get(s) > 0).count() as u64 }
+    fn sum(&self) -> u64 { self.iter().map(|v| v as u64).sum() }
+    fn count_nonzero(&self) -> u64 { self.iter().filter(|v| *v > 0).count() as u64 }
 
     fn lt(&self, threshold: u32) -> MemoryBitVec { self.cmp_scalar(|v| v <  threshold) }
     fn leq(&self, threshold: u32) -> MemoryBitVec { self.cmp_scalar(|v| v <= threshold) }

From 5ff5b04d2dfe74c93a217f6811667367244d3b3c Mon Sep 17 00:00:00 2001
From: Eric Coissac <eric@coissac.eu>
Date: Wed, 17 Jun 2026 09:19:30 +0200
Subject: [PATCH 05/24] refactor: replace manual bit ops with BitSlice traits

Refactors bit manipulation and distance calculations to leverage standardized `BitSlice` traits, replacing manual byte/word logic with safer, reusable methods. Extends `IntSlice` and `IntSliceMut` traits to expose direct memory-mapped access and overflow management, enabling efficient bulk data extraction and serialization. Replaces manual bit-shifting loops with optimized table-based unpacking and adds population count and distance metric methods for improved performance. Updates `PersistentBitVecBuilder` with file tracking and safe flushing, and aligns test imports with new trait bounds.
---
 src/obicompactvec/src/bitmatrix.rs       | 100 ++++++++---------------
 src/obicompactvec/src/bitvec.rs          |  58 +++----------
 src/obicompactvec/src/builder.rs         |   6 ++
 src/obicompactvec/src/memoryintvec.rs    |  26 ++++--
 src/obicompactvec/src/memoryvec.rs       |   6 --
 src/obicompactvec/src/reader.rs          |   6 ++
 src/obicompactvec/src/tests/bitmatrix.rs |   2 +-
 src/obicompactvec/src/tests/bitvec.rs    |   2 +-
 src/obicompactvec/src/traits.rs          |  55 +++++++++++--
 9 files changed, 128 insertions(+), 133 deletions(-)

diff --git a/src/obicompactvec/src/bitmatrix.rs b/src/obicompactvec/src/bitmatrix.rs
index cd7e0e9..591e4af 100644
--- a/src/obicompactvec/src/bitmatrix.rs
+++ b/src/obicompactvec/src/bitmatrix.rs
@@ -8,7 +8,7 @@ use rayon::prelude::*;
 
 use crate::bitvec::{PersistentBitVec, PersistentBitVecBuilder};
 use crate::memoryvec::MemoryBitVec;
-use crate::traits::BitSliceMut;
+use crate::traits::{BitSlice, BitSliceMut};
 use crate::layer_meta::LayerMeta;
 use crate::meta::MatrixMeta;
 
@@ -126,11 +126,22 @@ impl PackedBitMatrix {
         }).collect()
     }
 
-    #[inline]
     fn col_bytes(&self, c: usize) -> &[u8] {
         let start = self.data_offsets[c];
-        let len = (self.n_rows + 7) / 8;
-        &self.mmap[start..start + len]
+        &self.mmap[start..start + self.n_rows.div_ceil(8)]
+    }
+
+    fn col_words(&self, c: usize) -> &[u64] {
+        let nw = self.n_rows.div_ceil(64);
+        // SAFETY: data_offsets[c] is always 8-byte aligned.
+        // PBMX header = 24 + n_cols×8 (multiple of 8); each PBIV blob =
+        // 16 + nwords×8 (multiple of 8); mmap base is page-aligned.
+        let ptr = self.mmap[self.data_offsets[c]..].as_ptr() as *const u64;
+        unsafe { std::slice::from_raw_parts(ptr, nw) }
+    }
+
+    pub(crate) fn col_slice(&self, c: usize) -> PackedCol<'_> {
+        PackedCol { words: self.col_words(c), n: self.n_rows }
     }
 
     pub(crate) fn col_persist(&self, c: usize, path: &Path) -> io::Result<PersistentBitVecBuilder> {
@@ -138,81 +149,40 @@ impl PackedBitMatrix {
     }
 
     pub(crate) fn col_as_memory(&self, c: usize) -> MemoryBitVec {
-        let bytes = self.col_bytes(c);
-        let n = self.n_rows;
-        let n_words = n.div_ceil(64);
-        let mut words = vec![0u64; n_words];
-        let full = bytes.len() / 8;
-        for (i, chunk) in bytes[..full * 8].chunks_exact(8).enumerate() {
-            words[i] = u64::from_le_bytes(chunk.try_into().unwrap());
-        }
-        let rem = bytes.len() % 8;
-        if rem > 0 {
-            let mut last = [0u8; 8];
-            last[..rem].copy_from_slice(&bytes[full * 8..]);
-            words[full] = u64::from_le_bytes(last);
-        }
-        MemoryBitVec::from_words(words, n)
-    }
-
-    fn count_ones_col(&self, c: usize) -> u64 {
-        let bytes = self.col_bytes(c);
-        let full = self.n_rows / 8;
-        let rem  = self.n_rows % 8;
-        let mut n: u64 = bytes[..full].iter().map(|b| b.count_ones() as u64).sum();
-        if rem > 0 { n += (bytes[full] & ((1u8 << rem) - 1)).count_ones() as u64; }
-        n
-    }
-
-    fn pair_op(&self, i: usize, j: usize, and_or: bool) -> u64 {
-        let ai = self.col_bytes(i);
-        let aj = self.col_bytes(j);
-        let full = self.n_rows / 8;
-        let rem  = self.n_rows % 8;
-        let mut n: u64 = ai[..full].iter().zip(aj[..full].iter())
-            .map(|(a, b)| if and_or { a & b } else { a ^ b }.count_ones() as u64)
-            .sum();
-        if rem > 0 {
-            let mask = (1u8 << rem) - 1;
-            let last = if and_or { ai[full] & aj[full] } else { ai[full] ^ aj[full] };
-            n += (last & mask).count_ones() as u64;
-        }
-        n
-    }
-
-    fn partial_jaccard_col(&self, i: usize, j: usize) -> (u64, u64) {
-        let ai = self.col_bytes(i);
-        let aj = self.col_bytes(j);
-        let full = self.n_rows / 8;
-        let rem  = self.n_rows % 8;
-        let (mut inter, mut union) = ai[..full].iter().zip(aj[..full].iter())
-            .fold((0u64, 0u64), |(inter, union), (a, b)| {
-                (inter + (a & b).count_ones() as u64,
-                 union + (a | b).count_ones() as u64)
-            });
-        if rem > 0 {
-            let mask = (1u8 << rem) - 1;
-            inter += ((ai[full] & aj[full]) & mask).count_ones() as u64;
-            union += ((ai[full] | aj[full]) & mask).count_ones() as u64;
-        }
-        (inter, union)
+        MemoryBitVec::from(&self.col_slice(c))
     }
 
     pub(crate) fn count_ones(&self) -> Array1<u64> {
         Array1::from_vec(
-            (0..self.n_cols).into_par_iter().map(|c| self.count_ones_col(c)).collect()
+            (0..self.n_cols).into_par_iter()
+                .map(|c| self.col_slice(c).count_ones())
+                .collect()
         )
     }
 
     pub(crate) fn partial_jaccard_dist_matrix(&self) -> (Array2<u64>, Array2<u64>) {
-        pairwise2_matrix(self.n_cols, |i, j| self.partial_jaccard_col(i, j))
+        pairwise2_matrix(self.n_cols, |i, j| {
+            self.col_slice(i).partial_jaccard_dist(&self.col_slice(j))
+        })
     }
 
     pub(crate) fn partial_hamming_dist_matrix(&self) -> Array2<u64> {
-        pairwise_matrix(self.n_cols, |i, j| self.pair_op(i, j, false))
+        pairwise_matrix(self.n_cols, |i, j| {
+            self.col_slice(i).hamming_dist(&self.col_slice(j))
+        })
     }
 }
 
+pub(crate) struct PackedCol<'a> {
+    words: &'a [u64],
+    n: usize,
+}
+
+impl BitSlice for PackedCol<'_> {
+    fn len(&self) -> usize { self.n }
+    fn words(&self) -> &[u64] { self.words }
+}
+
 /// Build `presence/matrix.pbmx` from existing `col_*.pbiv` files.
 pub fn pack_bit_matrix(dir: &Path) -> io::Result<()> {
     let packed_path = dir.join("matrix.pbmx");
diff --git a/src/obicompactvec/src/bitvec.rs b/src/obicompactvec/src/bitvec.rs
index dcb52ba..9d78e88 100644
--- a/src/obicompactvec/src/bitvec.rs
+++ b/src/obicompactvec/src/bitvec.rs
@@ -78,48 +78,6 @@ impl PersistentBitVec {
         unsafe { std::slice::from_raw_parts(ptr, nw) }
     }
 
-    pub fn count_ones(&self) -> u64 {
-        // Padding bits in the last word are 0, so no masking needed.
-        self.data_words()
-            .iter()
-            .map(|w| w.count_ones() as u64)
-            .sum()
-    }
-
-    pub fn count_zeros(&self) -> u64 {
-        self.n as u64 - self.count_ones()
-    }
-
-    pub fn jaccard_dist(&self, other: &PersistentBitVec) -> f64 {
-        let (inter, union) = self.partial_jaccard_dist(other);
-        if union == 0 {
-            return 0.0;
-        }
-        1.0 - inter as f64 / union as f64
-    }
-
-    pub fn partial_jaccard_dist(&self, other: &PersistentBitVec) -> (u64, u64) {
-        assert_eq!(self.n, other.n, "length mismatch");
-        self.data_words()
-            .iter()
-            .zip(other.data_words())
-            .fold((0u64, 0u64), |(i, u), (&a, &b)| {
-                (
-                    i + (a & b).count_ones() as u64,
-                    u + (a | b).count_ones() as u64,
-                )
-            })
-    }
-
-    pub fn hamming_dist(&self, other: &PersistentBitVec) -> u64 {
-        assert_eq!(self.n, other.n, "length mismatch");
-        self.data_words()
-            .iter()
-            .zip(other.data_words())
-            .map(|(&a, &b)| (a ^ b).count_ones() as u64)
-            .sum()
-    }
-
     pub fn iter(&self) -> BitIter<'_> {
         BitIter {
             bytes: self.data_bytes(),
@@ -168,6 +126,7 @@ impl Iterator for BitIter<'_> {
 pub struct PersistentBitVecBuilder {
     mmap: MmapMut,
     n: usize,
+    path: PathBuf,
 }
 
 impl PersistentBitVecBuilder {
@@ -185,7 +144,7 @@ impl PersistentBitVecBuilder {
         file.seek(SeekFrom::Start(0))?;
         file.set_len(file_size as u64)?;
         let mmap = unsafe { MmapMut::map_mut(&file)? };
-        Ok(Self { mmap, n })
+        Ok(Self { mmap, n, path: path.to_path_buf() })
     }
 
     /// Create a PBIV file from raw packed bit-bytes, zero-padding to the next word boundary.
@@ -200,7 +159,7 @@ impl PersistentBitVecBuilder {
         mmap[0..4].copy_from_slice(&MAGIC);
         mmap[8..16].copy_from_slice(&(n as u64).to_le_bytes());
         mmap[HEADER_SIZE..HEADER_SIZE + bytes.len()].copy_from_slice(bytes);
-        Ok(Self { mmap, n })
+        Ok(Self { mmap, n, path: path.to_path_buf() })
     }
 
     pub fn build_from(source: &PersistentBitVec, path: &Path) -> io::Result<Self> {
@@ -208,7 +167,7 @@ impl PersistentBitVecBuilder {
         let file = OpenOptions::new().read(true).write(true).open(path)?;
         let mmap = unsafe { MmapMut::map_mut(&file)? };
         let n = source.len();
-        Ok(Self { mmap, n })
+        Ok(Self { mmap, n, path: path.to_path_buf() })
     }
 
     pub fn len(&self) -> usize {
@@ -268,7 +227,7 @@ impl PersistentBitVecBuilder {
             }
         }
 
-        Ok(Self { mmap, n })
+        Ok(Self { mmap, n, path: path.to_path_buf() })
     }
 
     /// Convert a count vector to a presence/absence bit vector (threshold = 1).
@@ -279,6 +238,13 @@ impl PersistentBitVecBuilder {
     pub fn close(self) -> io::Result<()> {
         self.mmap.flush()
     }
+
+    /// Flush, close, and reopen as a read-only `PersistentBitVec`.
+    pub fn finish(self) -> io::Result<PersistentBitVec> {
+        let path = self.path.clone();
+        self.close()?;
+        PersistentBitVec::open(&path)
+    }
 }
 
 // ── BitSlice / BitSliceMut impls ──────────────────────────────────────────────
diff --git a/src/obicompactvec/src/builder.rs b/src/obicompactvec/src/builder.rs
index 4885216..080254b 100644
--- a/src/obicompactvec/src/builder.rs
+++ b/src/obicompactvec/src/builder.rs
@@ -144,8 +144,14 @@ use crate::traits::{IntSlice, IntSliceMut};
 impl IntSlice for PersistentCompactIntVecBuilder {
     fn len(&self) -> usize { self.n }
     fn get(&self, slot: usize) -> u32 { self.get(slot) }
+    fn primary_bytes(&self) -> &[u8] { &self.mmap[HEADER_SIZE..HEADER_SIZE + self.n] }
+    fn overflow_entries(&self) -> impl Iterator<Item = (usize, u32)> + '_ {
+        self.overflow.iter().map(|(&k, &v)| (k, v))
+    }
 }
 
 impl IntSliceMut for PersistentCompactIntVecBuilder {
     fn set(&mut self, slot: usize, value: u32) { self.set(slot, value); }
+    fn primary_bytes_mut(&mut self) -> &mut [u8] { &mut self.mmap[HEADER_SIZE..HEADER_SIZE + self.n] }
+    fn clear_overflow(&mut self) { self.overflow.clear(); }
 }
diff --git a/src/obicompactvec/src/memoryintvec.rs b/src/obicompactvec/src/memoryintvec.rs
index 735431c..3c40377 100644
--- a/src/obicompactvec/src/memoryintvec.rs
+++ b/src/obicompactvec/src/memoryintvec.rs
@@ -76,6 +76,10 @@ impl MemoryIntVec {
 impl IntSlice for MemoryIntVec {
     fn len(&self) -> usize { self.n }
     fn get(&self, slot: usize) -> u32 { self.get(slot) }
+    fn primary_bytes(&self) -> &[u8] { &self.primary }
+    fn overflow_entries(&self) -> impl Iterator<Item = (usize, u32)> + '_ {
+        self.overflow.iter().map(|(&k, &v)| (k, v))
+    }
     fn sum(&self) -> u64 { self.sum() }
     fn count_nonzero(&self) -> u64 { self.count_nonzero() }
 }
@@ -90,18 +94,28 @@ impl IntSliceMut for MemoryIntVec {
             self.overflow.insert(slot, value);
         }
     }
+    fn primary_bytes_mut(&mut self) -> &mut [u8] { &mut self.primary }
+    fn clear_overflow(&mut self) { self.overflow.clear(); }
 }
 
 // ── From conversions ──────────────────────────────────────────────────────────
 
+impl MemoryIntVec {
+    /// Bulk copy from another `MemoryIntVec`: memcpy for the primary bytes,
+    /// clone for the overflow map.
+    pub fn copy_from_memory(&mut self, src: &MemoryIntVec) {
+        assert_eq!(self.n, src.n, "MemoryIntVec length mismatch");
+        self.primary.copy_from_slice(&src.primary);
+        self.overflow = src.overflow.clone();
+    }
+}
+
 impl<S: IntSlice> From<&S> for MemoryIntVec {
     fn from(src: &S) -> Self {
-        let mut v = Self::new(src.len());
-        for slot in 0..src.len() {
-            let val = src.get(slot);
-            if val != 0 { v.set(slot, val); }
-        }
-        v
+        Self::from_primary_and_overflow(
+            src.primary_bytes().to_vec(),
+            src.overflow_entries().collect(),
+        )
     }
 }
 
diff --git a/src/obicompactvec/src/memoryvec.rs b/src/obicompactvec/src/memoryvec.rs
index 9195982..3076325 100644
--- a/src/obicompactvec/src/memoryvec.rs
+++ b/src/obicompactvec/src/memoryvec.rs
@@ -38,12 +38,6 @@ impl MemoryBitVec {
         (self.words[slot >> 6] >> (slot & 63)) & 1 != 0
     }
 
-    pub fn count_ones(&self) -> u64 {
-        self.words.iter().map(|w| w.count_ones() as u64).sum()
-    }
-
-    pub fn count_zeros(&self) -> u64 { self.n as u64 - self.count_ones() }
-
     /// Write to disk and return a writable builder positioned at the same path.
     pub fn persist(&self, path: &Path) -> io::Result<PersistentBitVecBuilder> {
         let mut b = PersistentBitVecBuilder::new(self.n, path)?;
diff --git a/src/obicompactvec/src/reader.rs b/src/obicompactvec/src/reader.rs
index 4d5b9e0..4c75762 100644
--- a/src/obicompactvec/src/reader.rs
+++ b/src/obicompactvec/src/reader.rs
@@ -357,6 +357,12 @@ use crate::traits::IntSlice;
 impl IntSlice for PersistentCompactIntVec {
     fn len(&self) -> usize { self.n }
     fn get(&self, slot: usize) -> u32 { self.get(slot) }
+    fn primary_bytes(&self) -> &[u8] {
+        &self.mmap[self.primary_offset..self.primary_offset + self.n]
+    }
+    fn overflow_entries(&self) -> impl Iterator<Item = (usize, u32)> + '_ {
+        (0..self.n_overflow).map(|i| (self.data_slot(i), self.data_value(i)))
+    }
 }
 
 impl<'a> IntoIterator for &'a PersistentCompactIntVec {
diff --git a/src/obicompactvec/src/tests/bitmatrix.rs b/src/obicompactvec/src/tests/bitmatrix.rs
index 3304410..dced37f 100644
--- a/src/obicompactvec/src/tests/bitmatrix.rs
+++ b/src/obicompactvec/src/tests/bitmatrix.rs
@@ -1,7 +1,7 @@
 use tempfile::tempdir;
 
 use crate::{PersistentBitMatrix, PersistentBitMatrixBuilder};
-use crate::traits::{BitPartials, BitSliceMut};
+use crate::traits::{BitPartials, BitSlice, BitSliceMut};
 
 fn make_matrix(cols: &[&[bool]]) -> (tempfile::TempDir, PersistentBitMatrix) {
     let n = cols.first().map_or(0, |c| c.len());
diff --git a/src/obicompactvec/src/tests/bitvec.rs b/src/obicompactvec/src/tests/bitvec.rs
index a408e7d..7382e14 100644
--- a/src/obicompactvec/src/tests/bitvec.rs
+++ b/src/obicompactvec/src/tests/bitvec.rs
@@ -1,6 +1,6 @@
 use tempfile::tempdir;
 
-use crate::traits::BitSliceMut;
+use crate::traits::{BitSlice, BitSliceMut};
 use crate::{PersistentBitVec, PersistentBitVecBuilder, PersistentCompactIntVec, PersistentCompactIntVecBuilder};
 
 fn make_bv(bits: &[bool]) -> (tempfile::TempDir, PersistentBitVec) {
diff --git a/src/obicompactvec/src/traits.rs b/src/obicompactvec/src/traits.rs
index ff9df71..e1f15f0 100644
--- a/src/obicompactvec/src/traits.rs
+++ b/src/obicompactvec/src/traits.rs
@@ -13,6 +13,27 @@ pub trait BitSlice {
     fn get(&self, slot: usize) -> bool {
         (self.words()[slot >> 6] >> (slot & 63)) & 1 != 0
     }
+    fn count_ones(&self) -> u64 {
+        self.words().iter().map(|w| w.count_ones() as u64).sum()
+    }
+    fn count_zeros(&self) -> u64 { self.len() as u64 - self.count_ones() }
+    fn partial_jaccard_dist<S: BitSlice>(&self, other: &S) -> (u64, u64) {
+        assert_eq!(self.len(), other.len(), "length mismatch");
+        self.words().iter().zip(other.words())
+            .fold((0u64, 0u64), |(i, u), (&a, &b)| {
+                (i + (a & b).count_ones() as u64, u + (a | b).count_ones() as u64)
+            })
+    }
+    fn jaccard_dist<S: BitSlice>(&self, other: &S) -> f64 {
+        let (inter, union) = self.partial_jaccard_dist(other);
+        if union == 0 { 0.0 } else { 1.0 - inter as f64 / union as f64 }
+    }
+    fn hamming_dist<S: BitSlice>(&self, other: &S) -> u64 {
+        assert_eq!(self.len(), other.len(), "length mismatch");
+        self.words().iter().zip(other.words())
+            .map(|(&a, &b)| (a ^ b).count_ones() as u64)
+            .sum()
+    }
 }
 
 /// Mutable view over a bit-vector word array; default methods maintain the
@@ -66,6 +87,10 @@ pub trait BitSliceMut: BitSlice {
 pub trait IntSlice {
     fn len(&self) -> usize;
     fn get(&self, slot: usize) -> u32;
+    /// Raw primary byte slice (sentinel 255 marks overflow slots).
+    fn primary_bytes(&self) -> &[u8];
+    /// Iterator over `(slot, true_value)` pairs for all overflow entries (value >= 255).
+    fn overflow_entries(&self) -> impl Iterator<Item = (usize, u32)> + '_;
     fn is_empty(&self) -> bool { self.len() == 0 }
     fn iter(&self) -> impl Iterator<Item = u32> + '_ { (0..self.len()).map(|i| self.get(i)) }
     fn sum(&self) -> u64 { self.iter().map(|v| v as u64).sum() }
@@ -90,6 +115,8 @@ pub trait IntSlice {
 /// compact encoding invariants on the implementor's side.
 pub trait IntSliceMut: IntSlice {
     fn set(&mut self, slot: usize, value: u32);
+    fn primary_bytes_mut(&mut self) -> &mut [u8];
+    fn clear_overflow(&mut self);
 
     fn inc(&mut self, slot: usize) -> &mut Self {
         let v = self.get(slot);
@@ -111,7 +138,9 @@ pub trait IntSliceMut: IntSlice {
 
     fn copy_from<S: IntSlice>(&mut self, src: &S) -> &mut Self {
         assert_eq!(self.len(), src.len(), "IntSlice length mismatch");
-        for s in 0..src.len() { self.set(s, src.get(s)); }
+        self.primary_bytes_mut().copy_from_slice(src.primary_bytes());
+        self.clear_overflow();
+        for (slot, val) in src.overflow_entries() { self.set(slot, val); }
         self
     }
 
@@ -176,13 +205,26 @@ impl<T: IntSlice> IntToBit for T {}
 
 use crate::memoryintvec::MemoryIntVec;
 
+// Maps each byte value to its 8 constituent bits as individual u8 (0 or 1).
+static EXPAND_BYTE: [[u8; 8]; 256] = {
+    let mut table = [[0u8; 8]; 256];
+    let mut b = 0usize;
+    while b < 256 {
+        let mut bit = 0usize;
+        while bit < 8 {
+            table[b][bit] = ((b >> bit) & 1) as u8;
+            bit += 1;
+        }
+        b += 1;
+    }
+    table
+};
+
 pub trait BitToInt: BitSlice {
     fn to_intvec(&self) -> MemoryIntVec {
         let n = self.len();
         let mut primary = vec![0u8; n];
 
-        // Unpack u64 words: each byte within a word yields 8 output bytes.
-        // Values are always 0 or 1 → no overflow entries needed.
         let words = self.words();
         let full_words = n / 64;
 
@@ -190,14 +232,11 @@ pub trait BitToInt: BitSlice {
             let base = w_idx * 64;
             for byte_off in 0..8usize {
                 let byte = (word >> (byte_off * 8)) as u8;
-                let out = &mut primary[base + byte_off * 8..base + byte_off * 8 + 8];
-                for bit in 0..8usize {
-                    out[bit] = (byte >> bit) & 1;
-                }
+                primary[base + byte_off * 8..base + byte_off * 8 + 8]
+                    .copy_from_slice(&EXPAND_BYTE[byte as usize]);
             }
         }
 
-        // Remaining bits in the last partial word
         let rem = n % 64;
         if rem > 0 {
             let word = words[full_words];

From aa98e82875c3cc07ddf13d4c82180bbace34c2ec Mon Sep 17 00:00:00 2001
From: Eric Coissac <eric@coissac.eu>
Date: Wed, 17 Jun 2026 09:28:16 +0200
Subject: [PATCH 06/24] refactor: introduce PackedIntCol view and use iterators

Centralizes overflow handling and improves modularity by replacing manual mmap indexing and row loops with composable iterator patterns. This change leverages Rust's iterator traits for efficient, idiomatic column traversal while encapsulating data access in a dedicated view struct.
---
 src/obicompactvec/src/intmatrix.rs | 229 ++++++++++++++++++-----------
 1 file changed, 146 insertions(+), 83 deletions(-)

diff --git a/src/obicompactvec/src/intmatrix.rs b/src/obicompactvec/src/intmatrix.rs
index 69240dd..0be16fb 100644
--- a/src/obicompactvec/src/intmatrix.rs
+++ b/src/obicompactvec/src/intmatrix.rs
@@ -14,6 +14,7 @@ use crate::memoryintvec::MemoryIntVec;
 use crate::format::{byte_count_nonzero, byte_sum, HEADER_SIZE, OVERFLOW_ENTRY_SIZE, parse_index_entry, parse_overflow_entry};
 use crate::meta::MatrixMeta;
 use crate::reader::PersistentCompactIntVec;
+use crate::traits::IntSlice;
 
 fn col_path(dir: &Path, col: usize) -> PathBuf {
     dir.join(format!("col_{col:06}.pciv"))
@@ -124,6 +125,107 @@ struct ColInfo {
     index:         Vec<(usize, usize)>,
 }
 
+// ── PackedIntCol — lightweight column view backed by the shared mmap ──────────
+
+pub(crate) struct PackedIntCol<'a> {
+    primary:    &'a [u8],
+    overflow:   &'a [u8],  // raw bytes: n_overflow × OVERFLOW_ENTRY_SIZE
+    n_overflow: usize,
+    step:       usize,
+    index:      &'a [(usize, usize)],
+    n:          usize,
+}
+
+impl PackedIntCol<'_> {
+    fn overflow_get(&self, slot: usize) -> u32 {
+        let (pos_start, pos_end) = if self.step == 0 {
+            (0, self.n_overflow)
+        } else {
+            let i = self.index.partition_point(|&(s, _)| s <= slot).saturating_sub(1);
+            let start = self.index[i].1;
+            let end   = if i + 1 < self.index.len() { self.index[i + 1].1 } else { self.n_overflow };
+            (start, end)
+        };
+        let mut lo = pos_start;
+        let mut hi = pos_end;
+        while lo < hi {
+            let mid = lo + (hi - lo) / 2;
+            let (stored, val) = parse_overflow_entry(self.overflow, 0, mid);
+            match stored.cmp(&slot) {
+                Ordering::Equal   => return val,
+                Ordering::Less    => lo = mid + 1,
+                Ordering::Greater => hi = mid,
+            }
+        }
+        panic!("slot {slot} marked overflow but not found")
+    }
+}
+
+impl IntSlice for PackedIntCol<'_> {
+    fn len(&self) -> usize { self.n }
+
+    fn get(&self, slot: usize) -> u32 {
+        let v = self.primary[slot];
+        if v < 255 { v as u32 } else { self.overflow_get(slot) }
+    }
+
+    fn primary_bytes(&self) -> &[u8] { self.primary }
+
+    fn overflow_entries(&self) -> impl Iterator<Item = (usize, u32)> + '_ {
+        (0..self.n_overflow).map(|i| parse_overflow_entry(self.overflow, 0, i))
+    }
+
+    fn iter(&self) -> impl Iterator<Item = u32> + '_ {
+        PackedIntColIter {
+            primary:      self.primary,
+            overflow:     self.overflow,
+            slot:         0,
+            overflow_pos: 0,
+            n:            self.n,
+        }
+    }
+
+    fn sum(&self) -> u64 {
+        byte_sum(self.primary, (0..self.n_overflow).map(|i| parse_overflow_entry(self.overflow, 0, i).1))
+    }
+
+    fn count_nonzero(&self) -> u64 { byte_count_nonzero(self.primary) }
+}
+
+struct PackedIntColIter<'a> {
+    primary:      &'a [u8],
+    overflow:     &'a [u8],
+    slot:         usize,
+    overflow_pos: usize,
+    n:            usize,
+}
+
+impl Iterator for PackedIntColIter<'_> {
+    type Item = u32;
+
+    fn next(&mut self) -> Option<u32> {
+        if self.slot >= self.n { return None; }
+        let v = self.primary[self.slot];
+        self.slot += 1;
+        if v < 255 {
+            Some(v as u32)
+        } else {
+            let (_, val) = parse_overflow_entry(self.overflow, 0, self.overflow_pos);
+            self.overflow_pos += 1;
+            Some(val)
+        }
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        let rem = self.n - self.slot;
+        (rem, Some(rem))
+    }
+}
+
+impl ExactSizeIterator for PackedIntColIter<'_> {}
+
+// ─────────────────────────────────────────────────────────────────────────────
+
 pub struct PackedCompactIntMatrix {
     mmap:    Mmap,
     n_rows:  usize,
@@ -148,10 +250,10 @@ impl PackedCompactIntMatrix {
             let off_pos  = PCMX_HEADER + c * 8;
             let col_base = u64::from_le_bytes(mmap[off_pos..off_pos+8].try_into().unwrap()) as usize;
             // Parse embedded PCIV header at col_base
-            let n_ov    = u64::from_le_bytes(mmap[col_base+16..col_base+24].try_into().unwrap()) as usize;
-            let n_idx   = u64::from_le_bytes(mmap[col_base+24..col_base+32].try_into().unwrap()) as usize;
-            let step    = u64::from_le_bytes(mmap[col_base+32..col_base+40].try_into().unwrap()) as usize;
-            let n_pciv  = u64::from_le_bytes(mmap[col_base+8..col_base+16].try_into().unwrap())  as usize;
+            let n_ov   = u64::from_le_bytes(mmap[col_base+16..col_base+24].try_into().unwrap()) as usize;
+            let n_idx  = u64::from_le_bytes(mmap[col_base+24..col_base+32].try_into().unwrap()) as usize;
+            let step   = u64::from_le_bytes(mmap[col_base+32..col_base+40].try_into().unwrap()) as usize;
+            let n_pciv = u64::from_le_bytes(mmap[col_base+8..col_base+16].try_into().unwrap())  as usize;
 
             let primary_start = col_base + HEADER_SIZE;
             let data_offset   = primary_start + n_pciv;
@@ -167,57 +269,31 @@ impl PackedCompactIntMatrix {
         Ok(Self { mmap, n_rows, n_cols, columns })
     }
 
-    fn col_overflow_map(&self, ci: &ColInfo) -> HashMap<usize, u32> {
-        let mut overflow = HashMap::with_capacity(ci.n_overflow);
-        for i in 0..ci.n_overflow {
-            let (slot, value) = parse_overflow_entry(&self.mmap, ci.data_offset, i);
-            overflow.insert(slot, value);
+    pub(crate) fn col_slice(&self, c: usize) -> PackedIntCol<'_> {
+        let ci = &self.columns[c];
+        PackedIntCol {
+            primary:    &self.mmap[ci.primary_start..ci.primary_start + self.n_rows],
+            overflow:   &self.mmap[ci.data_offset..ci.data_offset + ci.n_overflow * OVERFLOW_ENTRY_SIZE],
+            n_overflow: ci.n_overflow,
+            step:       ci.step,
+            index:      &ci.index,
+            n:          self.n_rows,
         }
-        overflow
     }
 
     pub(crate) fn col_persist(&self, c: usize, path: &Path) -> io::Result<PersistentCompactIntVecBuilder> {
-        let ci = &self.columns[c];
-        let primary = &self.mmap[ci.primary_start..ci.primary_start + self.n_rows];
-        PersistentCompactIntVecBuilder::from_raw_primary(primary, self.col_overflow_map(ci), path)
+        let col = self.col_slice(c);
+        let overflow: HashMap<usize, u32> = col.overflow_entries().collect();
+        PersistentCompactIntVecBuilder::from_raw_primary(col.primary, overflow, path)
     }
 
     pub(crate) fn col_as_memory(&self, c: usize) -> MemoryIntVec {
-        let ci = &self.columns[c];
-        let primary = self.mmap[ci.primary_start..ci.primary_start + self.n_rows].to_vec();
-        MemoryIntVec::from_primary_and_overflow(primary, self.col_overflow_map(ci))
+        MemoryIntVec::from(&self.col_slice(c))
     }
 
     #[inline]
     pub(crate) fn get(&self, col: usize, slot: usize) -> u32 {
-        let ci = &self.columns[col];
-        let v = self.mmap[ci.primary_start + slot];
-        if v < 255 { return v as u32; }
-        self.overflow_get(ci, slot)
-    }
-
-    fn overflow_get(&self, ci: &ColInfo, slot: usize) -> u32 {
-        let (pos_start, pos_end) = if ci.step == 0 {
-            (0, ci.n_overflow)
-        } else {
-            let i = ci.index.partition_point(|&(s, _)| s <= slot).saturating_sub(1);
-            let start = ci.index[i].1;
-            let end   = if i + 1 < ci.index.len() { ci.index[i+1].1 } else { ci.n_overflow };
-            (start, end)
-        };
-        let mut lo = pos_start;
-        let mut hi = pos_end;
-        while lo < hi {
-            let mid = lo + (hi - lo) / 2;
-            let off = ci.data_offset + mid * OVERFLOW_ENTRY_SIZE;
-            let stored = u64::from_le_bytes(self.mmap[off..off+8].try_into().unwrap()) as usize;
-            match stored.cmp(&slot) {
-                Ordering::Equal   => return u32::from_le_bytes(self.mmap[off+8..off+12].try_into().unwrap()),
-                Ordering::Less    => lo = mid + 1,
-                Ordering::Greater => hi = mid,
-            }
-        }
-        panic!("slot {slot} marked overflow but not found")
+        self.col_slice(col).get(slot)
     }
 
     pub(crate) fn fill_row(&self, slot: usize, buf: &mut [u32]) {
@@ -230,73 +306,62 @@ impl PackedCompactIntMatrix {
 
     pub(crate) fn sum(&self) -> Array1<u64> {
         Array1::from_vec(
-            self.columns.par_iter()
-                .map(|ci| {
-                    let primary = &self.mmap[ci.primary_start..ci.primary_start + self.n_rows];
-                    let overflow = (0..ci.n_overflow)
-                        .map(|i| parse_overflow_entry(&self.mmap, ci.data_offset, i).1);
-                    byte_sum(primary, overflow)
-                })
+            (0..self.n_cols).into_par_iter()
+                .map(|c| self.col_slice(c).sum())
                 .collect()
         )
     }
 
     pub(crate) fn count_nonzero(&self) -> Array1<u64> {
         Array1::from_vec(
-            self.columns.par_iter()
-                .map(|ci| {
-                    let primary = &self.mmap[ci.primary_start..ci.primary_start + self.n_rows];
-                    byte_count_nonzero(primary)
-                })
+            (0..self.n_cols).into_par_iter()
+                .map(|c| self.col_slice(c).count_nonzero())
                 .collect()
         )
     }
 
-    // ── Pair primitives ───────────────────────────────────────────────────────
+    // ── Pair primitives — sequential scan via col_slice().iter() ─────────────
 
     fn pair_partial_bray(&self, i: usize, j: usize) -> u64 {
-        (0..self.n_rows).map(|s| self.get(i, s).min(self.get(j, s)) as u64).sum()
+        self.col_slice(i).iter().zip(self.col_slice(j).iter())
+            .map(|(a, b)| a.min(b) as u64)
+            .sum()
     }
 
     fn pair_partial_euclidean(&self, i: usize, j: usize) -> f64 {
-        (0..self.n_rows).map(|s| {
-            let d = self.get(i, s) as f64 - self.get(j, s) as f64;
-            d * d
-        }).sum()
+        self.col_slice(i).iter().zip(self.col_slice(j).iter())
+            .map(|(a, b)| { let d = a as f64 - b as f64; d * d })
+            .sum()
     }
 
     fn pair_partial_threshold_jaccard(&self, i: usize, j: usize, t: u32) -> (u64, u64) {
-        let (mut inter, mut union) = (0u64, 0u64);
-        for s in 0..self.n_rows {
-            let a = self.get(i, s) >= t;
-            let b = self.get(j, s) >= t;
-            if a && b { inter += 1; }
-            if a || b { union += 1; }
-        }
-        (inter, union)
+        self.col_slice(i).iter().zip(self.col_slice(j).iter())
+            .fold((0u64, 0u64), |(inter, uni), (a, b)| {
+                let ap = a >= t;
+                let bp = b >= t;
+                (inter + (ap & bp) as u64, uni + (ap | bp) as u64)
+            })
     }
 
     fn pair_partial_relfreq_bray(&self, i: usize, j: usize, si: f64, sj: f64) -> f64 {
         if si == 0.0 || sj == 0.0 { return 0.0; }
-        (0..self.n_rows).map(|s| {
-            (self.get(i, s) as f64 / si).min(self.get(j, s) as f64 / sj)
-        }).sum()
+        self.col_slice(i).iter().zip(self.col_slice(j).iter())
+            .map(|(a, b)| (a as f64 / si).min(b as f64 / sj))
+            .sum()
     }
 
     fn pair_partial_relfreq_euclidean(&self, i: usize, j: usize, si: f64, sj: f64) -> f64 {
         if si == 0.0 || sj == 0.0 { return 0.0; }
-        (0..self.n_rows).map(|s| {
-            let d = self.get(i, s) as f64 / si - self.get(j, s) as f64 / sj;
-            d * d
-        }).sum()
+        self.col_slice(i).iter().zip(self.col_slice(j).iter())
+            .map(|(a, b)| { let d = a as f64 / si - b as f64 / sj; d * d })
+            .sum()
     }
 
     fn pair_partial_hellinger(&self, i: usize, j: usize, si: f64, sj: f64) -> f64 {
         if si == 0.0 || sj == 0.0 { return 0.0; }
-        (0..self.n_rows).map(|s| {
-            let d = (self.get(i, s) as f64 / si).sqrt() - (self.get(j, s) as f64 / sj).sqrt();
-            d * d
-        }).sum()
+        self.col_slice(i).iter().zip(self.col_slice(j).iter())
+            .map(|(a, b)| { let d = (a as f64 / si).sqrt() - (b as f64 / sj).sqrt(); d * d })
+            .sum()
     }
 
     // ── Matrix methods ────────────────────────────────────────────────────────
@@ -324,7 +389,6 @@ impl PackedCompactIntMatrix {
     pub(crate) fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
         pairwise_matrix(self.n_cols, |i, j| self.pair_partial_hellinger(i, j, col_sums[i] as f64, col_sums[j] as f64))
     }
-
 }
 
 /// Build `counts/matrix.pcmx` from existing `col_*.pciv` files.
@@ -516,4 +580,3 @@ impl PersistentCompactIntMatrixBuilder {
         MatrixMeta { n: self.n, n_cols: self.n_cols }.save(&self.dir)
     }
 }
-

From 497d250d8aaf3224ac1d1b63532afd3342d2b718 Mon Sep 17 00:00:00 2001
From: Eric Coissac <eric@coissac.eu>
Date: Wed, 17 Jun 2026 09:32:12 +0200
Subject: [PATCH 07/24] refactor: replace byte-level bit iteration with 64-bit
 words

Refactor `BitIter` to process `u64` chunks using word-aligned shifts instead of byte-level operations. Introduce a dedicated `MemoryBitIter` for `MemoryBitVec`, updating its `iter()` and `IntoIterator` implementations accordingly. Hide `MemoryBitIter` from the public API to narrow the crate's interface, while leveraging explicit alignment guarantees for safer and more efficient bit extraction.
---
 src/obicompactvec/src/bitvec.rs    | 25 ++++++----------------
 src/obicompactvec/src/lib.rs       |  2 +-
 src/obicompactvec/src/memoryvec.rs | 34 +++++-------------------------
 3 files changed, 13 insertions(+), 48 deletions(-)

diff --git a/src/obicompactvec/src/bitvec.rs b/src/obicompactvec/src/bitvec.rs
index 9d78e88..1d91b10 100644
--- a/src/obicompactvec/src/bitvec.rs
+++ b/src/obicompactvec/src/bitvec.rs
@@ -65,12 +65,7 @@ impl PersistentBitVec {
         (self.mmap[HEADER_SIZE + (slot >> 3)] >> (slot & 7)) & 1 != 0
     }
 
-    // Used by iter() and get(): exact byte window, no padding.
-    fn data_bytes(&self) -> &[u8] {
-        &self.mmap[HEADER_SIZE..HEADER_SIZE + self.n.div_ceil(8)]
-    }
-
-    // Bulk word view. SAFETY: mmap is page-aligned, HEADER_SIZE=16 is divisible by 8,
+    // SAFETY: mmap is page-aligned, HEADER_SIZE=16 is divisible by 8,
     // so &mmap[HEADER_SIZE] is u64-aligned. Slice length is n_words * 8 bytes.
     fn data_words(&self) -> &[u64] {
         let nw = n_words(self.n);
@@ -79,11 +74,7 @@ impl PersistentBitVec {
     }
 
     pub fn iter(&self) -> BitIter<'_> {
-        BitIter {
-            bytes: self.data_bytes(),
-            slot: 0,
-            n: self.n,
-        }
+        BitIter { words: self.data_words(), slot: 0, n: self.n }
     }
 }
 
@@ -96,9 +87,9 @@ impl<'a> IntoIterator for &'a PersistentBitVec {
 }
 
 pub struct BitIter<'a> {
-    bytes: &'a [u8],
-    slot: usize,
-    n: usize,
+    pub(crate) words: &'a [u64],
+    pub(crate) slot: usize,
+    pub(crate) n: usize,
 }
 
 impl ExactSizeIterator for BitIter<'_> {}
@@ -107,10 +98,8 @@ impl Iterator for BitIter<'_> {
     type Item = bool;
 
     fn next(&mut self) -> Option<bool> {
-        if self.slot >= self.n {
-            return None;
-        }
-        let v = (self.bytes[self.slot >> 3] >> (self.slot & 7)) & 1 != 0;
+        if self.slot >= self.n { return None; }
+        let v = (self.words[self.slot >> 6] >> (self.slot & 63)) & 1 != 0;
         self.slot += 1;
         Some(v)
     }
diff --git a/src/obicompactvec/src/lib.rs b/src/obicompactvec/src/lib.rs
index ced509b..3a5f1c4 100644
--- a/src/obicompactvec/src/lib.rs
+++ b/src/obicompactvec/src/lib.rs
@@ -16,7 +16,7 @@ pub use builder::PersistentCompactIntVecBuilder;
 pub use intmatrix::{PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, pack_compact_int_matrix};
 pub use layer_meta::LayerMeta;
 pub use memoryintvec::{MemoryIntIter, MemoryIntVec};
-pub use memoryvec::{MemoryBitIter, MemoryBitVec};
+pub use memoryvec::MemoryBitVec;
 pub use reader::PersistentCompactIntVec;
 pub use traits::{BitPartials, BitSlice, BitSliceMut, BitToInt, ColumnWeights, CountPartials, IntSlice, IntSliceMut, IntToBit};
 
diff --git a/src/obicompactvec/src/memoryvec.rs b/src/obicompactvec/src/memoryvec.rs
index 3076325..fef0960 100644
--- a/src/obicompactvec/src/memoryvec.rs
+++ b/src/obicompactvec/src/memoryvec.rs
@@ -2,7 +2,7 @@ use std::io;
 use std::ops::{BitAnd, BitAndAssign, BitOr, BitOrAssign, BitXor, BitXorAssign, Not};
 use std::path::Path;
 
-use crate::bitvec::{PersistentBitVecBuilder, n_words};
+use crate::bitvec::{BitIter, PersistentBitVecBuilder, n_words};
 use crate::traits::{BitSlice, BitSliceMut};
 
 // ── MemoryBitVec ──────────────────────────────────────────────────────────────
@@ -125,38 +125,14 @@ impl<B: BitSlice> BitXorAssign<&B> for MemoryBitVec {
 
 // ── Iterator ──────────────────────────────────────────────────────────────────
 
-pub struct MemoryBitIter<'a> {
-    words: &'a [u64],
-    slot: usize,
-    n: usize,
-}
-
-impl Iterator for MemoryBitIter<'_> {
-    type Item = bool;
-
-    fn next(&mut self) -> Option<bool> {
-        if self.slot >= self.n { return None; }
-        let v = (self.words[self.slot >> 6] >> (self.slot & 63)) & 1 != 0;
-        self.slot += 1;
-        Some(v)
-    }
-
-    fn size_hint(&self) -> (usize, Option<usize>) {
-        let rem = self.n - self.slot;
-        (rem, Some(rem))
-    }
-}
-
-impl ExactSizeIterator for MemoryBitIter<'_> {}
-
 impl MemoryBitVec {
-    pub fn iter(&self) -> MemoryBitIter<'_> {
-        MemoryBitIter { words: &self.words, slot: 0, n: self.n }
+    pub fn iter(&self) -> BitIter<'_> {
+        BitIter { words: &self.words, slot: 0, n: self.n }
     }
 }
 
 impl<'a> IntoIterator for &'a MemoryBitVec {
     type Item = bool;
-    type IntoIter = MemoryBitIter<'a>;
-    fn into_iter(self) -> MemoryBitIter<'a> { self.iter() }
+    type IntoIter = BitIter<'a>;
+    fn into_iter(self) -> BitIter<'a> { self.iter() }
 }

From 26de90f18d0260c300484ba518696709c194171b Mon Sep 17 00:00:00 2001
From: Eric Coissac <eric@coissac.eu>
Date: Wed, 17 Jun 2026 09:48:09 +0200
Subject: [PATCH 08/24] feat: add iteration and aggregation to compact int vec

Implemented `sum()`, `count_nonzero()`, and `iter()` to complete the numeric vector interface. The builder now computes aggregate values across memory-mapped regions and overflow entries, while the reader delegates these operations to its inherent methods. The iterator provides zero-copy access to underlying `u32` elements.
---
 src/obicompactvec/src/builder.rs      | 8 +++++++-
 src/obicompactvec/src/memoryintvec.rs | 1 +
 src/obicompactvec/src/reader.rs       | 3 +++
 3 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/obicompactvec/src/builder.rs b/src/obicompactvec/src/builder.rs
index 080254b..3e622d9 100644
--- a/src/obicompactvec/src/builder.rs
+++ b/src/obicompactvec/src/builder.rs
@@ -5,7 +5,7 @@ use std::path::{Path, PathBuf};
 
 use memmap2::MmapMut;
 
-use crate::format::{HEADER_SIZE, finalize_pciv, parse_overflow_entry};
+use crate::format::{byte_count_nonzero, byte_sum, HEADER_SIZE, finalize_pciv, parse_overflow_entry};
 use crate::reader::PersistentCompactIntVec;
 
 pub struct PersistentCompactIntVecBuilder {
@@ -148,6 +148,12 @@ impl IntSlice for PersistentCompactIntVecBuilder {
     fn overflow_entries(&self) -> impl Iterator<Item = (usize, u32)> + '_ {
         self.overflow.iter().map(|(&k, &v)| (k, v))
     }
+    fn sum(&self) -> u64 {
+        byte_sum(&self.mmap[HEADER_SIZE..HEADER_SIZE + self.n], self.overflow.values().copied())
+    }
+    fn count_nonzero(&self) -> u64 {
+        byte_count_nonzero(&self.mmap[HEADER_SIZE..HEADER_SIZE + self.n])
+    }
 }
 
 impl IntSliceMut for PersistentCompactIntVecBuilder {
diff --git a/src/obicompactvec/src/memoryintvec.rs b/src/obicompactvec/src/memoryintvec.rs
index 3c40377..d5ca280 100644
--- a/src/obicompactvec/src/memoryintvec.rs
+++ b/src/obicompactvec/src/memoryintvec.rs
@@ -80,6 +80,7 @@ impl IntSlice for MemoryIntVec {
     fn overflow_entries(&self) -> impl Iterator<Item = (usize, u32)> + '_ {
         self.overflow.iter().map(|(&k, &v)| (k, v))
     }
+    fn iter(&self) -> impl Iterator<Item = u32> + '_ { self.iter() }
     fn sum(&self) -> u64 { self.sum() }
     fn count_nonzero(&self) -> u64 { self.count_nonzero() }
 }
diff --git a/src/obicompactvec/src/reader.rs b/src/obicompactvec/src/reader.rs
index 4c75762..af7d05c 100644
--- a/src/obicompactvec/src/reader.rs
+++ b/src/obicompactvec/src/reader.rs
@@ -363,6 +363,9 @@ impl IntSlice for PersistentCompactIntVec {
     fn overflow_entries(&self) -> impl Iterator<Item = (usize, u32)> + '_ {
         (0..self.n_overflow).map(|i| (self.data_slot(i), self.data_value(i)))
     }
+    fn iter(&self) -> impl Iterator<Item = u32> + '_ { self.iter() }
+    fn sum(&self) -> u64 { self.sum() }
+    fn count_nonzero(&self) -> u64 { self.count_nonzero() }
 }
 
 impl<'a> IntoIterator for &'a PersistentCompactIntVec {

From 7ed7b2603910ba689f723de0feceb73299c81147 Mon Sep 17 00:00:00 2001
From: Eric Coissac <eric@coissac.eu>
Date: Wed, 17 Jun 2026 12:30:40 +0200
Subject: [PATCH 09/24] perf: optimize vec arithmetic and add overflow tests

Refactor `cmp_scalar`, `min`, `max`, `add`, and `diff` to operate directly on the primary byte array, deferring overflow slot resolution to a secondary pass. This eliminates HashMap lookups in the hot path and enables SIMD vectorization. Add six unit tests to validate correct promotion and demotion between storage slots when values cross the 255 threshold.
---
 src/obicompactvec/src/tests/memoryvec.rs | 125 +++++++++++++++++++++++
 src/obicompactvec/src/traits.rs          |  83 ++++++++++++++-
 2 files changed, 203 insertions(+), 5 deletions(-)

diff --git a/src/obicompactvec/src/tests/memoryvec.rs b/src/obicompactvec/src/tests/memoryvec.rs
index 21c12c9..3fd4afb 100644
--- a/src/obicompactvec/src/tests/memoryvec.rs
+++ b/src/obicompactvec/src/tests/memoryvec.rs
@@ -305,6 +305,110 @@ fn count_bits_skips_zero_words() {
     assert_eq!(count.get(127), 1);
 }
 
+// ── min / max / add / diff — overflow edge cases ──────────────────────────────
+
+#[test]
+fn miv_min_overflow_edges() {
+    // [300, 50, 400, 300] min [50, 300, 500, 200]
+    // slot 0: self=overflow(300), other=primary(50)  → 50   (overflow removed)
+    // slot 1: self=primary(50),   other=overflow(300) → 50   (no overflow created)
+    // slot 2: self=overflow(400), other=overflow(500) → 400  (overflow updated)
+    // slot 3: self=overflow(300), other=primary(200)  → 200  (overflow removed, 200 < 255)
+    let mut a = MemoryIntVec::new(4);
+    a.set(0, 300); a.set(1, 50); a.set(2, 400); a.set(3, 300);
+    let mut b = MemoryIntVec::new(4);
+    b.set(0, 50); b.set(1, 300); b.set(2, 500); b.set(3, 200);
+    IntSliceMut::min(&mut a, &b);
+    assert_eq!(a.get(0), 50);
+    assert_eq!(a.get(1), 50);
+    assert_eq!(a.get(2), 400);
+    assert_eq!(a.get(3), 200);
+    // Only slot 2 should still have an overflow entry.
+    let ov: std::collections::HashMap<usize, u32> = a.overflow_entries().collect();
+    assert_eq!(ov.len(), 1);
+    assert_eq!(ov[&2], 400);
+}
+
+#[test]
+fn miv_max_overflow_edges() {
+    // [50, 300, 100, 400] max [300, 50, 500, 200]
+    // slot 0: self=primary(50),   other=overflow(300) → 300  (overflow created)
+    // slot 1: self=overflow(300), other=primary(50)   → 300  (overflow unchanged)
+    // slot 2: self=primary(100),  other=overflow(500) → 500  (overflow created)
+    // slot 3: self=overflow(400), other=overflow(200) → 400  (overflow unchanged, 200 < 255 wait...)
+    // Wait — 200 < 255 so other slot 3 is NOT overflow. Correct: max(400, 200) = 400.
+    let mut a = MemoryIntVec::new(4);
+    a.set(0, 50); a.set(1, 300); a.set(2, 100); a.set(3, 400);
+    let mut b = MemoryIntVec::new(4);
+    b.set(0, 300); b.set(1, 50); b.set(2, 500); b.set(3, 200);
+    IntSliceMut::max(&mut a, &b);
+    assert_eq!(a.get(0), 300);
+    assert_eq!(a.get(1), 300);
+    assert_eq!(a.get(2), 500);
+    assert_eq!(a.get(3), 400);
+    let ov: std::collections::HashMap<usize, u32> = a.overflow_entries().collect();
+    assert_eq!(ov.len(), 4); // all four results >= 255
+    assert_eq!(ov[&0], 300);
+    assert_eq!(ov[&1], 300);
+    assert_eq!(ov[&2], 500);
+    assert_eq!(ov[&3], 400);
+}
+
+#[test]
+fn miv_add_overflow_edges() {
+    // [300, 50, 400, 200] + [50, 300, 200, 200]
+    // slot 0: self=overflow(300), other=primary(50)   → 350  (overflow updated)
+    // slot 1: self=primary(50),   other=overflow(300) → 350  (overflow created from primary)
+    // slot 2: self=overflow(400), other=overflow(200... wait 200 < 255)
+    //         other slot 2 is primary(200); 400+200=600 (overflow updated)
+    // slot 3: self=primary(200),  other=primary(200)  → 400  (overflow created, 400 >= 255)
+    let mut a = MemoryIntVec::new(4);
+    a.set(0, 300); a.set(1, 50); a.set(2, 400); a.set(3, 200);
+    let mut b = MemoryIntVec::new(4);
+    b.set(0, 50); b.set(1, 300); b.set(2, 200); b.set(3, 200);
+    a.add(&b);
+    assert_eq!(a.get(0), 350);
+    assert_eq!(a.get(1), 350);
+    assert_eq!(a.get(2), 600);
+    assert_eq!(a.get(3), 400);
+    let ov: std::collections::HashMap<usize, u32> = a.overflow_entries().collect();
+    assert_eq!(ov.len(), 4);
+}
+
+#[test]
+fn miv_add_both_overflow() {
+    // [300] + [400] = [700]
+    let mut a = MemoryIntVec::new(1);
+    a.set(0, 300);
+    let mut b = MemoryIntVec::new(1);
+    b.set(0, 400);
+    a.add(&b);
+    assert_eq!(a.get(0), 700);
+    let ov: std::collections::HashMap<usize, u32> = a.overflow_entries().collect();
+    assert_eq!(ov[&0], 700);
+}
+
+#[test]
+fn miv_diff_overflow_edges() {
+    // [300, 400, 400, 50] - [100, 50, 350, 300]
+    // slot 0: self=overflow(300), other=primary(100)   → 200  (overflow removed, 200 < 255)
+    // slot 1: self=overflow(400), other=primary(50)    → 350  (overflow updated, 350 >= 255)
+    // slot 2: self=overflow(400), other=overflow(350)  → 50   (overflow removed, 50 < 255)
+    // slot 3: self=primary(50),   other=overflow(300)  → 0    (saturating, stays primary)
+    let mut a = MemoryIntVec::new(4);
+    a.set(0, 300); a.set(1, 400); a.set(2, 400); a.set(3, 50);
+    let mut b = MemoryIntVec::new(4);
+    b.set(0, 100); b.set(1, 50); b.set(2, 350); b.set(3, 300);
+    a.diff(&b);
+    assert_eq!(a.get(0), 200);
+    assert_eq!(a.get(1), 350);
+    assert_eq!(a.get(2), 50);
+    assert_eq!(a.get(3), 0);
+    let ov: std::collections::HashMap<usize, u32> = a.overflow_entries().collect();
+    assert_eq!(ov.len(), 1); // only slot 1 remains overflow
+    assert_eq!(ov[&1], 350);
+}
+
 // ── Comparison operators ──────────────────────────────────────────────────────
 
 #[test]
@@ -340,6 +444,27 @@ fn cmp_leq() {
     assert!(bv.get(0)); assert!(bv.get(1)); assert!(!bv.get(2)); assert!(bv.get(3));
 }
 
+#[test]
+fn cmp_scalar_with_overflow() {
+    // Slots: [10, 1000, 50, 500, 0]
+    // geq(100): slots 1 (1000) and 3 (500) → both overflow, must qualify
+    // lt(500):  slots 0 (10), 2 (50), 4 (0) → primary; slot 1 (1000) → no; slot 3 (500) → no
+    // geq(2000): only slot 1 (1000) fails, no slot qualifies
+    let mut v = MemoryIntVec::new(5);
+    v.set(0, 10); v.set(1, 1000); v.set(2, 50); v.set(3, 500); v.set(4, 0);
+
+    let bv = v.geq(100);
+    assert!(!bv.get(0)); assert!(bv.get(1)); assert!(!bv.get(2));
+    assert!(bv.get(3)); assert!(!bv.get(4));
+
+    let bv = v.lt(500);
+    assert!(bv.get(0)); assert!(!bv.get(1)); assert!(bv.get(2));
+    assert!(!bv.get(3)); assert!(bv.get(4));
+
+    let bv = v.geq(2000);
+    assert!(!(0..5).any(|s| bv.get(s)));
+}
+
 #[test]
 fn filter_pattern() {
     // Typical filter: ingroup >= min_count AND outgroup <= max_outgroup
diff --git a/src/obicompactvec/src/traits.rs b/src/obicompactvec/src/traits.rs
index e1f15f0..0e0e903 100644
--- a/src/obicompactvec/src/traits.rs
+++ b/src/obicompactvec/src/traits.rs
@@ -1,3 +1,5 @@
+use std::collections::HashMap;
+
 use ndarray::{Array1, Array2};
 
 // ── BitSlice / BitSliceMut ────────────────────────────────────────────────────
@@ -104,8 +106,18 @@ pub trait IntSlice {
     fn cmp_scalar(&self, pred: impl Fn(u32) -> bool) -> MemoryBitVec {
         let n = self.len();
         let mut words = vec![0u64; n.div_ceil(64)];
+        let primary = self.primary_bytes();
+        // Pass 1: byte scan — no HashMap access, vectorisable for simple predicates.
+        // Overflow slots (b == 255) are left as 0 and fixed in pass 2.
         for s in 0..n {
-            if pred(self.get(s)) { words[s >> 6] |= 1u64 << (s & 63); }
+            let b = primary[s];
+            if b < 255 && pred(b as u32) {
+                words[s >> 6] |= 1u64 << (s & 63);
+            }
+        }
+        // Pass 2: fix up overflow slots — O(k), negligible.
+        for (s, val) in self.overflow_entries() {
+            if pred(val) { words[s >> 6] |= 1u64 << (s & 63); }
         }
         MemoryBitVec::from_words(words, n)
     }
@@ -146,25 +158,86 @@ pub trait IntSliceMut: IntSlice {
 
     fn min<S: IntSlice>(&mut self, other: &S) -> &mut Self {
         assert_eq!(self.len(), other.len(), "IntSlice length mismatch");
-        for s in 0..other.len() { self.set(s, self.get(s).min(other.get(s))); }
+        // Snapshot both overflow sets (O(k), tiny) before mutating self.
+        // 255 = +∞ on u8, so byte-level min is correct in all cases except
+        // both-overflow: only those slots need a fixup pass.
+        let self_ov: Vec<(usize, u32)> = self.overflow_entries().collect();
+        let other_ov: HashMap<usize, u32> = other.overflow_entries().collect();
+        self.clear_overflow();
+        // Pass 1 — SIMD-vectorizable byte min over the full primary array.
+        for (a, &b) in self.primary_bytes_mut().iter_mut().zip(other.primary_bytes()) {
+            if b < *a { *a = b; }
+        }
+        // Pass 2 — fixup slots where BOTH sides were overflow (primary = 255 after pass 1,
+        // but the overflow value may have changed).  Slots where only self was overflow are
+        // already correct: pass 1 wrote other.primary[slot] < 255 and clear_overflow removed
+        // the stale entry.
+        for (slot, self_val) in self_ov {
+            if let Some(&other_val) = other_ov.get(&slot) {
+                self.set(slot, self_val.min(other_val));
+            }
+        }
         self
     }
 
     fn max<S: IntSlice>(&mut self, other: &S) -> &mut Self {
         assert_eq!(self.len(), other.len(), "IntSlice length mismatch");
-        for s in 0..other.len() { self.set(s, self.get(s).max(other.get(s))); }
+        // Pre-pass — process other's overflow entries BEFORE the byte pass.
+        // After the byte pass, self.primary[slot] = 255 for all slots in other_ov,
+        // making it impossible to recover the original self value; we need it now.
+        for (slot, other_val) in other.overflow_entries() {
+            let self_val = self.get(slot);
+            self.set(slot, self_val.max(other_val));
+        }
+        // Pass 1 — SIMD-vectorizable byte max over the full primary array.
+        // 255 = +∞ on u8 → max(a, 255) = 255 is the correct sentinel for all
+        // overflow slots, whether handled by the pre-pass or already in self.
+        for (a, &b) in self.primary_bytes_mut().iter_mut().zip(other.primary_bytes()) {
+            if b > *a { *a = b; }
+        }
         self
     }
 
     fn add<S: IntSlice>(&mut self, other: &S) -> &mut Self {
         assert_eq!(self.len(), other.len(), "IntSlice length mismatch");
-        for s in 0..other.len() { self.set(s, self.get(s).saturating_add(other.get(s))); }
+        let n = self.len();
+        for s in 0..n {
+            // Read both primary bytes first — u8 is Copy, borrows released immediately.
+            let sb = self.primary_bytes()[s];
+            let ob = other.primary_bytes()[s];
+            if sb < 255 && ob < 255 {
+                // Hot path: no overflow lookup, no HashMap write in the common case.
+                let sum = sb as u32 + ob as u32;
+                if sum < 255 { self.primary_bytes_mut()[s] = sum as u8; }
+                else         { self.set(s, sum); }
+            } else {
+                // At least one side is in overflow — get() is unavoidable.
+                let self_val = self.get(s);
+                let other_val = other.get(s);
+                self.set(s, self_val + other_val);
+            }
+        }
         self
     }
 
     fn diff<S: IntSlice>(&mut self, other: &S) -> &mut Self {
         assert_eq!(self.len(), other.len(), "IntSlice length mismatch");
-        for s in 0..other.len() { self.set(s, self.get(s).saturating_sub(other.get(s))); }
+        let n = self.len();
+        for s in 0..n {
+            let sb = self.primary_bytes()[s];
+            let ob = other.primary_bytes()[s];
+            if sb < 255 {
+                // Result is always < 255 — no overflow created or consulted.
+                // ob == 255 means b ≥ 255 > a, so saturating result = 0.
+                self.primary_bytes_mut()[s] = if ob < 255 { sb.saturating_sub(ob) } else { 0 };
+            } else {
+                // sb == 255: self has overflow — get() unavoidable.
+                // other.get() only needed when ob == 255 too (both-overflow case).
+                let self_val = self.get(s);
+                let other_val = if ob < 255 { ob as u32 } else { other.get(s) };
+                self.set(s, self_val.saturating_sub(other_val));
+            }
+        }
         self
     }
 

From eeba43ac4fd002c9df31c05697d5441be7befb82 Mon Sep 17 00:00:00 2001
From: Eric Coissac <eric@coissac.eu>
Date: Wed, 17 Jun 2026 13:14:11 +0200
Subject: [PATCH 10/24] docs: add technical reference for obicompactvec module

Document the two-tier compact integer encoding, BitSlice/IntSlice trait hierarchy, and SIMD-friendly O(n+k) algorithms. Include details on concrete memory and persistent vector types, matrix aggregation traits, and planned group-filtering APIs.
---
 docmd/implementation/obicompactvec.md | 455 ++++++++++++++++++++++++++
 1 file changed, 455 insertions(+)
 create mode 100644 docmd/implementation/obicompactvec.md

diff --git a/docmd/implementation/obicompactvec.md b/docmd/implementation/obicompactvec.md
new file mode 100644
index 0000000..7e9d0fc
--- /dev/null
+++ b/docmd/implementation/obicompactvec.md
@@ -0,0 +1,455 @@
+# obicompactvec — Complete Reference
+
+## Module structure
+
+```
+src/obicompactvec/src/
+  lib.rs            public re-exports
+  traits.rs         BitSlice, BitSliceMut, IntSlice, IntSliceMut + conversion traits
+  bitvec.rs         PersistentBitVec, PersistentBitVecBuilder, BitIter
+  memoryvec.rs      MemoryBitVec
+  reader.rs         PersistentCompactIntVec (read-only)
+  builder.rs        PersistentCompactIntVecBuilder (read-write)
+  memoryintvec.rs   MemoryIntVec
+  bitmatrix.rs      PersistentBitMatrix, PersistentBitMatrixBuilder
+  intmatrix.rs      PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder
+  format.rs         file format constants, encode/decode helpers
+  layer_meta.rs     LayerMeta (column metadata)
+  meta.rs           matrix metadata
+```
+
+---
+
+## Compact int encoding
+
+All integer vectors use the same two-tier encoding regardless of storage backend.
+
+**Primary array** — one `u8` per slot:
+
+- Values **0–254** are stored directly. No overhead.
+- Value **255 is a sentinel**: the slot's actual value is ≥ 255 and lives in the overflow store.
+
+**Overflow store** — maps slot index to a `u32` value ≥ 255:
+
+- In `MemoryIntVec` and `PersistentCompactIntVecBuilder`: a `HashMap<usize, u32>` in RAM.
+- In `PersistentCompactIntVec` (reader): a sorted `[(slot: u64, value: u32)]` array in the mmap, with a sparse L1-resident index for binary search.
+
+**Key property — sentinel 255 = +∞ on `u8`:**
+
+This is exploited throughout the binary operations. On a `u8` comparison, 255 behaves as positive infinity:
+- `min(a, 255) = a` for all `a ≤ 254` → correct when only one side is overflow
+- `max(a, 255) = 255` → correct sentinel when either side is overflow
+- Only the **both-overflow** case requires reading actual values from the overflow store.
+
+In practice, k (overflow count) ≪ n (total slots). Observed genomic data: ~0.07% of kmer slots are in overflow.
+
+---
+
+## Trait hierarchy
+
+### BitSlice (read-only)
+
+Required: `len()`, `words() -> &[u64]`.
+
+Bit `i` is at `words()[i >> 6]` bit `i & 63` (LSB-first). Padding bits in the last word are always zero — this invariant must be maintained by all implementors.
+
+| Provided method | Implementation | Cost |
+|---|---|---|
+| `is_empty()` | `len() == 0` | O(1) |
+| `get(slot)` | word extract | O(1) |
+| `count_ones()` | POPCNT per word | O(n/64) |
+| `count_zeros()` | `n − count_ones()` | O(n/64) |
+| `partial_jaccard_dist(other)` | `(a&b).popcount`, `(a\|b).popcount` per word | O(n/64) |
+| `jaccard_dist(other)` | from partial | O(n/64) |
+| `hamming_dist(other)` | `(a^b).popcount` per word | O(n/64) |
+
+### BitSliceMut: BitSlice (mutable)
+
+Required: `words_mut() -> &mut [u64]`.
+
+All bulk operations work at the word level (64 bits/iteration). The compiler auto-vectorizes these loops to AVX2/AVX-512. The zero-padding invariant is maintained: `not()` re-masks the last word after flipping.
+
+| Provided method | Implementation | Cost |
+|---|---|---|
+| `set(slot, value)` | OR / AND-NOT on one word | O(1) |
+| `copy_from(src)` | `copy_from_slice` = memcpy | O(n/64) |
+| `and(other)` | `w &= o` per word | O(n/64) |
+| `or(other)` | `w \|= o` per word | O(n/64) |
+| `xor(other)` | `w ^= o` per word | O(n/64) |
+| `not()` | `w ^= u64::MAX` per word, then mask last | O(n/64) |
+
+**No overflow complexity here.** The packed `u64` representation is already the natural unit for SIMD operations. No sentinel, no HashMap — just bitwise word ops.
+
+---
+
+### IntSlice (read-only)
+
+Required:
+- `len() -> usize`
+- `get(slot) -> u32` — handles sentinel transparently (binary search into overflow for persistent, HashMap for memory)
+- `primary_bytes() -> &[u8]` — raw primary array including 255 sentinels
+- `overflow_entries() -> impl Iterator<Item = (usize, u32)>` — (slot, true_value) pairs for all overflow slots
+
+| Provided method | Default implementation | Note |
+|---|---|---|
+| `is_empty()` | `len() == 0` | |
+| `iter()` | `(0..n).map(\|i\| self.get(i))` | Overridden in all concrete types |
+| `sum()` | `iter().map(\|v\| v as u64).sum()` | Overridden in concrete types |
+| `count_nonzero()` | `iter().filter(\|v\| *v > 0).count()` | Overridden in concrete types |
+| `lt(t)` | `cmp_scalar(\|v\| v < t)` | |
+| `leq(t)` | `cmp_scalar(\|v\| v <= t)` | |
+| `gt(t)` | `cmp_scalar(\|v\| v > t)` | |
+| `geq(t)` | `cmp_scalar(\|v\| v >= t)` | |
+| `cmp_scalar(pred)` | two-pass (see below) | |
+
+**`cmp_scalar` algorithm — two passes:**
+
+```
+Pass 1 — byte scan, O(n):
+  for s in 0..n:
+    b = primary[s]
+    if b < 255 AND pred(b as u32):
+      set bit s in result word
+
+Pass 2 — overflow fixup, O(k):
+  for (s, val) in overflow_entries():
+    if pred(val): set bit s in result word
+```
+
+Pass 1 reads only the primary byte array — no HashMap access. For simple predicates (`geq`, `lt`, etc.) the compiler inlines `pred` and can auto-vectorize the byte comparison loop. Pass 2 handles the O(k) overflow slots that were left as 0 in pass 1.
+
+Previous implementation: `pred(self.get(s))` for every slot → O(n log k) due to binary search in overflow. New: O(n) + O(k).
+
+---
+
+### IntSliceMut: IntSlice (mutable)
+
+Required:
+- `set(slot, value: u32)` — writes primary byte (or 255 + overflow entry if value ≥ 255); removes stale overflow entry if value drops below 255
+- `primary_bytes_mut() -> &mut [u8]` — direct mutable access to the primary array
+- `clear_overflow()` — empties the entire overflow store
+
+The required methods expose the encoding internals. All provided methods are implemented in terms of these three + the `IntSlice` required methods.
+
+| Provided method | Hot path | Overflow case | Cost |
+|---|---|---|---|
+| `inc(slot)` | `get` + `set` | — | O(1) or O(log k) |
+| `dec(slot)` | `get` + `set` (saturating) | — | O(1) or O(log k) |
+| `add_at(slot, delta)` | `get` + `set` (saturating) | — | O(1) or O(log k) |
+| `copy_from(src)` | `copy_from_slice` + `clear_overflow` + replay overflows | — | O(n) + O(k) |
+| `min(other)` | byte-level min, O(n) | both-overflow fixup, O(k) | O(n) |
+| `max(other)` | byte-level max, O(n) | pre-pass on other's overflows, O(k) | O(n) |
+| `add(other)` | byte add when both < 255, O(n) | `get` + `+` when either = 255 | O(n) |
+| `diff(other)` | byte saturating_sub when self < 255, O(n) | `get` + `saturating_sub` when self = 255 | O(n) |
+| `count_bits(bits)` | iterate set bits via word scan | — | O(n_ones) |
+| `cmp_scalar` | inherited from IntSlice | — | O(n) + O(k) |
+
+**`min` algorithm:**
+
+Exploits 255 = +∞: `u8::min(a, 255) = a` and `u8::min(255, b) = b`. Only the case where both sides are ≥ 255 needs actual overflow values.
+
+```
+1. Snapshot self's overflow:  self_ov:  Vec<(slot, value)>
+   Snapshot other's overflow: other_ov: HashMap<slot, value>
+2. clear_overflow()  — removes all self's overflow entries
+3. Pass 1 (byte min, SIMD-vectorizable):
+     for each byte pair: self.primary[s] = min(self.primary[s], other.primary[s])
+4. Pass 2 (both-overflow fixup):
+     for (slot, self_val) in self_ov:
+       if slot in other_ov:
+         self.set(slot, min(self_val, other_ov[slot]))
+       // else: byte pass already wrote other.primary[slot] < 255 — correct
+```
+
+Overflow entries where only self was overflow are correctly handled: after `clear_overflow` + byte pass, `self.primary[slot] = min(255, other.primary[slot]) = other.primary[slot]` (which is < 255). No overflow entry — correct.
+
+**`max` algorithm:**
+
+Exploits 255 = +∞: `u8::max(a, 255) = 255` → any slot where either side is overflow will have sentinel 255 in the primary after the byte pass. The byte pass cannot distinguish "self had overflow and other did not" from "self was just written to 255 by the byte pass".
+
+Solution: read and update self's original value at other's overflow slots *before* the byte pass overwrites them.
+
+```
+Pre-pass (O(k_other)):
+  for (slot, other_val) in other.overflow_entries():
+    self_val = self.get(slot)     // reads original value
+    self.set(slot, max(self_val, other_val))
+
+Pass 1 (byte max, SIMD-vectorizable):
+  for each byte pair: self.primary[s] = max(self.primary[s], other.primary[s])
+  // Overflow slots: max(255, 255) = 255 — primary unchanged, overflow entry from pre-pass preserved
+```
+
+After the pre-pass, self.primary[slot] = 255 for all slots in other's overflow. The byte pass leaves those 255s intact. Self's own overflow slots not in other's overflow are also 255 in primary — byte max(255, b < 255) = 255, unchanged. Correct in all cases.
+
+**`add` algorithm:**
+
+No sentinel property useful for add: any pair (sb, ob) with sb + ob ≥ 255 creates a new overflow entry, even when neither input was overflow. Cannot simplify via byte arithmetic.
+
+```
+for s in 0..n:
+  sb = self.primary[s]
+  ob = other.primary[s]
+  if sb < 255 AND ob < 255:      // hot path: no HashMap
+    sum = sb as u32 + ob as u32
+    if sum < 255: self.primary[s] = sum as u8   // direct byte write
+    else:         self.set(s, sum)               // creates overflow if needed
+  else:                           // at least one is overflow
+    self.set(s, self.get(s) + other.get(s))
+```
+
+The `+` on `u32` values is exact (no `saturating_add`). Overflow at u32 level panics in debug — not a real risk for kmer counts. The hot path (both < 255, sum < 255) is a single byte write with no HashMap access.
+
+**`diff` (saturating sub) algorithm:**
+
+`saturating_sub(a, b) = a − min(a, b) = max(0, a − b)`. Key insight: if self's primary byte < 255, the result is always < 255 (result ≤ a), so no new overflow entries are created and no overflow lookup is needed for self. Only self's overflow slots (primary = 255) need `get()`.
+
+| sb | ob | result | get() needed |
+|----|----|--------|-------------|
+| < 255 | < 255 | `sb.saturating_sub(ob)` < 255 | none |
+| < 255 | 255 | 0 (b ≥ 255 > a) | none |
+| 255 | < 255 | `self.get(s) − ob` | self only |
+| 255 | 255 | `self.get(s) − other.get(s)` | both |
+
+```
+for s in 0..n:
+  sb = self.primary[s]
+  ob = other.primary[s]
+  if sb < 255:          // hot path: O(n), no HashMap
+    self.primary[s] = if ob < 255 { sb.saturating_sub(ob) } else { 0 }
+  else:                 // cold path: O(k_self)
+    self_val = self.get(s)
+    other_val = if ob < 255 { ob as u32 } else { other.get(s) }
+    self.set(s, self_val.saturating_sub(other_val))
+```
+
+Overflow entries that drop below 255 (case sb=255, result < 255) are removed by `set()`. Overflow entries that remain ≥ 255 are updated. Correct in all four cases.
+
+**`count_bits` algorithm:**
+
+Increments self at each slot where the corresponding bit in `bits` is set. Iterates `bits.words()` and skips zero words entirely — O(n_ones) rather than O(n).
+
+```
+for (w_idx, word) in bits.words():
+  if word == 0: continue
+  base = w_idx * 64
+  while word != 0:
+    bit = trailing_zeros(word)
+    self.inc(base + bit)
+    word &= word − 1        // clear lowest set bit
+```
+
+---
+
+## Concrete types
+
+### Memory types
+
+**`MemoryBitVec`**
+
+```rust
+struct MemoryBitVec { words: Vec<u64>, n: usize }
+```
+
+Implements `BitSlice` + `BitSliceMut`. Owns its word array. Used as the result type of `cmp_scalar` / filter operations and as an intermediate for bit-level computations.
+
+Std ops: `BitAnd`, `BitOr`, `BitXor`, `Not` (owned and borrowed), `BitAndAssign`, `BitOrAssign`, `BitXorAssign` — all delegate to `BitSliceMut` methods.
+
+`iter()` returns a `BitIter<'_>` (word-level, see below).
+
+**`MemoryIntVec`**
+
+```rust
+struct MemoryIntVec {
+    primary:  Vec<u8>,
+    overflow: HashMap<usize, u32>,
+    n:        usize,
+}
+```
+
+Implements `IntSlice` + `IntSliceMut`. Overrides: `iter()` → inherent `iter()` (merge-scan), `sum()`, `count_nonzero()`.
+
+`IntSlice` required impls: `primary_bytes()` → `&self.primary`; `overflow_entries()` → `self.overflow.iter().map(...)`.
+
+`IntSliceMut` required impls: `set()` writes to `self.primary[slot]` and inserts/removes from `self.overflow`; `primary_bytes_mut()` → `&mut self.primary`; `clear_overflow()` → `self.overflow.clear()`.
+
+Std ops: `Add<&B>`, `Sub<&B>` (owned and borrowed), `AddAssign<&B>`, `SubAssign<&B>` — delegate to `IntSliceMut::add` / `diff`.
+
+`From<&S: IntSlice>`: copies primary bytes + overflow entries. O(n) + O(k).
+
+---
+
+### Persistent types
+
+**`PersistentBitVec` / `PersistentBitVecBuilder`**
+
+See `persistent_bit_vec.md`. `PersistentBitVec` is read-only (implements `BitSlice`). `PersistentBitVecBuilder` is read-write (implements `BitSlice` + `BitSliceMut`).
+
+`BitIter<'a>` — shared iterator type for both `MemoryBitVec` and `PersistentBitVec`:
+
+```rust
+pub struct BitIter<'a> { pub(crate) words: &'a [u64], pub(crate) slot: usize, pub(crate) n: usize }
+```
+
+Word-level scan: `(words[slot >> 6] >> (slot & 63)) & 1 != 0`. One word serves 64 iterations. `pub type MemoryBitIter<'a> = BitIter<'a>` preserves the public API name.
+
+**`PersistentCompactIntVec` / `PersistentCompactIntVecBuilder`**
+
+See `persistent_compact_int_vec.md` for file format and lifecycle.
+
+`PersistentCompactIntVec` implements `IntSlice`. Overrides: `iter()` → inherent merge-scan `Iter`; `sum()`; `count_nonzero()`. `overflow_entries()` returns a sequential scan `(0..n_overflow).map(|i| (data_slot(i), data_value(i)))` — no binary search since entries are stored sorted.
+
+`PersistentCompactIntVecBuilder` implements `IntSlice` + `IntSliceMut`. `iter()` is NOT overridden (default `get`-per-slot) because the overflow `HashMap` is unsorted. `sum()` and `count_nonzero()` are overridden using `byte_sum` / `byte_count_nonzero` on the mmap primary slice — avoids per-slot overhead.
+
+**Override rationale:** the default `iter()`, `sum()`, `count_nonzero()` on `IntSlice` call `self.get(s)` per slot, which is O(log k) binary search for `PersistentCompactIntVec`. Overrides provide O(n + k) merge-scan or O(n) byte scan instead.
+
+---
+
+### IntSlice implementors — override summary
+
+| Type | `iter()` | `sum()` | `count_nonzero()` |
+|------|----------|---------|-------------------|
+| `MemoryIntVec` | inherent merge-scan ✓ | `byte_sum` ✓ | `byte_count_nonzero` ✓ |
+| `PersistentCompactIntVecBuilder` | default (get-per-slot) | `byte_sum` on mmap ✓ | `byte_count_nonzero` on mmap ✓ |
+| `PersistentCompactIntVec` | inherent merge-scan Iter ✓ | inherent `sum()` ✓ | inherent `count_nonzero()` ✓ |
+| `PackedIntCol<'a>` | inherent PackedIntColIter ✓ | byte_sum ✓ | byte_count_nonzero ✓ |
+
+`PackedIntCol` is used internally by `PersistentCompactIntMatrix` (packed format) for column views.
+
+---
+
+## Matrix types
+
+Four matrix types, two encodings × two formats:
+
+| | Columnar format | Packed format |
+|---|---|---|
+| **Bit** | `PersistentBitMatrix` | — |
+| **Int** | `PersistentCompactIntMatrix` (columnar) | `PersistentCompactIntMatrix` (packed) |
+
+`PersistentCompactIntMatrix` is an enum behind a transparent API — the caller does not see whether the on-disk format is columnar (one `.pciv` per column) or packed (one `.pcmx` file interleaving all columns). `col(c)` and `col_slice(c)` return column views that implement `IntSlice`.
+
+`pack_compact_int_matrix` and `pack_bit_matrix` convert a columnar matrix to packed format.
+
+For details see `persistent_compact_int_vec.md` and `persistent_bit_vec.md`.
+
+---
+
+## Conversion traits
+
+Four blanket-impl traits on top of `BitSlice` / `IntSlice`:
+
+**`IntToBit: IntSlice`**
+- `to_bitvec(threshold: u32) -> MemoryBitVec` — bit set iff value ≥ threshold (delegates to `geq`)
+- `to_presence() -> MemoryBitVec` — bit set iff value ≥ 1 (delegates to `geq(1)`)
+
+**`BitToInt: BitSlice`**
+- `to_intvec() -> MemoryIntVec` — expands each bit to a `u8` (0 or 1) in a new primary array
+- Uses a `static EXPAND_BYTE: [[u8; 8]; 256]` lookup table — 8 bits expanded per byte, word-level outer loop
+
+Both `IntToBit` and `BitToInt` are implemented for all `T: IntSlice` / `T: BitSlice` via blanket impls.
+
+---
+
+## Aggregation traits (matrix level)
+
+### ColumnWeights
+
+```rust
+trait ColumnWeights: Send + Sync {
+    fn col_weights(&self) -> Array1<u64>;         // sum per column
+    fn partial_kmer_counts(&self) -> Array1<u64>; // default = col_weights()
+}
+```
+
+`partial_kmer_counts` is overridden for count matrices to return `count_nonzero` per column (distinct kmers) rather than total count.
+
+### CountPartials
+
+Abstract required methods: `partial_bray`, `partial_euclidean`, `partial_threshold_jaccard`, `partial_relfreq_bray`, `partial_relfreq_euclidean`, `partial_hellinger`.
+
+**Additivity rule:** self-contained partials (`partial_bray`, `partial_euclidean`, `partial_threshold_jaccard`) can be element-wise summed across all `(partition, layer)` pairs before applying the finalisation. Normalised partials (`partial_relfreq_*`, `partial_hellinger`) require the **global** `col_weights` (accumulated across all layers and all partitions) as parameter — not per-layer or per-partition weights.
+
+**`partial_threshold_jaccard` returns `(inter, union)`**, not a single matrix, because `union[i,j]` depends on both columns simultaneously and cannot be reconstructed from per-column statistics.
+
+Provided finalisations (default implementations):
+
+| Finalisation | Formula |
+|---|---|
+| `bray_dist_matrix()` | `1 − 2·partial_bray[i,j] / (w[i] + w[j])` |
+| `euclidean_dist_matrix()` | `√partial_euclidean[i,j]` |
+| `threshold_jaccard_dist_matrix(t)` | `1 − inter[i,j] / union[i,j]` |
+| `relfreq_bray_dist_matrix()` | `1 − partial_relfreq_bray[i,j]` (two-pass: col_weights then partial) |
+| `relfreq_euclidean_dist_matrix()` | `√partial_relfreq_euclidean[i,j]` |
+| `hellinger_dist_matrix()` | `√partial_hellinger[i,j] / √2` |
+| `hellinger_euclidean_dist_matrix()` | `√partial_hellinger[i,j]` |
+
+### BitPartials
+
+Required: `partial_jaccard() -> (Array2<u64>, Array2<u64>)` (inter, union), `partial_hamming() -> Array2<u64>`. Both additive across layers and partitions.
+
+---
+
+## Planned — Filter / Select API
+
+### ColGroup
+
+```rust
+struct ColGroup { name: String, indices: Vec<usize> }
+```
+
+Defined **once at the index level** from column metadata. Valid in all matrices of all layers and partitions because column structure is identical across the entire hierarchy (same samples/genomes everywhere; only rows = kmer slots are partitioned).
+
+`ColGroup` is passed by reference unchanged to any matrix — no index translation.
+
+### Composition axis
+
+- **Across partitions**: kmer space is partitioned → partial results are **concatenated** (disjoint kmer ranges).
+- **Across layers**: same kmer space, different counts → partial results are **aggregated** (add, OR, etc.).
+
+### MatrixGroupOps (planned trait)
+
+Group operations live on the matrix and expose only **additive intermediates** (`MemoryIntVec`). Predicates (final thresholds → `MemoryBitVec`) are applied at the index level after accumulation.
+
+```rust
+trait MatrixGroupOps {
+    // How many columns in group have value >= threshold, per kmer slot
+    fn partial_group_presence_count(&self, g: &ColGroup, threshold: u32) -> MemoryIntVec;
+
+    // Sum of values across group columns, per kmer slot
+    fn partial_group_sum(&self, g: &ColGroup) -> MemoryIntVec;
+
+    // Kmer present (value >= threshold) in at least one column of group
+    fn partial_group_any(&self, g: &ColGroup, threshold: u32) -> MemoryBitVec;
+}
+```
+
+Non-additive predicates (`group_all`, `group_at_least(k)`) are **not** on the matrix — they are composed at the index level from the additive intermediates:
+
+```
+// "present in >= 2 ingroup columns with count >= 3, absent from all outgroup"
+let presence = layers.map(|l| l.partial_group_presence_count(&ingroup, 3)).sum();
+let in_mask  = presence.geq(2);                                     // MemoryBitVec
+
+let out_sum  = layers.map(|l| l.partial_group_sum(&outgroup)).sum();
+let out_mask = out_sum.leq(0);                                      // MemoryBitVec
+
+let mask = in_mask.and(&out_mask);    // BitSliceMut::and — O(n/64)
+```
+
+### mask_with (planned IntSliceMut method)
+
+Apply a bit mask to a count vector: zero slots where the mask bit is 0. Iterates only zero bits — O(n_zeros), O(1) when mask is all-ones.
+
+```
+for (w_idx, word) in mask.words():
+  if word == u64::MAX: continue
+  zeros = !word
+  while zeros != 0:
+    bit = trailing_zeros(zeros)
+    s = w_idx * 64 + bit
+    self.set(s, 0)
+    zeros &= zeros − 1
+```
+
+This is the terminal operation for both Filter (zero non-selected kmer slots in a count matrix) and Select (positional selection without MPHF).

From 1f0d77d5bff086cf11c6e8ae1c7363c76e2de0c1 Mon Sep 17 00:00:00 2001
From: Eric Coissac <eric@coissac.eu>
Date: Wed, 17 Jun 2026 14:24:57 +0200
Subject: [PATCH 11/24] docs: document compact vector implementation with
 Mermaid diagrams

Add Mermaid diagrams to visualize the trait hierarchy, compact int storage layout, and SIMD-vectorizable arithmetic operations for MemoryIntVec and PersistentCompactIntVec. Also document concrete type structures and planned layer/partition composition rules to improve documentation clarity.
---
 docmd/implementation/obicompactvec.md | 279 +++++++++++++++++++++++---
 1 file changed, 249 insertions(+), 30 deletions(-)

diff --git a/docmd/implementation/obicompactvec.md b/docmd/implementation/obicompactvec.md
index 7e9d0fc..2926443 100644
--- a/docmd/implementation/obicompactvec.md
+++ b/docmd/implementation/obicompactvec.md
@@ -18,6 +18,25 @@ src/obicompactvec/src/
   meta.rs           matrix metadata
 ```
 
+```mermaid
+graph TD
+    traits --> memoryvec
+    traits --> memoryintvec
+    bitvec --> memoryvec
+    bitvec --> bitmatrix
+    format --> reader
+    format --> builder
+    reader --> intmatrix
+    builder --> intmatrix
+    builder --> memoryintvec
+    memoryvec --> traits
+    memoryintvec --> traits
+    layer_meta --> bitmatrix
+    layer_meta --> intmatrix
+    meta --> bitmatrix
+    meta --> intmatrix
+```
+
 ---
 
 ## Compact int encoding
@@ -34,6 +53,15 @@ All integer vectors use the same two-tier encoding regardless of storage backend
 - In `MemoryIntVec` and `PersistentCompactIntVecBuilder`: a `HashMap<usize, u32>` in RAM.
 - In `PersistentCompactIntVec` (reader): a sorted `[(slot: u64, value: u32)]` array in the mmap, with a sparse L1-resident index for binary search.
 
+```mermaid
+flowchart LR
+    slot --> P["primary[slot]: u8"]
+    P -->|"< 255"| V["value = byte (0–254)"]
+    P -->|"= 255 sentinel"| OV["overflow store"]
+    OV -->|"MemoryIntVec / Builder"| HM["HashMap&lt;usize, u32&gt;\nin RAM"]
+    OV -->|"PersistentCompactIntVec"| SA["sorted [(slot,value)] in mmap\n+ sparse L1 index"]
+```
+
 **Key property — sentinel 255 = +∞ on `u8`:**
 
 This is exploited throughout the binary operations. On a `u8` comparison, 255 behaves as positive infinity:
@@ -47,6 +75,66 @@ In practice, k (overflow count) ≪ n (total slots). Observed genomic data: ~0.0
 
 ## Trait hierarchy
 
+```mermaid
+classDiagram
+    class BitSlice {
+        <<trait>>
+        +len() usize
+        +words() &[u64]
+        +get(slot) bool
+        +count_ones() u64
+        +count_zeros() u64
+        +partial_jaccard_dist(other) (u64,u64)
+        +jaccard_dist(other) f64
+        +hamming_dist(other) u64
+    }
+    class BitSliceMut {
+        <<trait>>
+        +words_mut() &mut [u64]
+        +set(slot, value)
+        +copy_from(src)
+        +and(other)
+        +or(other)
+        +xor(other)
+        +not()
+    }
+    class IntSlice {
+        <<trait>>
+        +len() usize
+        +get(slot) u32
+        +primary_bytes() &[u8]
+        +overflow_entries() Iterator
+        +iter() Iterator
+        +sum() u64
+        +count_nonzero() u64
+        +cmp_scalar(pred) MemoryBitVec
+        +lt/leq/gt/geq(t) MemoryBitVec
+    }
+    class IntSliceMut {
+        <<trait>>
+        +set(slot, value)
+        +primary_bytes_mut() &mut [u8]
+        +clear_overflow()
+        +inc/dec/add_at(slot)
+        +copy_from(src)
+        +min/max/add/diff(other)
+        +count_bits(bits)
+    }
+    class IntToBit {
+        <<trait blanket>>
+        +to_bitvec(threshold) MemoryBitVec
+        +to_presence() MemoryBitVec
+    }
+    class BitToInt {
+        <<trait blanket>>
+        +to_intvec() MemoryIntVec
+    }
+    BitSliceMut --|> BitSlice : extends
+    IntSliceMut --|> IntSlice : extends
+    IntToBit --|> IntSlice : blanket T:IntSlice
+    BitToInt --|> BitSlice : blanket T:BitSlice
+```
+
 ### BitSlice (read-only)
 
 Required: `len()`, `words() -> &[u64]`.
@@ -148,17 +236,17 @@ The required methods expose the encoding internals. All provided methods are imp
 
 Exploits 255 = +∞: `u8::min(a, 255) = a` and `u8::min(255, b) = b`. Only the case where both sides are ≥ 255 needs actual overflow values.
 
-```
-1. Snapshot self's overflow:  self_ov:  Vec<(slot, value)>
-   Snapshot other's overflow: other_ov: HashMap<slot, value>
-2. clear_overflow()  — removes all self's overflow entries
-3. Pass 1 (byte min, SIMD-vectorizable):
-     for each byte pair: self.primary[s] = min(self.primary[s], other.primary[s])
-4. Pass 2 (both-overflow fixup):
-     for (slot, self_val) in self_ov:
-       if slot in other_ov:
-         self.set(slot, min(self_val, other_ov[slot]))
-       // else: byte pass already wrote other.primary[slot] < 255 — correct
+```mermaid
+flowchart TD
+    A["min(self, other)"] --> B["snapshot self_ov: Vec&lt;(slot,val)&gt;\nsnapshot other_ov: HashMap&lt;slot,val&gt;"]
+    B --> C["clear_overflow()"]
+    C --> D["Pass 1 — byte min, SIMD-vectorizable\nprimary[s] = min(self[s], other[s])  ∀s"]
+    D --> E["Pass 2 — both-overflow fixup\nfor (slot, self_val) in self_ov"]
+    E --> F{"slot ∈ other_ov?"}
+    F -->|yes| G["set(slot, min(self_val, other_ov[slot]))"]
+    F -->|no| H["byte pass wrote other.primary &lt; 255\nclear_overflow removed stale entry\nno action"]
+    G --> I[done]
+    H --> I
 ```
 
 Overflow entries where only self was overflow are correctly handled: after `clear_overflow` + byte pass, `self.primary[slot] = min(255, other.primary[slot]) = other.primary[slot]` (which is < 255). No overflow entry — correct.
@@ -169,15 +257,13 @@ Exploits 255 = +∞: `u8::max(a, 255) = 255` → any slot where either side is o
 
 Solution: read and update self's original value at other's overflow slots *before* the byte pass overwrites them.
 
-```
-Pre-pass (O(k_other)):
-  for (slot, other_val) in other.overflow_entries():
-    self_val = self.get(slot)     // reads original value
-    self.set(slot, max(self_val, other_val))
-
-Pass 1 (byte max, SIMD-vectorizable):
-  for each byte pair: self.primary[s] = max(self.primary[s], other.primary[s])
-  // Overflow slots: max(255, 255) = 255 — primary unchanged, overflow entry from pre-pass preserved
+```mermaid
+flowchart TD
+    A["max(self, other)"] --> B["Pre-pass O(k_other)\nfor (slot, other_val) in other.overflow_entries()"]
+    B --> C["self_val = self.get(slot)\nself.set(slot, max(self_val, other_val))"]
+    C --> D["Pass 1 — byte max, SIMD-vectorizable\nprimary[s] = max(self[s], other[s])  ∀s"]
+    D --> E["Overflow slots: max(255,255)=255\nprimary unchanged\noverflow entry from pre-pass preserved"]
+    E --> F[done]
 ```
 
 After the pre-pass, self.primary[slot] = 255 for all slots in other's overflow. The byte pass leaves those 255s intact. Self's own overflow slots not in other's overflow are also 255 in primary — byte max(255, b < 255) = 255, unchanged. Correct in all cases.
@@ -198,6 +284,18 @@ for s in 0..n:
     self.set(s, self.get(s) + other.get(s))
 ```
 
+```mermaid
+flowchart TD
+    A["add(self, other)"] --> B{"sb &lt; 255\nAND ob &lt; 255"}
+    B -->|"yes — hot path\nno HashMap"| C{"sb + ob &lt; 255"}
+    C -->|yes| D["primary[s] = sum as u8\nsingle byte write"]
+    C -->|no| E["set(s, sum)\ncreates overflow entry"]
+    B -->|"no — ≥1 side is overflow"| F["self_val = self.get(s)\nother_val = other.get(s)\nset(s, self_val + other_val)"]
+    D --> Z[next slot]
+    E --> Z
+    F --> Z
+```
+
 The `+` on `u32` values is exact (no `saturating_add`). Overflow at u32 level panics in debug — not a real risk for kmer counts. The hot path (both < 255, sum < 255) is a single byte write with no HashMap access.
 
 **`diff` (saturating sub) algorithm:**
@@ -211,16 +309,21 @@ The `+` on `u32` values is exact (no `saturating_add`). Overflow at u32 level pa
 | 255 | < 255 | `self.get(s) − ob` | self only |
 | 255 | 255 | `self.get(s) − other.get(s)` | both |
 
-```
-for s in 0..n:
-  sb = self.primary[s]
-  ob = other.primary[s]
-  if sb < 255:          // hot path: O(n), no HashMap
-    self.primary[s] = if ob < 255 { sb.saturating_sub(ob) } else { 0 }
-  else:                 // cold path: O(k_self)
-    self_val = self.get(s)
-    other_val = if ob < 255 { ob as u32 } else { other.get(s) }
-    self.set(s, self_val.saturating_sub(other_val))
+```mermaid
+flowchart TD
+    A["diff(self, other)"] --> B{"sb &lt; 255\nself not overflow"}
+    B -->|"yes — hot path O(n)"| C{"ob &lt; 255"}
+    C -->|yes| D["primary[s] = sb.saturating_sub(ob)\nbyte write, no HashMap"]
+    C -->|"no: b ≥ 255 > a"| E["primary[s] = 0"]
+    B -->|"no — cold path O(k_self)"| F["self_val = self.get(s)"]
+    F --> G{"ob &lt; 255"}
+    G -->|yes| H["other_val = ob as u32"]
+    G -->|no| I["other_val = other.get(s)"]
+    H --> J["set(s, self_val.saturating_sub(other_val))"]
+    I --> J
+    D --> Z[next slot]
+    E --> Z
+    J --> Z
 ```
 
 Overflow entries that drop below 255 (case sb=255, result < 255) are removed by `set()`. Overflow entries that remain ≥ 255 are updated. Correct in all four cases.
@@ -243,6 +346,70 @@ for (w_idx, word) in bits.words():
 
 ## Concrete types
 
+```mermaid
+classDiagram
+    class MemoryBitVec {
+        -words: Vec~u64~
+        -n: usize
+        +iter() BitIter
+        +ones(n) Self
+        +persist(path) Builder
+    }
+    class MemoryIntVec {
+        -primary: Vec~u8~
+        -overflow: HashMap~usize,u32~
+        -n: usize
+        +iter() MemoryIntIter
+        +filled(n, value) Self
+        +persist(path) Builder
+    }
+    class PersistentBitVec {
+        -mmap: Mmap
+        -n: usize
+        +iter() BitIter
+        +count_ones() u64
+    }
+    class PersistentBitVecBuilder {
+        -mmap: MmapMut
+        -n: usize
+        +close()
+        +build_from(src, path)
+        +build_from_counts(src, t, path)
+    }
+    class PersistentCompactIntVec {
+        -mmap: Mmap
+        -n usize
+        -n_overflow usize
+        -step usize
+        -index: Vec~(usize,usize)~
+        +iter() Iter
+        +get(slot) u32
+        +sum() u64
+    }
+    class PersistentCompactIntVecBuilder {
+        -mmap: MmapMut
+        -n: usize
+        -overflow: HashMap~usize,u32~
+        +set(slot, value)
+        +close()
+        +build_from(src, path)
+    }
+
+    MemoryBitVec ..|> BitSlice
+    MemoryBitVec ..|> BitSliceMut
+    PersistentBitVec ..|> BitSlice
+    PersistentBitVecBuilder ..|> BitSlice
+    PersistentBitVecBuilder ..|> BitSliceMut
+    MemoryIntVec ..|> IntSlice
+    MemoryIntVec ..|> IntSliceMut
+    PersistentCompactIntVec ..|> IntSlice
+    PersistentCompactIntVecBuilder ..|> IntSlice
+    PersistentCompactIntVecBuilder ..|> IntSliceMut
+
+    PersistentBitVecBuilder --> PersistentBitVec : close() then open()
+    PersistentCompactIntVecBuilder --> PersistentCompactIntVec : close() then open()
+```
+
 ### Memory types
 
 **`MemoryBitVec`**
@@ -392,6 +559,39 @@ Required: `partial_jaccard() -> (Array2<u64>, Array2<u64>)` (inter, union), `par
 
 ## Planned — Filter / Select API
 
+### Composition across layers and partitions
+
+```mermaid
+graph TD
+    subgraph Index
+        CG["ColGroup\nVec&lt;usize&gt; — valid everywhere"]
+        ACC["MemoryIntVec\nglobal accumulator"]
+        PRED["geq / leq / and / or\n→ MemoryBitVec mask"]
+    end
+
+    subgraph "Layer 1"
+        subgraph "Partition A  kmers 0..k/2"
+            MA["Matrix A\npartial_group_presence_count"]
+        end
+        subgraph "Partition B  kmers k/2..k"
+            MB["Matrix B\npartial_group_presence_count"]
+        end
+        CONCAT1["concat → MemoryIntVec\[0..k\]"]
+    end
+
+    subgraph "Layer 2"
+        CONCAT2["concat → MemoryIntVec\[0..k\]"]
+    end
+
+    CG -->|"same indices"| MA
+    CG -->|"same indices"| MB
+    MA -->|"kmer range A"| CONCAT1
+    MB -->|"kmer range B"| CONCAT1
+    CONCAT1 -->|"IntSliceMut::add"| ACC
+    CONCAT2 -->|"IntSliceMut::add"| ACC
+    ACC --> PRED
+```
+
 ### ColGroup
 
 ```rust
@@ -407,6 +607,25 @@ Defined **once at the index level** from column metadata. Valid in all matrices
 - **Across partitions**: kmer space is partitioned → partial results are **concatenated** (disjoint kmer ranges).
 - **Across layers**: same kmer space, different counts → partial results are **aggregated** (add, OR, etc.).
 
+### Additivity rules
+
+```mermaid
+flowchart LR
+    subgraph "Matrix level — returns MemoryIntVec"
+        PGP["partial_group_presence_count\npartial_group_sum\npartial_group_any → MemoryBitVec"]
+    end
+    subgraph "Index level — applies predicate"
+        GA["group_at_least(k)\n= accumulate.geq(k)"]
+        GALL["group_all\n= accumulate.geq(n_cols)"]
+        GANY["group_any\n= OR of partial_group_any"]
+    end
+    PGP -->|"concat across partitions\nadd across layers"| GA
+    PGP --> GALL
+    PGP --> GANY
+```
+
+Non-additive predicates (`group_all`, `group_at_least`) do **not** exist at matrix level — they require the global accumulated count.
+
 ### MatrixGroupOps (planned trait)
 
 Group operations live on the matrix and expose only **additive intermediates** (`MemoryIntVec`). Predicates (final thresholds → `MemoryBitVec`) are applied at the index level after accumulation.

From 93559c3294494078b07f038c7a1dfed56724c6a7 Mon Sep 17 00:00:00 2001
From: Eric Coissac <eric@coissac.eu>
Date: Wed, 17 Jun 2026 14:48:31 +0200
Subject: [PATCH 12/24] feat: introduce unified column view types for bit and
 int matrices

This commit introduces `BitColView` and `IntColView` to abstract over Columnar and Packed storage formats, implementing `BitSlice` and `IntSlice` for uniform column access. It adds `col_view()` accessors to `PersistentBitMatrix` and `PackedCompactIntMatrix`, explicitly panicking on implicit variants. The new types are publicly re-exported, and unit tests are added to validate per-element retrieval, aggregation methods, and parity with the original columnar representation.
---
 src/obicompactvec/src/bitmatrix.rs       | 28 +++++++++++
 src/obicompactvec/src/intmatrix.rs       | 44 ++++++++++++++++++
 src/obicompactvec/src/lib.rs             |  4 +-
 src/obicompactvec/src/tests/bitmatrix.rs | 56 +++++++++++++++++++++-
 src/obicompactvec/src/tests/intmatrix.rs | 59 +++++++++++++++++++++++-
 5 files changed, 186 insertions(+), 5 deletions(-)

diff --git a/src/obicompactvec/src/bitmatrix.rs b/src/obicompactvec/src/bitmatrix.rs
index 591e4af..0e70a96 100644
--- a/src/obicompactvec/src/bitmatrix.rs
+++ b/src/obicompactvec/src/bitmatrix.rs
@@ -183,6 +183,26 @@ impl BitSlice for PackedCol<'_> {
     fn words(&self) -> &[u64] { self.words }
 }
 
+// ── BitColView — uniform column access across Columnar and Packed ─────────────
+
+enum BitColViewInner<'a> {
+    Columnar(&'a PersistentBitVec),
+    Packed(PackedCol<'a>),
+}
+
+/// Opaque column view returned by [`PersistentBitMatrix::col_view`].
+/// Implements [`BitSlice`] uniformly for both Columnar and Packed matrix formats.
+pub struct BitColView<'a>(BitColViewInner<'a>);
+
+impl BitSlice for BitColView<'_> {
+    fn len(&self) -> usize {
+        match &self.0 { BitColViewInner::Columnar(c) => c.len(), BitColViewInner::Packed(c) => c.len() }
+    }
+    fn words(&self) -> &[u64] {
+        match &self.0 { BitColViewInner::Columnar(c) => c.words(), BitColViewInner::Packed(c) => c.words() }
+    }
+}
+
 /// Build `presence/matrix.pbmx` from existing `col_*.pbiv` files.
 pub fn pack_bit_matrix(dir: &Path) -> io::Result<()> {
     let packed_path = dir.join("matrix.pbmx");
@@ -298,6 +318,14 @@ impl PersistentBitMatrix {
         }
     }
 
+    pub fn col_view(&self, c: usize) -> BitColView<'_> {
+        match self {
+            Self::Columnar(m) => BitColView(BitColViewInner::Columnar(m.col(c))),
+            Self::Packed(m)   => BitColView(BitColViewInner::Packed(m.col_slice(c))),
+            Self::Implicit { .. } => panic!("col_view() not available on Implicit PersistentBitMatrix"),
+        }
+    }
+
     pub fn col_persist(&self, c: usize, path: &Path) -> io::Result<PersistentBitVecBuilder> {
         match self {
             Self::Columnar(m) => PersistentBitVecBuilder::build_from(m.col(c), path),
diff --git a/src/obicompactvec/src/intmatrix.rs b/src/obicompactvec/src/intmatrix.rs
index 0be16fb..a719e97 100644
--- a/src/obicompactvec/src/intmatrix.rs
+++ b/src/obicompactvec/src/intmatrix.rs
@@ -224,6 +224,43 @@ impl Iterator for PackedIntColIter<'_> {
 
 impl ExactSizeIterator for PackedIntColIter<'_> {}
 
+// ── IntColView — uniform column access across Columnar and Packed ─────────────
+
+enum IntColViewInner<'a> {
+    Columnar(&'a PersistentCompactIntVec),
+    Packed(PackedIntCol<'a>),
+}
+
+/// Opaque column view returned by [`PersistentCompactIntMatrix::col_view`].
+/// Implements [`IntSlice`] uniformly for both Columnar and Packed matrix formats.
+pub struct IntColView<'a>(IntColViewInner<'a>);
+
+impl IntSlice for IntColView<'_> {
+    fn len(&self) -> usize {
+        match &self.0 { IntColViewInner::Columnar(c) => c.len(), IntColViewInner::Packed(c) => c.len() }
+    }
+    fn get(&self, slot: usize) -> u32 {
+        match &self.0 { IntColViewInner::Columnar(c) => c.get(slot), IntColViewInner::Packed(c) => c.get(slot) }
+    }
+    fn primary_bytes(&self) -> &[u8] {
+        match &self.0 { IntColViewInner::Columnar(c) => c.primary_bytes(), IntColViewInner::Packed(c) => c.primary_bytes() }
+    }
+    fn overflow_entries(&self) -> impl Iterator<Item = (usize, u32)> + '_ {
+        // Box<dyn Iterator> implements Iterator, satisfying RPITIT across two distinct types.
+        let it: Box<dyn Iterator<Item = (usize, u32)> + '_> = match &self.0 {
+            IntColViewInner::Columnar(c) => Box::new(c.overflow_entries()),
+            IntColViewInner::Packed(c)   => Box::new(c.overflow_entries()),
+        };
+        it
+    }
+    fn sum(&self) -> u64 {
+        match &self.0 { IntColViewInner::Columnar(c) => c.sum(), IntColViewInner::Packed(c) => c.sum() }
+    }
+    fn count_nonzero(&self) -> u64 {
+        match &self.0 { IntColViewInner::Columnar(c) => c.count_nonzero(), IntColViewInner::Packed(c) => c.count_nonzero() }
+    }
+}
+
 // ─────────────────────────────────────────────────────────────────────────────
 
 pub struct PackedCompactIntMatrix {
@@ -481,6 +518,13 @@ impl PersistentCompactIntMatrix {
         }
     }
 
+    pub fn col_view(&self, c: usize) -> IntColView<'_> {
+        match self {
+            Self::Columnar(m) => IntColView(IntColViewInner::Columnar(m.col(c))),
+            Self::Packed(m)   => IntColView(IntColViewInner::Packed(m.col_slice(c))),
+        }
+    }
+
     pub fn col_persist(&self, c: usize, path: &Path) -> io::Result<PersistentCompactIntVecBuilder> {
         match self {
             Self::Columnar(m) => PersistentCompactIntVecBuilder::build_from(m.col(c), path),
diff --git a/src/obicompactvec/src/lib.rs b/src/obicompactvec/src/lib.rs
index 3a5f1c4..2dc1453 100644
--- a/src/obicompactvec/src/lib.rs
+++ b/src/obicompactvec/src/lib.rs
@@ -11,9 +11,9 @@ mod reader;
 pub mod traits;
 
 pub use bitvec::{BitIter, PersistentBitVec, PersistentBitVecBuilder};
-pub use bitmatrix::{PersistentBitMatrix, PersistentBitMatrixBuilder, pack_bit_matrix};
+pub use bitmatrix::{BitColView, PersistentBitMatrix, PersistentBitMatrixBuilder, pack_bit_matrix};
 pub use builder::PersistentCompactIntVecBuilder;
-pub use intmatrix::{PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, pack_compact_int_matrix};
+pub use intmatrix::{IntColView, PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, pack_compact_int_matrix};
 pub use layer_meta::LayerMeta;
 pub use memoryintvec::{MemoryIntIter, MemoryIntVec};
 pub use memoryvec::MemoryBitVec;
diff --git a/src/obicompactvec/src/tests/bitmatrix.rs b/src/obicompactvec/src/tests/bitmatrix.rs
index dced37f..5d93222 100644
--- a/src/obicompactvec/src/tests/bitmatrix.rs
+++ b/src/obicompactvec/src/tests/bitmatrix.rs
@@ -1,6 +1,6 @@
 use tempfile::tempdir;
 
-use crate::{PersistentBitMatrix, PersistentBitMatrixBuilder};
+use crate::{pack_bit_matrix, PersistentBitMatrix, PersistentBitMatrixBuilder};
 use crate::traits::{BitPartials, BitSlice, BitSliceMut};
 
 fn make_matrix(cols: &[&[bool]]) -> (tempfile::TempDir, PersistentBitMatrix) {
@@ -203,3 +203,57 @@ fn partial_hamming_matches_hamming() {
     let full    = m.hamming_dist_matrix();
     assert_eq!(partial, full);
 }
+
+// ── col_view on Packed ────────────────────────────────────────────────────────
+
+#[test]
+fn col_view_packed_values() {
+    let (dir, _) = make_matrix(&[
+        &[true, false, true, true],
+        &[false, true, false, true],
+    ]);
+    pack_bit_matrix(&dir.path().join("presence")).unwrap();
+    let m = PersistentBitMatrix::open(dir.path()).unwrap();
+
+    // col 0: [T, F, T, T]
+    let v0 = m.col_view(0);
+    assert_eq!(v0.len(), 4);
+    assert_eq!(v0.get(0), true);
+    assert_eq!(v0.get(1), false);
+    assert_eq!(v0.get(2), true);
+    assert_eq!(v0.get(3), true);
+    assert_eq!(v0.count_ones(), 3);
+
+    // col 1: [F, T, F, T]
+    let v1 = m.col_view(1);
+    assert_eq!(v1.get(0), false);
+    assert_eq!(v1.get(1), true);
+    assert_eq!(v1.get(2), false);
+    assert_eq!(v1.get(3), true);
+    assert_eq!(v1.count_ones(), 2);
+}
+
+#[test]
+fn col_view_packed_matches_columnar() {
+    let data: &[&[bool]] = &[
+        &[true, false, true, false, true, true, false, true],
+        &[false, false, true, true, false, true, true, false],
+        &[true, true, true, false, false, false, true, true],
+    ];
+    let (dir_col, m_col) = make_matrix(data);
+    let (dir_pack, _)    = make_matrix(data);
+    pack_bit_matrix(&dir_pack.path().join("presence")).unwrap();
+    let m_pack = PersistentBitMatrix::open(dir_pack.path()).unwrap();
+
+    for c in 0..data.len() {
+        let col_ref  = m_col.col(c);
+        let col_view = m_pack.col_view(c);
+        assert_eq!(col_view.len(), col_ref.len(), "col={c} len");
+        for s in 0..col_ref.len() {
+            assert_eq!(col_view.get(s), col_ref.get(s), "col={c} slot={s}");
+        }
+        assert_eq!(col_view.count_ones(), col_ref.count_ones(), "col={c} count_ones");
+        assert_eq!(col_view.words(), col_ref.words(), "col={c} words");
+    }
+    drop(dir_col);
+}
diff --git a/src/obicompactvec/src/tests/intmatrix.rs b/src/obicompactvec/src/tests/intmatrix.rs
index c4c0a98..d9869aa 100644
--- a/src/obicompactvec/src/tests/intmatrix.rs
+++ b/src/obicompactvec/src/tests/intmatrix.rs
@@ -1,7 +1,7 @@
 use tempfile::tempdir;
 
-use crate::{PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder};
-use crate::traits::CountPartials;
+use crate::{pack_compact_int_matrix, PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder};
+use crate::traits::{CountPartials, IntSlice};
 
 fn make_matrix(cols: &[&[u32]]) -> (tempfile::TempDir, PersistentCompactIntMatrix) {
     let n = cols.first().map_or(0, |c| c.len());
@@ -243,6 +243,61 @@ fn partial_hellinger_matches_full() {
     }
 }
 
+#[test]
+fn col_view_packed_values() {
+    // Build Columnar with overflow values (≥ 255), pack, reopen as Packed, exercise col_view().
+    let (dir, _col) = make_matrix(&[&[10, 300, 500], &[200, 50, 1000]]);
+    pack_compact_int_matrix(&dir.path().join("counts")).unwrap();
+    let m = PersistentCompactIntMatrix::open(dir.path()).unwrap();
+
+    // col 0: [10, 300, 500] — two overflow slots
+    let v0 = m.col_view(0);
+    assert_eq!(v0.get(0), 10);
+    assert_eq!(v0.get(1), 300);
+    assert_eq!(v0.get(2), 500);
+    assert_eq!(v0.sum(), 810);
+    assert_eq!(v0.count_nonzero(), 3);
+    let mut ov0: Vec<(usize, u32)> = v0.overflow_entries().collect();
+    ov0.sort_unstable_by_key(|&(s, _)| s);
+    assert_eq!(ov0, vec![(1, 300), (2, 500)]);
+
+    // col 1: [200, 50, 1000] — one overflow slot
+    let v1 = m.col_view(1);
+    assert_eq!(v1.get(0), 200);
+    assert_eq!(v1.get(1), 50);
+    assert_eq!(v1.get(2), 1000);
+    let mut ov1: Vec<(usize, u32)> = v1.overflow_entries().collect();
+    ov1.sort_unstable_by_key(|&(s, _)| s);
+    assert_eq!(ov1, vec![(2, 1000)]);
+}
+
+#[test]
+fn col_view_packed_matches_columnar() {
+    // Same data, compare col_view() on Packed against col() on Columnar slot-by-slot.
+    let data: &[&[u32]] = &[&[0, 255, 1, 300, 128], &[500, 3, 0, 700, 42]];
+    let (dir_col, m_col) = make_matrix(data);
+    // Re-build in a separate dir so we can pack without touching m_col's files.
+    let (dir_pack, _) = make_matrix(data);
+    pack_compact_int_matrix(&dir_pack.path().join("counts")).unwrap();
+    let m_pack = PersistentCompactIntMatrix::open(dir_pack.path()).unwrap();
+
+    for c in 0..data.len() {
+        let col_ref  = m_col.col(c);
+        let col_view = m_pack.col_view(c);
+        assert_eq!(col_view.len(), col_ref.len());
+        for s in 0..col_ref.len() {
+            assert_eq!(col_view.get(s), col_ref.get(s), "col={c} slot={s}");
+        }
+        assert_eq!(col_view.sum(), col_ref.sum(), "col={c} sum");
+        let mut ov_view: Vec<(usize, u32)> = col_view.overflow_entries().collect();
+        let mut ov_ref:  Vec<(usize, u32)> = col_ref.overflow_entries().collect();
+        ov_view.sort_unstable_by_key(|&(s, _)| s);
+        ov_ref.sort_unstable_by_key(|&(s, _)| s);
+        assert_eq!(ov_view, ov_ref, "col={c} overflow_entries");
+    }
+    drop(dir_col);
+}
+
 #[test]
 fn partial_relfreq_bray_additive_across_split() {
     // Split rows [1,2,3,4,5] between two matrices; partial sums should add up.

From 1d38d87ff9fab59624320e8815057d6d2bea45f9 Mon Sep 17 00:00:00 2001
From: Eric Coissac <eric@coissac.eu>
Date: Wed, 17 Jun 2026 14:50:28 +0200
Subject: [PATCH 13/24] Add column group operations and mask_with trait

Introduce the `ColGroup` struct and `MatrixGroupOps` trait to manage named subsets of column indices and perform additive aggregations (count, sum, any). Implement these operations for `PersistentBitMatrix` and `PersistentCompactIntMatrix`, applying size-optimized branches for presence counts and direct accumulation for small groups. Additionally, add a `mask_with` trait method that efficiently zero-sets elements based on a mask, optimized for sparse masks with O(n_zeros) complexity. Include comprehensive tests covering overflow handling, slot masking, and result additivity across partitioned data.
---
 src/obicompactvec/src/bitmatrix.rs      |  43 ++++-
 src/obicompactvec/src/colgroup.rs       |  59 +++++++
 src/obicompactvec/src/intmatrix.rs      |  50 +++++-
 src/obicompactvec/src/lib.rs            |   2 +
 src/obicompactvec/src/tests/colgroup.rs | 215 ++++++++++++++++++++++++
 src/obicompactvec/src/tests/mod.rs      |   1 +
 src/obicompactvec/src/traits.rs         |  23 +++
 7 files changed, 391 insertions(+), 2 deletions(-)
 create mode 100644 src/obicompactvec/src/colgroup.rs
 create mode 100644 src/obicompactvec/src/tests/colgroup.rs

diff --git a/src/obicompactvec/src/bitmatrix.rs b/src/obicompactvec/src/bitmatrix.rs
index 0e70a96..a51058e 100644
--- a/src/obicompactvec/src/bitmatrix.rs
+++ b/src/obicompactvec/src/bitmatrix.rs
@@ -7,8 +7,10 @@ use ndarray::{Array1, Array2};
 use rayon::prelude::*;
 
 use crate::bitvec::{PersistentBitVec, PersistentBitVecBuilder};
+use crate::colgroup::{ColGroup, MatrixGroupOps, inc_primary_bits};
+use crate::memoryintvec::MemoryIntVec;
 use crate::memoryvec::MemoryBitVec;
-use crate::traits::{BitSlice, BitSliceMut};
+use crate::traits::{BitSlice, BitSliceMut, IntSliceMut};
 use crate::layer_meta::LayerMeta;
 use crate::meta::MatrixMeta;
 
@@ -447,6 +449,45 @@ impl PersistentBitMatrixBuilder {
     }
 }
 
+// ── MatrixGroupOps ────────────────────────────────────────────────────────────
+
+impl MatrixGroupOps for PersistentBitMatrix {
+    fn partial_group_presence_count(&self, g: &ColGroup, _threshold: u32) -> MemoryIntVec {
+        // Bit matrices store 0/1 — threshold is structurally always 1.
+        // Materialize each column to a MemoryBitVec and accumulate directly.
+        let n = self.n();
+        if g.indices.len() < 255 {
+            let mut primary = vec![0u8; n];
+            for &c in &g.indices {
+                let mbv = MemoryBitVec::from(&self.col_view(c));
+                inc_primary_bits(&mut primary, &mbv);
+            }
+            MemoryIntVec::from_primary(primary)
+        } else {
+            let mut result = MemoryIntVec::new(n);
+            for &c in &g.indices {
+                let mbv = MemoryBitVec::from(&self.col_view(c));
+                result.count_bits(&mbv);
+            }
+            result
+        }
+    }
+
+    fn partial_group_sum(&self, g: &ColGroup) -> MemoryIntVec {
+        // For bit matrices, sum = count of 1-bits — identical to presence_count.
+        self.partial_group_presence_count(g, 1)
+    }
+
+    fn partial_group_any(&self, g: &ColGroup, _threshold: u32) -> MemoryBitVec {
+        let n = self.n();
+        let mut result = MemoryBitVec::new(n);
+        for &c in &g.indices {
+            result.or(&self.col_view(c));
+        }
+        result
+    }
+}
+
 // ── Shared matrix helpers (also used by intmatrix.rs) ─────────────────────────
 
 fn upper_pairs(n: usize) -> Vec<(usize, usize)> {
diff --git a/src/obicompactvec/src/colgroup.rs b/src/obicompactvec/src/colgroup.rs
new file mode 100644
index 0000000..df4576f
--- /dev/null
+++ b/src/obicompactvec/src/colgroup.rs
@@ -0,0 +1,59 @@
+use crate::memoryintvec::MemoryIntVec;
+use crate::memoryvec::MemoryBitVec;
+use crate::traits::BitSlice;
+
+// ── ColGroup ──────────────────────────────────────────────────────────────────
+
+/// A named subset of columns, identified by their indices within the matrix.
+///
+/// Defined once at the index level; the same indices are valid across all
+/// partitions and layers because the column structure (samples / genomes) is
+/// identical everywhere — only the row space (kmer slots) is partitioned.
+pub struct ColGroup {
+    pub name:    String,
+    pub indices: Vec<usize>,
+}
+
+impl ColGroup {
+    pub fn new(name: impl Into<String>, indices: Vec<usize>) -> Self {
+        Self { name: name.into(), indices }
+    }
+}
+
+// ── MatrixGroupOps ────────────────────────────────────────────────────────────
+
+/// Per-matrix group aggregations that return **additive intermediates**.
+///
+/// Results must be composed by the caller (concat across partitions, add across
+/// layers) before applying final predicates (`geq`, `leq`, …).  Non-additive
+/// predicates like `group_all` or `group_at_least(k)` are intentionally absent
+/// — they are derived at the index level from these intermediates.
+pub trait MatrixGroupOps {
+    /// Per-slot count of group columns whose value ≥ `threshold`.
+    fn partial_group_presence_count(&self, g: &ColGroup, threshold: u32) -> MemoryIntVec;
+
+    /// Per-slot sum of values across all group columns.
+    fn partial_group_sum(&self, g: &ColGroup) -> MemoryIntVec;
+
+    /// Per-slot OR: true if any group column has value ≥ `threshold`.
+    fn partial_group_any(&self, g: &ColGroup, threshold: u32) -> MemoryBitVec;
+}
+
+// ── Internal helper ───────────────────────────────────────────────────────────
+
+/// Iterate 1-bits of a `MemoryBitVec` and increment the corresponding raw
+/// byte.  Caller must guarantee that no counter will reach 255 (group size
+/// < 255 columns), so that incrementing `u8` is safe and no sentinel is
+/// accidentally written.
+pub(crate) fn inc_primary_bits(primary: &mut [u8], mask: &MemoryBitVec) {
+    let n = primary.len();
+    for (wi, &word) in mask.words().iter().enumerate() {
+        let mut w = word;
+        while w != 0 {
+            let bit = w.trailing_zeros() as usize;
+            let s = wi * 64 + bit;
+            if s < n { primary[s] += 1; }
+            w &= w - 1;
+        }
+    }
+}
diff --git a/src/obicompactvec/src/intmatrix.rs b/src/obicompactvec/src/intmatrix.rs
index a719e97..172d7b0 100644
--- a/src/obicompactvec/src/intmatrix.rs
+++ b/src/obicompactvec/src/intmatrix.rs
@@ -10,11 +10,13 @@ use rayon::prelude::*;
 
 use crate::bitmatrix::{pairwise_matrix, pairwise2_matrix};
 use crate::builder::PersistentCompactIntVecBuilder;
+use crate::colgroup::{ColGroup, MatrixGroupOps, inc_primary_bits};
 use crate::memoryintvec::MemoryIntVec;
+use crate::memoryvec::MemoryBitVec;
 use crate::format::{byte_count_nonzero, byte_sum, HEADER_SIZE, OVERFLOW_ENTRY_SIZE, parse_index_entry, parse_overflow_entry};
 use crate::meta::MatrixMeta;
 use crate::reader::PersistentCompactIntVec;
-use crate::traits::IntSlice;
+use crate::traits::{BitSliceMut, IntSlice, IntSliceMut};
 
 fn col_path(dir: &Path, col: usize) -> PathBuf {
     dir.join(format!("col_{col:06}.pciv"))
@@ -624,3 +626,49 @@ impl PersistentCompactIntMatrixBuilder {
         MatrixMeta { n: self.n, n_cols: self.n_cols }.save(&self.dir)
     }
 }
+
+// ── MatrixGroupOps ────────────────────────────────────────────────────────────
+
+impl MatrixGroupOps for PersistentCompactIntMatrix {
+    fn partial_group_presence_count(&self, g: &ColGroup, threshold: u32) -> MemoryIntVec {
+        let n = self.n();
+        if g.indices.len() < 255 {
+            // Fast path: counts fit in u8 — accumulate directly into raw bytes,
+            // no overflow map involved.
+            let mut primary = vec![0u8; n];
+            for &c in &g.indices {
+                let mask = self.col_view(c).cmp_scalar(|v| v >= threshold);
+                inc_primary_bits(&mut primary, &mask);
+            }
+            MemoryIntVec::from_primary(primary)
+        } else {
+            // Slow path (rare): use IntSliceMut::count_bits which handles overflow.
+            let mut result = MemoryIntVec::new(n);
+            for &c in &g.indices {
+                let mask = self.col_view(c).cmp_scalar(|v| v >= threshold);
+                result.count_bits(&mask);
+            }
+            result
+        }
+    }
+
+    fn partial_group_sum(&self, g: &ColGroup) -> MemoryIntVec {
+        let n = self.n();
+        let mut result = MemoryIntVec::new(n);
+        for &c in &g.indices {
+            let view = self.col_view(c);
+            IntSliceMut::add(&mut result, &view);
+        }
+        result
+    }
+
+    fn partial_group_any(&self, g: &ColGroup, threshold: u32) -> MemoryBitVec {
+        let n = self.n();
+        let mut result = MemoryBitVec::new(n);
+        for &c in &g.indices {
+            let mask = self.col_view(c).cmp_scalar(|v| v >= threshold);
+            result.or(&mask);
+        }
+        result
+    }
+}
diff --git a/src/obicompactvec/src/lib.rs b/src/obicompactvec/src/lib.rs
index 2dc1453..c5f3705 100644
--- a/src/obicompactvec/src/lib.rs
+++ b/src/obicompactvec/src/lib.rs
@@ -1,6 +1,7 @@
 mod bitvec;
 mod bitmatrix;
 mod builder;
+mod colgroup;
 mod format;
 mod intmatrix;
 mod layer_meta;
@@ -13,6 +14,7 @@ pub mod traits;
 pub use bitvec::{BitIter, PersistentBitVec, PersistentBitVecBuilder};
 pub use bitmatrix::{BitColView, PersistentBitMatrix, PersistentBitMatrixBuilder, pack_bit_matrix};
 pub use builder::PersistentCompactIntVecBuilder;
+pub use colgroup::{ColGroup, MatrixGroupOps};
 pub use intmatrix::{IntColView, PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, pack_compact_int_matrix};
 pub use layer_meta::LayerMeta;
 pub use memoryintvec::{MemoryIntIter, MemoryIntVec};
diff --git a/src/obicompactvec/src/tests/colgroup.rs b/src/obicompactvec/src/tests/colgroup.rs
new file mode 100644
index 0000000..813d4fa
--- /dev/null
+++ b/src/obicompactvec/src/tests/colgroup.rs
@@ -0,0 +1,215 @@
+use tempfile::tempdir;
+
+use crate::{
+    ColGroup, MatrixGroupOps,
+    PersistentBitMatrix, PersistentBitMatrixBuilder,
+    PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder,
+};
+use crate::traits::{BitSliceMut, IntSlice, IntSliceMut};
+use crate::{MemoryBitVec, MemoryIntVec};
+
+// ── helpers ───────────────────────────────────────────────────────────────────
+
+fn make_int_matrix(cols: &[&[u32]]) -> (tempfile::TempDir, PersistentCompactIntMatrix) {
+    let n = cols.first().map_or(0, |c| c.len());
+    let dir = tempdir().unwrap();
+    let mut b = PersistentCompactIntMatrixBuilder::new(n, &dir.path().join("counts")).unwrap();
+    for &col in cols {
+        let mut cb = b.add_col().unwrap();
+        for (slot, &v) in col.iter().enumerate() { cb.set(slot, v); }
+        cb.close().unwrap();
+    }
+    b.close().unwrap();
+    let m = PersistentCompactIntMatrix::open(dir.path()).unwrap();
+    (dir, m)
+}
+
+fn make_bit_matrix(cols: &[&[bool]]) -> (tempfile::TempDir, PersistentBitMatrix) {
+    let n = cols.first().map_or(0, |c| c.len());
+    let dir = tempdir().unwrap();
+    let presence = dir.path().join("presence");
+    let mut b = PersistentBitMatrixBuilder::new(n, &presence).unwrap();
+    for &col in cols {
+        let mut cb = b.add_col().unwrap();
+        for (slot, &v) in col.iter().enumerate() { cb.set(slot, v); }
+        cb.close().unwrap();
+    }
+    b.close().unwrap();
+    let m = PersistentBitMatrix::open(dir.path()).unwrap();
+    (dir, m)
+}
+
+// ── IntMatrix: partial_group_sum ──────────────────────────────────────────────
+
+#[test]
+fn int_partial_group_sum_basic() {
+    // col0=[1,2,3], col1=[10,20,30], col2=[100,0,5]
+    // group {0,2}: sum = [101, 2, 8]
+    let (_d, m) = make_int_matrix(&[&[1, 2, 3], &[10, 20, 30], &[100, 0, 5]]);
+    let g = ColGroup::new("g", vec![0, 2]);
+    let result = m.partial_group_sum(&g);
+    assert_eq!(result.get(0), 101);
+    assert_eq!(result.get(1), 2);
+    assert_eq!(result.get(2), 8);
+}
+
+#[test]
+fn int_partial_group_sum_with_overflow() {
+    // col0=[300,0], col1=[200,400]: group {0,1}: sum=[500, 400]
+    let (_d, m) = make_int_matrix(&[&[300, 0], &[200, 400]]);
+    let g = ColGroup::new("g", vec![0, 1]);
+    let result = m.partial_group_sum(&g);
+    assert_eq!(result.get(0), 500);
+    assert_eq!(result.get(1), 400);
+    assert_eq!(result.sum(), 900);
+}
+
+// ── IntMatrix: partial_group_presence_count ───────────────────────────────────
+
+#[test]
+fn int_partial_group_presence_count() {
+    // col0=[5,1,0,3], col1=[2,0,4,3], col2=[0,3,1,0]
+    // threshold=2: col0: [T,F,F,T], col1: [T,F,T,T], col2: [F,T,F,F]
+    // group {0,1,2}: counts = [2, 1, 1, 2]
+    let (_d, m) = make_int_matrix(&[&[5, 1, 0, 3], &[2, 0, 4, 3], &[0, 3, 1, 0]]);
+    let g = ColGroup::new("g", vec![0, 1, 2]);
+    let result = m.partial_group_presence_count(&g, 2);
+    assert_eq!(result.get(0), 2);
+    assert_eq!(result.get(1), 1);
+    assert_eq!(result.get(2), 1);
+    assert_eq!(result.get(3), 2);
+}
+
+#[test]
+fn int_partial_group_presence_count_with_overflow() {
+    // col0=[300,0,10], col1=[0,400,10], col2=[1,1,10]
+    // threshold=5: col0: [T,F,T], col1: [F,T,T], col2: [F,F,T]
+    // group {0,1,2}: counts = [1, 1, 3]
+    let (_d, m) = make_int_matrix(&[&[300, 0, 10], &[0, 400, 10], &[1, 1, 10]]);
+    let g = ColGroup::new("g", vec![0, 1, 2]);
+    let result = m.partial_group_presence_count(&g, 5);
+    assert_eq!(result.get(0), 1);
+    assert_eq!(result.get(1), 1);
+    assert_eq!(result.get(2), 3);
+}
+
+// ── IntMatrix: partial_group_any ──────────────────────────────────────────────
+
+#[test]
+fn int_partial_group_any() {
+    // col0=[0,3,0,1], col1=[2,0,0,0], col2=[0,0,5,0]
+    // threshold=2: col0: [F,T,F,F], col1: [T,F,F,F], col2: [F,F,T,F]
+    // group {0,1,2}: any = [T, T, T, F]
+    let (_d, m) = make_int_matrix(&[&[0, 3, 0, 1], &[2, 0, 0, 0], &[0, 0, 5, 0]]);
+    let g = ColGroup::new("g", vec![0, 1, 2]);
+    let result = m.partial_group_any(&g, 2);
+    assert_eq!(result.get(0), true);
+    assert_eq!(result.get(1), true);
+    assert_eq!(result.get(2), true);
+    assert_eq!(result.get(3), false);
+}
+
+// ── IntMatrix: mask_with ──────────────────────────────────────────────────────
+
+#[test]
+fn mask_with_zeros_selected_slots() {
+    // count vec [10, 20, 30, 40], mask [T, F, T, F] → [10, 0, 30, 0]
+    let mut v = MemoryIntVec::new(4);
+    v.set(0, 10); v.set(1, 20); v.set(2, 30); v.set(3, 40);
+    let mut mask = MemoryBitVec::new(4);
+    mask.set(0, true); mask.set(2, true);
+    v.mask_with(&mask);
+    assert_eq!(v.get(0), 10);
+    assert_eq!(v.get(1), 0);
+    assert_eq!(v.get(2), 30);
+    assert_eq!(v.get(3), 0);
+}
+
+#[test]
+fn mask_with_overflow_slot_zeroed() {
+    // overflow slot (value 500) masked out → removed from overflow, primary=0
+    let mut v = MemoryIntVec::new(3);
+    v.set(0, 10); v.set(1, 500); v.set(2, 5);
+    let mut mask = MemoryBitVec::new(3);
+    mask.set(0, true); mask.set(2, true);  // slot 1 masked out
+    v.mask_with(&mask);
+    assert_eq!(v.get(0), 10);
+    assert_eq!(v.get(1), 0);
+    assert_eq!(v.get(2), 5);
+    let ov: Vec<_> = v.overflow_entries().collect();
+    assert!(ov.is_empty(), "overflow entry for masked-out slot should be gone");
+}
+
+#[test]
+fn mask_with_all_ones_is_noop() {
+    let mut v = MemoryIntVec::new(4);
+    v.set(0, 300); v.set(1, 1); v.set(2, 0); v.set(3, 42);
+    let mask = MemoryBitVec::ones(4);
+    v.mask_with(&mask);
+    assert_eq!(v.get(0), 300);
+    assert_eq!(v.get(1), 1);
+    assert_eq!(v.get(2), 0);
+    assert_eq!(v.get(3), 42);
+}
+
+// ── BitMatrix: partial_group_presence_count ───────────────────────────────────
+
+#[test]
+fn bit_partial_group_presence_count() {
+    // col0=[T,F,T,F], col1=[T,T,F,F], col2=[F,T,T,F]
+    // group {0,1,2}: counts = [2, 2, 2, 0]
+    let (_d, m) = make_bit_matrix(&[
+        &[true, false, true,  false],
+        &[true, true,  false, false],
+        &[false,true,  true,  false],
+    ]);
+    let g = ColGroup::new("g", vec![0, 1, 2]);
+    let result = m.partial_group_presence_count(&g, 1);
+    assert_eq!(result.get(0), 2);
+    assert_eq!(result.get(1), 2);
+    assert_eq!(result.get(2), 2);
+    assert_eq!(result.get(3), 0);
+}
+
+// ── BitMatrix: partial_group_any ──────────────────────────────────────────────
+
+#[test]
+fn bit_partial_group_any() {
+    // col0=[T,F,F], col1=[F,F,T], group {0,1}: any = [T, F, T]
+    let (_d, m) = make_bit_matrix(&[
+        &[true, false, false],
+        &[false, false, true],
+    ]);
+    let g = ColGroup::new("g", vec![0, 1]);
+    let result = m.partial_group_any(&g, 1);
+    assert_eq!(result.get(0), true);
+    assert_eq!(result.get(1), false);
+    assert_eq!(result.get(2), true);
+}
+
+// ── Composition: partial results are additive ─────────────────────────────────
+
+#[test]
+fn int_presence_count_additive_across_split() {
+    // Simulate two partitions (different kmer ranges) whose counts should add.
+    // Global data for col0: [5,1,0,3,2], col1: [2,0,4,3,1] — threshold=2
+    // Split: partition A = slots 0..2, partition B = slots 2..5
+    let data_a: &[&[u32]] = &[&[5, 1], &[2, 0]];
+    let data_b: &[&[u32]] = &[&[0, 3, 2], &[4, 3, 1]];
+    let (_da, ma) = make_int_matrix(data_a);
+    let (_db, mb) = make_int_matrix(data_b);
+    let g = ColGroup::new("g", vec![0, 1]);
+
+    let pa = ma.partial_group_presence_count(&g, 2);
+    let pb = mb.partial_group_presence_count(&g, 2);
+
+    // Concatenate by adding (disjoint kmer ranges — here we just verify
+    // individual results match the expected per-partition counts).
+    // partition A: col0=[5≥2,1<2]=[T,F], col1=[2≥2,0<2]=[T,F] → [2, 0]
+    assert_eq!(pa.get(0), 2);
+    assert_eq!(pa.get(1), 0);
+    // partition B: col0=[0<2,3≥2,2≥2]=[F,T,T], col1=[4≥2,3≥2,1<2]=[T,T,F] → [1, 2, 1]
+    assert_eq!(pb.get(0), 1);
+    assert_eq!(pb.get(1), 2);
+    assert_eq!(pb.get(2), 1);
+}
diff --git a/src/obicompactvec/src/tests/mod.rs b/src/obicompactvec/src/tests/mod.rs
index c0be93a..3a61ab3 100644
--- a/src/obicompactvec/src/tests/mod.rs
+++ b/src/obicompactvec/src/tests/mod.rs
@@ -1,5 +1,6 @@
 mod bitmatrix;
 mod bitvec;
+mod colgroup;
 mod intmatrix;
 mod memoryvec;
 
diff --git a/src/obicompactvec/src/traits.rs b/src/obicompactvec/src/traits.rs
index 0e0e903..9a647ec 100644
--- a/src/obicompactvec/src/traits.rs
+++ b/src/obicompactvec/src/traits.rs
@@ -258,6 +258,29 @@ pub trait IntSliceMut: IntSlice {
         }
         self
     }
+
+    /// Zero every slot where the corresponding bit in `mask` is 0.
+    /// Iterates only the zero bits — O(n_zeros), O(1) when mask is all-ones.
+    fn mask_with<B: BitSlice>(&mut self, mask: &B) -> &mut Self {
+        assert_eq!(self.len(), mask.len(), "IntSlice/BitSlice length mismatch");
+        let n = self.len();
+        for (wi, &word) in mask.words().iter().enumerate() {
+            if word == u64::MAX { continue; }
+            let mut zeros = !word;
+            while zeros != 0 {
+                let bit = zeros.trailing_zeros() as usize;
+                let s   = wi * 64 + bit;
+                if s < n {
+                    // u8 is Copy — the immutable borrow from primary_bytes() ends
+                    // before the mutable borrow from set() begins.
+                    let b = self.primary_bytes()[s];
+                    if b != 0 { self.set(s, 0); }
+                }
+                zeros &= zeros - 1;
+            }
+        }
+        self
+    }
 }
 
 // ── IntSlice → MemoryBitVec conversions ───────────────────────────────────────

From fb4962c4fee90cc2bd77a503a3e92dbd7d4f13fc Mon Sep 17 00:00:00 2001
From: Eric Coissac <eric@coissac.eu>
Date: Wed, 17 Jun 2026 15:13:22 +0200
Subject: [PATCH 14/24] refactor: replace in-memory vectors with
 temp-file-backed storage

Introduces `TempCompactIntVec` and `TempBitVec` as temporary, file-backed intermediates to replace eager in-memory vectors, enabling OS-level paging under memory pressure. Updates the `MatrixGroupOps` trait to return `io::Result` types, allowing proper error propagation and supporting chunked accumulation for large column groups. Includes builder patterns with `.freeze()` finalization, automatic `TempDir` cleanup on drop, and necessary test updates to handle the new fallible signatures. Also fixes `Cargo.toml` section ordering.
---
 docmd/implementation/obicompactvec.md   | 184 +++++++++++++++---------
 obicompactvector_reflexion.md           |  44 ++++++
 src/obicompactvec/Cargo.toml            |   2 +-
 src/obicompactvec/src/bitmatrix.rs      |  45 +++---
 src/obicompactvec/src/builder.rs        |  16 +--
 src/obicompactvec/src/colgroup.rs       |  11 +-
 src/obicompactvec/src/intmatrix.rs      |  53 ++++---
 src/obicompactvec/src/lib.rs            |   4 +
 src/obicompactvec/src/tempbitvec.rs     |  69 +++++++++
 src/obicompactvec/src/tempintvec.rs     |  82 +++++++++++
 src/obicompactvec/src/tests/colgroup.rs |  20 +--
 11 files changed, 399 insertions(+), 131 deletions(-)
 create mode 100644 obicompactvector_reflexion.md
 create mode 100644 src/obicompactvec/src/tempbitvec.rs
 create mode 100644 src/obicompactvec/src/tempintvec.rs

diff --git a/docmd/implementation/obicompactvec.md b/docmd/implementation/obicompactvec.md
index 2926443..71dc939 100644
--- a/docmd/implementation/obicompactvec.md
+++ b/docmd/implementation/obicompactvec.md
@@ -11,8 +11,11 @@ src/obicompactvec/src/
   reader.rs         PersistentCompactIntVec (read-only)
   builder.rs        PersistentCompactIntVecBuilder (read-write)
   memoryintvec.rs   MemoryIntVec
+  tempintvec.rs     TempCompactIntVec, TempCompactIntVecBuilder (temp-file-backed)
+  tempbitvec.rs     TempBitVec, TempBitVecBuilder (temp-file-backed)
   bitmatrix.rs      PersistentBitMatrix, PersistentBitMatrixBuilder
   intmatrix.rs      PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder
+  colgroup.rs       ColGroup, MatrixGroupOps trait
   format.rs         file format constants, encode/decode helpers
   layer_meta.rs     LayerMeta (column metadata)
   meta.rs           matrix metadata
@@ -24,13 +27,22 @@ graph TD
     traits --> memoryintvec
     bitvec --> memoryvec
     bitvec --> bitmatrix
+    bitvec --> tempbitvec
     format --> reader
     format --> builder
     reader --> intmatrix
+    reader --> tempintvec
     builder --> intmatrix
     builder --> memoryintvec
+    builder --> tempintvec
     memoryvec --> traits
     memoryintvec --> traits
+    tempintvec --> intmatrix
+    tempintvec --> bitmatrix
+    tempbitvec --> intmatrix
+    tempbitvec --> bitmatrix
+    colgroup --> intmatrix
+    colgroup --> bitmatrix
     layer_meta --> bitmatrix
     layer_meta --> intmatrix
     meta --> bitmatrix
@@ -479,6 +491,8 @@ See `persistent_compact_int_vec.md` for file format and lifecycle.
 | `MemoryIntVec` | inherent merge-scan ✓ | `byte_sum` ✓ | `byte_count_nonzero` ✓ |
 | `PersistentCompactIntVecBuilder` | default (get-per-slot) | `byte_sum` on mmap ✓ | `byte_count_nonzero` on mmap ✓ |
 | `PersistentCompactIntVec` | inherent merge-scan Iter ✓ | inherent `sum()` ✓ | inherent `count_nonzero()` ✓ |
+| `TempCompactIntVec` | delegates to inner `PersistentCompactIntVec` | delegates | delegates |
+| `TempCompactIntVecBuilder` | default (get-per-slot) | delegates to builder | delegates to builder |
 | `PackedIntCol<'a>` | inherent PackedIntColIter ✓ | byte_sum ✓ | byte_count_nonzero ✓ |
 
 `PackedIntCol` is used internally by `PersistentCompactIntMatrix` (packed format) for column views.
@@ -557,45 +571,68 @@ Required: `partial_jaccard() -> (Array2<u64>, Array2<u64>)` (inter, union), `par
 
 ---
 
-## Planned — Filter / Select API
+## Temp-file-backed types
 
-### Composition across layers and partitions
+`MemoryBitVec` and `MemoryIntVec` are reserved for truly transient intra-method intermediates (e.g. a single `cmp_scalar` result that lives for one loop iteration). **All inter-function results use temp-file-backed types** so the OS can page them out under memory pressure. This matters in practice: processing dozens of layers × hundreds of partitions in parallel would otherwise accumulate gigabytes of live anonymous memory.
 
-```mermaid
-graph TD
-    subgraph Index
-        CG["ColGroup\nVec&lt;usize&gt; — valid everywhere"]
-        ACC["MemoryIntVec\nglobal accumulator"]
-        PRED["geq / leq / and / or\n→ MemoryBitVec mask"]
-    end
+### Lifecycle
 
-    subgraph "Layer 1"
-        subgraph "Partition A  kmers 0..k/2"
-            MA["Matrix A\npartial_group_presence_count"]
-        end
-        subgraph "Partition B  kmers k/2..k"
-            MB["Matrix B\npartial_group_presence_count"]
-        end
-        CONCAT1["concat → MemoryIntVec\[0..k\]"]
-    end
-
-    subgraph "Layer 2"
-        CONCAT2["concat → MemoryIntVec\[0..k\]"]
-    end
-
-    CG -->|"same indices"| MA
-    CG -->|"same indices"| MB
-    MA -->|"kmer range A"| CONCAT1
-    MB -->|"kmer range B"| CONCAT1
-    CONCAT1 -->|"IntSliceMut::add"| ACC
-    CONCAT2 -->|"IntSliceMut::add"| ACC
-    ACC --> PRED
 ```
+TempCompactIntVecBuilder::new(n)   →  writable mmap in TempDir
+     ↓  (set / add / count_bits / mask_with / …)
+ .freeze()                          →  TempCompactIntVec  (read-only mmap + TempDir)
+     ↓  (optional)
+ .make_persistent(path)             →  PersistentCompactIntVec  (permanent file)
+```
+
+Same pattern for `TempBitVecBuilder` → `TempBitVec` → `PersistentBitVec`.
+
+**Drop order**: in `TempCompactIntVec { vec: PersistentCompactIntVec, _temp: TempDir }`, Rust drops fields in declaration order — `vec` (mmap) is released before `_temp` (directory) is deleted. No explicit `drop()` needed.
+
+### TempCompactIntVec / TempCompactIntVecBuilder
+
+```rust
+pub struct TempCompactIntVec {
+    vec:   PersistentCompactIntVec,
+    _temp: TempDir,        // dropped after vec
+}
+
+pub(crate) struct TempCompactIntVecBuilder {
+    builder: PersistentCompactIntVecBuilder,
+    temp:    TempDir,
+}
+```
+
+`TempCompactIntVec` implements `IntSlice` (full delegation to inner `PersistentCompactIntVec`).  
+`TempCompactIntVecBuilder` implements `IntSlice` + `IntSliceMut` (delegation to inner builder).  
+`make_persistent(path)` copies the temp file to `path` and opens it as `PersistentCompactIntVec`.
+
+### TempBitVec / TempBitVecBuilder
+
+```rust
+pub struct TempBitVec {
+    vec:   PersistentBitVec,
+    _temp: TempDir,
+}
+
+pub(crate) struct TempBitVecBuilder {
+    builder: PersistentBitVecBuilder,
+    temp:    TempDir,
+}
+```
+
+`TempBitVec` implements `BitSlice`.  
+`TempBitVecBuilder` implements `BitSlice` + `BitSliceMut`.  
+`make_persistent(path)` copies the temp file and opens as `PersistentBitVec`.
+
+---
+
+## Filter / Select API
 
 ### ColGroup
 
 ```rust
-struct ColGroup { name: String, indices: Vec<usize> }
+pub struct ColGroup { pub name: String, pub indices: Vec<usize> }
 ```
 
 Defined **once at the index level** from column metadata. Valid in all matrices of all layers and partitions because column structure is identical across the entire hierarchy (same samples/genomes everywhere; only rows = kmer slots are partitioned).
@@ -607,68 +644,75 @@ Defined **once at the index level** from column metadata. Valid in all matrices
 - **Across partitions**: kmer space is partitioned → partial results are **concatenated** (disjoint kmer ranges).
 - **Across layers**: same kmer space, different counts → partial results are **aggregated** (add, OR, etc.).
 
-### Additivity rules
+### MatrixGroupOps
 
-```mermaid
-flowchart LR
-    subgraph "Matrix level — returns MemoryIntVec"
-        PGP["partial_group_presence_count\npartial_group_sum\npartial_group_any → MemoryBitVec"]
-    end
-    subgraph "Index level — applies predicate"
-        GA["group_at_least(k)\n= accumulate.geq(k)"]
-        GALL["group_all\n= accumulate.geq(n_cols)"]
-        GANY["group_any\n= OR of partial_group_any"]
-    end
-    PGP -->|"concat across partitions\nadd across layers"| GA
-    PGP --> GALL
-    PGP --> GANY
-```
-
-Non-additive predicates (`group_all`, `group_at_least`) do **not** exist at matrix level — they require the global accumulated count.
-
-### MatrixGroupOps (planned trait)
-
-Group operations live on the matrix and expose only **additive intermediates** (`MemoryIntVec`). Predicates (final thresholds → `MemoryBitVec`) are applied at the index level after accumulation.
+Group operations live on the matrix and expose only **additive intermediates** backed by temp files. Predicates (final thresholds → `MemoryBitVec`) are applied at the index level after accumulation.
 
 ```rust
-trait MatrixGroupOps {
-    // How many columns in group have value >= threshold, per kmer slot
-    fn partial_group_presence_count(&self, g: &ColGroup, threshold: u32) -> MemoryIntVec;
+pub trait MatrixGroupOps {
+    fn partial_group_presence_count(&self, g: &ColGroup, threshold: u32)
+        -> io::Result<TempCompactIntVec>;
 
-    // Sum of values across group columns, per kmer slot
-    fn partial_group_sum(&self, g: &ColGroup) -> MemoryIntVec;
+    fn partial_group_sum(&self, g: &ColGroup)
+        -> io::Result<TempCompactIntVec>;
 
-    // Kmer present (value >= threshold) in at least one column of group
-    fn partial_group_any(&self, g: &ColGroup, threshold: u32) -> MemoryBitVec;
+    fn partial_group_any(&self, g: &ColGroup, threshold: u32)
+        -> io::Result<TempBitVec>;
 }
 ```
 
-Non-additive predicates (`group_all`, `group_at_least(k)`) are **not** on the matrix — they are composed at the index level from the additive intermediates:
+Implemented for both `PersistentCompactIntMatrix` and `PersistentBitMatrix`. For bit matrices, `partial_group_sum` delegates to `partial_group_presence_count(g, 1)` since values are 0/1.
+
+**`partial_group_presence_count` — chunking for large groups:**
+
+When `g.indices.len() < 255`, per-slot counts fit in a raw `u8` — fast path: accumulate directly into `primary_bytes_mut()` using `inc_primary_bits`, then `freeze()`. No overflow map needed.
+
+When `g.indices.len() ≥ 255`, process in chunks of 254 columns — each chunk stays within `u8` range — then add chunks into a running `TempCompactIntVecBuilder` accumulator via `IntSliceMut::add`. This keeps peak memory proportional to one partition, not the number of columns × partitions.
+
+```
+fast path (< 255 cols):
+  builder = TempCompactIntVecBuilder::new(n)
+  for c in group:
+    mask = col_view(c).cmp_scalar(|v| v >= threshold)  // MemoryBitVec
+    inc_primary_bits(primary_bytes_mut, mask)           // u8 safe
+  builder.freeze()
+
+slow path (≥ 255 cols):
+  result = TempCompactIntVecBuilder::new(n)
+  for chunk in group.chunks(254):
+    chunk_builder = TempCompactIntVecBuilder::new(n)
+    inc_primary_bits(chunk_builder, …)
+    chunk_frozen = chunk_builder.freeze()
+    IntSliceMut::add(&mut result, &chunk_frozen)
+  result.freeze()
+```
+
+Non-additive predicates (`group_all`, `group_at_least(k)`) are **not** on the matrix — composed at the index level:
 
 ```
 // "present in >= 2 ingroup columns with count >= 3, absent from all outgroup"
-let presence = layers.map(|l| l.partial_group_presence_count(&ingroup, 3)).sum();
-let in_mask  = presence.geq(2);                                     // MemoryBitVec
+let presence = layers.map(|l| l.partial_group_presence_count(&ingroup, 3)?).add_all()?;
+let in_mask  = presence.geq(2);
 
-let out_sum  = layers.map(|l| l.partial_group_sum(&outgroup)).sum();
-let out_mask = out_sum.leq(0);                                      // MemoryBitVec
+let out_sum  = layers.map(|l| l.partial_group_sum(&outgroup)?).add_all()?;
+let out_mask = out_sum.leq(0);
 
-let mask = in_mask.and(&out_mask);    // BitSliceMut::and — O(n/64)
+let mask = in_mask & &out_mask;    // BitSliceMut::and — O(n/64)
 ```
 
-### mask_with (planned IntSliceMut method)
+### mask_with (IntSliceMut)
 
-Apply a bit mask to a count vector: zero slots where the mask bit is 0. Iterates only zero bits — O(n_zeros), O(1) when mask is all-ones.
+Provided method on `IntSliceMut`. Zeros every slot where the corresponding mask bit is 0. Iterates only zero bits — O(n_zeros), O(1) when mask is all-ones.
 
 ```
 for (w_idx, word) in mask.words():
-  if word == u64::MAX: continue
+  if word == u64::MAX: continue   // skip all-ones words
   zeros = !word
   while zeros != 0:
     bit = trailing_zeros(zeros)
     s = w_idx * 64 + bit
-    self.set(s, 0)
+    if primary[s] != 0: self.set(s, 0)   // clears overflow entry too
     zeros &= zeros − 1
 ```
 
-This is the terminal operation for both Filter (zero non-selected kmer slots in a count matrix) and Select (positional selection without MPHF).
+Terminal operation for Filter (retain only selected kmer slots in a count vector) and Select (positional selection without MPHF).
diff --git a/obicompactvector_reflexion.md b/obicompactvector_reflexion.md
new file mode 100644
index 0000000..a8e2356
--- /dev/null
+++ b/obicompactvector_reflexion.md
@@ -0,0 +1,44 @@
+# La crate obicompactvector
+
+Le code actuelle est ce qu'il est. Ce n'est pad la vrérité absolue, c'est un premier effort d'implémentation rien de plus. Ci-dessous je vais décrire les objectif et la structure qui devrait être. LA VERITE A ATTEINDRE.
+
+La crate fournie des représentations les plus compact possible en mémoire de matrice de comptage ou de présence de k-mer dans des génomes. Chaque colonne représente un génome chaque ligne un kmer. une matrice est une collection de vecteur ou chacun des vecteur est un colonne de la matrice. 
+
+Les matrices comme les colonnes ont vocation à être persistante. Les données sont stockées dans des fichiers binaires. Les données sont mappées en mémoire via `mmap`
+
+Les structure sont par essence immutables. Il existe des représentations mutables des colonnes qui permettent leur construction. À la fin de leur construction, les colonnes sont fermée ce qui les rends immutable. 
+
+Les matrices peuvent êtres représenté de deux façons:
+    - via un répertoire contenant une collection de fichier colonnes
+    - via un fichier matrix qui est la concatenation de plusieurs fichiers colonnes.
+
+
+## Les matrices de comptage 
+
+Ce sont des matrice d'entiers positif la plus part du temps de petites valeurs (inferieurs à 255). On assume que toutes les valeurs sont représentables sur un `u32`
+
+## Les matrices de presence
+
+Ce sont des matrices de boolean représenté comme des champs de bits
+
+Il existe une forme implicite des vecteur de présence, qui n'est représenté par aucun fichier pour lequel toutes les valeurs sont vraies
+
+## représentation légère des colonnes
+
+Les colonnes qu'elles soient de unitiaire (fichier colonne) ou partie d'un fichier composite matrice peuvent être représenté par un objet léger donnant acces à ces valeurs ainsi qu'à la longeur du vecteurs. Toutes les méthodes de calcules doivent uniquement travailler à partir de ces représentations légère unifiées des colonnes.
+
+### Représentation légère d'un vecteur de présence
+
+Le vecteur est représenté par 
+    - un champs de bits encodé comme un [u64]
+    - un usize encodant la longeur du champs de bits
+    
+###  Représentation légère d'un vecteur de présence
+
+Le vecteur est représenté par 
+    - un vecteur [u8] encodant directement les valeur faibe du vecteur [0,255[
+      La valeur 255 est une valeur sentinelle indiquant que la valeure vraie est >=255
+      et se trouvent dans une structure d'overflow
+    - un iterateur de (usize,u32) listant les valeurs d'overflow coorespondant aux valeurs
+      sentinels (255) du [u8]
+    - un usize encodant la longeur du champs de bits
diff --git a/src/obicompactvec/Cargo.toml b/src/obicompactvec/Cargo.toml
index ddb1e40..777b606 100644
--- a/src/obicompactvec/Cargo.toml
+++ b/src/obicompactvec/Cargo.toml
@@ -7,6 +7,6 @@ edition = "2024"
 memmap2  = "0.9"
 ndarray  = "0.16"
 rayon    = "1"
+tempfile = "3"
 
 [dev-dependencies]
-tempfile = "3"
diff --git a/src/obicompactvec/src/bitmatrix.rs b/src/obicompactvec/src/bitmatrix.rs
index a51058e..8039e29 100644
--- a/src/obicompactvec/src/bitmatrix.rs
+++ b/src/obicompactvec/src/bitmatrix.rs
@@ -8,8 +8,9 @@ use rayon::prelude::*;
 
 use crate::bitvec::{PersistentBitVec, PersistentBitVecBuilder};
 use crate::colgroup::{ColGroup, MatrixGroupOps, inc_primary_bits};
-use crate::memoryintvec::MemoryIntVec;
 use crate::memoryvec::MemoryBitVec;
+use crate::tempbitvec::{TempBitVec, TempBitVecBuilder};
+use crate::tempintvec::{TempCompactIntVec, TempCompactIntVecBuilder};
 use crate::traits::{BitSlice, BitSliceMut, IntSliceMut};
 use crate::layer_meta::LayerMeta;
 use crate::meta::MatrixMeta;
@@ -452,39 +453,49 @@ impl PersistentBitMatrixBuilder {
 // ── MatrixGroupOps ────────────────────────────────────────────────────────────
 
 impl MatrixGroupOps for PersistentBitMatrix {
-    fn partial_group_presence_count(&self, g: &ColGroup, _threshold: u32) -> MemoryIntVec {
+    fn partial_group_presence_count(&self, g: &ColGroup, _threshold: u32) -> io::Result<TempCompactIntVec> {
         // Bit matrices store 0/1 — threshold is structurally always 1.
-        // Materialize each column to a MemoryBitVec and accumulate directly.
         let n = self.n();
         if g.indices.len() < 255 {
-            let mut primary = vec![0u8; n];
-            for &c in &g.indices {
-                let mbv = MemoryBitVec::from(&self.col_view(c));
-                inc_primary_bits(&mut primary, &mbv);
+            let mut builder = TempCompactIntVecBuilder::new(n)?;
+            {
+                let primary = builder.primary_bytes_mut();
+                for &c in &g.indices {
+                    let mbv = MemoryBitVec::from(&self.col_view(c));
+                    inc_primary_bits(primary, &mbv);
+                }
             }
-            MemoryIntVec::from_primary(primary)
+            builder.freeze()
         } else {
-            let mut result = MemoryIntVec::new(n);
-            for &c in &g.indices {
-                let mbv = MemoryBitVec::from(&self.col_view(c));
-                result.count_bits(&mbv);
+            let mut result = TempCompactIntVecBuilder::new(n)?;
+            for chunk in g.indices.chunks(254) {
+                let mut chunk_builder = TempCompactIntVecBuilder::new(n)?;
+                {
+                    let primary = chunk_builder.primary_bytes_mut();
+                    for &c in chunk {
+                        let mbv = MemoryBitVec::from(&self.col_view(c));
+                        inc_primary_bits(primary, &mbv);
+                    }
+                }
+                let chunk_frozen = chunk_builder.freeze()?;
+                IntSliceMut::add(&mut result, &chunk_frozen);
             }
-            result
+            result.freeze()
         }
     }
 
-    fn partial_group_sum(&self, g: &ColGroup) -> MemoryIntVec {
+    fn partial_group_sum(&self, g: &ColGroup) -> io::Result<TempCompactIntVec> {
         // For bit matrices, sum = count of 1-bits — identical to presence_count.
         self.partial_group_presence_count(g, 1)
     }
 
-    fn partial_group_any(&self, g: &ColGroup, _threshold: u32) -> MemoryBitVec {
+    fn partial_group_any(&self, g: &ColGroup, _threshold: u32) -> io::Result<TempBitVec> {
         let n = self.n();
-        let mut result = MemoryBitVec::new(n);
+        let mut result = TempBitVecBuilder::new(n)?;
         for &c in &g.indices {
             result.or(&self.col_view(c));
         }
-        result
+        result.freeze()
     }
 }
 
diff --git a/src/obicompactvec/src/builder.rs b/src/obicompactvec/src/builder.rs
index 3e622d9..271b5d8 100644
--- a/src/obicompactvec/src/builder.rs
+++ b/src/obicompactvec/src/builder.rs
@@ -122,19 +122,19 @@ impl PersistentCompactIntVecBuilder {
     /// Flush the primary mmap, then write sorted overflow data + index and fix the header.
     pub fn close(self) -> io::Result<()> {
         self.mmap.flush()?;
-        let Self {
-            path,
-            mmap,
-            n,
-            overflow,
-        } = self;
+        let Self { path, mmap, n, overflow } = self;
         drop(mmap);
-
         let mut entries: Vec<(usize, u32)> = overflow.into_iter().collect();
         entries.sort_unstable_by_key(|&(slot, _)| slot);
-
         finalize_pciv(&path, n, &entries)
     }
+
+    /// Close and reopen as a read-only [`PersistentCompactIntVec`].
+    pub fn finish(self) -> io::Result<PersistentCompactIntVec> {
+        let path = self.path.clone();
+        self.close()?;
+        PersistentCompactIntVec::open(&path)
+    }
 }
 
 // ── IntSlice / IntSliceMut impls ──────────────────────────────────────────────
diff --git a/src/obicompactvec/src/colgroup.rs b/src/obicompactvec/src/colgroup.rs
index df4576f..9fe1659 100644
--- a/src/obicompactvec/src/colgroup.rs
+++ b/src/obicompactvec/src/colgroup.rs
@@ -1,5 +1,8 @@
-use crate::memoryintvec::MemoryIntVec;
+use std::io;
+
 use crate::memoryvec::MemoryBitVec;
+use crate::tempbitvec::TempBitVec;
+use crate::tempintvec::TempCompactIntVec;
 use crate::traits::BitSlice;
 
 // ── ColGroup ──────────────────────────────────────────────────────────────────
@@ -30,13 +33,13 @@ impl ColGroup {
 /// — they are derived at the index level from these intermediates.
 pub trait MatrixGroupOps {
     /// Per-slot count of group columns whose value ≥ `threshold`.
-    fn partial_group_presence_count(&self, g: &ColGroup, threshold: u32) -> MemoryIntVec;
+    fn partial_group_presence_count(&self, g: &ColGroup, threshold: u32) -> io::Result<TempCompactIntVec>;
 
     /// Per-slot sum of values across all group columns.
-    fn partial_group_sum(&self, g: &ColGroup) -> MemoryIntVec;
+    fn partial_group_sum(&self, g: &ColGroup) -> io::Result<TempCompactIntVec>;
 
     /// Per-slot OR: true if any group column has value ≥ `threshold`.
-    fn partial_group_any(&self, g: &ColGroup, threshold: u32) -> MemoryBitVec;
+    fn partial_group_any(&self, g: &ColGroup, threshold: u32) -> io::Result<TempBitVec>;
 }
 
 // ── Internal helper ───────────────────────────────────────────────────────────
diff --git a/src/obicompactvec/src/intmatrix.rs b/src/obicompactvec/src/intmatrix.rs
index 172d7b0..fc64c48 100644
--- a/src/obicompactvec/src/intmatrix.rs
+++ b/src/obicompactvec/src/intmatrix.rs
@@ -12,7 +12,8 @@ use crate::bitmatrix::{pairwise_matrix, pairwise2_matrix};
 use crate::builder::PersistentCompactIntVecBuilder;
 use crate::colgroup::{ColGroup, MatrixGroupOps, inc_primary_bits};
 use crate::memoryintvec::MemoryIntVec;
-use crate::memoryvec::MemoryBitVec;
+use crate::tempbitvec::{TempBitVec, TempBitVecBuilder};
+use crate::tempintvec::{TempCompactIntVec, TempCompactIntVecBuilder};
 use crate::format::{byte_count_nonzero, byte_sum, HEADER_SIZE, OVERFLOW_ENTRY_SIZE, parse_index_entry, parse_overflow_entry};
 use crate::meta::MatrixMeta;
 use crate::reader::PersistentCompactIntVec;
@@ -630,45 +631,55 @@ impl PersistentCompactIntMatrixBuilder {
 // ── MatrixGroupOps ────────────────────────────────────────────────────────────
 
 impl MatrixGroupOps for PersistentCompactIntMatrix {
-    fn partial_group_presence_count(&self, g: &ColGroup, threshold: u32) -> MemoryIntVec {
+    fn partial_group_presence_count(&self, g: &ColGroup, threshold: u32) -> io::Result<TempCompactIntVec> {
         let n = self.n();
         if g.indices.len() < 255 {
-            // Fast path: counts fit in u8 — accumulate directly into raw bytes,
-            // no overflow map involved.
-            let mut primary = vec![0u8; n];
-            for &c in &g.indices {
-                let mask = self.col_view(c).cmp_scalar(|v| v >= threshold);
-                inc_primary_bits(&mut primary, &mask);
+            // Fast path: counts fit in u8 — accumulate directly into raw bytes.
+            let mut builder = TempCompactIntVecBuilder::new(n)?;
+            {
+                let primary = builder.primary_bytes_mut();
+                for &c in &g.indices {
+                    let mask = self.col_view(c).cmp_scalar(|v| v >= threshold);
+                    inc_primary_bits(primary, &mask);
+                }
             }
-            MemoryIntVec::from_primary(primary)
+            builder.freeze()
         } else {
-            // Slow path (rare): use IntSliceMut::count_bits which handles overflow.
-            let mut result = MemoryIntVec::new(n);
-            for &c in &g.indices {
-                let mask = self.col_view(c).cmp_scalar(|v| v >= threshold);
-                result.count_bits(&mask);
+            // Slow path: chunk by 254 to keep per-chunk u8 safe, then add chunks.
+            let mut result = TempCompactIntVecBuilder::new(n)?;
+            for chunk in g.indices.chunks(254) {
+                let mut chunk_builder = TempCompactIntVecBuilder::new(n)?;
+                {
+                    let primary = chunk_builder.primary_bytes_mut();
+                    for &c in chunk {
+                        let mask = self.col_view(c).cmp_scalar(|v| v >= threshold);
+                        inc_primary_bits(primary, &mask);
+                    }
+                }
+                let chunk_frozen = chunk_builder.freeze()?;
+                IntSliceMut::add(&mut result, &chunk_frozen);
             }
-            result
+            result.freeze()
         }
     }
 
-    fn partial_group_sum(&self, g: &ColGroup) -> MemoryIntVec {
+    fn partial_group_sum(&self, g: &ColGroup) -> io::Result<TempCompactIntVec> {
         let n = self.n();
-        let mut result = MemoryIntVec::new(n);
+        let mut result = TempCompactIntVecBuilder::new(n)?;
         for &c in &g.indices {
             let view = self.col_view(c);
             IntSliceMut::add(&mut result, &view);
         }
-        result
+        result.freeze()
     }
 
-    fn partial_group_any(&self, g: &ColGroup, threshold: u32) -> MemoryBitVec {
+    fn partial_group_any(&self, g: &ColGroup, threshold: u32) -> io::Result<TempBitVec> {
         let n = self.n();
-        let mut result = MemoryBitVec::new(n);
+        let mut result = TempBitVecBuilder::new(n)?;
         for &c in &g.indices {
             let mask = self.col_view(c).cmp_scalar(|v| v >= threshold);
             result.or(&mask);
         }
-        result
+        result.freeze()
     }
 }
diff --git a/src/obicompactvec/src/lib.rs b/src/obicompactvec/src/lib.rs
index c5f3705..6625ab6 100644
--- a/src/obicompactvec/src/lib.rs
+++ b/src/obicompactvec/src/lib.rs
@@ -9,6 +9,8 @@ mod memoryintvec;
 mod memoryvec;
 mod meta;
 mod reader;
+mod tempbitvec;
+mod tempintvec;
 pub mod traits;
 
 pub use bitvec::{BitIter, PersistentBitVec, PersistentBitVecBuilder};
@@ -20,6 +22,8 @@ pub use layer_meta::LayerMeta;
 pub use memoryintvec::{MemoryIntIter, MemoryIntVec};
 pub use memoryvec::MemoryBitVec;
 pub use reader::PersistentCompactIntVec;
+pub use tempbitvec::TempBitVec;
+pub use tempintvec::TempCompactIntVec;
 pub use traits::{BitPartials, BitSlice, BitSliceMut, BitToInt, ColumnWeights, CountPartials, IntSlice, IntSliceMut, IntToBit};
 
 #[cfg(test)]
diff --git a/src/obicompactvec/src/tempbitvec.rs b/src/obicompactvec/src/tempbitvec.rs
new file mode 100644
index 0000000..3945075
--- /dev/null
+++ b/src/obicompactvec/src/tempbitvec.rs
@@ -0,0 +1,69 @@
+use std::io;
+use std::path::Path;
+
+use tempfile::TempDir;
+
+use crate::bitvec::{PersistentBitVec, PersistentBitVecBuilder};
+use crate::traits::{BitSlice, BitSliceMut};
+
+// ── TempBitVec — frozen read-only, auto-deleted on drop ──────────────────────
+
+/// A bit vector backed by a temporary file.
+/// Implements [`BitSlice`]; the file is deleted when this value is dropped.
+/// Call [`make_persistent`](Self::make_persistent) to promote to a durable file.
+pub struct TempBitVec {
+    vec:   PersistentBitVec,
+    // Dropped after `vec` (field order), so the mmap is released before the
+    // temp directory is deleted.
+    _temp: TempDir,
+}
+
+impl TempBitVec {
+    /// Copy to a permanent file and open as a [`PersistentBitVec`].
+    pub fn make_persistent(&self, path: &Path) -> io::Result<PersistentBitVec> {
+        std::fs::copy(self.vec.path(), path)?;
+        PersistentBitVec::open(path)
+    }
+
+    pub fn len(&self)      -> usize { self.vec.len() }
+    pub fn is_empty(&self) -> bool  { self.vec.is_empty() }
+}
+
+impl BitSlice for TempBitVec {
+    fn len(&self)   -> usize  { self.vec.len() }
+    fn words(&self) -> &[u64] { self.vec.words() }
+}
+
+// ── TempBitVecBuilder — mutable, becomes TempBitVec on freeze ────────────────
+
+/// Writable builder for a [`TempBitVec`].  `pub(crate)` — callers receive
+/// only the frozen result via [`freeze`](Self::freeze).
+pub(crate) struct TempBitVecBuilder {
+    builder: PersistentBitVecBuilder,
+    temp:    TempDir,
+}
+
+impl TempBitVecBuilder {
+    pub(crate) fn new(n: usize) -> io::Result<Self> {
+        let temp = TempDir::new()?;
+        let path = temp.path().join("data.pbiv");
+        let builder = PersistentBitVecBuilder::new(n, &path)?;
+        Ok(Self { builder, temp })
+    }
+
+    /// Finalize writes and return a frozen, read-only [`TempBitVec`].
+    pub(crate) fn freeze(self) -> io::Result<TempBitVec> {
+        let Self { builder, temp } = self;
+        let vec = builder.finish()?;
+        Ok(TempBitVec { vec, _temp: temp })
+    }
+}
+
+impl BitSlice for TempBitVecBuilder {
+    fn len(&self)   -> usize  { self.builder.len() }
+    fn words(&self) -> &[u64] { self.builder.words() }
+}
+
+impl BitSliceMut for TempBitVecBuilder {
+    fn words_mut(&mut self) -> &mut [u64] { self.builder.words_mut() }
+}
diff --git a/src/obicompactvec/src/tempintvec.rs b/src/obicompactvec/src/tempintvec.rs
new file mode 100644
index 0000000..ced3cef
--- /dev/null
+++ b/src/obicompactvec/src/tempintvec.rs
@@ -0,0 +1,82 @@
+use std::io;
+use std::path::Path;
+
+use tempfile::TempDir;
+
+use crate::builder::PersistentCompactIntVecBuilder;
+use crate::reader::PersistentCompactIntVec;
+use crate::traits::{IntSlice, IntSliceMut};
+
+// ── TempCompactIntVec — frozen read-only, auto-deleted on drop ────────────────
+
+/// A compact int vector backed by a temporary file.
+/// Implements [`IntSlice`]; the file is deleted when this value is dropped.
+/// Call [`make_persistent`](Self::make_persistent) to promote to a durable file.
+pub struct TempCompactIntVec {
+    vec:   PersistentCompactIntVec,
+    // Dropped after `vec` (field order), so the mmap is released before the
+    // temp directory is deleted.
+    _temp: TempDir,
+}
+
+impl TempCompactIntVec {
+    /// Copy to a permanent file and open as a [`PersistentCompactIntVec`].
+    pub fn make_persistent(&self, path: &Path) -> io::Result<PersistentCompactIntVec> {
+        std::fs::copy(self.vec.path(), path)?;
+        PersistentCompactIntVec::open(path)
+    }
+
+    pub fn len(&self)      -> usize { self.vec.len() }
+    pub fn is_empty(&self) -> bool  { self.vec.is_empty() }
+}
+
+impl IntSlice for TempCompactIntVec {
+    fn len(&self)              -> usize { self.vec.len() }
+    fn get(&self, slot: usize) -> u32   { self.vec.get(slot) }
+    fn primary_bytes(&self)    -> &[u8] { self.vec.primary_bytes() }
+    fn overflow_entries(&self) -> impl Iterator<Item = (usize, u32)> + '_ {
+        self.vec.overflow_entries()
+    }
+    fn sum(&self)           -> u64 { self.vec.sum() }
+    fn count_nonzero(&self) -> u64 { self.vec.count_nonzero() }
+}
+
+// ── TempCompactIntVecBuilder — mutable, becomes TempCompactIntVec on freeze ──
+
+/// Writable builder for a [`TempCompactIntVec`].  `pub(crate)` — callers
+/// receive only the frozen result via [`freeze`](Self::freeze).
+pub(crate) struct TempCompactIntVecBuilder {
+    builder: PersistentCompactIntVecBuilder,
+    temp:    TempDir,
+}
+
+impl TempCompactIntVecBuilder {
+    pub(crate) fn new(n: usize) -> io::Result<Self> {
+        let temp = TempDir::new()?;
+        let path = temp.path().join("data.pciv");
+        let builder = PersistentCompactIntVecBuilder::new(n, &path)?;
+        Ok(Self { builder, temp })
+    }
+
+    /// Finalize writes and return a frozen, read-only [`TempCompactIntVec`].
+    pub(crate) fn freeze(self) -> io::Result<TempCompactIntVec> {
+        let Self { builder, temp } = self;
+        let vec = builder.finish()?;
+        Ok(TempCompactIntVec { vec, _temp: temp })
+    }
+}
+
+impl IntSlice for TempCompactIntVecBuilder {
+    fn len(&self)              -> usize { self.builder.len() }
+    fn get(&self, slot: usize) -> u32   { self.builder.get(slot) }
+    fn primary_bytes(&self)    -> &[u8] { self.builder.primary_bytes() }
+    fn overflow_entries(&self) -> impl Iterator<Item = (usize, u32)> + '_ {
+        self.builder.overflow_entries()
+    }
+}
+
+impl IntSliceMut for TempCompactIntVecBuilder {
+    fn set(&mut self, slot: usize, value: u32) { self.builder.set(slot, value); }
+    fn primary_bytes_mut(&mut self) -> &mut [u8] { self.builder.primary_bytes_mut() }
+    fn clear_overflow(&mut self)                  { self.builder.clear_overflow(); }
+}
diff --git a/src/obicompactvec/src/tests/colgroup.rs b/src/obicompactvec/src/tests/colgroup.rs
index 813d4fa..388508d 100644
--- a/src/obicompactvec/src/tests/colgroup.rs
+++ b/src/obicompactvec/src/tests/colgroup.rs
@@ -5,7 +5,7 @@ use crate::{
     PersistentBitMatrix, PersistentBitMatrixBuilder,
     PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder,
 };
-use crate::traits::{BitSliceMut, IntSlice, IntSliceMut};
+use crate::traits::{BitSlice, BitSliceMut, IntSlice, IntSliceMut};
 use crate::{MemoryBitVec, MemoryIntVec};
 
 // ── helpers ───────────────────────────────────────────────────────────────────
@@ -47,7 +47,7 @@ fn int_partial_group_sum_basic() {
     // group {0,2}: sum = [101, 2, 8]
     let (_d, m) = make_int_matrix(&[&[1, 2, 3], &[10, 20, 30], &[100, 0, 5]]);
     let g = ColGroup::new("g", vec![0, 2]);
-    let result = m.partial_group_sum(&g);
+    let result = m.partial_group_sum(&g).unwrap();
     assert_eq!(result.get(0), 101);
     assert_eq!(result.get(1), 2);
     assert_eq!(result.get(2), 8);
@@ -58,7 +58,7 @@ fn int_partial_group_sum_with_overflow() {
     // col0=[300,0], col1=[200,400]: group {0,1}: sum=[500, 400]
     let (_d, m) = make_int_matrix(&[&[300, 0], &[200, 400]]);
     let g = ColGroup::new("g", vec![0, 1]);
-    let result = m.partial_group_sum(&g);
+    let result = m.partial_group_sum(&g).unwrap();
     assert_eq!(result.get(0), 500);
     assert_eq!(result.get(1), 400);
     assert_eq!(result.sum(), 900);
@@ -73,7 +73,7 @@ fn int_partial_group_presence_count() {
     // group {0,1,2}: counts = [2, 1, 1, 2]
     let (_d, m) = make_int_matrix(&[&[5, 1, 0, 3], &[2, 0, 4, 3], &[0, 3, 1, 0]]);
     let g = ColGroup::new("g", vec![0, 1, 2]);
-    let result = m.partial_group_presence_count(&g, 2);
+    let result = m.partial_group_presence_count(&g, 2).unwrap();
     assert_eq!(result.get(0), 2);
     assert_eq!(result.get(1), 1);
     assert_eq!(result.get(2), 1);
@@ -87,7 +87,7 @@ fn int_partial_group_presence_count_with_overflow() {
     // group {0,1,2}: counts = [1, 1, 3]
     let (_d, m) = make_int_matrix(&[&[300, 0, 10], &[0, 400, 10], &[1, 1, 10]]);
     let g = ColGroup::new("g", vec![0, 1, 2]);
-    let result = m.partial_group_presence_count(&g, 5);
+    let result = m.partial_group_presence_count(&g, 5).unwrap();
     assert_eq!(result.get(0), 1);
     assert_eq!(result.get(1), 1);
     assert_eq!(result.get(2), 3);
@@ -102,7 +102,7 @@ fn int_partial_group_any() {
     // group {0,1,2}: any = [T, T, T, F]
     let (_d, m) = make_int_matrix(&[&[0, 3, 0, 1], &[2, 0, 0, 0], &[0, 0, 5, 0]]);
     let g = ColGroup::new("g", vec![0, 1, 2]);
-    let result = m.partial_group_any(&g, 2);
+    let result = m.partial_group_any(&g, 2).unwrap();
     assert_eq!(result.get(0), true);
     assert_eq!(result.get(1), true);
     assert_eq!(result.get(2), true);
@@ -164,7 +164,7 @@ fn bit_partial_group_presence_count() {
         &[false,true,  true,  false],
     ]);
     let g = ColGroup::new("g", vec![0, 1, 2]);
-    let result = m.partial_group_presence_count(&g, 1);
+    let result = m.partial_group_presence_count(&g, 1).unwrap();
     assert_eq!(result.get(0), 2);
     assert_eq!(result.get(1), 2);
     assert_eq!(result.get(2), 2);
@@ -181,7 +181,7 @@ fn bit_partial_group_any() {
         &[false, false, true],
     ]);
     let g = ColGroup::new("g", vec![0, 1]);
-    let result = m.partial_group_any(&g, 1);
+    let result = m.partial_group_any(&g, 1).unwrap();
     assert_eq!(result.get(0), true);
     assert_eq!(result.get(1), false);
     assert_eq!(result.get(2), true);
@@ -200,8 +200,8 @@ fn int_presence_count_additive_across_split() {
     let (_db, mb) = make_int_matrix(data_b);
     let g = ColGroup::new("g", vec![0, 1]);
 
-    let pa = ma.partial_group_presence_count(&g, 2);
-    let pb = mb.partial_group_presence_count(&g, 2);
+    let pa = ma.partial_group_presence_count(&g, 2).unwrap();
+    let pb = mb.partial_group_presence_count(&g, 2).unwrap();
 
     // Concatenate by adding (disjoint kmer ranges — here we just verify
     // individual results match the expected per-partition counts).

From f91c5a3f7981216e2a49af4986d8358b75225ee2 Mon Sep 17 00:00:00 2001
From: Eric Coissac <eric@coissac.eu>
Date: Wed, 17 Jun 2026 23:07:19 +0200
Subject: [PATCH 15/24] refactor(obicompactvec): unify bit and int vector slice
 views

Refactors column and matrix access to use unified `BitSliceView` and `IntSliceView` abstractions, replacing legacy `PackedCol`/`IntColView` types. Introduces `BitSlice`/`IntSlice` traits for zero-copy, trait-based bitwise and arithmetic operations across persistent and temporary vector types. Removes deprecated in-memory `MemoryBitVec` and `MemoryIntVec` implementations and their tests, while updating dependent crates to use the new view-based API and `BitSliceMut` trait.
---
 src/obicompactvec/src/bitmatrix.rs       |  89 +----
 src/obicompactvec/src/bitvec.rs          | 220 +++++------
 src/obicompactvec/src/builder.rs         | 274 +++++++++----
 src/obicompactvec/src/colgroup.rs        |  21 -
 src/obicompactvec/src/intmatrix.rs       | 355 +++--------------
 src/obicompactvec/src/lib.rs             |  12 +-
 src/obicompactvec/src/memoryintvec.rs    | 186 ---------
 src/obicompactvec/src/memoryvec.rs       | 138 -------
 src/obicompactvec/src/reader.rs          | 284 +++----------
 src/obicompactvec/src/tempbitvec.rs      |  69 ++--
 src/obicompactvec/src/tempintvec.rs      |  71 ++--
 src/obicompactvec/src/tests/bitmatrix.rs |   2 +-
 src/obicompactvec/src/tests/bitvec.rs    |   7 +-
 src/obicompactvec/src/tests/colgroup.rs  |  55 +--
 src/obicompactvec/src/tests/intmatrix.rs |   4 +-
 src/obicompactvec/src/tests/memoryvec.rs | 484 -----------------------
 src/obicompactvec/src/tests/mod.rs       |  11 +-
 src/obicompactvec/src/traits.rs          | 348 ----------------
 src/obicompactvec/src/views.rs           | 278 +++++++++++++
 src/obikpartitionner/src/common.rs       |   1 -
 src/obikpartitionner/src/select_layer.rs |   1 -
 src/obilayeredmap/src/layer.rs           |   1 -
 src/obilayeredmap/src/layered_store.rs   |   1 -
 23 files changed, 845 insertions(+), 2067 deletions(-)
 delete mode 100644 src/obicompactvec/src/memoryintvec.rs
 delete mode 100644 src/obicompactvec/src/memoryvec.rs
 delete mode 100644 src/obicompactvec/src/tests/memoryvec.rs
 create mode 100644 src/obicompactvec/src/views.rs

diff --git a/src/obicompactvec/src/bitmatrix.rs b/src/obicompactvec/src/bitmatrix.rs
index 8039e29..2717174 100644
--- a/src/obicompactvec/src/bitmatrix.rs
+++ b/src/obicompactvec/src/bitmatrix.rs
@@ -7,13 +7,12 @@ use ndarray::{Array1, Array2};
 use rayon::prelude::*;
 
 use crate::bitvec::{PersistentBitVec, PersistentBitVecBuilder};
-use crate::colgroup::{ColGroup, MatrixGroupOps, inc_primary_bits};
-use crate::memoryvec::MemoryBitVec;
-use crate::tempbitvec::{TempBitVec, TempBitVecBuilder};
-use crate::tempintvec::{TempCompactIntVec, TempCompactIntVecBuilder};
-use crate::traits::{BitSlice, BitSliceMut, IntSliceMut};
+use crate::colgroup::{ColGroup, MatrixGroupOps};
 use crate::layer_meta::LayerMeta;
 use crate::meta::MatrixMeta;
+use crate::tempbitvec::{TempBitVec, TempBitVecBuilder};
+use crate::tempintvec::{TempCompactIntVec, TempCompactIntVecBuilder};
+use crate::views::BitSliceView;
 
 fn col_path(dir: &Path, col: usize) -> PathBuf {
     dir.join(format!("col_{col:06}.pbiv"))
@@ -143,18 +142,14 @@ impl PackedBitMatrix {
         unsafe { std::slice::from_raw_parts(ptr, nw) }
     }
 
-    pub(crate) fn col_slice(&self, c: usize) -> PackedCol<'_> {
-        PackedCol { words: self.col_words(c), n: self.n_rows }
+    pub(crate) fn col_slice(&self, c: usize) -> BitSliceView<'_> {
+        BitSliceView::new(self.col_words(c), self.n_rows)
     }
 
     pub(crate) fn col_persist(&self, c: usize, path: &Path) -> io::Result<PersistentBitVecBuilder> {
         PersistentBitVecBuilder::from_raw_bytes(self.col_bytes(c), self.n_rows, path)
     }
 
-    pub(crate) fn col_as_memory(&self, c: usize) -> MemoryBitVec {
-        MemoryBitVec::from(&self.col_slice(c))
-    }
-
     pub(crate) fn count_ones(&self) -> Array1<u64> {
         Array1::from_vec(
             (0..self.n_cols).into_par_iter()
@@ -165,47 +160,17 @@ impl PackedBitMatrix {
 
     pub(crate) fn partial_jaccard_dist_matrix(&self) -> (Array2<u64>, Array2<u64>) {
         pairwise2_matrix(self.n_cols, |i, j| {
-            self.col_slice(i).partial_jaccard_dist(&self.col_slice(j))
+            self.col_slice(i).partial_jaccard_dist(self.col_slice(j))
         })
     }
 
     pub(crate) fn partial_hamming_dist_matrix(&self) -> Array2<u64> {
         pairwise_matrix(self.n_cols, |i, j| {
-            self.col_slice(i).hamming_dist(&self.col_slice(j))
+            self.col_slice(i).hamming_dist(self.col_slice(j))
         })
     }
 }
 
-pub(crate) struct PackedCol<'a> {
-    words: &'a [u64],
-    n: usize,
-}
-
-impl BitSlice for PackedCol<'_> {
-    fn len(&self) -> usize { self.n }
-    fn words(&self) -> &[u64] { self.words }
-}
-
-// ── BitColView — uniform column access across Columnar and Packed ─────────────
-
-enum BitColViewInner<'a> {
-    Columnar(&'a PersistentBitVec),
-    Packed(PackedCol<'a>),
-}
-
-/// Opaque column view returned by [`PersistentBitMatrix::col_view`].
-/// Implements [`BitSlice`] uniformly for both Columnar and Packed matrix formats.
-pub struct BitColView<'a>(BitColViewInner<'a>);
-
-impl BitSlice for BitColView<'_> {
-    fn len(&self) -> usize {
-        match &self.0 { BitColViewInner::Columnar(c) => c.len(), BitColViewInner::Packed(c) => c.len() }
-    }
-    fn words(&self) -> &[u64] {
-        match &self.0 { BitColViewInner::Columnar(c) => c.words(), BitColViewInner::Packed(c) => c.words() }
-    }
-}
-
 /// Build `presence/matrix.pbmx` from existing `col_*.pbiv` files.
 pub fn pack_bit_matrix(dir: &Path) -> io::Result<()> {
     let packed_path = dir.join("matrix.pbmx");
@@ -321,10 +286,10 @@ impl PersistentBitMatrix {
         }
     }
 
-    pub fn col_view(&self, c: usize) -> BitColView<'_> {
+    pub fn col_view(&self, c: usize) -> BitSliceView<'_> {
         match self {
-            Self::Columnar(m) => BitColView(BitColViewInner::Columnar(m.col(c))),
-            Self::Packed(m)   => BitColView(BitColViewInner::Packed(m.col_slice(c))),
+            Self::Columnar(m) => m.col(c).view(),
+            Self::Packed(m)   => m.col_slice(c),
             Self::Implicit { .. } => panic!("col_view() not available on Implicit PersistentBitMatrix"),
         }
     }
@@ -341,14 +306,6 @@ impl PersistentBitMatrix {
         }
     }
 
-    pub fn col_as_memory(&self, c: usize) -> MemoryBitVec {
-        match self {
-            Self::Columnar(m) => MemoryBitVec::from(m.col(c)),
-            Self::Packed(m)   => m.col_as_memory(c),
-            Self::Implicit { n_rows, .. } => MemoryBitVec::ones(*n_rows),
-        }
-    }
-
     pub fn row(&self, slot: usize) -> Box<[bool]> {
         match self {
             Self::Columnar(m)             => m.row(slot),
@@ -458,27 +415,19 @@ impl MatrixGroupOps for PersistentBitMatrix {
         let n = self.n();
         if g.indices.len() < 255 {
             let mut builder = TempCompactIntVecBuilder::new(n)?;
-            {
-                let primary = builder.primary_bytes_mut();
-                for &c in &g.indices {
-                    let mbv = MemoryBitVec::from(&self.col_view(c));
-                    inc_primary_bits(primary, &mbv);
-                }
+            for &c in &g.indices {
+                builder.inc_present_fast(self.col_view(c));
             }
             builder.freeze()
         } else {
             let mut result = TempCompactIntVecBuilder::new(n)?;
             for chunk in g.indices.chunks(254) {
-                let mut chunk_builder = TempCompactIntVecBuilder::new(n)?;
-                {
-                    let primary = chunk_builder.primary_bytes_mut();
-                    for &c in chunk {
-                        let mbv = MemoryBitVec::from(&self.col_view(c));
-                        inc_primary_bits(primary, &mbv);
-                    }
+                let mut chunk_b = TempCompactIntVecBuilder::new(n)?;
+                for &c in chunk {
+                    chunk_b.inc_present_fast(self.col_view(c));
                 }
-                let chunk_frozen = chunk_builder.freeze()?;
-                IntSliceMut::add(&mut result, &chunk_frozen);
+                let frozen = chunk_b.freeze()?;
+                result.add(frozen.view());
             }
             result.freeze()
         }
@@ -493,7 +442,7 @@ impl MatrixGroupOps for PersistentBitMatrix {
         let n = self.n();
         let mut result = TempBitVecBuilder::new(n)?;
         for &c in &g.indices {
-            result.or(&self.col_view(c));
+            result.or(self.col_view(c));
         }
         result.freeze()
     }
diff --git a/src/obicompactvec/src/bitvec.rs b/src/obicompactvec/src/bitvec.rs
index 1d91b10..8cde36b 100644
--- a/src/obicompactvec/src/bitvec.rs
+++ b/src/obicompactvec/src/bitvec.rs
@@ -5,29 +5,25 @@ use std::path::{Path, PathBuf};
 use memmap2::{Mmap, MmapMut};
 
 use crate::reader::PersistentCompactIntVec;
+use crate::views::{BitSliceView, BitSliceIter};
 
 const MAGIC: [u8; 4] = *b"PBIV";
 
 // Header: magic(4) + _pad(4) + n(8) = 16 bytes.
-// Data starts at offset 16, which is divisible by 8 → u64-aligned
-// (mmap base is page-aligned, 16 % 8 == 0).
+// Data starts at offset 16, u64-aligned (mmap base is page-aligned, 16 % 8 == 0).
 const HEADER_SIZE: usize = 16;
 
 #[inline]
-pub(crate) fn n_words(n: usize) -> usize {
-    n.div_ceil(64)
-}
+pub(crate) fn n_words(n: usize) -> usize { n.div_ceil(64) }
 
 #[inline]
-fn n_bytes_for_words(n: usize) -> usize {
-    n_words(n) * 8
-}
+fn n_bytes_for_words(n: usize) -> usize { n_words(n) * 8 }
 
-// ── Reader ────────────────────────────────────────────────────────────────────
+// ── PersistentBitVec ──────────────────────────────────────────────────────────
 
 pub struct PersistentBitVec {
     mmap: Mmap,
-    n: usize,
+    n:    usize,
     path: PathBuf,
 }
 
@@ -35,44 +31,49 @@ impl PersistentBitVec {
     pub fn open(path: &Path) -> io::Result<Self> {
         let mmap = unsafe { Mmap::map(&File::open(path)?)? };
         if mmap.len() < HEADER_SIZE {
-            return Err(io::Error::new(
-                io::ErrorKind::InvalidData,
-                "PBIV file too short",
-            ));
+            return Err(io::Error::new(io::ErrorKind::InvalidData, "PBIV file too short"));
         }
         if &mmap[0..4] != &MAGIC {
             return Err(io::Error::new(io::ErrorKind::InvalidData, "bad PBIV magic"));
         }
         let n = u64::from_le_bytes(mmap[8..16].try_into().unwrap()) as usize;
-        Ok(Self {
-            mmap,
-            n,
-            path: path.to_path_buf(),
-        })
+        Ok(Self { mmap, n, path: path.to_path_buf() })
     }
 
-    pub fn path(&self) -> &Path {
-        &self.path
-    }
-    pub fn len(&self) -> usize {
-        self.n
-    }
-    pub fn is_empty(&self) -> bool {
-        self.n == 0
-    }
+    pub fn path(&self) -> &Path { &self.path }
+    pub fn len(&self)      -> usize { self.n }
+    pub fn is_empty(&self) -> bool  { self.n == 0 }
 
     pub fn get(&self, slot: usize) -> bool {
         (self.mmap[HEADER_SIZE + (slot >> 3)] >> (slot & 7)) & 1 != 0
     }
 
-    // SAFETY: mmap is page-aligned, HEADER_SIZE=16 is divisible by 8,
-    // so &mmap[HEADER_SIZE] is u64-aligned. Slice length is n_words * 8 bytes.
+    // SAFETY: mmap is page-aligned, HEADER_SIZE=16 divisible by 8 → u64-aligned.
     fn data_words(&self) -> &[u64] {
-        let nw = n_words(self.n);
+        let nw  = n_words(self.n);
         let ptr = self.mmap[HEADER_SIZE..].as_ptr() as *const u64;
         unsafe { std::slice::from_raw_parts(ptr, nw) }
     }
 
+    pub fn view(&self) -> BitSliceView<'_> {
+        BitSliceView::new(self.data_words(), self.n)
+    }
+
+    pub fn words(&self) -> &[u64] { self.data_words() }
+
+    pub fn count_ones(&self)  -> u64 { self.view().count_ones() }
+    pub fn count_zeros(&self) -> u64 { self.view().count_zeros() }
+
+    pub fn partial_jaccard_dist(&self, other: &PersistentBitVec) -> (u64, u64) {
+        self.view().partial_jaccard_dist(other.view())
+    }
+    pub fn jaccard_dist(&self, other: &PersistentBitVec) -> f64 {
+        self.view().jaccard_dist(other.view())
+    }
+    pub fn hamming_dist(&self, other: &PersistentBitVec) -> u64 {
+        self.view().hamming_dist(other.view())
+    }
+
     pub fn iter(&self) -> BitIter<'_> {
         BitIter { words: self.data_words(), slot: 0, n: self.n }
     }
@@ -81,40 +82,38 @@ impl PersistentBitVec {
 impl<'a> IntoIterator for &'a PersistentBitVec {
     type Item = bool;
     type IntoIter = BitIter<'a>;
-    fn into_iter(self) -> BitIter<'a> {
-        self.iter()
-    }
+    fn into_iter(self) -> BitIter<'a> { self.iter() }
 }
 
+// ── BitIter ───────────────────────────────────────────────────────────────────
+
 pub struct BitIter<'a> {
     pub(crate) words: &'a [u64],
-    pub(crate) slot: usize,
-    pub(crate) n: usize,
+    pub(crate) slot:  usize,
+    pub(crate) n:     usize,
 }
 
 impl ExactSizeIterator for BitIter<'_> {}
 
 impl Iterator for BitIter<'_> {
     type Item = bool;
-
     fn next(&mut self) -> Option<bool> {
         if self.slot >= self.n { return None; }
         let v = (self.words[self.slot >> 6] >> (self.slot & 63)) & 1 != 0;
         self.slot += 1;
         Some(v)
     }
-
     fn size_hint(&self) -> (usize, Option<usize>) {
         let rem = self.n - self.slot;
         (rem, Some(rem))
     }
 }
 
-// ── Builder ───────────────────────────────────────────────────────────────────
+// ── PersistentBitVecBuilder ───────────────────────────────────────────────────
 
 pub struct PersistentBitVecBuilder {
     mmap: MmapMut,
-    n: usize,
+    n:    usize,
     path: PathBuf,
 }
 
@@ -122,13 +121,10 @@ impl PersistentBitVecBuilder {
     pub fn new(n: usize, path: &Path) -> io::Result<Self> {
         let file_size = HEADER_SIZE + n_bytes_for_words(n);
         let mut file = OpenOptions::new()
-            .read(true)
-            .write(true)
-            .create(true)
-            .truncate(true)
+            .read(true).write(true).create(true).truncate(true)
             .open(path)?;
         file.write_all(&MAGIC)?;
-        file.write_all(&[0u8; 4])?; // padding
+        file.write_all(&[0u8; 4])?;
         file.write_all(&(n as u64).to_le_bytes())?;
         file.seek(SeekFrom::Start(0))?;
         file.set_len(file_size as u64)?;
@@ -136,8 +132,6 @@ impl PersistentBitVecBuilder {
         Ok(Self { mmap, n, path: path.to_path_buf() })
     }
 
-    /// Create a PBIV file from raw packed bit-bytes, zero-padding to the next word boundary.
-    /// `bytes` is `n.div_ceil(8)` bytes; `n` is the number of bits.
     pub(crate) fn from_raw_bytes(bytes: &[u8], n: usize, path: &Path) -> io::Result<Self> {
         let file_size = HEADER_SIZE + n_bytes_for_words(n);
         let file = OpenOptions::new()
@@ -159,44 +153,11 @@ impl PersistentBitVecBuilder {
         Ok(Self { mmap, n, path: path.to_path_buf() })
     }
 
-    pub fn len(&self) -> usize {
-        self.n
-    }
-    pub fn is_empty(&self) -> bool {
-        self.n == 0
-    }
-
-    pub fn get(&self, slot: usize) -> bool {
-        (self.mmap[HEADER_SIZE + (slot >> 3)] >> (slot & 7)) & 1 != 0
-    }
-
-    fn data_words(&self) -> &[u64] {
-        let nw = n_words(self.n);
-        let ptr = self.mmap[HEADER_SIZE..].as_ptr() as *const u64;
-        unsafe { std::slice::from_raw_parts(ptr, nw) }
-    }
-
-    // SAFETY: same alignment argument as PersistentBitVec::data_words.
-    fn data_words_mut(&mut self) -> &mut [u64] {
-        let nw = n_words(self.n);
-        let ptr = self.mmap[HEADER_SIZE..].as_mut_ptr() as *mut u64;
-        unsafe { std::slice::from_raw_parts_mut(ptr, nw) }
-    }
-
-    /// Convert a count vector to a bit vector: bit set iff count >= threshold.
-    /// Fills u64 words directly from the count iterator — O(n), no bit-level set() overhead.
-    pub fn build_from_counts(
-        source: &PersistentCompactIntVec,
-        threshold: u32,
-        path: &Path,
-    ) -> io::Result<Self> {
+    pub fn build_from_counts(source: &PersistentCompactIntVec, threshold: u32, path: &Path) -> io::Result<Self> {
         let n = source.len();
         let file_size = HEADER_SIZE + n_bytes_for_words(n);
         let mut file = OpenOptions::new()
-            .read(true)
-            .write(true)
-            .create(true)
-            .truncate(true)
+            .read(true).write(true).create(true).truncate(true)
             .open(path)?;
         file.write_all(&MAGIC)?;
         file.write_all(&[0u8; 4])?;
@@ -204,52 +165,91 @@ impl PersistentBitVecBuilder {
         file.seek(SeekFrom::Start(0))?;
         file.set_len(file_size as u64)?;
         let mut mmap = unsafe { MmapMut::map_mut(&file)? };
-
         {
-            let nw = n_words(n);
+            let nw  = n_words(n);
             let ptr = mmap[HEADER_SIZE..].as_mut_ptr() as *mut u64;
             let words = unsafe { std::slice::from_raw_parts_mut(ptr, nw) };
             for (slot, count) in source.iter().enumerate() {
-                if count >= threshold {
-                    words[slot >> 6] |= 1u64 << (slot & 63);
-                }
+                if count >= threshold { words[slot >> 6] |= 1u64 << (slot & 63); }
             }
         }
-
         Ok(Self { mmap, n, path: path.to_path_buf() })
     }
 
-    /// Convert a count vector to a presence/absence bit vector (threshold = 1).
     pub fn build_from_presence(source: &PersistentCompactIntVec, path: &Path) -> io::Result<Self> {
         Self::build_from_counts(source, 1, path)
     }
 
-    pub fn close(self) -> io::Result<()> {
-        self.mmap.flush()
+    pub fn len(&self)      -> usize { self.n }
+    pub fn is_empty(&self) -> bool  { self.n == 0 }
+
+    pub fn get(&self, slot: usize) -> bool {
+        (self.mmap[HEADER_SIZE + (slot >> 3)] >> (slot & 7)) & 1 != 0
     }
 
-    /// Flush, close, and reopen as a read-only `PersistentBitVec`.
+    pub fn set(&mut self, slot: usize, value: bool) {
+        let bit = 1u64 << (slot & 63);
+        if value { self.data_words_mut()[slot >> 6] |=  bit; }
+        else     { self.data_words_mut()[slot >> 6] &= !bit; }
+    }
+
+    fn data_words(&self) -> &[u64] {
+        let nw  = n_words(self.n);
+        let ptr = self.mmap[HEADER_SIZE..].as_ptr() as *const u64;
+        unsafe { std::slice::from_raw_parts(ptr, nw) }
+    }
+
+    // SAFETY: same alignment argument as PersistentBitVec::data_words.
+    fn data_words_mut(&mut self) -> &mut [u64] {
+        let nw  = n_words(self.n);
+        let ptr = self.mmap[HEADER_SIZE..].as_mut_ptr() as *mut u64;
+        unsafe { std::slice::from_raw_parts_mut(ptr, nw) }
+    }
+
+    pub fn view(&self) -> BitSliceView<'_> {
+        BitSliceView::new(self.data_words(), self.n)
+    }
+
+    pub fn words(&self) -> &[u64] { self.data_words() }
+
+    pub fn copy_from(&mut self, src: BitSliceView<'_>) {
+        assert_eq!(self.n, src.len(), "BitSliceView length mismatch");
+        self.data_words_mut().copy_from_slice(src.words());
+    }
+
+    pub fn and(&mut self, other: BitSliceView<'_>) {
+        assert_eq!(self.n, other.len(), "BitSliceView length mismatch");
+        for (w, &o) in self.data_words_mut().iter_mut().zip(other.words()) { *w &= o; }
+    }
+
+    pub fn or(&mut self, other: BitSliceView<'_>) {
+        assert_eq!(self.n, other.len(), "BitSliceView length mismatch");
+        for (w, &o) in self.data_words_mut().iter_mut().zip(other.words()) { *w |= o; }
+    }
+
+    pub fn xor(&mut self, other: BitSliceView<'_>) {
+        assert_eq!(self.n, other.len(), "BitSliceView length mismatch");
+        for (w, &o) in self.data_words_mut().iter_mut().zip(other.words()) { *w ^= o; }
+    }
+
+    pub fn not(&mut self) {
+        let rem   = self.n % 64;
+        let words = self.data_words_mut();
+        for w in words.iter_mut() { *w ^= u64::MAX; }
+        if rem != 0 {
+            if let Some(last) = words.last_mut() { *last &= (1u64 << rem) - 1; }
+        }
+    }
+
+    pub fn iter(&self) -> BitSliceIter<'_> {
+        self.view().iter()
+    }
+
+    pub fn close(self) -> io::Result<()> { self.mmap.flush() }
+
     pub fn finish(self) -> io::Result<PersistentBitVec> {
         let path = self.path.clone();
         self.close()?;
         PersistentBitVec::open(&path)
     }
 }
-
-// ── BitSlice / BitSliceMut impls ──────────────────────────────────────────────
-
-use crate::traits::{BitSlice, BitSliceMut};
-
-impl BitSlice for PersistentBitVec {
-    fn len(&self) -> usize { self.n }
-    fn words(&self) -> &[u64] { self.data_words() }
-}
-
-impl BitSlice for PersistentBitVecBuilder {
-    fn len(&self) -> usize { self.n }
-    fn words(&self) -> &[u64] { self.data_words() }
-}
-
-impl BitSliceMut for PersistentBitVecBuilder {
-    fn words_mut(&mut self) -> &mut [u64] { self.data_words_mut() }
-}
diff --git a/src/obicompactvec/src/builder.rs b/src/obicompactvec/src/builder.rs
index 271b5d8..266b3c1 100644
--- a/src/obicompactvec/src/builder.rs
+++ b/src/obicompactvec/src/builder.rs
@@ -7,53 +7,26 @@ use memmap2::MmapMut;
 
 use crate::format::{byte_count_nonzero, byte_sum, HEADER_SIZE, finalize_pciv, parse_overflow_entry};
 use crate::reader::PersistentCompactIntVec;
+use crate::views::{BitSliceView, IntSliceView};
 
 pub struct PersistentCompactIntVecBuilder {
-    path: PathBuf,
-    mmap: MmapMut,
-    n: usize,
+    path:     PathBuf,
+    mmap:     MmapMut,
+    n:        usize,
     overflow: HashMap<usize, u32>,
 }
 
 impl PersistentCompactIntVecBuilder {
-    /// Create a new, zero-filled PCIV at `path`. Primary is mmapped immediately.
     pub fn new(n: usize, path: &Path) -> io::Result<Self> {
-        let file = OpenOptions::new()
-            .read(true)
-            .write(true)
-            .create(true)
-            .truncate(true)
-            .open(path)?;
-        file.set_len((HEADER_SIZE + n) as u64)?;
-        let mmap = unsafe { MmapMut::map_mut(&file)? };
-        Ok(Self {
-            path: path.to_path_buf(),
-            mmap,
-            n,
-            overflow: HashMap::new(),
-        })
-    }
-
-    /// Create from a [`MemoryIntVec`], copying primary bytes directly into the mmap.
-    /// O(n) memcpy + O(n_overflow) HashMap clone — no per-slot `set` overhead.
-    pub fn from_memory(src: &crate::memoryintvec::MemoryIntVec, path: &Path) -> io::Result<Self> {
-        let n = src.len();
         let file = OpenOptions::new()
             .read(true).write(true).create(true).truncate(true)
             .open(path)?;
         file.set_len((HEADER_SIZE + n) as u64)?;
-        let mut mmap = unsafe { MmapMut::map_mut(&file)? };
-        mmap[HEADER_SIZE..HEADER_SIZE + n].copy_from_slice(src.primary_bytes());
-        Ok(Self {
-            path: path.to_path_buf(),
-            mmap,
-            n,
-            overflow: src.overflow_map().clone(),
-        })
+        let mmap = unsafe { MmapMut::map_mut(&file)? };
+        Ok(Self { path: path.to_path_buf(), mmap, n, overflow: HashMap::new() })
     }
 
-    /// Create from raw primary bytes + an already-built overflow map (no per-slot overhead).
-    pub(crate) fn from_raw_primary(primary: &[u8], overflow: HashMap<usize, u32>, path: &Path) -> io::Result<Self> {
+    pub fn from_raw_primary(primary: &[u8], overflow: HashMap<usize, u32>, path: &Path) -> io::Result<Self> {
         let n = primary.len();
         let file = OpenOptions::new()
             .read(true).write(true).create(true).truncate(true)
@@ -64,40 +37,25 @@ impl PersistentCompactIntVecBuilder {
         Ok(Self { path: path.to_path_buf(), mmap, n, overflow })
     }
 
-    /// Copy `source`'s file to `path`, mmap the primary section, load overflow into RAM.
-    /// Avoids iterating all n slots: the file copy is OS-level, overflow loading is O(n_overflow).
     pub fn build_from(source: &PersistentCompactIntVec, path: &Path) -> io::Result<Self> {
         fs::copy(source.path(), path)?;
-
         let file = OpenOptions::new().read(true).write(true).open(path)?;
         let mmap = unsafe { MmapMut::map_mut(&file)? };
-
-        let n = source.len();
+        let n          = source.len();
         let n_overflow = u64::from_le_bytes(mmap[16..24].try_into().unwrap()) as usize;
         let data_offset = HEADER_SIZE + n;
-
         let mut overflow = HashMap::with_capacity(n_overflow);
         for i in 0..n_overflow {
             let (slot, value) = parse_overflow_entry(&mmap, data_offset, i);
             overflow.insert(slot, value);
         }
-
-        Ok(Self {
-            path: path.to_path_buf(),
-            mmap,
-            n,
-            overflow,
-        })
+        Ok(Self { path: path.to_path_buf(), mmap, n, overflow })
     }
 
-    /// Get the value at the given slot, handling overflow if necessary.
     pub fn get(&self, slot: usize) -> u32 {
         match self.mmap[HEADER_SIZE + slot] {
-            255 => *self
-                .overflow
-                .get(&slot)
-                .expect("sentinel without overflow entry"),
-            v => v as u32,
+            255 => *self.overflow.get(&slot).expect("sentinel without overflow entry"),
+            v   => v as u32,
         }
     }
 
@@ -111,15 +69,189 @@ impl PersistentCompactIntVecBuilder {
         }
     }
 
-    pub fn len(&self) -> usize {
-        self.n
+    pub fn len(&self)      -> usize { self.n }
+    pub fn is_empty(&self) -> bool  { self.n == 0 }
+
+    pub fn primary_bytes(&self)     -> &[u8]      { &self.mmap[HEADER_SIZE..HEADER_SIZE + self.n] }
+    pub fn primary_bytes_mut(&mut self) -> &mut [u8] { &mut self.mmap[HEADER_SIZE..HEADER_SIZE + self.n] }
+    pub fn clear_overflow(&mut self) { self.overflow.clear(); }
+
+    pub fn sum(&self) -> u64 {
+        byte_sum(&self.mmap[HEADER_SIZE..HEADER_SIZE + self.n], self.overflow.values().copied())
+    }
+    pub fn count_nonzero(&self) -> u64 {
+        byte_count_nonzero(&self.mmap[HEADER_SIZE..HEADER_SIZE + self.n])
     }
 
-    pub fn is_empty(&self) -> bool {
-        self.n == 0
+    pub fn view(&self) -> IntSliceView<'_> {
+        // Builder overflow is a HashMap, not sorted raw bytes — convert on the fly
+        // by collecting into a sorted vec and storing in a thread-local buffer.
+        // For read-back during building, just call get(slot) directly.
+        // view() is primarily useful AFTER freeze (on PersistentCompactIntVec).
+        // Here we expose it via a zero-alloc path: primary only, no overflow raw.
+        // Callers that need overflow_entries during building use overflow_entries().
+        let primary = &self.mmap[HEADER_SIZE..HEADER_SIZE + self.n];
+        IntSliceView::new(primary, &[], 0, self.n)
+    }
+
+    pub fn overflow_entries(&self) -> impl Iterator<Item = (usize, u32)> + '_ {
+        self.overflow.iter().map(|(&k, &v)| (k, v))
+    }
+
+    pub fn inc(&mut self, slot: usize) {
+        let v = self.get(slot);
+        self.set(slot, v.saturating_add(1));
+    }
+
+    // ── Computation methods ───────────────────────────────────────────────────
+
+    /// Increment one counter per 1-bit of `col`.  Safe for any group size.
+    pub fn inc_present(&mut self, col: BitSliceView<'_>) {
+        let n = self.n;
+        for (wi, &word) in col.words().iter().enumerate() {
+            if word == 0 { continue; }
+            let mut w = word;
+            while w != 0 {
+                let bit  = w.trailing_zeros() as usize;
+                let slot = wi * 64 + bit;
+                if slot < n { self.inc(slot); }
+                w &= w - 1;
+            }
+        }
+    }
+
+    /// Increment one counter per 1-bit of `col`, using raw u8 arithmetic.
+    /// Caller guarantees no counter will reach 255 (group size < 255).
+    pub fn inc_present_fast(&mut self, col: BitSliceView<'_>) {
+        {
+            let primary = self.primary_bytes_mut();
+            let n       = primary.len();
+            for (wi, &word) in col.words().iter().enumerate() {
+                if word == 0 { continue; }
+                let mut w = word;
+                while w != 0 {
+                    let bit  = w.trailing_zeros() as usize;
+                    let s    = wi * 64 + bit;
+                    if s < n { primary[s] += 1; }
+                    w &= w - 1;
+                }
+            }
+        }
+        debug_assert!(
+            !self.primary_bytes().contains(&255),
+            "sentinel 255 reached in inc_present_fast — group size must be < 255"
+        );
+    }
+
+    /// Two-pass: primary bytes then overflow.  Increments `self[slot]` for each
+    /// slot where `pred(col[slot])` is true.  Safe for any group size.
+    pub fn inc_predicate(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
+        let n = col.len();
+        for slot in 0..n {
+            let b = col.primary_bytes()[slot];
+            if b < 255 && pred(b as u32) {
+                self.inc(slot);
+            }
+        }
+        for (slot, val) in col.overflow_entries() {
+            if pred(val) { self.inc(slot); }
+        }
+    }
+
+    /// Fast two-pass: raw u8 arithmetic.  Caller guarantees no counter reaches 255.
+    pub fn inc_predicate_fast(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
+        let n = col.len();
+        {
+            let primary = self.primary_bytes_mut();
+            for slot in 0..n {
+                let b = col.primary_bytes()[slot];
+                if b < 255 && pred(b as u32) {
+                    primary[slot] += 1;
+                }
+            }
+        }
+        for (slot, val) in col.overflow_entries() {
+            if pred(val) { self.primary_bytes_mut()[slot] += 1; }
+        }
+        debug_assert!(
+            !self.primary_bytes().contains(&255),
+            "sentinel 255 reached in inc_predicate_fast — group size must be < 255"
+        );
+    }
+
+    pub fn add(&mut self, other: IntSliceView<'_>) {
+        let n = self.n;
+        for s in 0..n {
+            let sb = self.primary_bytes()[s];
+            let ob = other.primary_bytes()[s];
+            if sb < 255 && ob < 255 {
+                let sum = sb as u32 + ob as u32;
+                if sum < 255 { self.primary_bytes_mut()[s] = sum as u8; }
+                else         { self.set(s, sum); }
+            } else {
+                let sv = self.get(s);
+                let ov = other.get(s);
+                self.set(s, sv + ov);
+            }
+        }
+    }
+
+    pub fn min(&mut self, other: IntSliceView<'_>) {
+        let self_ov: Vec<(usize, u32)> = self.overflow_entries().collect();
+        let other_ov: HashMap<usize, u32> = other.overflow_entries().collect();
+        self.clear_overflow();
+        for (a, &b) in self.primary_bytes_mut().iter_mut().zip(other.primary_bytes()) {
+            if b < *a { *a = b; }
+        }
+        for (slot, self_val) in self_ov {
+            if let Some(&other_val) = other_ov.get(&slot) {
+                self.set(slot, self_val.min(other_val));
+            }
+        }
+    }
+
+    pub fn max(&mut self, other: IntSliceView<'_>) {
+        for (slot, other_val) in other.overflow_entries() {
+            let sv = self.get(slot);
+            self.set(slot, sv.max(other_val));
+        }
+        for (a, &b) in self.primary_bytes_mut().iter_mut().zip(other.primary_bytes()) {
+            if b > *a { *a = b; }
+        }
+    }
+
+    pub fn diff(&mut self, other: IntSliceView<'_>) {
+        let n = self.n;
+        for s in 0..n {
+            let sb = self.primary_bytes()[s];
+            let ob = other.primary_bytes()[s];
+            if sb < 255 {
+                self.primary_bytes_mut()[s] = if ob < 255 { sb.saturating_sub(ob) } else { 0 };
+            } else {
+                let sv = self.get(s);
+                let ov = if ob < 255 { ob as u32 } else { other.get(s) };
+                self.set(s, sv.saturating_sub(ov));
+            }
+        }
+    }
+
+    pub fn mask_with(&mut self, mask: BitSliceView<'_>) {
+        let n = self.n;
+        for (wi, &word) in mask.words().iter().enumerate() {
+            if word == u64::MAX { continue; }
+            let mut zeros = !word;
+            while zeros != 0 {
+                let bit = zeros.trailing_zeros() as usize;
+                let s   = wi * 64 + bit;
+                if s < n {
+                    let b = self.primary_bytes()[s];
+                    if b != 0 { self.set(s, 0); }
+                }
+                zeros &= zeros - 1;
+            }
+        }
     }
 
-    /// Flush the primary mmap, then write sorted overflow data + index and fix the header.
     pub fn close(self) -> io::Result<()> {
         self.mmap.flush()?;
         let Self { path, mmap, n, overflow } = self;
@@ -129,35 +261,9 @@ impl PersistentCompactIntVecBuilder {
         finalize_pciv(&path, n, &entries)
     }
 
-    /// Close and reopen as a read-only [`PersistentCompactIntVec`].
     pub fn finish(self) -> io::Result<PersistentCompactIntVec> {
         let path = self.path.clone();
         self.close()?;
         PersistentCompactIntVec::open(&path)
     }
 }
-
-// ── IntSlice / IntSliceMut impls ──────────────────────────────────────────────
-
-use crate::traits::{IntSlice, IntSliceMut};
-
-impl IntSlice for PersistentCompactIntVecBuilder {
-    fn len(&self) -> usize { self.n }
-    fn get(&self, slot: usize) -> u32 { self.get(slot) }
-    fn primary_bytes(&self) -> &[u8] { &self.mmap[HEADER_SIZE..HEADER_SIZE + self.n] }
-    fn overflow_entries(&self) -> impl Iterator<Item = (usize, u32)> + '_ {
-        self.overflow.iter().map(|(&k, &v)| (k, v))
-    }
-    fn sum(&self) -> u64 {
-        byte_sum(&self.mmap[HEADER_SIZE..HEADER_SIZE + self.n], self.overflow.values().copied())
-    }
-    fn count_nonzero(&self) -> u64 {
-        byte_count_nonzero(&self.mmap[HEADER_SIZE..HEADER_SIZE + self.n])
-    }
-}
-
-impl IntSliceMut for PersistentCompactIntVecBuilder {
-    fn set(&mut self, slot: usize, value: u32) { self.set(slot, value); }
-    fn primary_bytes_mut(&mut self) -> &mut [u8] { &mut self.mmap[HEADER_SIZE..HEADER_SIZE + self.n] }
-    fn clear_overflow(&mut self) { self.overflow.clear(); }
-}
diff --git a/src/obicompactvec/src/colgroup.rs b/src/obicompactvec/src/colgroup.rs
index 9fe1659..c238a62 100644
--- a/src/obicompactvec/src/colgroup.rs
+++ b/src/obicompactvec/src/colgroup.rs
@@ -1,9 +1,7 @@
 use std::io;
 
-use crate::memoryvec::MemoryBitVec;
 use crate::tempbitvec::TempBitVec;
 use crate::tempintvec::TempCompactIntVec;
-use crate::traits::BitSlice;
 
 // ── ColGroup ──────────────────────────────────────────────────────────────────
 
@@ -41,22 +39,3 @@ pub trait MatrixGroupOps {
     /// Per-slot OR: true if any group column has value ≥ `threshold`.
     fn partial_group_any(&self, g: &ColGroup, threshold: u32) -> io::Result<TempBitVec>;
 }
-
-// ── Internal helper ───────────────────────────────────────────────────────────
-
-/// Iterate 1-bits of a `MemoryBitVec` and increment the corresponding raw
-/// byte.  Caller must guarantee that no counter will reach 255 (group size
-/// < 255 columns), so that incrementing `u8` is safe and no sentinel is
-/// accidentally written.
-pub(crate) fn inc_primary_bits(primary: &mut [u8], mask: &MemoryBitVec) {
-    let n = primary.len();
-    for (wi, &word) in mask.words().iter().enumerate() {
-        let mut w = word;
-        while w != 0 {
-            let bit = w.trailing_zeros() as usize;
-            let s = wi * 64 + bit;
-            if s < n { primary[s] += 1; }
-            w &= w - 1;
-        }
-    }
-}
diff --git a/src/obicompactvec/src/intmatrix.rs b/src/obicompactvec/src/intmatrix.rs
index fc64c48..f3486d6 100644
--- a/src/obicompactvec/src/intmatrix.rs
+++ b/src/obicompactvec/src/intmatrix.rs
@@ -1,5 +1,3 @@
-use std::cmp::Ordering;
-use std::collections::HashMap;
 use std::fs::{self, File};
 use std::io::{self, BufWriter, Write as _};
 use std::path::{Path, PathBuf};
@@ -10,14 +8,13 @@ use rayon::prelude::*;
 
 use crate::bitmatrix::{pairwise_matrix, pairwise2_matrix};
 use crate::builder::PersistentCompactIntVecBuilder;
-use crate::colgroup::{ColGroup, MatrixGroupOps, inc_primary_bits};
-use crate::memoryintvec::MemoryIntVec;
-use crate::tempbitvec::{TempBitVec, TempBitVecBuilder};
-use crate::tempintvec::{TempCompactIntVec, TempCompactIntVecBuilder};
-use crate::format::{byte_count_nonzero, byte_sum, HEADER_SIZE, OVERFLOW_ENTRY_SIZE, parse_index_entry, parse_overflow_entry};
+use crate::colgroup::{ColGroup, MatrixGroupOps};
+use crate::format::{HEADER_SIZE, OVERFLOW_ENTRY_SIZE};
 use crate::meta::MatrixMeta;
 use crate::reader::PersistentCompactIntVec;
-use crate::traits::{BitSliceMut, IntSlice, IntSliceMut};
+use crate::tempbitvec::{TempBitVec, TempBitVecBuilder};
+use crate::tempintvec::{TempCompactIntVec, TempCompactIntVecBuilder};
+use crate::views::IntSliceView;
 
 fn col_path(dir: &Path, col: usize) -> PathBuf {
     dir.join(format!("col_{col:06}.pciv"))
@@ -48,9 +45,7 @@ impl ColumnarCompactIntMatrix {
     }
 
     pub(crate) fn fill_row(&self, slot: usize, buf: &mut [u32]) {
-        for (c, col) in self.cols.iter().enumerate() {
-            buf[c] = col.get(slot);
-        }
+        for (c, col) in self.cols.iter().enumerate() { buf[c] = col.get(slot); }
     }
 
     pub(crate) fn sum(&self) -> Array1<u64> {
@@ -72,31 +67,22 @@ impl ColumnarCompactIntMatrix {
     pub(crate) fn partial_bray_dist_matrix(&self) -> Array2<u64> {
         pairwise_matrix(self.n_cols(), |i, j| self.col(i).partial_bray_dist(self.col(j)))
     }
-
     pub(crate) fn partial_euclidean_dist_matrix(&self) -> Array2<f64> {
         pairwise_matrix(self.n_cols(), |i, j| self.col(i).partial_euclidean_dist(self.col(j)))
     }
-
-    pub(crate) fn partial_threshold_jaccard_dist_matrix(
-        &self, threshold: u32,
-    ) -> (Array2<u64>, Array2<u64>) {
-        pairwise2_matrix(self.n_cols(), |i, j| {
-            self.col(i).partial_threshold_jaccard_dist(self.col(j), threshold)
-        })
+    pub(crate) fn partial_threshold_jaccard_dist_matrix(&self, threshold: u32) -> (Array2<u64>, Array2<u64>) {
+        pairwise2_matrix(self.n_cols(), |i, j| self.col(i).partial_threshold_jaccard_dist(self.col(j), threshold))
     }
-
     pub(crate) fn partial_relfreq_bray_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
         pairwise_matrix(self.n_cols(), |i, j| {
             self.col(i).partial_relfreq_bray_dist(self.col(j), col_sums[i] as f64, col_sums[j] as f64)
         })
     }
-
     pub(crate) fn partial_relfreq_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
         pairwise_matrix(self.n_cols(), |i, j| {
             self.col(i).partial_relfreq_euclidean_dist(self.col(j), col_sums[i] as f64, col_sums[j] as f64)
         })
     }
-
     pub(crate) fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
         pairwise_matrix(self.n_cols(), |i, j| {
             self.col(i).partial_hellinger_euclidean_dist(self.col(j), col_sums[i] as f64, col_sums[j] as f64)
@@ -111,7 +97,6 @@ impl ColumnarCompactIntMatrix {
         meta.n_cols += 1;
         meta.save(dir)
     }
-
 }
 
 // ── PackedCompactIntMatrix ────────────────────────────────────────────────────
@@ -119,153 +104,12 @@ impl ColumnarCompactIntMatrix {
 const PCMX_MAGIC:  [u8; 4] = *b"PCMX";
 const PCMX_HEADER: usize   = 24; // magic(4) + pad(4) + n_rows(8) + n_cols(8)
 
-/// Per-column metadata pre-parsed from the embedded PCIV header.
 struct ColInfo {
-    primary_start: usize,  // absolute mmap offset to primary array
-    data_offset:   usize,  // absolute mmap offset to overflow array
+    primary_start: usize,
+    data_offset:   usize,
     n_overflow:    usize,
-    step:          usize,
-    index:         Vec<(usize, usize)>,
 }
 
-// ── PackedIntCol — lightweight column view backed by the shared mmap ──────────
-
-pub(crate) struct PackedIntCol<'a> {
-    primary:    &'a [u8],
-    overflow:   &'a [u8],  // raw bytes: n_overflow × OVERFLOW_ENTRY_SIZE
-    n_overflow: usize,
-    step:       usize,
-    index:      &'a [(usize, usize)],
-    n:          usize,
-}
-
-impl PackedIntCol<'_> {
-    fn overflow_get(&self, slot: usize) -> u32 {
-        let (pos_start, pos_end) = if self.step == 0 {
-            (0, self.n_overflow)
-        } else {
-            let i = self.index.partition_point(|&(s, _)| s <= slot).saturating_sub(1);
-            let start = self.index[i].1;
-            let end   = if i + 1 < self.index.len() { self.index[i + 1].1 } else { self.n_overflow };
-            (start, end)
-        };
-        let mut lo = pos_start;
-        let mut hi = pos_end;
-        while lo < hi {
-            let mid = lo + (hi - lo) / 2;
-            let (stored, val) = parse_overflow_entry(self.overflow, 0, mid);
-            match stored.cmp(&slot) {
-                Ordering::Equal   => return val,
-                Ordering::Less    => lo = mid + 1,
-                Ordering::Greater => hi = mid,
-            }
-        }
-        panic!("slot {slot} marked overflow but not found")
-    }
-}
-
-impl IntSlice for PackedIntCol<'_> {
-    fn len(&self) -> usize { self.n }
-
-    fn get(&self, slot: usize) -> u32 {
-        let v = self.primary[slot];
-        if v < 255 { v as u32 } else { self.overflow_get(slot) }
-    }
-
-    fn primary_bytes(&self) -> &[u8] { self.primary }
-
-    fn overflow_entries(&self) -> impl Iterator<Item = (usize, u32)> + '_ {
-        (0..self.n_overflow).map(|i| parse_overflow_entry(self.overflow, 0, i))
-    }
-
-    fn iter(&self) -> impl Iterator<Item = u32> + '_ {
-        PackedIntColIter {
-            primary:      self.primary,
-            overflow:     self.overflow,
-            slot:         0,
-            overflow_pos: 0,
-            n:            self.n,
-        }
-    }
-
-    fn sum(&self) -> u64 {
-        byte_sum(self.primary, (0..self.n_overflow).map(|i| parse_overflow_entry(self.overflow, 0, i).1))
-    }
-
-    fn count_nonzero(&self) -> u64 { byte_count_nonzero(self.primary) }
-}
-
-struct PackedIntColIter<'a> {
-    primary:      &'a [u8],
-    overflow:     &'a [u8],
-    slot:         usize,
-    overflow_pos: usize,
-    n:            usize,
-}
-
-impl Iterator for PackedIntColIter<'_> {
-    type Item = u32;
-
-    fn next(&mut self) -> Option<u32> {
-        if self.slot >= self.n { return None; }
-        let v = self.primary[self.slot];
-        self.slot += 1;
-        if v < 255 {
-            Some(v as u32)
-        } else {
-            let (_, val) = parse_overflow_entry(self.overflow, 0, self.overflow_pos);
-            self.overflow_pos += 1;
-            Some(val)
-        }
-    }
-
-    fn size_hint(&self) -> (usize, Option<usize>) {
-        let rem = self.n - self.slot;
-        (rem, Some(rem))
-    }
-}
-
-impl ExactSizeIterator for PackedIntColIter<'_> {}
-
-// ── IntColView — uniform column access across Columnar and Packed ─────────────
-
-enum IntColViewInner<'a> {
-    Columnar(&'a PersistentCompactIntVec),
-    Packed(PackedIntCol<'a>),
-}
-
-/// Opaque column view returned by [`PersistentCompactIntMatrix::col_view`].
-/// Implements [`IntSlice`] uniformly for both Columnar and Packed matrix formats.
-pub struct IntColView<'a>(IntColViewInner<'a>);
-
-impl IntSlice for IntColView<'_> {
-    fn len(&self) -> usize {
-        match &self.0 { IntColViewInner::Columnar(c) => c.len(), IntColViewInner::Packed(c) => c.len() }
-    }
-    fn get(&self, slot: usize) -> u32 {
-        match &self.0 { IntColViewInner::Columnar(c) => c.get(slot), IntColViewInner::Packed(c) => c.get(slot) }
-    }
-    fn primary_bytes(&self) -> &[u8] {
-        match &self.0 { IntColViewInner::Columnar(c) => c.primary_bytes(), IntColViewInner::Packed(c) => c.primary_bytes() }
-    }
-    fn overflow_entries(&self) -> impl Iterator<Item = (usize, u32)> + '_ {
-        // Box<dyn Iterator> implements Iterator, satisfying RPITIT across two distinct types.
-        let it: Box<dyn Iterator<Item = (usize, u32)> + '_> = match &self.0 {
-            IntColViewInner::Columnar(c) => Box::new(c.overflow_entries()),
-            IntColViewInner::Packed(c)   => Box::new(c.overflow_entries()),
-        };
-        it
-    }
-    fn sum(&self) -> u64 {
-        match &self.0 { IntColViewInner::Columnar(c) => c.sum(), IntColViewInner::Packed(c) => c.sum() }
-    }
-    fn count_nonzero(&self) -> u64 {
-        match &self.0 { IntColViewInner::Columnar(c) => c.count_nonzero(), IntColViewInner::Packed(c) => c.count_nonzero() }
-    }
-}
-
-// ─────────────────────────────────────────────────────────────────────────────
-
 pub struct PackedCompactIntMatrix {
     mmap:    Mmap,
     n_rows:  usize,
@@ -289,52 +133,30 @@ impl PackedCompactIntMatrix {
         for c in 0..n_cols {
             let off_pos  = PCMX_HEADER + c * 8;
             let col_base = u64::from_le_bytes(mmap[off_pos..off_pos+8].try_into().unwrap()) as usize;
-            // Parse embedded PCIV header at col_base
             let n_ov   = u64::from_le_bytes(mmap[col_base+16..col_base+24].try_into().unwrap()) as usize;
-            let n_idx  = u64::from_le_bytes(mmap[col_base+24..col_base+32].try_into().unwrap()) as usize;
-            let step   = u64::from_le_bytes(mmap[col_base+32..col_base+40].try_into().unwrap()) as usize;
             let n_pciv = u64::from_le_bytes(mmap[col_base+8..col_base+16].try_into().unwrap())  as usize;
-
             let primary_start = col_base + HEADER_SIZE;
             let data_offset   = primary_start + n_pciv;
-            let index_offset  = data_offset + n_ov * OVERFLOW_ENTRY_SIZE;
-
-            let mut index = Vec::with_capacity(n_idx);
-            for i in 0..n_idx {
-                index.push(parse_index_entry(&mmap, index_offset, i));
-            }
-            columns.push(ColInfo { primary_start, data_offset, n_overflow: n_ov, step, index });
+            columns.push(ColInfo { primary_start, data_offset, n_overflow: n_ov });
         }
-
         Ok(Self { mmap, n_rows, n_cols, columns })
     }
 
-    pub(crate) fn col_slice(&self, c: usize) -> PackedIntCol<'_> {
+    pub(crate) fn col_view(&self, c: usize) -> IntSliceView<'_> {
         let ci = &self.columns[c];
-        PackedIntCol {
-            primary:    &self.mmap[ci.primary_start..ci.primary_start + self.n_rows],
-            overflow:   &self.mmap[ci.data_offset..ci.data_offset + ci.n_overflow * OVERFLOW_ENTRY_SIZE],
-            n_overflow: ci.n_overflow,
-            step:       ci.step,
-            index:      &ci.index,
-            n:          self.n_rows,
-        }
+        let primary     = &self.mmap[ci.primary_start..ci.primary_start + self.n_rows];
+        let overflow_raw = &self.mmap[ci.data_offset..ci.data_offset + ci.n_overflow * OVERFLOW_ENTRY_SIZE];
+        IntSliceView::new(primary, overflow_raw, ci.n_overflow, self.n_rows)
     }
 
     pub(crate) fn col_persist(&self, c: usize, path: &Path) -> io::Result<PersistentCompactIntVecBuilder> {
-        let col = self.col_slice(c);
-        let overflow: HashMap<usize, u32> = col.overflow_entries().collect();
-        PersistentCompactIntVecBuilder::from_raw_primary(col.primary, overflow, path)
-    }
-
-    pub(crate) fn col_as_memory(&self, c: usize) -> MemoryIntVec {
-        MemoryIntVec::from(&self.col_slice(c))
+        let view = self.col_view(c);
+        let overflow: std::collections::HashMap<usize, u32> = view.overflow_entries().collect();
+        PersistentCompactIntVecBuilder::from_raw_primary(view.primary_bytes(), overflow, path)
     }
 
     #[inline]
-    pub(crate) fn get(&self, col: usize, slot: usize) -> u32 {
-        self.col_slice(col).get(slot)
-    }
+    pub(crate) fn get(&self, col: usize, slot: usize) -> u32 { self.col_view(col).get(slot) }
 
     pub(crate) fn fill_row(&self, slot: usize, buf: &mut [u32]) {
         for c in 0..self.n_cols { buf[c] = self.get(c, slot); }
@@ -346,86 +168,61 @@ impl PackedCompactIntMatrix {
 
     pub(crate) fn sum(&self) -> Array1<u64> {
         Array1::from_vec(
-            (0..self.n_cols).into_par_iter()
-                .map(|c| self.col_slice(c).sum())
-                .collect()
+            (0..self.n_cols).into_par_iter().map(|c| self.col_view(c).sum()).collect()
         )
     }
 
     pub(crate) fn count_nonzero(&self) -> Array1<u64> {
         Array1::from_vec(
-            (0..self.n_cols).into_par_iter()
-                .map(|c| self.col_slice(c).count_nonzero())
-                .collect()
+            (0..self.n_cols).into_par_iter().map(|c| self.col_view(c).count_nonzero()).collect()
         )
     }
 
-    // ── Pair primitives — sequential scan via col_slice().iter() ─────────────
-
     fn pair_partial_bray(&self, i: usize, j: usize) -> u64 {
-        self.col_slice(i).iter().zip(self.col_slice(j).iter())
-            .map(|(a, b)| a.min(b) as u64)
-            .sum()
+        self.col_view(i).iter().zip(self.col_view(j).iter()).map(|(a, b)| a.min(b) as u64).sum()
     }
-
     fn pair_partial_euclidean(&self, i: usize, j: usize) -> f64 {
-        self.col_slice(i).iter().zip(self.col_slice(j).iter())
-            .map(|(a, b)| { let d = a as f64 - b as f64; d * d })
-            .sum()
+        self.col_view(i).iter().zip(self.col_view(j).iter())
+            .map(|(a, b)| { let d = a as f64 - b as f64; d * d }).sum()
     }
-
     fn pair_partial_threshold_jaccard(&self, i: usize, j: usize, t: u32) -> (u64, u64) {
-        self.col_slice(i).iter().zip(self.col_slice(j).iter())
+        self.col_view(i).iter().zip(self.col_view(j).iter())
             .fold((0u64, 0u64), |(inter, uni), (a, b)| {
-                let ap = a >= t;
-                let bp = b >= t;
+                let ap = a >= t; let bp = b >= t;
                 (inter + (ap & bp) as u64, uni + (ap | bp) as u64)
             })
     }
-
     fn pair_partial_relfreq_bray(&self, i: usize, j: usize, si: f64, sj: f64) -> f64 {
         if si == 0.0 || sj == 0.0 { return 0.0; }
-        self.col_slice(i).iter().zip(self.col_slice(j).iter())
-            .map(|(a, b)| (a as f64 / si).min(b as f64 / sj))
-            .sum()
+        self.col_view(i).iter().zip(self.col_view(j).iter())
+            .map(|(a, b)| (a as f64 / si).min(b as f64 / sj)).sum()
     }
-
     fn pair_partial_relfreq_euclidean(&self, i: usize, j: usize, si: f64, sj: f64) -> f64 {
         if si == 0.0 || sj == 0.0 { return 0.0; }
-        self.col_slice(i).iter().zip(self.col_slice(j).iter())
-            .map(|(a, b)| { let d = a as f64 / si - b as f64 / sj; d * d })
-            .sum()
+        self.col_view(i).iter().zip(self.col_view(j).iter())
+            .map(|(a, b)| { let d = a as f64 / si - b as f64 / sj; d * d }).sum()
     }
-
     fn pair_partial_hellinger(&self, i: usize, j: usize, si: f64, sj: f64) -> f64 {
         if si == 0.0 || sj == 0.0 { return 0.0; }
-        self.col_slice(i).iter().zip(self.col_slice(j).iter())
-            .map(|(a, b)| { let d = (a as f64 / si).sqrt() - (b as f64 / sj).sqrt(); d * d })
-            .sum()
+        self.col_view(i).iter().zip(self.col_view(j).iter())
+            .map(|(a, b)| { let d = (a as f64 / si).sqrt() - (b as f64 / sj).sqrt(); d * d }).sum()
     }
 
-    // ── Matrix methods ────────────────────────────────────────────────────────
-
     pub(crate) fn partial_bray_dist_matrix(&self) -> Array2<u64> {
         pairwise_matrix(self.n_cols, |i, j| self.pair_partial_bray(i, j))
     }
-
     pub(crate) fn partial_euclidean_dist_matrix(&self) -> Array2<f64> {
         pairwise_matrix(self.n_cols, |i, j| self.pair_partial_euclidean(i, j))
     }
-
     pub(crate) fn partial_threshold_jaccard_dist_matrix(&self, t: u32) -> (Array2<u64>, Array2<u64>) {
         pairwise2_matrix(self.n_cols, |i, j| self.pair_partial_threshold_jaccard(i, j, t))
     }
-
     pub(crate) fn partial_relfreq_bray_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
         pairwise_matrix(self.n_cols, |i, j| self.pair_partial_relfreq_bray(i, j, col_sums[i] as f64, col_sums[j] as f64))
     }
-
     pub(crate) fn partial_relfreq_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
         pairwise_matrix(self.n_cols, |i, j| self.pair_partial_relfreq_euclidean(i, j, col_sums[i] as f64, col_sums[j] as f64))
     }
-
     pub(crate) fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
         pairwise_matrix(self.n_cols, |i, j| self.pair_partial_hellinger(i, j, col_sums[i] as f64, col_sums[j] as f64))
     }
@@ -435,32 +232,21 @@ impl PackedCompactIntMatrix {
 pub fn pack_compact_int_matrix(dir: &Path) -> io::Result<()> {
     let packed_path = dir.join("matrix.pcmx");
     if packed_path.exists() {
-        // Matrix complete; remove any leftover column files from a killed cleanup.
         if let Ok(meta) = MatrixMeta::load(dir) {
             for c in 0..meta.n_cols { let _ = fs::remove_file(col_path(dir, c)); }
             let _ = fs::remove_file(dir.join("meta.json"));
         }
         return Ok(());
     }
-
-    let meta = MatrixMeta::load(dir)?;
+    let meta   = MatrixMeta::load(dir)?;
     let n_cols = meta.n_cols;
-
-    // Compute offsets from file sizes — no column data loaded into RAM.
     let col_sizes: Vec<u64> = (0..n_cols)
         .map(|c| fs::metadata(col_path(dir, c)).map(|m| m.len()))
         .collect::<io::Result<_>>()?;
-
     let header_size = (PCMX_HEADER + n_cols * 8) as u64;
     let mut col_offset = header_size;
     let mut offsets = Vec::with_capacity(n_cols);
-    for &size in &col_sizes {
-        offsets.push(col_offset);
-        col_offset += size;
-    }
-
-    // Write to a temp file; rename atomically so a killed process never leaves
-    // a truncated matrix.pcmx that would be mistaken for a complete file.
+    for &size in &col_sizes { offsets.push(col_offset); col_offset += size; }
     let tmp_path = dir.join("matrix.pcmx.tmp");
     let mut out = BufWriter::new(File::create(&tmp_path)?);
     out.write_all(&PCMX_MAGIC)?;
@@ -468,13 +254,10 @@ pub fn pack_compact_int_matrix(dir: &Path) -> io::Result<()> {
     out.write_all(&(meta.n as u64).to_le_bytes())?;
     out.write_all(&(n_cols as u64).to_le_bytes())?;
     for &off in &offsets { out.write_all(&off.to_le_bytes())?; }
-    for c in 0..n_cols {
-        io::copy(&mut File::open(col_path(dir, c))?, &mut out)?;
-    }
+    for c in 0..n_cols { io::copy(&mut File::open(col_path(dir, c))?, &mut out)?; }
     out.flush()?;
     drop(out);
     fs::rename(&tmp_path, &packed_path)?;
-
     for c in 0..n_cols { fs::remove_file(col_path(dir, c))?; }
     fs::remove_file(dir.join("meta.json"))?;
     Ok(())
@@ -488,18 +271,14 @@ pub enum PersistentCompactIntMatrix {
 }
 
 impl PersistentCompactIntMatrix {
-    /// Open from `layer_dir`, auto-detecting Packed or Columnar.
     pub fn open(layer_dir: &Path) -> io::Result<Self> {
         let counts_dir = layer_dir.join("counts");
-
         if counts_dir.join("matrix.pcmx").exists() {
             return Ok(Self::Packed(PackedCompactIntMatrix::open(&counts_dir.join("matrix.pcmx"))?));
         }
-
         if MatrixMeta::load(&counts_dir).is_ok() {
             return Ok(Self::Columnar(ColumnarCompactIntMatrix::open(&counts_dir)?));
         }
-
         Err(io::Error::new(
             io::ErrorKind::NotFound,
             format!("no count matrix found in {} — run 'obikmer upgrade'", layer_dir.display()),
@@ -509,7 +288,6 @@ impl PersistentCompactIntMatrix {
     pub fn n(&self) -> usize {
         match self { Self::Columnar(m) => m.n(), Self::Packed(m) => m.n_rows }
     }
-
     pub fn n_cols(&self) -> usize {
         match self { Self::Columnar(m) => m.n_cols(), Self::Packed(m) => m.n_cols }
     }
@@ -521,10 +299,10 @@ impl PersistentCompactIntMatrix {
         }
     }
 
-    pub fn col_view(&self, c: usize) -> IntColView<'_> {
+    pub fn col_view(&self, c: usize) -> IntSliceView<'_> {
         match self {
-            Self::Columnar(m) => IntColView(IntColViewInner::Columnar(m.col(c))),
-            Self::Packed(m)   => IntColView(IntColViewInner::Packed(m.col_slice(c))),
+            Self::Columnar(m) => m.col(c).view(),
+            Self::Packed(m)   => m.col_view(c),
         }
     }
 
@@ -535,29 +313,18 @@ impl PersistentCompactIntMatrix {
         }
     }
 
-    pub fn col_as_memory(&self, c: usize) -> MemoryIntVec {
-        match self {
-            Self::Columnar(m) => MemoryIntVec::from(m.col(c)),
-            Self::Packed(m)   => m.col_as_memory(c),
-        }
-    }
-
     pub fn row(&self, slot: usize) -> Box<[u32]> {
         match self { Self::Columnar(m) => m.row(slot), Self::Packed(m) => m.row(slot) }
     }
-
     pub fn fill_row(&self, slot: usize, buf: &mut [u32]) {
         match self { Self::Columnar(m) => m.fill_row(slot, buf), Self::Packed(m) => m.fill_row(slot, buf) }
     }
-
     pub fn sum(&self) -> Array1<u64> {
         match self { Self::Columnar(m) => m.sum(), Self::Packed(m) => m.sum() }
     }
-
     pub fn count_nonzero(&self) -> Array1<u64> {
         match self { Self::Columnar(m) => m.count_nonzero(), Self::Packed(m) => m.count_nonzero() }
     }
-
     pub fn partial_bray_dist_matrix(&self) -> Array2<u64> {
         match self { Self::Columnar(m) => m.partial_bray_dist_matrix(), Self::Packed(m) => m.partial_bray_dist_matrix() }
     }
@@ -576,7 +343,6 @@ impl PersistentCompactIntMatrix {
     pub fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
         match self { Self::Columnar(m) => m.partial_hellinger_euclidean_dist_matrix(col_sums), Self::Packed(m) => m.partial_hellinger_euclidean_dist_matrix(col_sums) }
     }
-
     pub fn append_column(dir: &Path, value_of: impl Fn(usize) -> u32) -> io::Result<()> {
         ColumnarCompactIntMatrix::append_column(dir, value_of)
     }
@@ -592,12 +358,12 @@ impl ColumnWeights for PersistentCompactIntMatrix {
 }
 
 impl CountPartials for PersistentCompactIntMatrix {
-    fn partial_bray(&self) -> Array2<u64>                        { self.partial_bray_dist_matrix() }
-    fn partial_euclidean(&self) -> Array2<f64>                   { self.partial_euclidean_dist_matrix() }
+    fn partial_bray(&self) -> Array2<u64>                                 { self.partial_bray_dist_matrix() }
+    fn partial_euclidean(&self) -> Array2<f64>                            { self.partial_euclidean_dist_matrix() }
     fn partial_threshold_jaccard(&self, t: u32) -> (Array2<u64>, Array2<u64>) { self.partial_threshold_jaccard_dist_matrix(t) }
-    fn partial_relfreq_bray(&self, g: &Array1<u64>) -> Array2<f64>     { self.partial_relfreq_bray_dist_matrix(g) }
-    fn partial_relfreq_euclidean(&self, g: &Array1<u64>) -> Array2<f64> { self.partial_relfreq_euclidean_dist_matrix(g) }
-    fn partial_hellinger(&self, g: &Array1<u64>) -> Array2<f64>         { self.partial_hellinger_euclidean_dist_matrix(g) }
+    fn partial_relfreq_bray(&self, g: &Array1<u64>) -> Array2<f64>        { self.partial_relfreq_bray_dist_matrix(g) }
+    fn partial_relfreq_euclidean(&self, g: &Array1<u64>) -> Array2<f64>   { self.partial_relfreq_euclidean_dist_matrix(g) }
+    fn partial_hellinger(&self, g: &Array1<u64>) -> Array2<f64>           { self.partial_hellinger_euclidean_dist_matrix(g) }
 }
 
 // ── Builder ───────────────────────────────────────────────────────────────────
@@ -613,16 +379,13 @@ impl PersistentCompactIntMatrixBuilder {
         fs::create_dir_all(dir)?;
         Ok(Self { dir: dir.to_path_buf(), n, n_cols: 0 })
     }
-
     pub fn n(&self)      -> usize { self.n }
     pub fn n_cols(&self) -> usize { self.n_cols }
-
     pub fn add_col(&mut self) -> io::Result<PersistentCompactIntVecBuilder> {
         let path = col_path(&self.dir, self.n_cols);
         self.n_cols += 1;
         PersistentCompactIntVecBuilder::new(self.n, &path)
     }
-
     pub fn close(self) -> io::Result<()> {
         MatrixMeta { n: self.n, n_cols: self.n_cols }.save(&self.dir)
     }
@@ -634,30 +397,20 @@ impl MatrixGroupOps for PersistentCompactIntMatrix {
     fn partial_group_presence_count(&self, g: &ColGroup, threshold: u32) -> io::Result<TempCompactIntVec> {
         let n = self.n();
         if g.indices.len() < 255 {
-            // Fast path: counts fit in u8 — accumulate directly into raw bytes.
             let mut builder = TempCompactIntVecBuilder::new(n)?;
-            {
-                let primary = builder.primary_bytes_mut();
-                for &c in &g.indices {
-                    let mask = self.col_view(c).cmp_scalar(|v| v >= threshold);
-                    inc_primary_bits(primary, &mask);
-                }
+            for &c in &g.indices {
+                builder.inc_predicate_fast(self.col_view(c), |v| v >= threshold);
             }
             builder.freeze()
         } else {
-            // Slow path: chunk by 254 to keep per-chunk u8 safe, then add chunks.
             let mut result = TempCompactIntVecBuilder::new(n)?;
             for chunk in g.indices.chunks(254) {
-                let mut chunk_builder = TempCompactIntVecBuilder::new(n)?;
-                {
-                    let primary = chunk_builder.primary_bytes_mut();
-                    for &c in chunk {
-                        let mask = self.col_view(c).cmp_scalar(|v| v >= threshold);
-                        inc_primary_bits(primary, &mask);
-                    }
+                let mut chunk_b = TempCompactIntVecBuilder::new(n)?;
+                for &c in chunk {
+                    chunk_b.inc_predicate_fast(self.col_view(c), |v| v >= threshold);
                 }
-                let chunk_frozen = chunk_builder.freeze()?;
-                IntSliceMut::add(&mut result, &chunk_frozen);
+                let frozen = chunk_b.freeze()?;
+                result.add(frozen.view());
             }
             result.freeze()
         }
@@ -666,10 +419,7 @@ impl MatrixGroupOps for PersistentCompactIntMatrix {
     fn partial_group_sum(&self, g: &ColGroup) -> io::Result<TempCompactIntVec> {
         let n = self.n();
         let mut result = TempCompactIntVecBuilder::new(n)?;
-        for &c in &g.indices {
-            let view = self.col_view(c);
-            IntSliceMut::add(&mut result, &view);
-        }
+        for &c in &g.indices { result.add(self.col_view(c)); }
         result.freeze()
     }
 
@@ -677,8 +427,7 @@ impl MatrixGroupOps for PersistentCompactIntMatrix {
         let n = self.n();
         let mut result = TempBitVecBuilder::new(n)?;
         for &c in &g.indices {
-            let mask = self.col_view(c).cmp_scalar(|v| v >= threshold);
-            result.or(&mask);
+            result.or_where(self.col_view(c), |v| v >= threshold);
         }
         result.freeze()
     }
diff --git a/src/obicompactvec/src/lib.rs b/src/obicompactvec/src/lib.rs
index 6625ab6..ddd3bdc 100644
--- a/src/obicompactvec/src/lib.rs
+++ b/src/obicompactvec/src/lib.rs
@@ -5,26 +5,24 @@ mod colgroup;
 mod format;
 mod intmatrix;
 mod layer_meta;
-mod memoryintvec;
-mod memoryvec;
 mod meta;
 mod reader;
 mod tempbitvec;
 mod tempintvec;
+mod views;
 pub mod traits;
 
 pub use bitvec::{BitIter, PersistentBitVec, PersistentBitVecBuilder};
-pub use bitmatrix::{BitColView, PersistentBitMatrix, PersistentBitMatrixBuilder, pack_bit_matrix};
+pub use bitmatrix::{PersistentBitMatrix, PersistentBitMatrixBuilder, pack_bit_matrix};
 pub use builder::PersistentCompactIntVecBuilder;
 pub use colgroup::{ColGroup, MatrixGroupOps};
-pub use intmatrix::{IntColView, PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, pack_compact_int_matrix};
+pub use intmatrix::{PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, pack_compact_int_matrix};
 pub use layer_meta::LayerMeta;
-pub use memoryintvec::{MemoryIntIter, MemoryIntVec};
-pub use memoryvec::MemoryBitVec;
 pub use reader::PersistentCompactIntVec;
 pub use tempbitvec::TempBitVec;
 pub use tempintvec::TempCompactIntVec;
-pub use traits::{BitPartials, BitSlice, BitSliceMut, BitToInt, ColumnWeights, CountPartials, IntSlice, IntSliceMut, IntToBit};
+pub use traits::{BitPartials, ColumnWeights, CountPartials};
+pub use views::{BitSliceView, IntSliceView};
 
 #[cfg(test)]
 #[path = "tests/mod.rs"]
diff --git a/src/obicompactvec/src/memoryintvec.rs b/src/obicompactvec/src/memoryintvec.rs
deleted file mode 100644
index d5ca280..0000000
--- a/src/obicompactvec/src/memoryintvec.rs
+++ /dev/null
@@ -1,186 +0,0 @@
-use std::collections::HashMap;
-use std::io;
-use std::ops::{Add, AddAssign, Sub, SubAssign};
-use std::path::Path;
-
-use crate::builder::PersistentCompactIntVecBuilder;
-use crate::format::{byte_count_nonzero, byte_sum};
-use crate::traits::{IntSlice, IntSliceMut};
-
-// ── MemoryIntVec ──────────────────────────────────────────────────────────────
-
-#[derive(Clone)]
-pub struct MemoryIntVec {
-    primary:  Vec<u8>,
-    overflow: HashMap<usize, u32>,
-    n:        usize,
-}
-
-impl MemoryIntVec {
-    pub fn new(n: usize) -> Self {
-        Self { primary: vec![0u8; n], overflow: HashMap::new(), n }
-    }
-
-    pub fn len(&self) -> usize { self.n }
-    pub fn is_empty(&self) -> bool { self.n == 0 }
-
-    /// Construct directly from a pre-built primary array (no overflow — all values < 255).
-    pub(crate) fn from_primary(primary: Vec<u8>) -> Self {
-        let n = primary.len();
-        Self { primary, overflow: HashMap::new(), n }
-    }
-
-    pub(crate) fn from_primary_and_overflow(primary: Vec<u8>, overflow: HashMap<usize, u32>) -> Self {
-        let n = primary.len();
-        Self { primary, overflow, n }
-    }
-
-    pub(crate) fn primary_bytes(&self) -> &[u8] { &self.primary }
-    pub(crate) fn overflow_map(&self) -> &HashMap<usize, u32> { &self.overflow }
-
-    pub fn get(&self, slot: usize) -> u32 {
-        match self.primary[slot] {
-            255 => *self.overflow.get(&slot).expect("sentinel without overflow entry"),
-            v   => v as u32,
-        }
-    }
-
-    pub fn sum(&self) -> u64 {
-        byte_sum(&self.primary, self.overflow.values().copied())
-    }
-
-    pub fn count_nonzero(&self) -> u64 {
-        byte_count_nonzero(&self.primary)
-    }
-
-    pub fn filled(n: usize, value: u32) -> Self {
-        if value < 255 {
-            Self { primary: vec![value as u8; n], overflow: HashMap::new(), n }
-        } else {
-            Self { primary: vec![255u8; n], overflow: (0..n).map(|i| (i, value)).collect(), n }
-        }
-    }
-
-    pub fn iter(&self) -> MemoryIntIter<'_> {
-        MemoryIntIter { vec: self, slot: 0 }
-    }
-
-    /// Write to disk and return a writable builder at `path`.
-    pub fn persist(&self, path: &Path) -> io::Result<PersistentCompactIntVecBuilder> {
-        PersistentCompactIntVecBuilder::from_memory(self, path)
-    }
-}
-
-// ── IntSlice / IntSliceMut ────────────────────────────────────────────────────
-
-impl IntSlice for MemoryIntVec {
-    fn len(&self) -> usize { self.n }
-    fn get(&self, slot: usize) -> u32 { self.get(slot) }
-    fn primary_bytes(&self) -> &[u8] { &self.primary }
-    fn overflow_entries(&self) -> impl Iterator<Item = (usize, u32)> + '_ {
-        self.overflow.iter().map(|(&k, &v)| (k, v))
-    }
-    fn iter(&self) -> impl Iterator<Item = u32> + '_ { self.iter() }
-    fn sum(&self) -> u64 { self.sum() }
-    fn count_nonzero(&self) -> u64 { self.count_nonzero() }
-}
-
-impl IntSliceMut for MemoryIntVec {
-    fn set(&mut self, slot: usize, value: u32) {
-        if value < 255 {
-            self.primary[slot] = value as u8;
-            self.overflow.remove(&slot);
-        } else {
-            self.primary[slot] = 255;
-            self.overflow.insert(slot, value);
-        }
-    }
-    fn primary_bytes_mut(&mut self) -> &mut [u8] { &mut self.primary }
-    fn clear_overflow(&mut self) { self.overflow.clear(); }
-}
-
-// ── From conversions ──────────────────────────────────────────────────────────
-
-impl MemoryIntVec {
-    /// Bulk copy from another `MemoryIntVec`: memcpy for the primary bytes,
-    /// clone for the overflow map.
-    pub fn copy_from_memory(&mut self, src: &MemoryIntVec) {
-        assert_eq!(self.n, src.n, "MemoryIntVec length mismatch");
-        self.primary.copy_from_slice(&src.primary);
-        self.overflow = src.overflow.clone();
-    }
-}
-
-impl<S: IntSlice> From<&S> for MemoryIntVec {
-    fn from(src: &S) -> Self {
-        Self::from_primary_and_overflow(
-            src.primary_bytes().to_vec(),
-            src.overflow_entries().collect(),
-        )
-    }
-}
-
-// ── std::ops — owned (consumes lhs) ──────────────────────────────────────────
-
-impl<B: IntSlice> Add<&B> for MemoryIntVec {
-    type Output = MemoryIntVec;
-    fn add(mut self, rhs: &B) -> MemoryIntVec { IntSliceMut::add(&mut self, rhs); self }
-}
-
-impl<B: IntSlice> Sub<&B> for MemoryIntVec {
-    type Output = MemoryIntVec;
-    fn sub(mut self, rhs: &B) -> MemoryIntVec { self.diff(rhs); self }
-}
-
-// ── std::ops — borrowed (clones lhs) ─────────────────────────────────────────
-
-impl<B: IntSlice> Add<&B> for &MemoryIntVec {
-    type Output = MemoryIntVec;
-    fn add(self, rhs: &B) -> MemoryIntVec { self.clone().add(rhs) }
-}
-
-impl<B: IntSlice> Sub<&B> for &MemoryIntVec {
-    type Output = MemoryIntVec;
-    fn sub(self, rhs: &B) -> MemoryIntVec { self.clone().sub(rhs) }
-}
-
-// ── std::ops — in-place assign ────────────────────────────────────────────────
-
-impl<B: IntSlice> AddAssign<&B> for MemoryIntVec {
-    fn add_assign(&mut self, rhs: &B) { IntSliceMut::add(self, rhs); }
-}
-
-impl<B: IntSlice> SubAssign<&B> for MemoryIntVec {
-    fn sub_assign(&mut self, rhs: &B) { self.diff(rhs); }
-}
-
-// ── Iterator ──────────────────────────────────────────────────────────────────
-
-pub struct MemoryIntIter<'a> {
-    vec: &'a MemoryIntVec,
-    slot: usize,
-}
-
-impl Iterator for MemoryIntIter<'_> {
-    type Item = u32;
-
-    fn next(&mut self) -> Option<u32> {
-        if self.slot >= self.vec.n { return None; }
-        let v = self.vec.get(self.slot);
-        self.slot += 1;
-        Some(v)
-    }
-
-    fn size_hint(&self) -> (usize, Option<usize>) {
-        let rem = self.vec.n - self.slot;
-        (rem, Some(rem))
-    }
-}
-
-impl ExactSizeIterator for MemoryIntIter<'_> {}
-
-impl<'a> IntoIterator for &'a MemoryIntVec {
-    type Item = u32;
-    type IntoIter = MemoryIntIter<'a>;
-    fn into_iter(self) -> MemoryIntIter<'a> { self.iter() }
-}
diff --git a/src/obicompactvec/src/memoryvec.rs b/src/obicompactvec/src/memoryvec.rs
deleted file mode 100644
index fef0960..0000000
--- a/src/obicompactvec/src/memoryvec.rs
+++ /dev/null
@@ -1,138 +0,0 @@
-use std::io;
-use std::ops::{BitAnd, BitAndAssign, BitOr, BitOrAssign, BitXor, BitXorAssign, Not};
-use std::path::Path;
-
-use crate::bitvec::{BitIter, PersistentBitVecBuilder, n_words};
-use crate::traits::{BitSlice, BitSliceMut};
-
-// ── MemoryBitVec ──────────────────────────────────────────────────────────────
-
-#[derive(Clone)]
-pub struct MemoryBitVec {
-    words: Vec<u64>,
-    n: usize,
-}
-
-impl MemoryBitVec {
-    pub fn new(n: usize) -> Self {
-        Self { words: vec![0u64; n_words(n)], n }
-    }
-
-    pub fn ones(n: usize) -> Self {
-        let rem = n % 64;
-        let mut words = vec![u64::MAX; n_words(n)];
-        if rem != 0 {
-            if let Some(last) = words.last_mut() { *last = (1u64 << rem) - 1; }
-        }
-        Self { words, n }
-    }
-
-    pub(crate) fn from_words(words: Vec<u64>, n: usize) -> Self {
-        Self { words, n }
-    }
-
-    pub fn len(&self) -> usize { self.n }
-    pub fn is_empty(&self) -> bool { self.n == 0 }
-
-    pub fn get(&self, slot: usize) -> bool {
-        (self.words[slot >> 6] >> (slot & 63)) & 1 != 0
-    }
-
-    /// Write to disk and return a writable builder positioned at the same path.
-    pub fn persist(&self, path: &Path) -> io::Result<PersistentBitVecBuilder> {
-        let mut b = PersistentBitVecBuilder::new(self.n, path)?;
-        b.copy_from(self);
-        Ok(b)
-    }
-}
-
-// ── BitSlice / BitSliceMut ────────────────────────────────────────────────────
-
-impl BitSlice for MemoryBitVec {
-    fn len(&self) -> usize { self.n }
-    fn words(&self) -> &[u64] { &self.words }
-}
-
-impl BitSliceMut for MemoryBitVec {
-    fn words_mut(&mut self) -> &mut [u64] { &mut self.words }
-}
-
-// ── From conversions ──────────────────────────────────────────────────────────
-
-impl<S: BitSlice> From<&S> for MemoryBitVec {
-    fn from(src: &S) -> Self {
-        Self { words: src.words().to_vec(), n: src.len() }
-    }
-}
-
-// ── std::ops — owned (consumes lhs) ──────────────────────────────────────────
-
-impl<B: BitSlice> BitAnd<&B> for MemoryBitVec {
-    type Output = MemoryBitVec;
-    fn bitand(mut self, rhs: &B) -> MemoryBitVec { self.and(rhs); self }
-}
-
-impl<B: BitSlice> BitOr<&B> for MemoryBitVec {
-    type Output = MemoryBitVec;
-    fn bitor(mut self, rhs: &B) -> MemoryBitVec { self.or(rhs); self }
-}
-
-impl<B: BitSlice> BitXor<&B> for MemoryBitVec {
-    type Output = MemoryBitVec;
-    fn bitxor(mut self, rhs: &B) -> MemoryBitVec { self.xor(rhs); self }
-}
-
-impl Not for MemoryBitVec {
-    type Output = MemoryBitVec;
-    fn not(mut self) -> MemoryBitVec { BitSliceMut::not(&mut self); self }
-}
-
-// ── std::ops — borrowed (clones lhs) ─────────────────────────────────────────
-
-impl<B: BitSlice> BitAnd<&B> for &MemoryBitVec {
-    type Output = MemoryBitVec;
-    fn bitand(self, rhs: &B) -> MemoryBitVec { self.clone().bitand(rhs) }
-}
-
-impl<B: BitSlice> BitOr<&B> for &MemoryBitVec {
-    type Output = MemoryBitVec;
-    fn bitor(self, rhs: &B) -> MemoryBitVec { self.clone().bitor(rhs) }
-}
-
-impl<B: BitSlice> BitXor<&B> for &MemoryBitVec {
-    type Output = MemoryBitVec;
-    fn bitxor(self, rhs: &B) -> MemoryBitVec { self.clone().bitxor(rhs) }
-}
-
-impl Not for &MemoryBitVec {
-    type Output = MemoryBitVec;
-    fn not(self) -> MemoryBitVec { !self.clone() }
-}
-
-// ── std::ops — in-place assign ────────────────────────────────────────────────
-
-impl<B: BitSlice> BitAndAssign<&B> for MemoryBitVec {
-    fn bitand_assign(&mut self, rhs: &B) { self.and(rhs); }
-}
-
-impl<B: BitSlice> BitOrAssign<&B> for MemoryBitVec {
-    fn bitor_assign(&mut self, rhs: &B) { self.or(rhs); }
-}
-
-impl<B: BitSlice> BitXorAssign<&B> for MemoryBitVec {
-    fn bitxor_assign(&mut self, rhs: &B) { self.xor(rhs); }
-}
-
-// ── Iterator ──────────────────────────────────────────────────────────────────
-
-impl MemoryBitVec {
-    pub fn iter(&self) -> BitIter<'_> {
-        BitIter { words: &self.words, slot: 0, n: self.n }
-    }
-}
-
-impl<'a> IntoIterator for &'a MemoryBitVec {
-    type Item = bool;
-    type IntoIter = BitIter<'a>;
-    fn into_iter(self) -> BitIter<'a> { self.iter() }
-}
diff --git a/src/obicompactvec/src/reader.rs b/src/obicompactvec/src/reader.rs
index af7d05c..f3b1dd6 100644
--- a/src/obicompactvec/src/reader.rs
+++ b/src/obicompactvec/src/reader.rs
@@ -5,6 +5,7 @@ use std::path::{Path, PathBuf};
 use memmap2::Mmap;
 
 use crate::format::{byte_count_nonzero, byte_sum, HEADER_SIZE, MAGIC, OVERFLOW_ENTRY_SIZE, parse_index_entry};
+use crate::views::IntSliceView;
 
 pub struct PersistentCompactIntVec {
     mmap: Mmap,
@@ -18,97 +19,60 @@ pub struct PersistentCompactIntVec {
 }
 
 impl PersistentCompactIntVec {
-    /// Opens a persistent compact int vector from the given path.
     pub fn open(path: &Path) -> io::Result<Self> {
         let mmap = unsafe { Mmap::map(&File::open(path)?)? };
 
         if mmap.len() < HEADER_SIZE {
-            return Err(io::Error::new(
-                io::ErrorKind::InvalidData,
-                "PCIV file too short",
-            ));
+            return Err(io::Error::new(io::ErrorKind::InvalidData, "PCIV file too short"));
         }
         if &mmap[0..4] != &MAGIC {
             return Err(io::Error::new(io::ErrorKind::InvalidData, "bad PCIV magic"));
         }
 
-        let n = u64::from_le_bytes(mmap[8..16].try_into().unwrap()) as usize;
+        let n          = u64::from_le_bytes(mmap[8..16].try_into().unwrap())  as usize;
         let n_overflow = u64::from_le_bytes(mmap[16..24].try_into().unwrap()) as usize;
-        let n_index = u64::from_le_bytes(mmap[24..32].try_into().unwrap()) as usize;
-        let step = u64::from_le_bytes(mmap[32..40].try_into().unwrap()) as usize;
+        let n_index    = u64::from_le_bytes(mmap[24..32].try_into().unwrap()) as usize;
+        let step       = u64::from_le_bytes(mmap[32..40].try_into().unwrap()) as usize;
 
         let primary_offset = HEADER_SIZE;
-        let data_offset = primary_offset + n;
-        let index_offset = data_offset + n_overflow * OVERFLOW_ENTRY_SIZE;
+        let data_offset    = primary_offset + n;
+        let index_offset   = data_offset + n_overflow * OVERFLOW_ENTRY_SIZE;
 
         let mut index = Vec::with_capacity(n_index);
         for i in 0..n_index {
             index.push(parse_index_entry(&mmap, index_offset, i));
         }
 
-        Ok(Self {
-            mmap,
-            n,
-            n_overflow,
-            step,
-            index,
-            primary_offset,
-            data_offset,
-            path: path.to_path_buf(),
-        })
+        Ok(Self { mmap, n, n_overflow, step, index, primary_offset, data_offset, path: path.to_path_buf() })
     }
 
-    /// Returns the path of the compact int vector file.
-    pub fn path(&self) -> &Path {
-        &self.path
-    }
+    pub fn path(&self) -> &Path { &self.path }
+    pub fn len(&self)      -> usize { self.n }
+    pub fn is_empty(&self) -> bool  { self.n == 0 }
 
-    /// Returns the length of the compact int vector.
-    pub fn len(&self) -> usize {
-        self.n
-    }
-
-    /// Returns whether the compact int vector is empty.
-    pub fn is_empty(&self) -> bool {
-        self.n == 0
-    }
-
-    /// Returns the value at the given slot.
     pub fn get(&self, slot: usize) -> u32 {
         match self.mmap[self.primary_offset + slot] {
             255 => self.overflow_get(slot),
-            v => v as u32,
+            v   => v as u32,
         }
     }
 
-    /// Returns the value at the given slot from the overflow region.
     fn overflow_get(&self, slot: usize) -> u32 {
-        let pos_start;
-        let pos_end;
-
-        if self.step == 0 {
-            pos_start = 0;
-            pos_end = self.n_overflow;
+        let (pos_start, pos_end) = if self.step == 0 {
+            (0, self.n_overflow)
         } else {
-            let i = self
-                .index
-                .partition_point(|&(s, _)| s <= slot)
-                .saturating_sub(1);
-            pos_start = self.index[i].1;
-            pos_end = if i + 1 < self.index.len() {
-                self.index[i + 1].1
-            } else {
-                self.n_overflow
-            };
-        }
-
+            let i = self.index.partition_point(|&(s, _)| s <= slot).saturating_sub(1);
+            let start = self.index[i].1;
+            let end = if i + 1 < self.index.len() { self.index[i + 1].1 } else { self.n_overflow };
+            (start, end)
+        };
         let mut lo = pos_start;
         let mut hi = pos_end;
         while lo < hi {
             let mid = lo + (hi - lo) / 2;
             match self.data_slot(mid).cmp(&slot) {
-                std::cmp::Ordering::Equal => return self.data_value(mid),
-                std::cmp::Ordering::Less => lo = mid + 1,
+                std::cmp::Ordering::Equal   => return self.data_value(mid),
+                std::cmp::Ordering::Less    => lo = mid + 1,
                 std::cmp::Ordering::Greater => hi = mid,
             }
         }
@@ -116,14 +80,12 @@ impl PersistentCompactIntVec {
     }
 
     #[inline]
-    /// Returns the slot at the given index in the overflow region.
     fn data_slot(&self, i: usize) -> usize {
         let off = self.data_offset + i * OVERFLOW_ENTRY_SIZE;
         u64::from_le_bytes(self.mmap[off..off + 8].try_into().unwrap()) as usize
     }
 
     #[inline]
-    /// Returns the value at the given index in the overflow region.
     fn data_value(&self, i: usize) -> u32 {
         let off = self.data_offset + i * OVERFLOW_ENTRY_SIZE + 8;
         u32::from_le_bytes(self.mmap[off..off + 4].try_into().unwrap())
@@ -139,121 +101,70 @@ impl PersistentCompactIntVec {
         byte_count_nonzero(primary)
     }
 
-    #[inline]
-    /// Returns the Bray-Curtis distance between two compact int vectors.
+    /// Lightweight zero-copy view — primary and overflow point into the mmap.
+    pub fn view(&self) -> IntSliceView<'_> {
+        let primary = &self.mmap[self.primary_offset..self.primary_offset + self.n];
+        let overflow_raw = &self.mmap[self.data_offset..self.data_offset + self.n_overflow * OVERFLOW_ENTRY_SIZE];
+        IntSliceView::new(primary, overflow_raw, self.n_overflow, self.n)
+    }
+
+    pub fn iter(&self) -> Iter<'_> {
+        Iter { pciv: self, slot: 0, overflow_pos: 0 }
+    }
+
+    // ── Distance methods ──────────────────────────────────────────────────────
+
     pub fn bray_dist(&self, other: &PersistentCompactIntVec) -> f64 {
         let sum_min = self.partial_bray_dist(other);
         let denom = self.sum() + other.sum();
-        if denom == 0 {
-            return 0.0;
-        }
-        1.0 - 2.0 * sum_min as f64 / denom as f64
+        if denom == 0 { 0.0 } else { 1.0 - 2.0 * sum_min as f64 / denom as f64 }
     }
 
-    /// Returns `Σ_slot min(self[slot], other[slot])` — the additive numerator of Bray-Curtis.
-    /// The denominator `sum_a + sum_b` is obtained from `self.sum() + other.sum()`.
     pub fn partial_bray_dist(&self, other: &PersistentCompactIntVec) -> u64 {
         assert_eq!(self.n, other.len(), "length mismatch");
-        self.iter()
-            .zip(other.iter())
-            .map(|(a, b)| a.min(b) as u64)
-            .sum()
+        self.iter().zip(other.iter()).map(|(a, b)| a.min(b) as u64).sum()
     }
 
-    /// Returns the relative frequency Bray-Curtis distance between two compact int vectors.
-    ///
-    /// This is a variant of [`bray_dist`] that uses relative frequencies instead of raw counts.
     pub fn relfreq_bray_dist(&self, other: &PersistentCompactIntVec) -> f64 {
         assert_eq!(self.n, other.len(), "length mismatch");
-        let sum_a = self.sum() as f64;
-        let sum_b = other.sum() as f64;
-        if sum_a == 0.0 && sum_b == 0.0 {
-            return 0.0;
-        }
-        let sum_min = self.partial_relfreq_bray_dist(other, sum_a, sum_b);
-        1.0 - sum_min
+        let sa = self.sum() as f64;
+        let sb = other.sum() as f64;
+        if sa == 0.0 && sb == 0.0 { return 0.0; }
+        1.0 - self.partial_relfreq_bray_dist(other, sa, sb)
     }
 
-    /// Returns the partial relative frequency Bray-Curtis distance between two compact int vectors.
-    ///
-    /// This is used internally by [`relfreq_bray_dist`] and to easily compute the relative frequency
-    /// Bray-Curtis distance over a set of vector pairs.
-    ///
-    /// Arguments:
-    /// - `other`: the other compact int vector to compare with
-    /// - `sum_a`: the sum of the first vector's counts
-    /// - `sum_b`: the sum of the second vector's counts
-    ///
-    /// Returns the sum of the minimum relative frequencies at each index.
-    pub fn partial_relfreq_bray_dist(
-        &self,
-        other: &PersistentCompactIntVec,
-        sum_a: f64,
-        sum_b: f64,
-    ) -> f64 {
+    pub fn partial_relfreq_bray_dist(&self, other: &PersistentCompactIntVec, sum_a: f64, sum_b: f64) -> f64 {
         assert_eq!(self.n, other.len(), "length mismatch");
-        let sum_min: f64 = self
-            .iter()
-            .zip(other.iter())
+        self.iter().zip(other.iter())
             .map(|(a, b)| {
                 let pa = if sum_a > 0.0 { a as f64 / sum_a } else { 0.0 };
                 let pb = if sum_b > 0.0 { b as f64 / sum_b } else { 0.0 };
                 pa.min(pb)
             })
-            .sum();
-        sum_min
+            .sum()
     }
 
-    /// Returns the euclidean distance between two compact int vectors.
     pub fn euclidean_dist(&self, other: &PersistentCompactIntVec) -> f64 {
         self.partial_euclidean_dist(other).sqrt()
     }
 
-    /// Returns the partial euclidean distance between two compact int vectors.
-    ///
-    /// This is used internally by [`euclidean_dist`] and to easily compute the euclidean distance
-    /// over a set of vector pairs.
-    ///
-    /// The result is the sum of the squared differences between corresponding elements of the two
-    /// vectors.
     pub fn partial_euclidean_dist(&self, other: &PersistentCompactIntVec) -> f64 {
         assert_eq!(self.n, other.len(), "length mismatch");
-        self.iter()
-            .zip(other.iter())
-            .map(|(a, b)| {
-                let d = a as f64 - b as f64;
-                d * d
-            })
+        self.iter().zip(other.iter())
+            .map(|(a, b)| { let d = a as f64 - b as f64; d * d })
             .sum()
     }
 
-    /// Returns the relative frequency euclidean distance between two compact int vectors.
-    ///
-    /// This is a variant of [`euclidean_dist`] that uses relative frequencies instead of raw counts.
     pub fn relfreq_euclidean_dist(&self, other: &PersistentCompactIntVec) -> f64 {
-        assert_eq!(self.n, other.len(), "length mismatch");
-        let sum_a = self.sum() as f64;
-        let sum_b = other.sum() as f64;
-        if sum_a == 0.0 && sum_b == 0.0 {
-            return 0.0;
-        }
-        self.partial_relfreq_euclidean_dist(other, sum_a, sum_b)
-            .sqrt()
+        let sa = self.sum() as f64;
+        let sb = other.sum() as f64;
+        if sa == 0.0 && sb == 0.0 { return 0.0; }
+        self.partial_relfreq_euclidean_dist(other, sa, sb).sqrt()
     }
 
-    /// Returns the partial relative frequency euclidean distance between two compact int vectors.
-    ///
-    /// This is used internally by [`relfreq_euclidean_dist`] and to easily compute the relative frequency
-    /// euclidean distance over a set of vector pairs.
-    pub fn partial_relfreq_euclidean_dist(
-        &self,
-        other: &PersistentCompactIntVec,
-        sum_a: f64,
-        sum_b: f64,
-    ) -> f64 {
+    pub fn partial_relfreq_euclidean_dist(&self, other: &PersistentCompactIntVec, sum_a: f64, sum_b: f64) -> f64 {
         assert_eq!(self.n, other.len(), "length mismatch");
-        self.iter()
-            .zip(other.iter())
+        self.iter().zip(other.iter())
             .map(|(a, b)| {
                 let pa = if sum_a > 0.0 { a as f64 / sum_a } else { 0.0 };
                 let pb = if sum_b > 0.0 { b as f64 / sum_b } else { 0.0 };
@@ -263,46 +174,19 @@ impl PersistentCompactIntVec {
             .sum()
     }
 
-    /// Returns the Euclidean distance between two compact int vectors using the Hellinger transform.
-    ///
-    /// The Hellinger transform is applied to the raw counts of each vector, and the result is
-    /// the Euclidean distance between the transformed vectors. The Hellinger transform is defined
-    /// as the square root of the relative frequencies.
     pub fn hellinger_euclidean_dist(&self, other: &PersistentCompactIntVec) -> f64 {
-        assert_eq!(self.n, other.len(), "length mismatch");
-        let sum_a = self.sum() as f64;
-        let sum_b = other.sum() as f64;
-        if sum_a == 0.0 && sum_b == 0.0 {
-            return 0.0;
-        }
-        self.partial_hellinger_euclidean_dist(other, sum_a, sum_b)
-            .sqrt()
+        let sa = self.sum() as f64;
+        let sb = other.sum() as f64;
+        if sa == 0.0 && sb == 0.0 { return 0.0; }
+        self.partial_hellinger_euclidean_dist(other, sa, sb).sqrt()
     }
 
-    /// Returns the partial Hellinger Euclidean distance between two compact int vectors.
-    ///
-    /// This is used internally by [`hellinger_euclidean_dist`] and to easily compute the Hellinger
-    /// Euclidean distance over a set of vector pairs.
-    pub fn partial_hellinger_euclidean_dist(
-        &self,
-        other: &PersistentCompactIntVec,
-        sum_a: f64,
-        sum_b: f64,
-    ) -> f64 {
+    pub fn partial_hellinger_euclidean_dist(&self, other: &PersistentCompactIntVec, sum_a: f64, sum_b: f64) -> f64 {
         assert_eq!(self.n, other.len(), "length mismatch");
-        self.iter()
-            .zip(other.iter())
+        self.iter().zip(other.iter())
             .map(|(a, b)| {
-                let pa = if sum_a > 0.0 {
-                    (a as f64 / sum_a).sqrt()
-                } else {
-                    0.0
-                };
-                let pb = if sum_b > 0.0 {
-                    (b as f64 / sum_b).sqrt()
-                } else {
-                    0.0
-                };
+                let pa = if sum_a > 0.0 { (a as f64 / sum_a).sqrt() } else { 0.0 };
+                let pb = if sum_b > 0.0 { (b as f64 / sum_b).sqrt() } else { 0.0 };
                 let d = pa - pb;
                 d * d
             })
@@ -314,22 +198,13 @@ impl PersistentCompactIntVec {
     }
 
     pub fn threshold_jaccard_dist(&self, other: &PersistentCompactIntVec, threshold: u32) -> f64 {
-        assert_eq!(self.n, other.len(), "length mismatch");
         let (intersection, union) = self.partial_threshold_jaccard_dist(other, threshold);
-        if union == 0 {
-            return 0.0;
-        }
-        1.0 - intersection as f64 / union as f64
+        if union == 0 { 0.0 } else { 1.0 - intersection as f64 / union as f64 }
     }
 
-    pub fn partial_threshold_jaccard_dist(
-        &self,
-        other: &PersistentCompactIntVec,
-        threshold: u32,
-    ) -> (u64, u64) {
+    pub fn partial_threshold_jaccard_dist(&self, other: &PersistentCompactIntVec, threshold: u32) -> (u64, u64) {
         assert_eq!(self.n, other.len(), "length mismatch");
-        self.iter()
-            .zip(other.iter())
+        self.iter().zip(other.iter())
             .fold((0u64, 0u64), |(inter, uni), (a, b)| {
                 let ap = a >= threshold;
                 let bp = b >= threshold;
@@ -340,41 +215,12 @@ impl PersistentCompactIntVec {
     pub fn jaccard_dist(&self, other: &PersistentCompactIntVec) -> f64 {
         self.threshold_jaccard_dist(other, 1)
     }
-
-    pub fn iter(&self) -> Iter<'_> {
-        Iter {
-            pciv: self,
-            slot: 0,
-            overflow_pos: 0,
-        }
-    }
-}
-
-// ── IntSlice impl ─────────────────────────────────────────────────────────────
-
-use crate::traits::IntSlice;
-
-impl IntSlice for PersistentCompactIntVec {
-    fn len(&self) -> usize { self.n }
-    fn get(&self, slot: usize) -> u32 { self.get(slot) }
-    fn primary_bytes(&self) -> &[u8] {
-        &self.mmap[self.primary_offset..self.primary_offset + self.n]
-    }
-    fn overflow_entries(&self) -> impl Iterator<Item = (usize, u32)> + '_ {
-        (0..self.n_overflow).map(|i| (self.data_slot(i), self.data_value(i)))
-    }
-    fn iter(&self) -> impl Iterator<Item = u32> + '_ { self.iter() }
-    fn sum(&self) -> u64 { self.sum() }
-    fn count_nonzero(&self) -> u64 { self.count_nonzero() }
 }
 
 impl<'a> IntoIterator for &'a PersistentCompactIntVec {
     type Item = u32;
     type IntoIter = Iter<'a>;
-
-    fn into_iter(self) -> Iter<'a> {
-        self.iter()
-    }
+    fn into_iter(self) -> Iter<'a> { self.iter() }
 }
 
 pub struct Iter<'a> {
@@ -389,9 +235,7 @@ impl Iterator for Iter<'_> {
     type Item = u32;
 
     fn next(&mut self) -> Option<u32> {
-        if self.slot >= self.pciv.n {
-            return None;
-        }
+        if self.slot >= self.pciv.n { return None; }
         let v = self.pciv.mmap[self.pciv.primary_offset + self.slot];
         self.slot += 1;
         if v < 255 {
diff --git a/src/obicompactvec/src/tempbitvec.rs b/src/obicompactvec/src/tempbitvec.rs
index 3945075..3024ffb 100644
--- a/src/obicompactvec/src/tempbitvec.rs
+++ b/src/obicompactvec/src/tempbitvec.rs
@@ -4,43 +4,48 @@ use std::path::Path;
 use tempfile::TempDir;
 
 use crate::bitvec::{PersistentBitVec, PersistentBitVecBuilder};
-use crate::traits::{BitSlice, BitSliceMut};
+use crate::views::{BitSliceIter, BitSliceView, IntSliceView};
 
 // ── TempBitVec — frozen read-only, auto-deleted on drop ──────────────────────
 
-/// A bit vector backed by a temporary file.
-/// Implements [`BitSlice`]; the file is deleted when this value is dropped.
-/// Call [`make_persistent`](Self::make_persistent) to promote to a durable file.
 pub struct TempBitVec {
-    vec:   PersistentBitVec,
+    vec: PersistentBitVec,
     // Dropped after `vec` (field order), so the mmap is released before the
     // temp directory is deleted.
     _temp: TempDir,
 }
 
 impl TempBitVec {
-    /// Copy to a permanent file and open as a [`PersistentBitVec`].
     pub fn make_persistent(&self, path: &Path) -> io::Result<PersistentBitVec> {
         std::fs::copy(self.vec.path(), path)?;
         PersistentBitVec::open(path)
     }
 
-    pub fn len(&self)      -> usize { self.vec.len() }
-    pub fn is_empty(&self) -> bool  { self.vec.is_empty() }
-}
-
-impl BitSlice for TempBitVec {
-    fn len(&self)   -> usize  { self.vec.len() }
-    fn words(&self) -> &[u64] { self.vec.words() }
+    pub fn len(&self) -> usize {
+        self.vec.len()
+    }
+    pub fn is_empty(&self) -> bool {
+        self.vec.is_empty()
+    }
+    pub fn get(&self, slot: usize) -> bool {
+        self.vec.get(slot)
+    }
+    pub fn count_ones(&self) -> u64 {
+        self.vec.count_ones()
+    }
+    pub fn view(&self) -> BitSliceView<'_> {
+        self.vec.view()
+    }
+    pub fn iter(&self) -> BitSliceIter<'_> {
+        self.view().iter()
+    }
 }
 
 // ── TempBitVecBuilder — mutable, becomes TempBitVec on freeze ────────────────
 
-/// Writable builder for a [`TempBitVec`].  `pub(crate)` — callers receive
-/// only the frozen result via [`freeze`](Self::freeze).
 pub(crate) struct TempBitVecBuilder {
     builder: PersistentBitVecBuilder,
-    temp:    TempDir,
+    temp: TempDir,
 }
 
 impl TempBitVecBuilder {
@@ -51,19 +56,35 @@ impl TempBitVecBuilder {
         Ok(Self { builder, temp })
     }
 
-    /// Finalize writes and return a frozen, read-only [`TempBitVec`].
     pub(crate) fn freeze(self) -> io::Result<TempBitVec> {
         let Self { builder, temp } = self;
         let vec = builder.finish()?;
         Ok(TempBitVec { vec, _temp: temp })
     }
-}
 
-impl BitSlice for TempBitVecBuilder {
-    fn len(&self)   -> usize  { self.builder.len() }
-    fn words(&self) -> &[u64] { self.builder.words() }
-}
+    pub fn set(&mut self, slot: usize, value: bool) {
+        self.builder.set(slot, value);
+    }
+    pub(crate) fn view(&self) -> BitSliceView<'_> {
+        self.builder.view()
+    }
 
-impl BitSliceMut for TempBitVecBuilder {
-    fn words_mut(&mut self) -> &mut [u64] { self.builder.words_mut() }
+    pub fn or(&mut self, other: BitSliceView<'_>) {
+        self.builder.or(other);
+    }
+
+    /// Set self[slot] where pred(col[slot]) is true. Two-pass: primary then overflow.
+    pub fn or_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
+        for slot in 0..col.len() {
+            let b = col.primary_bytes()[slot];
+            if b < 255 && pred(b as u32) {
+                self.builder.set(slot, true);
+            }
+        }
+        for (slot, val) in col.overflow_entries() {
+            if pred(val) {
+                self.builder.set(slot, true);
+            }
+        }
+    }
 }
diff --git a/src/obicompactvec/src/tempintvec.rs b/src/obicompactvec/src/tempintvec.rs
index ced3cef..e5ff848 100644
--- a/src/obicompactvec/src/tempintvec.rs
+++ b/src/obicompactvec/src/tempintvec.rs
@@ -5,13 +5,10 @@ use tempfile::TempDir;
 
 use crate::builder::PersistentCompactIntVecBuilder;
 use crate::reader::PersistentCompactIntVec;
-use crate::traits::{IntSlice, IntSliceMut};
+use crate::views::{BitSliceView, IntSliceView};
 
 // ── TempCompactIntVec — frozen read-only, auto-deleted on drop ────────────────
 
-/// A compact int vector backed by a temporary file.
-/// Implements [`IntSlice`]; the file is deleted when this value is dropped.
-/// Call [`make_persistent`](Self::make_persistent) to promote to a durable file.
 pub struct TempCompactIntVec {
     vec:   PersistentCompactIntVec,
     // Dropped after `vec` (field order), so the mmap is released before the
@@ -20,7 +17,6 @@ pub struct TempCompactIntVec {
 }
 
 impl TempCompactIntVec {
-    /// Copy to a permanent file and open as a [`PersistentCompactIntVec`].
     pub fn make_persistent(&self, path: &Path) -> io::Result<PersistentCompactIntVec> {
         std::fs::copy(self.vec.path(), path)?;
         PersistentCompactIntVec::open(path)
@@ -28,23 +24,14 @@ impl TempCompactIntVec {
 
     pub fn len(&self)      -> usize { self.vec.len() }
     pub fn is_empty(&self) -> bool  { self.vec.is_empty() }
-}
-
-impl IntSlice for TempCompactIntVec {
-    fn len(&self)              -> usize { self.vec.len() }
-    fn get(&self, slot: usize) -> u32   { self.vec.get(slot) }
-    fn primary_bytes(&self)    -> &[u8] { self.vec.primary_bytes() }
-    fn overflow_entries(&self) -> impl Iterator<Item = (usize, u32)> + '_ {
-        self.vec.overflow_entries()
-    }
-    fn sum(&self)           -> u64 { self.vec.sum() }
-    fn count_nonzero(&self) -> u64 { self.vec.count_nonzero() }
+    pub fn get(&self, slot: usize) -> u32  { self.vec.get(slot) }
+    pub fn sum(&self)      -> u64   { self.vec.sum() }
+    pub fn view(&self)     -> IntSliceView<'_> { self.vec.view() }
+    pub fn iter(&self) -> crate::reader::Iter<'_> { self.vec.iter() }
 }
 
 // ── TempCompactIntVecBuilder — mutable, becomes TempCompactIntVec on freeze ──
 
-/// Writable builder for a [`TempCompactIntVec`].  `pub(crate)` — callers
-/// receive only the frozen result via [`freeze`](Self::freeze).
 pub(crate) struct TempCompactIntVecBuilder {
     builder: PersistentCompactIntVecBuilder,
     temp:    TempDir,
@@ -58,25 +45,47 @@ impl TempCompactIntVecBuilder {
         Ok(Self { builder, temp })
     }
 
-    /// Finalize writes and return a frozen, read-only [`TempCompactIntVec`].
     pub(crate) fn freeze(self) -> io::Result<TempCompactIntVec> {
         let Self { builder, temp } = self;
         let vec = builder.finish()?;
         Ok(TempCompactIntVec { vec, _temp: temp })
     }
-}
 
-impl IntSlice for TempCompactIntVecBuilder {
-    fn len(&self)              -> usize { self.builder.len() }
-    fn get(&self, slot: usize) -> u32   { self.builder.get(slot) }
-    fn primary_bytes(&self)    -> &[u8] { self.builder.primary_bytes() }
-    fn overflow_entries(&self) -> impl Iterator<Item = (usize, u32)> + '_ {
-        self.builder.overflow_entries()
+    // ── Delegation methods ────────────────────────────────────────────────────
+
+    pub(crate) fn n(&self) -> usize { self.builder.len() }
+
+    pub(crate) fn set(&mut self, slot: usize, value: u32) { self.builder.set(slot, value); }
+    pub(crate) fn get(&self, slot: usize) -> u32           { self.builder.get(slot) }
+
+    pub(crate) fn primary_bytes(&self)       -> &[u8]      { self.builder.primary_bytes() }
+    pub(crate) fn primary_bytes_mut(&mut self) -> &mut [u8] { self.builder.primary_bytes_mut() }
+
+    pub(crate) fn inc_present(&mut self, col: BitSliceView<'_>) {
+        self.builder.inc_present(col);
     }
-}
 
-impl IntSliceMut for TempCompactIntVecBuilder {
-    fn set(&mut self, slot: usize, value: u32) { self.builder.set(slot, value); }
-    fn primary_bytes_mut(&mut self) -> &mut [u8] { self.builder.primary_bytes_mut() }
-    fn clear_overflow(&mut self)                  { self.builder.clear_overflow(); }
+    pub(crate) fn inc_present_fast(&mut self, col: BitSliceView<'_>) {
+        self.builder.inc_present_fast(col);
+    }
+
+    pub(crate) fn inc_predicate(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
+        self.builder.inc_predicate(col, pred);
+    }
+
+    pub(crate) fn inc_predicate_fast(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
+        self.builder.inc_predicate_fast(col, pred);
+    }
+
+    pub(crate) fn add(&mut self, other: IntSliceView<'_>) {
+        self.builder.add(other);
+    }
+
+    pub(crate) fn mask_with(&mut self, mask: BitSliceView<'_>) {
+        self.builder.mask_with(mask);
+    }
+
+    pub(crate) fn min(&mut self, other: IntSliceView<'_>) { self.builder.min(other); }
+    pub(crate) fn max(&mut self, other: IntSliceView<'_>) { self.builder.max(other); }
+    pub(crate) fn diff(&mut self, other: IntSliceView<'_>) { self.builder.diff(other); }
 }
diff --git a/src/obicompactvec/src/tests/bitmatrix.rs b/src/obicompactvec/src/tests/bitmatrix.rs
index 5d93222..7600ac3 100644
--- a/src/obicompactvec/src/tests/bitmatrix.rs
+++ b/src/obicompactvec/src/tests/bitmatrix.rs
@@ -1,7 +1,7 @@
 use tempfile::tempdir;
 
 use crate::{pack_bit_matrix, PersistentBitMatrix, PersistentBitMatrixBuilder};
-use crate::traits::{BitPartials, BitSlice, BitSliceMut};
+use crate::traits::BitPartials;
 
 fn make_matrix(cols: &[&[bool]]) -> (tempfile::TempDir, PersistentBitMatrix) {
     let n = cols.first().map_or(0, |c| c.len());
diff --git a/src/obicompactvec/src/tests/bitvec.rs b/src/obicompactvec/src/tests/bitvec.rs
index 7382e14..4669489 100644
--- a/src/obicompactvec/src/tests/bitvec.rs
+++ b/src/obicompactvec/src/tests/bitvec.rs
@@ -1,6 +1,5 @@
 use tempfile::tempdir;
 
-use crate::traits::{BitSlice, BitSliceMut};
 use crate::{PersistentBitVec, PersistentBitVecBuilder, PersistentCompactIntVec, PersistentCompactIntVecBuilder};
 
 fn make_bv(bits: &[bool]) -> (tempfile::TempDir, PersistentBitVec) {
@@ -78,7 +77,7 @@ fn op_and() {
     let dir = tempdir().unwrap();
     let path = dir.path().join("out.pbiv");
     let mut b = PersistentBitVecBuilder::build_from(&ra, &path).unwrap();
-    b.and(&rb);
+    b.and(rb.view());
     b.close().unwrap();
     let r = PersistentBitVec::open(&path).unwrap();
     assert_eq!(r.iter().collect::<Vec<_>>(), vec![true, false, false, false]);
@@ -91,7 +90,7 @@ fn op_or() {
     let dir = tempdir().unwrap();
     let path = dir.path().join("out.pbiv");
     let mut b = PersistentBitVecBuilder::build_from(&ra, &path).unwrap();
-    b.or(&rb);
+    b.or(rb.view());
     b.close().unwrap();
     let r = PersistentBitVec::open(&path).unwrap();
     assert_eq!(r.iter().collect::<Vec<_>>(), vec![true, true, true, false]);
@@ -104,7 +103,7 @@ fn op_xor() {
     let dir = tempdir().unwrap();
     let path = dir.path().join("out.pbiv");
     let mut b = PersistentBitVecBuilder::build_from(&ra, &path).unwrap();
-    b.xor(&rb);
+    b.xor(rb.view());
     b.close().unwrap();
     let r = PersistentBitVec::open(&path).unwrap();
     assert_eq!(r.iter().collect::<Vec<_>>(), vec![false, true, true, false]);
diff --git a/src/obicompactvec/src/tests/colgroup.rs b/src/obicompactvec/src/tests/colgroup.rs
index 388508d..884450f 100644
--- a/src/obicompactvec/src/tests/colgroup.rs
+++ b/src/obicompactvec/src/tests/colgroup.rs
@@ -5,8 +5,7 @@ use crate::{
     PersistentBitMatrix, PersistentBitMatrixBuilder,
     PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder,
 };
-use crate::traits::{BitSlice, BitSliceMut, IntSlice, IntSliceMut};
-use crate::{MemoryBitVec, MemoryIntVec};
+use crate::{PersistentBitVecBuilder, PersistentCompactIntVec, PersistentCompactIntVecBuilder};
 
 // ── helpers ───────────────────────────────────────────────────────────────────
 
@@ -114,42 +113,52 @@ fn int_partial_group_any() {
 #[test]
 fn mask_with_zeros_selected_slots() {
     // count vec [10, 20, 30, 40], mask [T, F, T, F] → [10, 0, 30, 0]
-    let mut v = MemoryIntVec::new(4);
+    let dir = tempdir().unwrap();
+    let mut v = PersistentCompactIntVecBuilder::new(4, &dir.path().join("v.pciv")).unwrap();
     v.set(0, 10); v.set(1, 20); v.set(2, 30); v.set(3, 40);
-    let mut mask = MemoryBitVec::new(4);
+    let mut mask = PersistentBitVecBuilder::new(4, &dir.path().join("m.pbiv")).unwrap();
     mask.set(0, true); mask.set(2, true);
-    v.mask_with(&mask);
-    assert_eq!(v.get(0), 10);
-    assert_eq!(v.get(1), 0);
-    assert_eq!(v.get(2), 30);
-    assert_eq!(v.get(3), 0);
+    v.mask_with(mask.view());
+    v.close().unwrap();
+    let r = PersistentCompactIntVec::open(&dir.path().join("v.pciv")).unwrap();
+    assert_eq!(r.get(0), 10);
+    assert_eq!(r.get(1), 0);
+    assert_eq!(r.get(2), 30);
+    assert_eq!(r.get(3), 0);
 }
 
 #[test]
 fn mask_with_overflow_slot_zeroed() {
     // overflow slot (value 500) masked out → removed from overflow, primary=0
-    let mut v = MemoryIntVec::new(3);
+    let dir = tempdir().unwrap();
+    let mut v = PersistentCompactIntVecBuilder::new(3, &dir.path().join("v.pciv")).unwrap();
     v.set(0, 10); v.set(1, 500); v.set(2, 5);
-    let mut mask = MemoryBitVec::new(3);
+    let mut mask = PersistentBitVecBuilder::new(3, &dir.path().join("m.pbiv")).unwrap();
     mask.set(0, true); mask.set(2, true);  // slot 1 masked out
-    v.mask_with(&mask);
-    assert_eq!(v.get(0), 10);
-    assert_eq!(v.get(1), 0);
-    assert_eq!(v.get(2), 5);
-    let ov: Vec<_> = v.overflow_entries().collect();
+    v.mask_with(mask.view());
+    v.close().unwrap();
+    let r = PersistentCompactIntVec::open(&dir.path().join("v.pciv")).unwrap();
+    assert_eq!(r.get(0), 10);
+    assert_eq!(r.get(1), 0);
+    assert_eq!(r.get(2), 5);
+    let ov: Vec<_> = r.view().overflow_entries().collect();
     assert!(ov.is_empty(), "overflow entry for masked-out slot should be gone");
 }
 
 #[test]
 fn mask_with_all_ones_is_noop() {
-    let mut v = MemoryIntVec::new(4);
+    let dir = tempdir().unwrap();
+    let mut v = PersistentCompactIntVecBuilder::new(4, &dir.path().join("v.pciv")).unwrap();
     v.set(0, 300); v.set(1, 1); v.set(2, 0); v.set(3, 42);
-    let mask = MemoryBitVec::ones(4);
-    v.mask_with(&mask);
-    assert_eq!(v.get(0), 300);
-    assert_eq!(v.get(1), 1);
-    assert_eq!(v.get(2), 0);
-    assert_eq!(v.get(3), 42);
+    let mut mask = PersistentBitVecBuilder::new(4, &dir.path().join("m.pbiv")).unwrap();
+    mask.not();  // all bits → 1
+    v.mask_with(mask.view());
+    v.close().unwrap();
+    let r = PersistentCompactIntVec::open(&dir.path().join("v.pciv")).unwrap();
+    assert_eq!(r.get(0), 300);
+    assert_eq!(r.get(1), 1);
+    assert_eq!(r.get(2), 0);
+    assert_eq!(r.get(3), 42);
 }
 
 // ── BitMatrix: partial_group_presence_count ───────────────────────────────────
diff --git a/src/obicompactvec/src/tests/intmatrix.rs b/src/obicompactvec/src/tests/intmatrix.rs
index d9869aa..9abd7b5 100644
--- a/src/obicompactvec/src/tests/intmatrix.rs
+++ b/src/obicompactvec/src/tests/intmatrix.rs
@@ -1,7 +1,7 @@
 use tempfile::tempdir;
 
 use crate::{pack_compact_int_matrix, PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder};
-use crate::traits::{CountPartials, IntSlice};
+use crate::traits::CountPartials;
 
 fn make_matrix(cols: &[&[u32]]) -> (tempfile::TempDir, PersistentCompactIntMatrix) {
     let n = cols.first().map_or(0, |c| c.len());
@@ -290,7 +290,7 @@ fn col_view_packed_matches_columnar() {
         }
         assert_eq!(col_view.sum(), col_ref.sum(), "col={c} sum");
         let mut ov_view: Vec<(usize, u32)> = col_view.overflow_entries().collect();
-        let mut ov_ref:  Vec<(usize, u32)> = col_ref.overflow_entries().collect();
+        let mut ov_ref:  Vec<(usize, u32)> = col_ref.view().overflow_entries().collect();
         ov_view.sort_unstable_by_key(|&(s, _)| s);
         ov_ref.sort_unstable_by_key(|&(s, _)| s);
         assert_eq!(ov_view, ov_ref, "col={c} overflow_entries");
diff --git a/src/obicompactvec/src/tests/memoryvec.rs b/src/obicompactvec/src/tests/memoryvec.rs
deleted file mode 100644
index 3fd4afb..0000000
--- a/src/obicompactvec/src/tests/memoryvec.rs
+++ /dev/null
@@ -1,484 +0,0 @@
-use tempfile::tempdir;
-
-use crate::traits::{BitSlice, BitSliceMut, BitToInt, IntSlice, IntSliceMut, IntToBit};
-use crate::{MemoryBitVec, MemoryIntVec, PersistentBitVec, PersistentBitVecBuilder};
-
-// ── MemoryBitVec ──────────────────────────────────────────────────────────────
-
-#[test]
-fn mbv_new_all_zero() {
-    let v = MemoryBitVec::new(10);
-    assert_eq!(v.len(), 10);
-    assert!(!(0..10).any(|s| v.get(s)));
-    assert_eq!(v.count_ones(), 0);
-}
-
-#[test]
-fn mbv_ones_all_set() {
-    let v = MemoryBitVec::ones(10);
-    assert!((0..10).all(|s| v.get(s)));
-    assert_eq!(v.count_ones(), 10);
-    assert_eq!(v.count_zeros(), 0);
-}
-
-#[test]
-fn mbv_ones_no_padding_leak() {
-    // 5 bits: padding bits in last word must stay 0
-    let v = MemoryBitVec::ones(5);
-    assert_eq!(v.words()[0], 0b11111);
-}
-
-#[test]
-fn mbv_set_get_roundtrip() {
-    let mut v = MemoryBitVec::new(64);
-    v.set(0, true);
-    v.set(63, true);
-    assert!(v.get(0));
-    assert!(!v.get(1));
-    assert!(v.get(63));
-    assert_eq!(v.count_ones(), 2);
-}
-
-#[test]
-fn mbv_and() {
-    let mut a = MemoryBitVec::new(4);
-    a.set(0, true); a.set(1, true);
-    let mut b = MemoryBitVec::new(4);
-    b.set(0, true); b.set(2, true);
-    a.and(&b);
-    assert!(a.get(0)); assert!(!a.get(1)); assert!(!a.get(2));
-}
-
-#[test]
-fn mbv_or() {
-    let mut a = MemoryBitVec::new(4);
-    a.set(0, true); a.set(1, true);
-    let mut b = MemoryBitVec::new(4);
-    b.set(0, true); b.set(2, true);
-    a.or(&b);
-    assert!(a.get(0)); assert!(a.get(1)); assert!(a.get(2)); assert!(!a.get(3));
-}
-
-#[test]
-fn mbv_xor() {
-    let mut a = MemoryBitVec::new(4);
-    a.set(0, true); a.set(1, true);
-    let mut b = MemoryBitVec::new(4);
-    b.set(0, true); b.set(2, true);
-    a.xor(&b);
-    assert!(!a.get(0)); assert!(a.get(1)); assert!(a.get(2)); assert!(!a.get(3));
-}
-
-#[test]
-fn mbv_not() {
-    let mut a = MemoryBitVec::new(4);
-    a.set(0, true); a.set(2, true);
-    a.not();
-    assert!(!a.get(0)); assert!(a.get(1)); assert!(!a.get(2)); assert!(a.get(3));
-}
-
-#[test]
-fn mbv_not_no_padding_leak() {
-    let mut v = MemoryBitVec::new(5);
-    v.not();
-    assert_eq!(v.count_ones(), 5);
-    assert_eq!(v.words()[0], 0b11111);
-}
-
-#[test]
-fn mbv_ops_chaining() {
-    let mut a = MemoryBitVec::ones(8);
-    let b = MemoryBitVec::new(8); // all zeros
-    a.and(&b).or(&b).not();
-    assert_eq!(a.count_ones(), 8);
-}
-
-#[test]
-fn mbv_std_ops_owned() {
-    let mut a = MemoryBitVec::new(4);
-    a.set(0, true); a.set(1, true);
-    let mut b = MemoryBitVec::new(4);
-    b.set(1, true); b.set(2, true);
-    let c = a & &b;
-    assert!(!c.get(0)); assert!(c.get(1)); assert!(!c.get(2));
-}
-
-#[test]
-fn mbv_std_ops_assign() {
-    let mut a = MemoryBitVec::new(4);
-    a.set(0, true); a.set(1, true);
-    let mut b = MemoryBitVec::new(4);
-    b.set(1, true); b.set(2, true);
-    a &= &b;
-    assert!(!a.get(0)); assert!(a.get(1));
-}
-
-#[test]
-fn mbv_from_persistent() {
-    let dir = tempdir().unwrap();
-    let path = dir.path().join("v.pbiv");
-    let mut builder = PersistentBitVecBuilder::new(4, &path).unwrap();
-    builder.set(1, true); builder.set(3, true);
-    builder.close().unwrap();
-    let pv = PersistentBitVec::open(&path).unwrap();
-    let mv = MemoryBitVec::from(&pv);
-    assert!(!mv.get(0)); assert!(mv.get(1)); assert!(!mv.get(2)); assert!(mv.get(3));
-}
-
-#[test]
-fn mbv_persist_roundtrip() {
-    let dir = tempdir().unwrap();
-    let path = dir.path().join("out.pbiv");
-    let mut v = MemoryBitVec::new(8);
-    v.set(2, true); v.set(5, true);
-    let builder = v.persist(&path).unwrap();
-    builder.close().unwrap();
-    let pv = PersistentBitVec::open(&path).unwrap();
-    assert!(pv.get(2)); assert!(pv.get(5));
-    assert_eq!(pv.count_ones(), 2);
-}
-
-// ── MemoryIntVec ──────────────────────────────────────────────────────────────
-
-#[test]
-fn miv_new_all_zero() {
-    let v = MemoryIntVec::new(10);
-    assert_eq!(v.len(), 10);
-    assert!((0..10).all(|s| v.get(s) == 0));
-}
-
-#[test]
-fn miv_set_get_roundtrip() {
-    let mut v = MemoryIntVec::new(4);
-    v.set(0, 42); v.set(3, 200);
-    assert_eq!(v.get(0), 42);
-    assert_eq!(v.get(1), 0);
-    assert_eq!(v.get(3), 200);
-}
-
-#[test]
-fn miv_overflow_roundtrip() {
-    let mut v = MemoryIntVec::new(4);
-    v.set(1, 1000);
-    assert_eq!(v.get(1), 1000);
-    assert_eq!(v.get(0), 0);
-}
-
-#[test]
-fn miv_inc_dec() {
-    let mut v = MemoryIntVec::new(4);
-    v.inc(2); v.inc(2); v.inc(2);
-    assert_eq!(v.get(2), 3);
-    v.dec(2);
-    assert_eq!(v.get(2), 2);
-}
-
-#[test]
-fn miv_dec_saturates_at_zero() {
-    let mut v = MemoryIntVec::new(4);
-    v.dec(0);
-    assert_eq!(v.get(0), 0);
-}
-
-#[test]
-fn miv_add_at() {
-    let mut v = MemoryIntVec::new(4);
-    v.add_at(1, 100); v.add_at(1, 200);
-    assert_eq!(v.get(1), 300);
-}
-
-#[test]
-fn miv_min_max() {
-    let mut a = MemoryIntVec::new(4);
-    a.set(0, 5); a.set(1, 2); a.set(2, 8);
-    let mut b = MemoryIntVec::new(4);
-    b.set(0, 3); b.set(1, 7); b.set(2, 8);
-    let mut c = MemoryIntVec::from(&a);
-    IntSliceMut::min(&mut c, &b);
-    assert_eq!(c.get(0), 3); assert_eq!(c.get(1), 2); assert_eq!(c.get(2), 8);
-    let mut d = MemoryIntVec::from(&a);
-    IntSliceMut::max(&mut d, &b);
-    assert_eq!(d.get(0), 5); assert_eq!(d.get(1), 7); assert_eq!(d.get(2), 8);
-}
-
-#[test]
-fn miv_add_diff() {
-    let mut a = MemoryIntVec::new(3);
-    a.set(0, 10); a.set(1, 5);
-    let mut b = MemoryIntVec::new(3);
-    b.set(0, 3); b.set(1, 8);
-    let mut c = MemoryIntVec::from(&a);
-    c.add(&b);
-    assert_eq!(c.get(0), 13); assert_eq!(c.get(1), 13);
-    let mut d = MemoryIntVec::from(&a);
-    d.diff(&b);
-    assert_eq!(d.get(0), 7); assert_eq!(d.get(1), 0); // saturating sub
-}
-
-#[test]
-fn miv_std_ops() {
-    let mut a = MemoryIntVec::new(3);
-    a.set(0, 10); a.set(1, 5);
-    let mut b = MemoryIntVec::new(3);
-    b.set(0, 3); b.set(1, 8);
-    let c = &a + &b;
-    assert_eq!(c.get(0), 13); assert_eq!(c.get(1), 13);
-    let d = &a - &b;
-    assert_eq!(d.get(0), 7); assert_eq!(d.get(1), 0);
-}
-
-#[test]
-fn miv_from_persistent() {
-    use crate::{PersistentCompactIntVec, PersistentCompactIntVecBuilder};
-    let dir = tempdir().unwrap();
-    let path = dir.path().join("v.pciv");
-    let mut b = PersistentCompactIntVecBuilder::new(4, &path).unwrap();
-    b.set(1, 42); b.set(3, 1000);
-    b.close().unwrap();
-    let pv = PersistentCompactIntVec::open(&path).unwrap();
-    let mv = MemoryIntVec::from(&pv);
-    assert_eq!(mv.get(0), 0); assert_eq!(mv.get(1), 42); assert_eq!(mv.get(3), 1000);
-}
-
-// ── Cross-type conversions ────────────────────────────────────────────────────
-
-#[test]
-fn to_bitvec_threshold() {
-    let mut v = MemoryIntVec::new(5);
-    v.set(0, 0); v.set(1, 1); v.set(2, 5); v.set(3, 10); v.set(4, 3);
-    let bv = v.to_bitvec(4); // > 4: slots 2 (5) and 3 (10) pass
-    assert!(!bv.get(0)); assert!(!bv.get(1)); assert!(bv.get(2));
-    assert!(bv.get(3)); assert!(!bv.get(4));
-}
-
-#[test]
-fn to_presence() {
-    let mut v = MemoryIntVec::new(4);
-    v.set(1, 1); v.set(3, 100);
-    let bv = v.to_presence();
-    assert!(!bv.get(0)); assert!(bv.get(1)); assert!(!bv.get(2)); assert!(bv.get(3));
-}
-
-#[test]
-fn to_intvec_roundtrip() {
-    let mut bv = MemoryBitVec::new(8);
-    bv.set(0, true); bv.set(3, true); bv.set(7, true);
-    let iv = bv.to_intvec();
-    assert_eq!(iv.get(0), 1); assert_eq!(iv.get(1), 0);
-    assert_eq!(iv.get(3), 1); assert_eq!(iv.get(7), 1);
-}
-
-#[test]
-fn to_intvec_word_boundary() {
-    // 65 bits: spans two words
-    let mut bv = MemoryBitVec::new(65);
-    bv.set(63, true); bv.set(64, true);
-    let iv = bv.to_intvec();
-    assert_eq!(iv.get(63), 1); assert_eq!(iv.get(64), 1); assert_eq!(iv.get(62), 0);
-}
-
-#[test]
-fn count_bits_accumulates() {
-    let mut count = MemoryIntVec::new(8);
-    let mut b1 = MemoryBitVec::new(8);
-    b1.set(0, true); b1.set(2, true);
-    let mut b2 = MemoryBitVec::new(8);
-    b2.set(0, true); b2.set(3, true);
-    let mut b3 = MemoryBitVec::new(8);
-    b3.set(2, true); b3.set(3, true);
-    count.count_bits(&b1).count_bits(&b2).count_bits(&b3);
-    assert_eq!(count.get(0), 2);
-    assert_eq!(count.get(2), 2);
-    assert_eq!(count.get(3), 2);
-    assert_eq!(count.get(1), 0);
-}
-
-#[test]
-fn count_bits_skips_zero_words() {
-    // Entire first word is zero — should not touch those slots
-    let mut count = MemoryIntVec::new(128);
-    let mut bv = MemoryBitVec::new(128);
-    bv.set(64, true); bv.set(127, true);
-    count.count_bits(&bv);
-    assert_eq!(count.get(0), 0);
-    assert_eq!(count.get(64), 1);
-    assert_eq!(count.get(127), 1);
-}
-
-// ── min / max / add / diff — overflow edge cases ──────────────────────────────
-
-#[test]
-fn miv_min_overflow_edges() {
-    // [300, 50, 400, 300] min [50, 300, 500, 200]
-    // slot 0: self=overflow(300), other=primary(50)  → 50   (overflow removed)
-    // slot 1: self=primary(50),   other=overflow(300) → 50   (no overflow created)
-    // slot 2: self=overflow(400), other=overflow(500) → 400  (overflow updated)
-    // slot 3: self=overflow(300), other=primary(200)  → 200  (overflow removed, 200 < 255)
-    let mut a = MemoryIntVec::new(4);
-    a.set(0, 300); a.set(1, 50); a.set(2, 400); a.set(3, 300);
-    let mut b = MemoryIntVec::new(4);
-    b.set(0, 50); b.set(1, 300); b.set(2, 500); b.set(3, 200);
-    IntSliceMut::min(&mut a, &b);
-    assert_eq!(a.get(0), 50);
-    assert_eq!(a.get(1), 50);
-    assert_eq!(a.get(2), 400);
-    assert_eq!(a.get(3), 200);
-    // Only slot 2 should still have an overflow entry.
-    let ov: std::collections::HashMap<usize, u32> = a.overflow_entries().collect();
-    assert_eq!(ov.len(), 1);
-    assert_eq!(ov[&2], 400);
-}
-
-#[test]
-fn miv_max_overflow_edges() {
-    // [50, 300, 100, 400] max [300, 50, 500, 200]
-    // slot 0: self=primary(50),   other=overflow(300) → 300  (overflow created)
-    // slot 1: self=overflow(300), other=primary(50)   → 300  (overflow unchanged)
-    // slot 2: self=primary(100),  other=overflow(500) → 500  (overflow created)
-    // slot 3: self=overflow(400), other=overflow(200) → 400  (overflow unchanged, 200 < 255 wait...)
-    // Wait — 200 < 255 so other slot 3 is NOT overflow. Correct: max(400, 200) = 400.
-    let mut a = MemoryIntVec::new(4);
-    a.set(0, 50); a.set(1, 300); a.set(2, 100); a.set(3, 400);
-    let mut b = MemoryIntVec::new(4);
-    b.set(0, 300); b.set(1, 50); b.set(2, 500); b.set(3, 200);
-    IntSliceMut::max(&mut a, &b);
-    assert_eq!(a.get(0), 300);
-    assert_eq!(a.get(1), 300);
-    assert_eq!(a.get(2), 500);
-    assert_eq!(a.get(3), 400);
-    let ov: std::collections::HashMap<usize, u32> = a.overflow_entries().collect();
-    assert_eq!(ov.len(), 4); // all four results >= 255
-    assert_eq!(ov[&0], 300);
-    assert_eq!(ov[&1], 300);
-    assert_eq!(ov[&2], 500);
-    assert_eq!(ov[&3], 400);
-}
-
-#[test]
-fn miv_add_overflow_edges() {
-    // [300, 50, 400, 200] + [50, 300, 200, 200]
-    // slot 0: self=overflow(300), other=primary(50)   → 350  (overflow updated)
-    // slot 1: self=primary(50),   other=overflow(300) → 350  (overflow created from primary)
-    // slot 2: self=overflow(400), other=overflow(200... wait 200 < 255)
-    //         other slot 2 is primary(200); 400+200=600 (overflow updated)
-    // slot 3: self=primary(200),  other=primary(200)  → 400  (overflow created, 400 >= 255)
-    let mut a = MemoryIntVec::new(4);
-    a.set(0, 300); a.set(1, 50); a.set(2, 400); a.set(3, 200);
-    let mut b = MemoryIntVec::new(4);
-    b.set(0, 50); b.set(1, 300); b.set(2, 200); b.set(3, 200);
-    a.add(&b);
-    assert_eq!(a.get(0), 350);
-    assert_eq!(a.get(1), 350);
-    assert_eq!(a.get(2), 600);
-    assert_eq!(a.get(3), 400);
-    let ov: std::collections::HashMap<usize, u32> = a.overflow_entries().collect();
-    assert_eq!(ov.len(), 4);
-}
-
-#[test]
-fn miv_add_both_overflow() {
-    // [300] + [400] = [700]
-    let mut a = MemoryIntVec::new(1);
-    a.set(0, 300);
-    let mut b = MemoryIntVec::new(1);
-    b.set(0, 400);
-    a.add(&b);
-    assert_eq!(a.get(0), 700);
-    let ov: std::collections::HashMap<usize, u32> = a.overflow_entries().collect();
-    assert_eq!(ov[&0], 700);
-}
-
-#[test]
-fn miv_diff_overflow_edges() {
-    // [300, 400, 400, 50] - [100, 50, 350, 300]
-    // slot 0: self=overflow(300), other=primary(100)   → 200  (overflow removed, 200 < 255)
-    // slot 1: self=overflow(400), other=primary(50)    → 350  (overflow updated, 350 >= 255)
-    // slot 2: self=overflow(400), other=overflow(350)  → 50   (overflow removed, 50 < 255)
-    // slot 3: self=primary(50),   other=overflow(300)  → 0    (saturating, stays primary)
-    let mut a = MemoryIntVec::new(4);
-    a.set(0, 300); a.set(1, 400); a.set(2, 400); a.set(3, 50);
-    let mut b = MemoryIntVec::new(4);
-    b.set(0, 100); b.set(1, 50); b.set(2, 350); b.set(3, 300);
-    a.diff(&b);
-    assert_eq!(a.get(0), 200);
-    assert_eq!(a.get(1), 350);
-    assert_eq!(a.get(2), 50);
-    assert_eq!(a.get(3), 0);
-    let ov: std::collections::HashMap<usize, u32> = a.overflow_entries().collect();
-    assert_eq!(ov.len(), 1); // only slot 1 remains overflow
-    assert_eq!(ov[&1], 350);
-}
-
-// ── Comparison operators ──────────────────────────────────────────────────────
-
-#[test]
-fn cmp_gt() {
-    let mut v = MemoryIntVec::new(5);
-    v.set(0, 0); v.set(1, 3); v.set(2, 5); v.set(3, 3); v.set(4, 10);
-    let bv = v.gt(3);
-    assert!(!bv.get(0)); assert!(!bv.get(1)); assert!(bv.get(2));
-    assert!(!bv.get(3)); assert!(bv.get(4));
-}
-
-#[test]
-fn cmp_geq() {
-    let mut v = MemoryIntVec::new(4);
-    v.set(0, 2); v.set(1, 3); v.set(2, 4); v.set(3, 1);
-    let bv = v.geq(3);
-    assert!(!bv.get(0)); assert!(bv.get(1)); assert!(bv.get(2)); assert!(!bv.get(3));
-}
-
-#[test]
-fn cmp_lt() {
-    let mut v = MemoryIntVec::new(4);
-    v.set(0, 2); v.set(1, 3); v.set(2, 4); v.set(3, 0);
-    let bv = v.lt(3);
-    assert!(bv.get(0)); assert!(!bv.get(1)); assert!(!bv.get(2)); assert!(bv.get(3));
-}
-
-#[test]
-fn cmp_leq() {
-    let mut v = MemoryIntVec::new(4);
-    v.set(0, 2); v.set(1, 3); v.set(2, 4); v.set(3, 3);
-    let bv = v.leq(3);
-    assert!(bv.get(0)); assert!(bv.get(1)); assert!(!bv.get(2)); assert!(bv.get(3));
-}
-
-#[test]
-fn cmp_scalar_with_overflow() {
-    // Slots: [10, 1000, 50, 500, 0]
-    // geq(100): slots 1 (1000) and 3 (500) → both overflow, must qualify
-    // lt(500):  slots 0 (10), 2 (50), 4 (0) → primary; slot 1 (1000) → no; slot 3 (500) → no
-    // geq(2000): only slot 1 (1000) fails, no slot qualifies
-    let mut v = MemoryIntVec::new(5);
-    v.set(0, 10); v.set(1, 1000); v.set(2, 50); v.set(3, 500); v.set(4, 0);
-
-    let bv = v.geq(100);
-    assert!(!bv.get(0)); assert!(bv.get(1)); assert!(!bv.get(2));
-    assert!(bv.get(3)); assert!(!bv.get(4));
-
-    let bv = v.lt(500);
-    assert!(bv.get(0)); assert!(!bv.get(1)); assert!(bv.get(2));
-    assert!(!bv.get(3)); assert!(bv.get(4));
-
-    let bv = v.geq(2000);
-    assert!(!(0..5).any(|s| bv.get(s)));
-}
-
-#[test]
-fn filter_pattern() {
-    // Typical filter: ingroup >= min_count AND outgroup <= max_outgroup
-    let mut ingroup  = MemoryIntVec::new(6);
-    let mut outgroup = MemoryIntVec::new(6);
-    // slot 2: ingroup=3, outgroup=0  → keep
-    // slot 4: ingroup=2, outgroup=1  → drop (outgroup > 0)
-    // slot 5: ingroup=1, outgroup=0  → drop (ingroup < 2)
-    ingroup.set(2, 3); ingroup.set(4, 2); ingroup.set(5, 1);
-    outgroup.set(4, 1);
-    let out_mask  = outgroup.leq(0);
-    let mut in_mask = ingroup.geq(2);
-    let keep = in_mask.and(&out_mask);
-    assert!(!keep.get(0)); assert!(!keep.get(1));
-    assert!(keep.get(2));
-    assert!(!keep.get(4)); assert!(!keep.get(5));
-}
diff --git a/src/obicompactvec/src/tests/mod.rs b/src/obicompactvec/src/tests/mod.rs
index 3a61ab3..31f630e 100644
--- a/src/obicompactvec/src/tests/mod.rs
+++ b/src/obicompactvec/src/tests/mod.rs
@@ -2,12 +2,9 @@ mod bitmatrix;
 mod bitvec;
 mod colgroup;
 mod intmatrix;
-mod memoryvec;
 
 use tempfile::tempdir;
 
-use crate::traits::IntSliceMut;
-
 use crate::{PersistentCompactIntVec, PersistentCompactIntVecBuilder};
 
 fn roundtrip(values: &[(usize, u32)], n: usize) -> Vec<u32> {
@@ -173,7 +170,7 @@ fn combine_min() {
     let dir = tempdir().unwrap();
     let path = dir.path().join("out.pciv");
     let mut b = PersistentCompactIntVecBuilder::build_from(&ra, &path).unwrap();
-    b.min(&rb);
+    b.min(rb.view());
     b.close().unwrap();
     let r = PersistentCompactIntVec::open(&path).unwrap();
     assert_eq!(r.iter().collect::<Vec<_>>(), vec![10, 100, 0, 800]);
@@ -186,7 +183,7 @@ fn combine_max() {
     let dir = tempdir().unwrap();
     let path = dir.path().join("out.pciv");
     let mut b = PersistentCompactIntVecBuilder::build_from(&ra, &path).unwrap();
-    b.max(&rb);
+    b.max(rb.view());
     b.close().unwrap();
     let r = PersistentCompactIntVec::open(&path).unwrap();
     assert_eq!(r.iter().collect::<Vec<_>>(), vec![20, 300, 500, 1000]);
@@ -199,7 +196,7 @@ fn combine_add() {
     let dir = tempdir().unwrap();
     let path = dir.path().join("out.pciv");
     let mut b = PersistentCompactIntVecBuilder::build_from(&ra, &path).unwrap();
-    b.add(&rb);
+    b.add(rb.view());
     b.close().unwrap();
     let r = PersistentCompactIntVec::open(&path).unwrap();
     assert_eq!(r.iter().collect::<Vec<_>>(), vec![30, 300, 5, 101]);
@@ -224,7 +221,7 @@ fn combine_diff() {
     let dir = tempdir().unwrap();
     let path = dir.path().join("out.pciv");
     let mut b = PersistentCompactIntVecBuilder::build_from(&ra, &path).unwrap();
-    b.diff(&rb);
+    b.diff(rb.view());
     b.close().unwrap();
     let r = PersistentCompactIntVec::open(&path).unwrap();
     assert_eq!(r.iter().collect::<Vec<_>>(), vec![10, 700, 0, 0]);
diff --git a/src/obicompactvec/src/traits.rs b/src/obicompactvec/src/traits.rs
index 9a647ec..cc52bc1 100644
--- a/src/obicompactvec/src/traits.rs
+++ b/src/obicompactvec/src/traits.rs
@@ -1,353 +1,5 @@
-use std::collections::HashMap;
-
 use ndarray::{Array1, Array2};
 
-// ── BitSlice / BitSliceMut ────────────────────────────────────────────────────
-
-/// Read-only view over the u64 word array of a bit vector.
-///
-/// Bit `i` is in `words()[i >> 6]` at position `i & 63`.
-/// Padding bits in the last word are zero.
-pub trait BitSlice {
-    fn len(&self) -> usize;
-    fn words(&self) -> &[u64];
-    fn is_empty(&self) -> bool { self.len() == 0 }
-    fn get(&self, slot: usize) -> bool {
-        (self.words()[slot >> 6] >> (slot & 63)) & 1 != 0
-    }
-    fn count_ones(&self) -> u64 {
-        self.words().iter().map(|w| w.count_ones() as u64).sum()
-    }
-    fn count_zeros(&self) -> u64 { self.len() as u64 - self.count_ones() }
-    fn partial_jaccard_dist<S: BitSlice>(&self, other: &S) -> (u64, u64) {
-        assert_eq!(self.len(), other.len(), "length mismatch");
-        self.words().iter().zip(other.words())
-            .fold((0u64, 0u64), |(i, u), (&a, &b)| {
-                (i + (a & b).count_ones() as u64, u + (a | b).count_ones() as u64)
-            })
-    }
-    fn jaccard_dist<S: BitSlice>(&self, other: &S) -> f64 {
-        let (inter, union) = self.partial_jaccard_dist(other);
-        if union == 0 { 0.0 } else { 1.0 - inter as f64 / union as f64 }
-    }
-    fn hamming_dist<S: BitSlice>(&self, other: &S) -> u64 {
-        assert_eq!(self.len(), other.len(), "length mismatch");
-        self.words().iter().zip(other.words())
-            .map(|(&a, &b)| (a ^ b).count_ones() as u64)
-            .sum()
-    }
-}
-
-/// Mutable view over a bit-vector word array; default methods maintain the
-/// zero-padding invariant on the last word.
-pub trait BitSliceMut: BitSlice {
-    fn words_mut(&mut self) -> &mut [u64];
-
-    fn set(&mut self, slot: usize, value: bool) {
-        let bit = 1u64 << (slot & 63);
-        if value { self.words_mut()[slot >> 6] |= bit; } else { self.words_mut()[slot >> 6] &= !bit; }
-    }
-
-    fn copy_from<S: BitSlice>(&mut self, src: &S) -> &mut Self {
-        assert_eq!(self.len(), src.len(), "BitSlice length mismatch");
-        self.words_mut().copy_from_slice(src.words());
-        self
-    }
-
-    fn and<S: BitSlice>(&mut self, other: &S) -> &mut Self {
-        assert_eq!(self.len(), other.len(), "BitSlice length mismatch");
-        for (w, &o) in self.words_mut().iter_mut().zip(other.words()) { *w &= o; }
-        self
-    }
-
-    fn or<S: BitSlice>(&mut self, other: &S) -> &mut Self {
-        assert_eq!(self.len(), other.len(), "BitSlice length mismatch");
-        for (w, &o) in self.words_mut().iter_mut().zip(other.words()) { *w |= o; }
-        self
-    }
-
-    fn xor<S: BitSlice>(&mut self, other: &S) -> &mut Self {
-        assert_eq!(self.len(), other.len(), "BitSlice length mismatch");
-        for (w, &o) in self.words_mut().iter_mut().zip(other.words()) { *w ^= o; }
-        self
-    }
-
-    fn not(&mut self) -> &mut Self {
-        let rem = self.len() % 64;
-        let words = self.words_mut();
-        for w in words.iter_mut() { *w ^= u64::MAX; }
-        if rem != 0 {
-            if let Some(last) = words.last_mut() { *last &= (1u64 << rem) - 1; }
-        }
-        self
-    }
-}
-
-// ── IntSlice / IntSliceMut ────────────────────────────────────────────────────
-
-/// Read-only access to a compact integer vector (values encoded as u32).
-pub trait IntSlice {
-    fn len(&self) -> usize;
-    fn get(&self, slot: usize) -> u32;
-    /// Raw primary byte slice (sentinel 255 marks overflow slots).
-    fn primary_bytes(&self) -> &[u8];
-    /// Iterator over `(slot, true_value)` pairs for all overflow entries (value >= 255).
-    fn overflow_entries(&self) -> impl Iterator<Item = (usize, u32)> + '_;
-    fn is_empty(&self) -> bool { self.len() == 0 }
-    fn iter(&self) -> impl Iterator<Item = u32> + '_ { (0..self.len()).map(|i| self.get(i)) }
-    fn sum(&self) -> u64 { self.iter().map(|v| v as u64).sum() }
-    fn count_nonzero(&self) -> u64 { self.iter().filter(|v| *v > 0).count() as u64 }
-
-    fn lt(&self, threshold: u32) -> MemoryBitVec { self.cmp_scalar(|v| v <  threshold) }
-    fn leq(&self, threshold: u32) -> MemoryBitVec { self.cmp_scalar(|v| v <= threshold) }
-    fn gt(&self, threshold: u32) -> MemoryBitVec  { self.cmp_scalar(|v| v >  threshold) }
-    fn geq(&self, threshold: u32) -> MemoryBitVec { self.cmp_scalar(|v| v >= threshold) }
-
-    fn cmp_scalar(&self, pred: impl Fn(u32) -> bool) -> MemoryBitVec {
-        let n = self.len();
-        let mut words = vec![0u64; n.div_ceil(64)];
-        let primary = self.primary_bytes();
-        // Pass 1: byte scan — no HashMap access, vectorisable for simple predicates.
-        // Overflow slots (b == 255) are left as 0 and fixed in pass 2.
-        for s in 0..n {
-            let b = primary[s];
-            if b < 255 && pred(b as u32) {
-                words[s >> 6] |= 1u64 << (s & 63);
-            }
-        }
-        // Pass 2: fix up overflow slots — O(k), negligible.
-        for (s, val) in self.overflow_entries() {
-            if pred(val) { words[s >> 6] |= 1u64 << (s & 63); }
-        }
-        MemoryBitVec::from_words(words, n)
-    }
-}
-
-/// Mutable access; default methods use only `get` / `set` and maintain the
-/// compact encoding invariants on the implementor's side.
-pub trait IntSliceMut: IntSlice {
-    fn set(&mut self, slot: usize, value: u32);
-    fn primary_bytes_mut(&mut self) -> &mut [u8];
-    fn clear_overflow(&mut self);
-
-    fn inc(&mut self, slot: usize) -> &mut Self {
-        let v = self.get(slot);
-        self.set(slot, v.saturating_add(1));
-        self
-    }
-
-    fn dec(&mut self, slot: usize) -> &mut Self {
-        let v = self.get(slot);
-        self.set(slot, v.saturating_sub(1));
-        self
-    }
-
-    fn add_at(&mut self, slot: usize, delta: u32) -> &mut Self {
-        let v = self.get(slot);
-        self.set(slot, v.saturating_add(delta));
-        self
-    }
-
-    fn copy_from<S: IntSlice>(&mut self, src: &S) -> &mut Self {
-        assert_eq!(self.len(), src.len(), "IntSlice length mismatch");
-        self.primary_bytes_mut().copy_from_slice(src.primary_bytes());
-        self.clear_overflow();
-        for (slot, val) in src.overflow_entries() { self.set(slot, val); }
-        self
-    }
-
-    fn min<S: IntSlice>(&mut self, other: &S) -> &mut Self {
-        assert_eq!(self.len(), other.len(), "IntSlice length mismatch");
-        // Snapshot both overflow sets (O(k), tiny) before mutating self.
-        // 255 = +∞ on u8, so byte-level min is correct in all cases except
-        // both-overflow: only those slots need a fixup pass.
-        let self_ov: Vec<(usize, u32)> = self.overflow_entries().collect();
-        let other_ov: HashMap<usize, u32> = other.overflow_entries().collect();
-        self.clear_overflow();
-        // Pass 1 — SIMD-vectorizable byte min over the full primary array.
-        for (a, &b) in self.primary_bytes_mut().iter_mut().zip(other.primary_bytes()) {
-            if b < *a { *a = b; }
-        }
-        // Pass 2 — fixup slots where BOTH sides were overflow (primary = 255 after pass 1,
-        // but the overflow value may have changed).  Slots where only self was overflow are
-        // already correct: pass 1 wrote other.primary[slot] < 255 and clear_overflow removed
-        // the stale entry.
-        for (slot, self_val) in self_ov {
-            if let Some(&other_val) = other_ov.get(&slot) {
-                self.set(slot, self_val.min(other_val));
-            }
-        }
-        self
-    }
-
-    fn max<S: IntSlice>(&mut self, other: &S) -> &mut Self {
-        assert_eq!(self.len(), other.len(), "IntSlice length mismatch");
-        // Pre-pass — process other's overflow entries BEFORE the byte pass.
-        // After the byte pass, self.primary[slot] = 255 for all slots in other_ov,
-        // making it impossible to recover the original self value; we need it now.
-        for (slot, other_val) in other.overflow_entries() {
-            let self_val = self.get(slot);
-            self.set(slot, self_val.max(other_val));
-        }
-        // Pass 1 — SIMD-vectorizable byte max over the full primary array.
-        // 255 = +∞ on u8 → max(a, 255) = 255 is the correct sentinel for all
-        // overflow slots, whether handled by the pre-pass or already in self.
-        for (a, &b) in self.primary_bytes_mut().iter_mut().zip(other.primary_bytes()) {
-            if b > *a { *a = b; }
-        }
-        self
-    }
-
-    fn add<S: IntSlice>(&mut self, other: &S) -> &mut Self {
-        assert_eq!(self.len(), other.len(), "IntSlice length mismatch");
-        let n = self.len();
-        for s in 0..n {
-            // Read both primary bytes first — u8 is Copy, borrows released immediately.
-            let sb = self.primary_bytes()[s];
-            let ob = other.primary_bytes()[s];
-            if sb < 255 && ob < 255 {
-                // Hot path: no overflow lookup, no HashMap write in the common case.
-                let sum = sb as u32 + ob as u32;
-                if sum < 255 { self.primary_bytes_mut()[s] = sum as u8; }
-                else         { self.set(s, sum); }
-            } else {
-                // At least one side is in overflow — get() is unavoidable.
-                let self_val = self.get(s);
-                let other_val = other.get(s);
-                self.set(s, self_val + other_val);
-            }
-        }
-        self
-    }
-
-    fn diff<S: IntSlice>(&mut self, other: &S) -> &mut Self {
-        assert_eq!(self.len(), other.len(), "IntSlice length mismatch");
-        let n = self.len();
-        for s in 0..n {
-            let sb = self.primary_bytes()[s];
-            let ob = other.primary_bytes()[s];
-            if sb < 255 {
-                // Result is always < 255 — no overflow created or consulted.
-                // ob == 255 means b ≥ 255 > a, so saturating result = 0.
-                self.primary_bytes_mut()[s] = if ob < 255 { sb.saturating_sub(ob) } else { 0 };
-            } else {
-                // sb == 255: self has overflow — get() unavoidable.
-                // other.get() only needed when ob == 255 too (both-overflow case).
-                let self_val = self.get(s);
-                let other_val = if ob < 255 { ob as u32 } else { other.get(s) };
-                self.set(s, self_val.saturating_sub(other_val));
-            }
-        }
-        self
-    }
-
-    /// For each slot where `bits` is true, increment `self` by 1.
-    /// Skips zero words entirely — O(n_ones) rather than O(n).
-    fn count_bits<B: BitSlice>(&mut self, bits: &B) -> &mut Self {
-        assert_eq!(self.len(), bits.len(), "IntSlice/BitSlice length mismatch");
-        for (w_idx, &word) in bits.words().iter().enumerate() {
-            if word == 0 { continue; }
-            let base = w_idx * 64;
-            let mut w = word;
-            while w != 0 {
-                let bit = w.trailing_zeros() as usize;
-                let slot = base + bit;
-                if slot < self.len() { self.inc(slot); }
-                w &= w - 1;
-            }
-        }
-        self
-    }
-
-    /// Zero every slot where the corresponding bit in `mask` is 0.
-    /// Iterates only the zero bits — O(n_zeros), O(1) when mask is all-ones.
-    fn mask_with<B: BitSlice>(&mut self, mask: &B) -> &mut Self {
-        assert_eq!(self.len(), mask.len(), "IntSlice/BitSlice length mismatch");
-        let n = self.len();
-        for (wi, &word) in mask.words().iter().enumerate() {
-            if word == u64::MAX { continue; }
-            let mut zeros = !word;
-            while zeros != 0 {
-                let bit = zeros.trailing_zeros() as usize;
-                let s   = wi * 64 + bit;
-                if s < n {
-                    // u8 is Copy — the immutable borrow from primary_bytes() ends
-                    // before the mutable borrow from set() begins.
-                    let b = self.primary_bytes()[s];
-                    if b != 0 { self.set(s, 0); }
-                }
-                zeros &= zeros - 1;
-            }
-        }
-        self
-    }
-}
-
-// ── IntSlice → MemoryBitVec conversions ───────────────────────────────────────
-
-use crate::memoryvec::MemoryBitVec;
-
-pub trait IntToBit: IntSlice {
-    /// Bit set iff value >= threshold. Consistent with `geq` and `build_from_counts`.
-    fn to_bitvec(&self, threshold: u32) -> MemoryBitVec { self.geq(threshold) }
-
-    /// Bit set iff value >= 1 (slot is present).
-    fn to_presence(&self) -> MemoryBitVec { self.geq(1) }
-}
-
-impl<T: IntSlice> IntToBit for T {}
-
-// ── BitSlice → MemoryIntVec conversion ───────────────────────────────────────
-
-use crate::memoryintvec::MemoryIntVec;
-
-// Maps each byte value to its 8 constituent bits as individual u8 (0 or 1).
-static EXPAND_BYTE: [[u8; 8]; 256] = {
-    let mut table = [[0u8; 8]; 256];
-    let mut b = 0usize;
-    while b < 256 {
-        let mut bit = 0usize;
-        while bit < 8 {
-            table[b][bit] = ((b >> bit) & 1) as u8;
-            bit += 1;
-        }
-        b += 1;
-    }
-    table
-};
-
-pub trait BitToInt: BitSlice {
-    fn to_intvec(&self) -> MemoryIntVec {
-        let n = self.len();
-        let mut primary = vec![0u8; n];
-
-        let words = self.words();
-        let full_words = n / 64;
-
-        for (w_idx, &word) in words[..full_words].iter().enumerate() {
-            let base = w_idx * 64;
-            for byte_off in 0..8usize {
-                let byte = (word >> (byte_off * 8)) as u8;
-                primary[base + byte_off * 8..base + byte_off * 8 + 8]
-                    .copy_from_slice(&EXPAND_BYTE[byte as usize]);
-            }
-        }
-
-        let rem = n % 64;
-        if rem > 0 {
-            let word = words[full_words];
-            let base = full_words * 64;
-            for bit in 0..rem {
-                primary[base + bit] = ((word >> bit) & 1) as u8;
-            }
-        }
-
-        MemoryIntVec::from_primary(primary)
-    }
-}
-
-impl<T: BitSlice> BitToInt for T {}
-
 // ── Column-level weight statistic — total count or presence count per column.
 /// Additive across layers and partitions; used as denominator in normalised distances.
 ///
diff --git a/src/obicompactvec/src/views.rs b/src/obicompactvec/src/views.rs
new file mode 100644
index 0000000..85e4165
--- /dev/null
+++ b/src/obicompactvec/src/views.rs
@@ -0,0 +1,278 @@
+use crate::format::{byte_count_nonzero, byte_sum, parse_overflow_entry};
+
+// ── BitSliceView ──────────────────────────────────────────────────────────────
+
+/// Lightweight, copy-able read-only view over a u64 word array.
+/// Bit `i` is in `words[i >> 6]` at position `i & 63`.  Padding bits are zero.
+#[derive(Clone, Copy)]
+pub struct BitSliceView<'a> {
+    pub(crate) words: &'a [u64],
+    pub(crate) n:     usize,
+}
+
+impl<'a> BitSliceView<'a> {
+    #[inline]
+    pub fn new(words: &'a [u64], n: usize) -> Self { Self { words, n } }
+
+    pub fn len(&self)      -> usize  { self.n }
+    pub fn is_empty(&self) -> bool   { self.n == 0 }
+    pub fn words(&self)    -> &'a [u64] { self.words }
+
+    #[inline]
+    pub fn get(&self, slot: usize) -> bool {
+        (self.words[slot >> 6] >> (slot & 63)) & 1 != 0
+    }
+
+    pub fn count_ones(&self) -> u64 {
+        self.words.iter().map(|w| w.count_ones() as u64).sum()
+    }
+    pub fn count_zeros(&self) -> u64 { self.n as u64 - self.count_ones() }
+
+    pub fn iter(&self) -> BitSliceIter<'a> {
+        BitSliceIter { words: self.words, slot: 0, n: self.n }
+    }
+
+    pub fn partial_jaccard_dist(self, other: BitSliceView<'_>) -> (u64, u64) {
+        assert_eq!(self.n, other.n, "BitSliceView length mismatch");
+        self.words.iter().zip(other.words)
+            .fold((0u64, 0u64), |(i, u), (&a, &b)| {
+                (i + (a & b).count_ones() as u64, u + (a | b).count_ones() as u64)
+            })
+    }
+
+    pub fn jaccard_dist(self, other: BitSliceView<'_>) -> f64 {
+        let (inter, union) = self.partial_jaccard_dist(other);
+        if union == 0 { 0.0 } else { 1.0 - inter as f64 / union as f64 }
+    }
+
+    pub fn hamming_dist(self, other: BitSliceView<'_>) -> u64 {
+        assert_eq!(self.n, other.n, "BitSliceView length mismatch");
+        self.words.iter().zip(other.words)
+            .map(|(&a, &b)| (a ^ b).count_ones() as u64)
+            .sum()
+    }
+}
+
+// ── BitSliceIter ──────────────────────────────────────────────────────────────
+
+pub struct BitSliceIter<'a> {
+    words: &'a [u64],
+    slot:  usize,
+    n:     usize,
+}
+
+impl Iterator for BitSliceIter<'_> {
+    type Item = bool;
+    fn next(&mut self) -> Option<bool> {
+        if self.slot >= self.n { return None; }
+        let v = (self.words[self.slot >> 6] >> (self.slot & 63)) & 1 != 0;
+        self.slot += 1;
+        Some(v)
+    }
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        let rem = self.n - self.slot;
+        (rem, Some(rem))
+    }
+}
+impl ExactSizeIterator for BitSliceIter<'_> {}
+
+// ── IntSliceView ──────────────────────────────────────────────────────────────
+
+/// Lightweight, copy-able read-only view over a compact-int primary array plus
+/// its sorted raw overflow bytes.  Zero-copy: all data lives in the caller's mmap.
+#[derive(Clone, Copy)]
+pub struct IntSliceView<'a> {
+    pub(crate) primary:      &'a [u8],
+    pub(crate) overflow_raw: &'a [u8],   // n_overflow × OVERFLOW_ENTRY_SIZE bytes, sorted by slot
+    pub(crate) n_overflow:   usize,
+    pub(crate) n:            usize,
+}
+
+impl<'a> IntSliceView<'a> {
+    #[inline]
+    pub fn new(primary: &'a [u8], overflow_raw: &'a [u8], n_overflow: usize, n: usize) -> Self {
+        Self { primary, overflow_raw, n_overflow, n }
+    }
+
+    pub fn len(&self)        -> usize    { self.n }
+    pub fn is_empty(&self)   -> bool     { self.n == 0 }
+    pub fn primary_bytes(&self) -> &'a [u8] { self.primary }
+    pub fn n_overflow(&self) -> usize    { self.n_overflow }
+
+    pub fn overflow_entries(&self) -> impl Iterator<Item = (usize, u32)> + 'a {
+        let raw  = self.overflow_raw;
+        let n_ov = self.n_overflow;
+        (0..n_ov).map(move |i| parse_overflow_entry(raw, 0, i))
+    }
+
+    /// O(log n_overflow) via binary search (overflow is always sorted by slot).
+    pub fn get(&self, slot: usize) -> u32 {
+        let b = self.primary[slot];
+        if b < 255 { return b as u32; }
+        let mut lo = 0usize;
+        let mut hi = self.n_overflow;
+        while lo < hi {
+            let mid = lo + (hi - lo) / 2;
+            let (s, v) = parse_overflow_entry(self.overflow_raw, 0, mid);
+            match s.cmp(&slot) {
+                std::cmp::Ordering::Equal   => return v,
+                std::cmp::Ordering::Less    => lo = mid + 1,
+                std::cmp::Ordering::Greater => hi = mid,
+            }
+        }
+        panic!("slot {slot} marked overflow but not found")
+    }
+
+    /// Sequential merge scan: yields all n values in slot order.
+    pub fn iter(&self) -> IntSliceViewIter<'a> {
+        IntSliceViewIter {
+            primary:      self.primary,
+            overflow_raw: self.overflow_raw,
+            slot:         0,
+            overflow_pos: 0,
+            n:            self.n,
+        }
+    }
+
+    pub fn sum(&self) -> u64 {
+        byte_sum(self.primary, self.overflow_entries().map(|(_, v)| v))
+    }
+
+    pub fn count_nonzero(&self) -> u64 {
+        byte_count_nonzero(self.primary)
+    }
+
+    // ── Distance methods ──────────────────────────────────────────────────────
+
+    pub fn partial_bray_dist(self, other: IntSliceView<'_>) -> u64 {
+        assert_eq!(self.n, other.n, "length mismatch");
+        self.iter().zip(other.iter()).map(|(a, b)| a.min(b) as u64).sum()
+    }
+
+    pub fn bray_dist(self, other: IntSliceView<'_>) -> f64 {
+        let sum_min = self.partial_bray_dist(other);
+        let denom = self.sum() + other.sum();
+        if denom == 0 { 0.0 } else { 1.0 - 2.0 * sum_min as f64 / denom as f64 }
+    }
+
+    pub fn partial_relfreq_bray_dist(self, other: IntSliceView<'_>, sa: f64, sb: f64) -> f64 {
+        assert_eq!(self.n, other.n, "length mismatch");
+        self.iter().zip(other.iter())
+            .map(|(a, b)| {
+                let pa = if sa > 0.0 { a as f64 / sa } else { 0.0 };
+                let pb = if sb > 0.0 { b as f64 / sb } else { 0.0 };
+                pa.min(pb)
+            })
+            .sum()
+    }
+
+    pub fn relfreq_bray_dist(self, other: IntSliceView<'_>) -> f64 {
+        let sa = self.sum() as f64;
+        let sb = other.sum() as f64;
+        if sa == 0.0 && sb == 0.0 { return 0.0; }
+        1.0 - self.partial_relfreq_bray_dist(other, sa, sb)
+    }
+
+    pub fn partial_euclidean_dist(self, other: IntSliceView<'_>) -> f64 {
+        assert_eq!(self.n, other.n, "length mismatch");
+        self.iter().zip(other.iter())
+            .map(|(a, b)| { let d = a as f64 - b as f64; d * d })
+            .sum()
+    }
+
+    pub fn euclidean_dist(self, other: IntSliceView<'_>) -> f64 {
+        self.partial_euclidean_dist(other).sqrt()
+    }
+
+    pub fn partial_relfreq_euclidean_dist(self, other: IntSliceView<'_>, sa: f64, sb: f64) -> f64 {
+        assert_eq!(self.n, other.n, "length mismatch");
+        self.iter().zip(other.iter())
+            .map(|(a, b)| {
+                let pa = if sa > 0.0 { a as f64 / sa } else { 0.0 };
+                let pb = if sb > 0.0 { b as f64 / sb } else { 0.0 };
+                let d = pa - pb;
+                d * d
+            })
+            .sum()
+    }
+
+    pub fn relfreq_euclidean_dist(self, other: IntSliceView<'_>) -> f64 {
+        let sa = self.sum() as f64;
+        let sb = other.sum() as f64;
+        if sa == 0.0 && sb == 0.0 { return 0.0; }
+        self.partial_relfreq_euclidean_dist(other, sa, sb).sqrt()
+    }
+
+    pub fn partial_hellinger_euclidean_dist(self, other: IntSliceView<'_>, sa: f64, sb: f64) -> f64 {
+        assert_eq!(self.n, other.n, "length mismatch");
+        self.iter().zip(other.iter())
+            .map(|(a, b)| {
+                let pa = if sa > 0.0 { (a as f64 / sa).sqrt() } else { 0.0 };
+                let pb = if sb > 0.0 { (b as f64 / sb).sqrt() } else { 0.0 };
+                let d = pa - pb;
+                d * d
+            })
+            .sum()
+    }
+
+    pub fn hellinger_euclidean_dist(self, other: IntSliceView<'_>) -> f64 {
+        let sa = self.sum() as f64;
+        let sb = other.sum() as f64;
+        if sa == 0.0 && sb == 0.0 { return 0.0; }
+        self.partial_hellinger_euclidean_dist(other, sa, sb).sqrt()
+    }
+
+    pub fn hellinger_dist(self, other: IntSliceView<'_>) -> f64 {
+        self.hellinger_euclidean_dist(other) / std::f64::consts::SQRT_2
+    }
+
+    pub fn partial_threshold_jaccard_dist(self, other: IntSliceView<'_>, threshold: u32) -> (u64, u64) {
+        assert_eq!(self.n, other.n, "length mismatch");
+        self.iter().zip(other.iter())
+            .fold((0u64, 0u64), |(inter, uni), (a, b)| {
+                let ap = a >= threshold;
+                let bp = b >= threshold;
+                (inter + (ap & bp) as u64, uni + (ap | bp) as u64)
+            })
+    }
+
+    pub fn threshold_jaccard_dist(self, other: IntSliceView<'_>, threshold: u32) -> f64 {
+        let (inter, union) = self.partial_threshold_jaccard_dist(other, threshold);
+        if union == 0 { 0.0 } else { 1.0 - inter as f64 / union as f64 }
+    }
+
+    pub fn jaccard_dist(self, other: IntSliceView<'_>) -> f64 {
+        self.threshold_jaccard_dist(other, 1)
+    }
+}
+
+// ── IntSliceViewIter ──────────────────────────────────────────────────────────
+
+pub struct IntSliceViewIter<'a> {
+    primary:      &'a [u8],
+    overflow_raw: &'a [u8],
+    slot:         usize,
+    overflow_pos: usize,
+    n:            usize,
+}
+
+impl Iterator for IntSliceViewIter<'_> {
+    type Item = u32;
+    fn next(&mut self) -> Option<u32> {
+        if self.slot >= self.n { return None; }
+        let v = self.primary[self.slot];
+        self.slot += 1;
+        if v < 255 {
+            Some(v as u32)
+        } else {
+            let (_, val) = parse_overflow_entry(self.overflow_raw, 0, self.overflow_pos);
+            self.overflow_pos += 1;
+            Some(val)
+        }
+    }
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        let rem = self.n - self.slot;
+        (rem, Some(rem))
+    }
+}
+impl ExactSizeIterator for IntSliceViewIter<'_> {}
diff --git a/src/obikpartitionner/src/common.rs b/src/obikpartitionner/src/common.rs
index 76d3bf3..99e345e 100644
--- a/src/obikpartitionner/src/common.rs
+++ b/src/obikpartitionner/src/common.rs
@@ -3,7 +3,6 @@ use std::io;
 use std::path::{Path, PathBuf};
 
 use obicompactvec::{PersistentBitVecBuilder, PersistentCompactIntVecBuilder};
-use obicompactvec::traits::BitSliceMut;
 use obilayeredmap::meta::PartitionMeta;
 use obilayeredmap::{IndexMode, OLMError};
 use obiskio::{SKError, SKResult};
diff --git a/src/obikpartitionner/src/select_layer.rs b/src/obikpartitionner/src/select_layer.rs
index 56b2ac7..36286c0 100644
--- a/src/obikpartitionner/src/select_layer.rs
+++ b/src/obikpartitionner/src/select_layer.rs
@@ -6,7 +6,6 @@ use obicompactvec::{
     PersistentBitMatrix, PersistentBitMatrixBuilder, PersistentBitVecBuilder,
     PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, PersistentCompactIntVecBuilder,
 };
-use obicompactvec::traits::BitSliceMut;
 use obilayeredmap::meta::PartitionMeta;
 use obilayeredmap::OLMError;
 use obiskio::{SKError, SKResult};
diff --git a/src/obilayeredmap/src/layer.rs b/src/obilayeredmap/src/layer.rs
index c79e781..72b38ea 100644
--- a/src/obilayeredmap/src/layer.rs
+++ b/src/obilayeredmap/src/layer.rs
@@ -6,7 +6,6 @@ use obicompactvec::{
     PersistentBitMatrix, PersistentBitMatrixBuilder,
     PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder,
 };
-use obicompactvec::traits::BitSliceMut;
 use obikseq::CanonicalKmer;
 use obiskio::{UnitigFileReader, UnitigFileWriter};
 
diff --git a/src/obilayeredmap/src/layered_store.rs b/src/obilayeredmap/src/layered_store.rs
index 6ebf343..433183e 100644
--- a/src/obilayeredmap/src/layered_store.rs
+++ b/src/obilayeredmap/src/layered_store.rs
@@ -102,7 +102,6 @@ mod tests {
         PersistentBitMatrix, PersistentBitMatrixBuilder,
         PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder,
     };
-    use obicompactvec::traits::BitSliceMut;
     use tempfile::tempdir;
 
     fn make_int_matrix(cols: &[&[u32]]) -> (tempfile::TempDir, PersistentCompactIntMatrix) {

From 7eea71fdcded0ada532042cf647b36e03ec38df2 Mon Sep 17 00:00:00 2001
From: Eric Coissac <eric@coissac.eu>
Date: Thu, 18 Jun 2026 07:10:08 +0200
Subject: [PATCH 16/24] docs(obicompactvec): update API docs and algorithm
 descriptions

Replace trait-based API documentation with concrete, zero-copy view structs and update all associated diagrams. Refine algorithmic descriptions for sentinel handling, overflow stores, and bulk operations. Clarify temporary file lifecycles and group-chunking strategies to support memory-efficient parallel aggregation.
---
 docmd/implementation/obicompactvec.md | 647 +++++++++-----------------
 1 file changed, 228 insertions(+), 419 deletions(-)

diff --git a/docmd/implementation/obicompactvec.md b/docmd/implementation/obicompactvec.md
index 71dc939..e3c8568 100644
--- a/docmd/implementation/obicompactvec.md
+++ b/docmd/implementation/obicompactvec.md
@@ -5,12 +5,11 @@
 ```
 src/obicompactvec/src/
   lib.rs            public re-exports
-  traits.rs         BitSlice, BitSliceMut, IntSlice, IntSliceMut + conversion traits
+  views.rs          BitSliceView<'a>, IntSliceView<'a> — zero-copy read views
+  traits.rs         ColumnWeights, CountPartials, BitPartials (matrix aggregation)
   bitvec.rs         PersistentBitVec, PersistentBitVecBuilder, BitIter
-  memoryvec.rs      MemoryBitVec
   reader.rs         PersistentCompactIntVec (read-only)
   builder.rs        PersistentCompactIntVecBuilder (read-write)
-  memoryintvec.rs   MemoryIntVec
   tempintvec.rs     TempCompactIntVec, TempCompactIntVecBuilder (temp-file-backed)
   tempbitvec.rs     TempBitVec, TempBitVecBuilder (temp-file-backed)
   bitmatrix.rs      PersistentBitMatrix, PersistentBitMatrixBuilder
@@ -23,20 +22,20 @@ src/obicompactvec/src/
 
 ```mermaid
 graph TD
-    traits --> memoryvec
-    traits --> memoryintvec
-    bitvec --> memoryvec
-    bitvec --> bitmatrix
-    bitvec --> tempbitvec
+    views --> bitvec
+    views --> builder
+    views --> tempbitvec
+    views --> tempintvec
+    views --> bitmatrix
+    views --> intmatrix
     format --> reader
     format --> builder
     reader --> intmatrix
     reader --> tempintvec
     builder --> intmatrix
-    builder --> memoryintvec
     builder --> tempintvec
-    memoryvec --> traits
-    memoryintvec --> traits
+    bitvec --> tempbitvec
+    bitvec --> bitmatrix
     tempintvec --> intmatrix
     tempintvec --> bitmatrix
     tempbitvec --> intmatrix
@@ -62,7 +61,7 @@ All integer vectors use the same two-tier encoding regardless of storage backend
 
 **Overflow store** — maps slot index to a `u32` value ≥ 255:
 
-- In `MemoryIntVec` and `PersistentCompactIntVecBuilder`: a `HashMap<usize, u32>` in RAM.
+- In `PersistentCompactIntVecBuilder`: a `HashMap<usize, u32>` in RAM.
 - In `PersistentCompactIntVec` (reader): a sorted `[(slot: u64, value: u32)]` array in the mmap, with a sparse L1-resident index for binary search.
 
 ```mermaid
@@ -70,13 +69,12 @@ flowchart LR
     slot --> P["primary[slot]: u8"]
     P -->|"< 255"| V["value = byte (0–254)"]
     P -->|"= 255 sentinel"| OV["overflow store"]
-    OV -->|"MemoryIntVec / Builder"| HM["HashMap&lt;usize, u32&gt;\nin RAM"]
+    OV -->|"Builder"| HM["HashMap&lt;usize, u32&gt;\nin RAM"]
     OV -->|"PersistentCompactIntVec"| SA["sorted [(slot,value)] in mmap\n+ sparse L1 index"]
 ```
 
 **Key property — sentinel 255 = +∞ on `u8`:**
 
-This is exploited throughout the binary operations. On a `u8` comparison, 255 behaves as positive infinity:
 - `min(a, 255) = a` for all `a ≤ 254` → correct when only one side is overflow
 - `max(a, 255) = 255` → correct sentinel when either side is overflow
 - Only the **both-overflow** case requires reading actual values from the overflow store.
@@ -85,274 +83,60 @@ In practice, k (overflow count) ≪ n (total slots). Observed genomic data: ~0.0
 
 ---
 
-## Trait hierarchy
+## View types
 
-```mermaid
-classDiagram
-    class BitSlice {
-        <<trait>>
-        +len() usize
-        +words() &[u64]
-        +get(slot) bool
-        +count_ones() u64
-        +count_zeros() u64
-        +partial_jaccard_dist(other) (u64,u64)
-        +jaccard_dist(other) f64
-        +hamming_dist(other) u64
-    }
-    class BitSliceMut {
-        <<trait>>
-        +words_mut() &mut [u64]
-        +set(slot, value)
-        +copy_from(src)
-        +and(other)
-        +or(other)
-        +xor(other)
-        +not()
-    }
-    class IntSlice {
-        <<trait>>
-        +len() usize
-        +get(slot) u32
-        +primary_bytes() &[u8]
-        +overflow_entries() Iterator
-        +iter() Iterator
-        +sum() u64
-        +count_nonzero() u64
-        +cmp_scalar(pred) MemoryBitVec
-        +lt/leq/gt/geq(t) MemoryBitVec
-    }
-    class IntSliceMut {
-        <<trait>>
-        +set(slot, value)
-        +primary_bytes_mut() &mut [u8]
-        +clear_overflow()
-        +inc/dec/add_at(slot)
-        +copy_from(src)
-        +min/max/add/diff(other)
-        +count_bits(bits)
-    }
-    class IntToBit {
-        <<trait blanket>>
-        +to_bitvec(threshold) MemoryBitVec
-        +to_presence() MemoryBitVec
-    }
-    class BitToInt {
-        <<trait blanket>>
-        +to_intvec() MemoryIntVec
-    }
-    BitSliceMut --|> BitSlice : extends
-    IntSliceMut --|> IntSlice : extends
-    IntToBit --|> IntSlice : blanket T:IntSlice
-    BitToInt --|> BitSlice : blanket T:BitSlice
+The previous trait hierarchy (`BitSlice`, `BitSliceMut`, `IntSlice`, `IntSliceMut`) has been replaced by two concrete zero-copy view structs with inherent methods. Views are **`Copy`** — passing them is free. All read operations live on these two types.
+
+### `BitSliceView<'a>`
+
+```rust
+#[derive(Clone, Copy)]
+pub struct BitSliceView<'a> { pub(crate) words: &'a [u64], pub(crate) n: usize }
 ```
 
-### BitSlice (read-only)
+Bit `i` is at `words[i >> 6]` bit `i & 63` (LSB-first). Padding bits in the last word are zero.
 
-Required: `len()`, `words() -> &[u64]`.
+| Method | Cost |
+|---|---|
+| `len()`, `is_empty()` | O(1) |
+| `get(slot)` | O(1) |
+| `count_ones()` | POPCNT per word, O(n/64) |
+| `count_zeros()` | `n − count_ones()`, O(n/64) |
+| `iter() -> BitSliceIter<'a>` | O(1) setup, O(n) iteration |
+| `partial_jaccard_dist(other: BitSliceView)` | `(a&b).popcount`, `(a\|b).popcount` per word, O(n/64) |
+| `jaccard_dist(other: BitSliceView)` | from partial, O(n/64) |
+| `hamming_dist(other: BitSliceView)` | `(a^b).popcount` per word, O(n/64) |
 
-Bit `i` is at `words()[i >> 6]` bit `i & 63` (LSB-first). Padding bits in the last word are always zero — this invariant must be maintained by all implementors.
+`BitSliceIter<'a>`: word-level scan; one word per 64 iterations.
 
-| Provided method | Implementation | Cost |
-|---|---|---|
-| `is_empty()` | `len() == 0` | O(1) |
-| `get(slot)` | word extract | O(1) |
-| `count_ones()` | POPCNT per word | O(n/64) |
-| `count_zeros()` | `n − count_ones()` | O(n/64) |
-| `partial_jaccard_dist(other)` | `(a&b).popcount`, `(a\|b).popcount` per word | O(n/64) |
-| `jaccard_dist(other)` | from partial | O(n/64) |
-| `hamming_dist(other)` | `(a^b).popcount` per word | O(n/64) |
+### `IntSliceView<'a>`
 
-### BitSliceMut: BitSlice (mutable)
-
-Required: `words_mut() -> &mut [u64]`.
-
-All bulk operations work at the word level (64 bits/iteration). The compiler auto-vectorizes these loops to AVX2/AVX-512. The zero-padding invariant is maintained: `not()` re-masks the last word after flipping.
-
-| Provided method | Implementation | Cost |
-|---|---|---|
-| `set(slot, value)` | OR / AND-NOT on one word | O(1) |
-| `copy_from(src)` | `copy_from_slice` = memcpy | O(n/64) |
-| `and(other)` | `w &= o` per word | O(n/64) |
-| `or(other)` | `w \|= o` per word | O(n/64) |
-| `xor(other)` | `w ^= o` per word | O(n/64) |
-| `not()` | `w ^= u64::MAX` per word, then mask last | O(n/64) |
-
-**No overflow complexity here.** The packed `u64` representation is already the natural unit for SIMD operations. No sentinel, no HashMap — just bitwise word ops.
-
----
-
-### IntSlice (read-only)
-
-Required:
-- `len() -> usize`
-- `get(slot) -> u32` — handles sentinel transparently (binary search into overflow for persistent, HashMap for memory)
-- `primary_bytes() -> &[u8]` — raw primary array including 255 sentinels
-- `overflow_entries() -> impl Iterator<Item = (usize, u32)>` — (slot, true_value) pairs for all overflow slots
-
-| Provided method | Default implementation | Note |
-|---|---|---|
-| `is_empty()` | `len() == 0` | |
-| `iter()` | `(0..n).map(\|i\| self.get(i))` | Overridden in all concrete types |
-| `sum()` | `iter().map(\|v\| v as u64).sum()` | Overridden in concrete types |
-| `count_nonzero()` | `iter().filter(\|v\| *v > 0).count()` | Overridden in concrete types |
-| `lt(t)` | `cmp_scalar(\|v\| v < t)` | |
-| `leq(t)` | `cmp_scalar(\|v\| v <= t)` | |
-| `gt(t)` | `cmp_scalar(\|v\| v > t)` | |
-| `geq(t)` | `cmp_scalar(\|v\| v >= t)` | |
-| `cmp_scalar(pred)` | two-pass (see below) | |
-
-**`cmp_scalar` algorithm — two passes:**
-
-```
-Pass 1 — byte scan, O(n):
-  for s in 0..n:
-    b = primary[s]
-    if b < 255 AND pred(b as u32):
-      set bit s in result word
-
-Pass 2 — overflow fixup, O(k):
-  for (s, val) in overflow_entries():
-    if pred(val): set bit s in result word
+```rust
+#[derive(Clone, Copy)]
+pub struct IntSliceView<'a> {
+    pub(crate) primary:      &'a [u8],
+    pub(crate) overflow_raw: &'a [u8],   // sorted [(slot:u64, value:u32)] entries
+    pub(crate) n_overflow:   usize,
+    pub(crate) n:            usize,
+}
 ```
 
-Pass 1 reads only the primary byte array — no HashMap access. For simple predicates (`geq`, `lt`, etc.) the compiler inlines `pred` and can auto-vectorize the byte comparison loop. Pass 2 handles the O(k) overflow slots that were left as 0 in pass 1.
+`overflow_raw` contains `n_overflow` entries of `OVERFLOW_ENTRY_SIZE` bytes each, sorted by slot. The sort invariant is established at `close()`/`freeze()` time.
 
-Previous implementation: `pred(self.get(s))` for every slot → O(n log k) due to binary search in overflow. New: O(n) + O(k).
+| Method | Cost |
+|---|---|
+| `len()`, `is_empty()` | O(1) |
+| `primary_bytes()` | O(1) |
+| `overflow_entries() -> impl Iterator<(usize,u32)>` | O(n_overflow) iteration |
+| `get(slot)` | O(1) primary; binary search O(log k) for overflow slots |
+| `iter() -> IntSliceViewIter<'a>` | merge scan, O(n + k) |
+| `sum()` | byte scan + overflow, O(n + k) |
+| `count_nonzero()` | byte scan, O(n) |
+| Distance methods (`bray_dist`, `euclidean_dist`, `jaccard_dist`, …) | O(n + k) |
 
----
+`IntSliceViewIter<'a>`: merge scan using `overflow_pos` index. Requires sorted overflow — guaranteed by the construction lifecycle.
 
-### IntSliceMut: IntSlice (mutable)
-
-Required:
-- `set(slot, value: u32)` — writes primary byte (or 255 + overflow entry if value ≥ 255); removes stale overflow entry if value drops below 255
-- `primary_bytes_mut() -> &mut [u8]` — direct mutable access to the primary array
-- `clear_overflow()` — empties the entire overflow store
-
-The required methods expose the encoding internals. All provided methods are implemented in terms of these three + the `IntSlice` required methods.
-
-| Provided method | Hot path | Overflow case | Cost |
-|---|---|---|---|
-| `inc(slot)` | `get` + `set` | — | O(1) or O(log k) |
-| `dec(slot)` | `get` + `set` (saturating) | — | O(1) or O(log k) |
-| `add_at(slot, delta)` | `get` + `set` (saturating) | — | O(1) or O(log k) |
-| `copy_from(src)` | `copy_from_slice` + `clear_overflow` + replay overflows | — | O(n) + O(k) |
-| `min(other)` | byte-level min, O(n) | both-overflow fixup, O(k) | O(n) |
-| `max(other)` | byte-level max, O(n) | pre-pass on other's overflows, O(k) | O(n) |
-| `add(other)` | byte add when both < 255, O(n) | `get` + `+` when either = 255 | O(n) |
-| `diff(other)` | byte saturating_sub when self < 255, O(n) | `get` + `saturating_sub` when self = 255 | O(n) |
-| `count_bits(bits)` | iterate set bits via word scan | — | O(n_ones) |
-| `cmp_scalar` | inherited from IntSlice | — | O(n) + O(k) |
-
-**`min` algorithm:**
-
-Exploits 255 = +∞: `u8::min(a, 255) = a` and `u8::min(255, b) = b`. Only the case where both sides are ≥ 255 needs actual overflow values.
-
-```mermaid
-flowchart TD
-    A["min(self, other)"] --> B["snapshot self_ov: Vec&lt;(slot,val)&gt;\nsnapshot other_ov: HashMap&lt;slot,val&gt;"]
-    B --> C["clear_overflow()"]
-    C --> D["Pass 1 — byte min, SIMD-vectorizable\nprimary[s] = min(self[s], other[s])  ∀s"]
-    D --> E["Pass 2 — both-overflow fixup\nfor (slot, self_val) in self_ov"]
-    E --> F{"slot ∈ other_ov?"}
-    F -->|yes| G["set(slot, min(self_val, other_ov[slot]))"]
-    F -->|no| H["byte pass wrote other.primary &lt; 255\nclear_overflow removed stale entry\nno action"]
-    G --> I[done]
-    H --> I
-```
-
-Overflow entries where only self was overflow are correctly handled: after `clear_overflow` + byte pass, `self.primary[slot] = min(255, other.primary[slot]) = other.primary[slot]` (which is < 255). No overflow entry — correct.
-
-**`max` algorithm:**
-
-Exploits 255 = +∞: `u8::max(a, 255) = 255` → any slot where either side is overflow will have sentinel 255 in the primary after the byte pass. The byte pass cannot distinguish "self had overflow and other did not" from "self was just written to 255 by the byte pass".
-
-Solution: read and update self's original value at other's overflow slots *before* the byte pass overwrites them.
-
-```mermaid
-flowchart TD
-    A["max(self, other)"] --> B["Pre-pass O(k_other)\nfor (slot, other_val) in other.overflow_entries()"]
-    B --> C["self_val = self.get(slot)\nself.set(slot, max(self_val, other_val))"]
-    C --> D["Pass 1 — byte max, SIMD-vectorizable\nprimary[s] = max(self[s], other[s])  ∀s"]
-    D --> E["Overflow slots: max(255,255)=255\nprimary unchanged\noverflow entry from pre-pass preserved"]
-    E --> F[done]
-```
-
-After the pre-pass, self.primary[slot] = 255 for all slots in other's overflow. The byte pass leaves those 255s intact. Self's own overflow slots not in other's overflow are also 255 in primary — byte max(255, b < 255) = 255, unchanged. Correct in all cases.
-
-**`add` algorithm:**
-
-No sentinel property useful for add: any pair (sb, ob) with sb + ob ≥ 255 creates a new overflow entry, even when neither input was overflow. Cannot simplify via byte arithmetic.
-
-```
-for s in 0..n:
-  sb = self.primary[s]
-  ob = other.primary[s]
-  if sb < 255 AND ob < 255:      // hot path: no HashMap
-    sum = sb as u32 + ob as u32
-    if sum < 255: self.primary[s] = sum as u8   // direct byte write
-    else:         self.set(s, sum)               // creates overflow if needed
-  else:                           // at least one is overflow
-    self.set(s, self.get(s) + other.get(s))
-```
-
-```mermaid
-flowchart TD
-    A["add(self, other)"] --> B{"sb &lt; 255\nAND ob &lt; 255"}
-    B -->|"yes — hot path\nno HashMap"| C{"sb + ob &lt; 255"}
-    C -->|yes| D["primary[s] = sum as u8\nsingle byte write"]
-    C -->|no| E["set(s, sum)\ncreates overflow entry"]
-    B -->|"no — ≥1 side is overflow"| F["self_val = self.get(s)\nother_val = other.get(s)\nset(s, self_val + other_val)"]
-    D --> Z[next slot]
-    E --> Z
-    F --> Z
-```
-
-The `+` on `u32` values is exact (no `saturating_add`). Overflow at u32 level panics in debug — not a real risk for kmer counts. The hot path (both < 255, sum < 255) is a single byte write with no HashMap access.
-
-**`diff` (saturating sub) algorithm:**
-
-`saturating_sub(a, b) = a − min(a, b) = max(0, a − b)`. Key insight: if self's primary byte < 255, the result is always < 255 (result ≤ a), so no new overflow entries are created and no overflow lookup is needed for self. Only self's overflow slots (primary = 255) need `get()`.
-
-| sb | ob | result | get() needed |
-|----|----|--------|-------------|
-| < 255 | < 255 | `sb.saturating_sub(ob)` < 255 | none |
-| < 255 | 255 | 0 (b ≥ 255 > a) | none |
-| 255 | < 255 | `self.get(s) − ob` | self only |
-| 255 | 255 | `self.get(s) − other.get(s)` | both |
-
-```mermaid
-flowchart TD
-    A["diff(self, other)"] --> B{"sb &lt; 255\nself not overflow"}
-    B -->|"yes — hot path O(n)"| C{"ob &lt; 255"}
-    C -->|yes| D["primary[s] = sb.saturating_sub(ob)\nbyte write, no HashMap"]
-    C -->|"no: b ≥ 255 > a"| E["primary[s] = 0"]
-    B -->|"no — cold path O(k_self)"| F["self_val = self.get(s)"]
-    F --> G{"ob &lt; 255"}
-    G -->|yes| H["other_val = ob as u32"]
-    G -->|no| I["other_val = other.get(s)"]
-    H --> J["set(s, self_val.saturating_sub(other_val))"]
-    I --> J
-    D --> Z[next slot]
-    E --> Z
-    J --> Z
-```
-
-Overflow entries that drop below 255 (case sb=255, result < 255) are removed by `set()`. Overflow entries that remain ≥ 255 are updated. Correct in all four cases.
-
-**`count_bits` algorithm:**
-
-Increments self at each slot where the corresponding bit in `bits` is set. Iterates `bits.words()` and skips zero words entirely — O(n_ones) rather than O(n).
-
-```
-for (w_idx, word) in bits.words():
-  if word == 0: continue
-  base = w_idx * 64
-  while word != 0:
-    bit = trailing_zeros(word)
-    self.inc(base + bit)
-    word &= word − 1        // clear lowest set bit
-```
+**Builder `view()` vs reader `view()`:** `PersistentCompactIntVecBuilder` stores overflow as an unsorted `HashMap`, not raw bytes. Its `view()` returns an `IntSliceView` with `overflow_raw = &[]` and `n_overflow = 0`. This is intentional — the view is primarily useful after `freeze()`. During building, callers that need overflow use `overflow_entries()` directly.
 
 ---
 
@@ -360,142 +144,149 @@ for (w_idx, word) in bits.words():
 
 ```mermaid
 classDiagram
-    class MemoryBitVec {
-        -words: Vec~u64~
-        -n: usize
-        +iter() BitIter
-        +ones(n) Self
-        +persist(path) Builder
+    class BitSliceView {
+        +words: &[u64]
+        +n: usize
+        +get(slot) bool
+        +count_ones() u64
+        +iter() BitSliceIter
+        +jaccard_dist/hamming_dist(other: BitSliceView)
     }
-    class MemoryIntVec {
-        -primary: Vec~u8~
-        -overflow: HashMap~usize,u32~
-        -n: usize
-        +iter() MemoryIntIter
-        +filled(n, value) Self
-        +persist(path) Builder
+    class IntSliceView {
+        +primary: &[u8]
+        +overflow_raw: &[u8]
+        +n_overflow: usize
+        +n: usize
+        +get(slot) u32
+        +iter() IntSliceViewIter
+        +overflow_entries() Iterator
+        +bray_dist/euclidean_dist/…(other: IntSliceView)
     }
     class PersistentBitVec {
         -mmap: Mmap
         -n: usize
+        +view() BitSliceView
+        +get(slot) bool
+        +count_ones/zeros() u64
         +iter() BitIter
-        +count_ones() u64
+        +partial_jaccard_dist(&Self) (u64,u64)
+        +jaccard_dist/hamming_dist(&Self) …
     }
     class PersistentBitVecBuilder {
         -mmap: MmapMut
         -n: usize
-        +close()
-        +build_from(src, path)
-        +build_from_counts(src, t, path)
+        +view() BitSliceView
+        +set(slot, bool)
+        +or/and/xor/not(BitSliceView)
+        +copy_from(BitSliceView)
+        +close() / finish() → PersistentBitVec
     }
     class PersistentCompactIntVec {
         -mmap: Mmap
-        -n usize
-        -n_overflow usize
-        -step usize
+        -n: usize
+        -n_overflow: usize
+        -step: usize
         -index: Vec~(usize,usize)~
-        +iter() Iter
+        +view() IntSliceView
         +get(slot) u32
-        +sum() u64
+        +iter() Iter
+        +sum/count_nonzero() u64
+        +bray_dist/euclidean_dist/… (&Self)
     }
     class PersistentCompactIntVecBuilder {
         -mmap: MmapMut
         -n: usize
         -overflow: HashMap~usize,u32~
-        +set(slot, value)
-        +close()
-        +build_from(src, path)
+        +view() IntSliceView
+        +set(slot, u32) / get(slot) u32
+        +inc / inc_present / inc_present_fast
+        +inc_predicate / inc_predicate_fast
+        +add/min/max/diff/mask_with(…View)
+        +primary_bytes/primary_bytes_mut()
+        +close() / finish() → PersistentCompactIntVec
     }
 
-    MemoryBitVec ..|> BitSlice
-    MemoryBitVec ..|> BitSliceMut
-    PersistentBitVec ..|> BitSlice
-    PersistentBitVecBuilder ..|> BitSlice
-    PersistentBitVecBuilder ..|> BitSliceMut
-    MemoryIntVec ..|> IntSlice
-    MemoryIntVec ..|> IntSliceMut
-    PersistentCompactIntVec ..|> IntSlice
-    PersistentCompactIntVecBuilder ..|> IntSlice
-    PersistentCompactIntVecBuilder ..|> IntSliceMut
-
+    PersistentBitVec --> BitSliceView : view()
+    PersistentBitVecBuilder --> BitSliceView : view()
+    PersistentCompactIntVec --> IntSliceView : view()
+    PersistentCompactIntVecBuilder --> IntSliceView : view() (primary only)
     PersistentBitVecBuilder --> PersistentBitVec : close() then open()
     PersistentCompactIntVecBuilder --> PersistentCompactIntVec : close() then open()
 ```
 
-### Memory types
+### `PersistentBitVec` / `PersistentBitVecBuilder`
 
-**`MemoryBitVec`**
+`PersistentBitVec` is the read-only type. `view()` returns a `BitSliceView<'_>` over the mmap word array. Direct inherent methods delegate to the view: `count_ones()`, `count_zeros()`, `partial_jaccard_dist(&Self)`, `jaccard_dist(&Self)`, `hamming_dist(&Self)`.
 
-```rust
-struct MemoryBitVec { words: Vec<u64>, n: usize }
-```
-
-Implements `BitSlice` + `BitSliceMut`. Owns its word array. Used as the result type of `cmp_scalar` / filter operations and as an intermediate for bit-level computations.
-
-Std ops: `BitAnd`, `BitOr`, `BitXor`, `Not` (owned and borrowed), `BitAndAssign`, `BitOrAssign`, `BitXorAssign` — all delegate to `BitSliceMut` methods.
-
-`iter()` returns a `BitIter<'_>` (word-level, see below).
-
-**`MemoryIntVec`**
-
-```rust
-struct MemoryIntVec {
-    primary:  Vec<u8>,
-    overflow: HashMap<usize, u32>,
-    n:        usize,
-}
-```
-
-Implements `IntSlice` + `IntSliceMut`. Overrides: `iter()` → inherent `iter()` (merge-scan), `sum()`, `count_nonzero()`.
-
-`IntSlice` required impls: `primary_bytes()` → `&self.primary`; `overflow_entries()` → `self.overflow.iter().map(...)`.
-
-`IntSliceMut` required impls: `set()` writes to `self.primary[slot]` and inserts/removes from `self.overflow`; `primary_bytes_mut()` → `&mut self.primary`; `clear_overflow()` → `self.overflow.clear()`.
-
-Std ops: `Add<&B>`, `Sub<&B>` (owned and borrowed), `AddAssign<&B>`, `SubAssign<&B>` — delegate to `IntSliceMut::add` / `diff`.
-
-`From<&S: IntSlice>`: copies primary bytes + overflow entries. O(n) + O(k).
-
----
-
-### Persistent types
-
-**`PersistentBitVec` / `PersistentBitVecBuilder`**
-
-See `persistent_bit_vec.md`. `PersistentBitVec` is read-only (implements `BitSlice`). `PersistentBitVecBuilder` is read-write (implements `BitSlice` + `BitSliceMut`).
-
-`BitIter<'a>` — shared iterator type for both `MemoryBitVec` and `PersistentBitVec`:
+`BitIter<'a>` — exported iterator for `PersistentBitVec::iter()`:
 
 ```rust
 pub struct BitIter<'a> { pub(crate) words: &'a [u64], pub(crate) slot: usize, pub(crate) n: usize }
 ```
 
-Word-level scan: `(words[slot >> 6] >> (slot & 63)) & 1 != 0`. One word serves 64 iterations. `pub type MemoryBitIter<'a> = BitIter<'a>` preserves the public API name.
+`PersistentBitVecBuilder` is the read-write type. Mutation operations accept `BitSliceView<'_>`:
 
-**`PersistentCompactIntVec` / `PersistentCompactIntVecBuilder`**
+| Method | Cost |
+|---|---|
+| `set(slot, bool)` | O(1) |
+| `view() -> BitSliceView<'_>` | O(1) |
+| `or/and/xor(BitSliceView)` | word-level, O(n/64), SIMD-friendly |
+| `not()` | `w ^= u64::MAX` per word, re-masks last word | O(n/64) |
+| `copy_from(BitSliceView)` | `copy_from_slice` | O(n/64) |
 
-See `persistent_compact_int_vec.md` for file format and lifecycle.
+### `PersistentCompactIntVec` / `PersistentCompactIntVecBuilder`
 
-`PersistentCompactIntVec` implements `IntSlice`. Overrides: `iter()` → inherent merge-scan `Iter`; `sum()`; `count_nonzero()`. `overflow_entries()` returns a sequential scan `(0..n_overflow).map(|i| (data_slot(i), data_value(i)))` — no binary search since entries are stored sorted.
+`PersistentCompactIntVec` is the read-only type. `view()` returns an `IntSliceView<'_>` over the mmap primary and overflow arrays. Inherent `iter()` is a merge scan (`Iter` struct). Inherent `sum()` and `count_nonzero()` use fast byte-scan helpers.
 
-`PersistentCompactIntVecBuilder` implements `IntSlice` + `IntSliceMut`. `iter()` is NOT overridden (default `get`-per-slot) because the overflow `HashMap` is unsorted. `sum()` and `count_nonzero()` are overridden using `byte_sum` / `byte_count_nonzero` on the mmap primary slice — avoids per-slot overhead.
+`PersistentCompactIntVecBuilder` is the read-write type. Mutation methods on the builder fall into two categories:
 
-**Override rationale:** the default `iter()`, `sum()`, `count_nonzero()` on `IntSlice` call `self.get(s)` per slot, which is O(log k) binary search for `PersistentCompactIntVec`. Overrides provide O(n + k) merge-scan or O(n) byte scan instead.
+**Point mutations:**
 
----
+| Method | Note |
+|---|---|
+| `set(slot, u32)` | writes primary[slot] or 255+overflow |
+| `get(slot) -> u32` | reads primary byte or HashMap |
+| `inc(slot)` | `get` + `set`, O(1) |
 
-### IntSlice implementors — override summary
+**Bulk computation methods** — accept view arguments:
 
-| Type | `iter()` | `sum()` | `count_nonzero()` |
-|------|----------|---------|-------------------|
-| `MemoryIntVec` | inherent merge-scan ✓ | `byte_sum` ✓ | `byte_count_nonzero` ✓ |
-| `PersistentCompactIntVecBuilder` | default (get-per-slot) | `byte_sum` on mmap ✓ | `byte_count_nonzero` on mmap ✓ |
-| `PersistentCompactIntVec` | inherent merge-scan Iter ✓ | inherent `sum()` ✓ | inherent `count_nonzero()` ✓ |
-| `TempCompactIntVec` | delegates to inner `PersistentCompactIntVec` | delegates | delegates |
-| `TempCompactIntVecBuilder` | default (get-per-slot) | delegates to builder | delegates to builder |
-| `PackedIntCol<'a>` | inherent PackedIntColIter ✓ | byte_sum ✓ | byte_count_nonzero ✓ |
+| Method | Semantics | Overflow |
+|---|---|---|
+| `inc_present(BitSliceView)` | `+= 1` at each 1-bit | via `inc`, safe for any group size |
+| `inc_present_fast(BitSliceView)` | same, raw u8 `+= 1` | `debug_assert` no 255 reached |
+| `inc_predicate(IntSliceView, pred)` | `+= 1` where `pred(col[s])` | two-pass, safe |
+| `inc_predicate_fast(IntSliceView, pred)` | same, raw u8 | `debug_assert` no 255 reached |
+| `add(IntSliceView)` | `self[s] += other[s]` | primary fast path + overflow fallback |
+| `min(IntSliceView)` | byte min + both-overflow fixup | see algorithm below |
+| `max(IntSliceView)` | pre-pass + byte max | see algorithm below |
+| `diff(IntSliceView)` | saturating sub | self<255 hot path |
+| `mask_with(BitSliceView)` | zeros slots where mask bit = 0 | O(n_zeros) |
 
-`PackedIntCol` is used internally by `PersistentCompactIntMatrix` (packed format) for column views.
+**`inc_present_fast` / `inc_predicate_fast` invariant:** caller guarantees no counter reaches 255 during the operation (group size < 255 for `inc_present_fast`, or chunk size < 255 for `inc_predicate_fast`). Violation is caught by `debug_assert` in dev builds.
+
+**`min` algorithm:**
+
+Exploits 255 = +∞: byte-level min is correct unless both sides are overflow.
+
+```
+snapshot self_ov: Vec<(slot,val)>
+snapshot other_ov: HashMap<slot,val>
+clear_overflow()
+Pass 1 — byte min, SIMD-vectorizable, O(n)
+Pass 2 — both-overflow fixup, O(k_self):
+  for (slot, self_val) in self_ov:
+    if slot ∈ other_ov: set(slot, min(self_val, other_ov[slot]))
+```
+
+**`max` algorithm:**
+
+Cannot do byte max first — `max(255, b<255)=255` overwrites self's original overflow value. Pre-pass reads self's value at other's overflow slots before the byte pass.
+
+```
+Pre-pass O(k_other): for (slot, other_val) in other.overflow_entries():
+  set(slot, max(self.get(slot), other_val))
+Pass 1 — byte max, SIMD-vectorizable, O(n)
+```
 
 ---
 
@@ -505,30 +296,22 @@ Four matrix types, two encodings × two formats:
 
 | | Columnar format | Packed format |
 |---|---|---|
-| **Bit** | `PersistentBitMatrix` | — |
-| **Int** | `PersistentCompactIntMatrix` (columnar) | `PersistentCompactIntMatrix` (packed) |
+| **Bit** | `PersistentBitMatrix` (Columnar variant) | `PersistentBitMatrix` (Packed variant) |
+| **Int** | `PersistentCompactIntMatrix` (Columnar variant) | `PersistentCompactIntMatrix` (Packed variant) |
 
-`PersistentCompactIntMatrix` is an enum behind a transparent API — the caller does not see whether the on-disk format is columnar (one `.pciv` per column) or packed (one `.pcmx` file interleaving all columns). `col(c)` and `col_slice(c)` return column views that implement `IntSlice`.
+Both matrix types are enums (`Columnar` / `Packed` / `Implicit` for bit) behind a transparent API. `col_view(c)` returns the appropriate view directly:
 
-`pack_compact_int_matrix` and `pack_bit_matrix` convert a columnar matrix to packed format.
+```rust
+// PersistentBitMatrix
+pub fn col_view(&self, c: usize) -> BitSliceView<'_>
 
-For details see `persistent_compact_int_vec.md` and `persistent_bit_vec.md`.
+// PersistentCompactIntMatrix
+pub fn col_view(&self, c: usize) -> IntSliceView<'_>
+```
 
----
+No wrapper enums (`BitColView`, `IntColView`): the caller receives a `Copy` view struct immediately usable with any view method or bulk builder method.
 
-## Conversion traits
-
-Four blanket-impl traits on top of `BitSlice` / `IntSlice`:
-
-**`IntToBit: IntSlice`**
-- `to_bitvec(threshold: u32) -> MemoryBitVec` — bit set iff value ≥ threshold (delegates to `geq`)
-- `to_presence() -> MemoryBitVec` — bit set iff value ≥ 1 (delegates to `geq(1)`)
-
-**`BitToInt: BitSlice`**
-- `to_intvec() -> MemoryIntVec` — expands each bit to a `u8` (0 or 1) in a new primary array
-- Uses a `static EXPAND_BYTE: [[u8; 8]; 256]` lookup table — 8 bits expanded per byte, word-level outer loop
-
-Both `IntToBit` and `BitToInt` are implemented for all `T: IntSlice` / `T: BitSlice` via blanket impls.
+`pack_compact_int_matrix` and `pack_bit_matrix` convert columnar → packed format.
 
 ---
 
@@ -549,37 +332,37 @@ trait ColumnWeights: Send + Sync {
 
 Abstract required methods: `partial_bray`, `partial_euclidean`, `partial_threshold_jaccard`, `partial_relfreq_bray`, `partial_relfreq_euclidean`, `partial_hellinger`.
 
-**Additivity rule:** self-contained partials (`partial_bray`, `partial_euclidean`, `partial_threshold_jaccard`) can be element-wise summed across all `(partition, layer)` pairs before applying the finalisation. Normalised partials (`partial_relfreq_*`, `partial_hellinger`) require the **global** `col_weights` (accumulated across all layers and all partitions) as parameter — not per-layer or per-partition weights.
+**Additivity rule:** self-contained partials (`partial_bray`, `partial_euclidean`, `partial_threshold_jaccard`) can be element-wise summed across all `(partition, layer)` pairs. Normalised partials (`partial_relfreq_*`, `partial_hellinger`) require the **global** `col_weights` (accumulated across all layers and all partitions) as parameter.
 
-**`partial_threshold_jaccard` returns `(inter, union)`**, not a single matrix, because `union[i,j]` depends on both columns simultaneously and cannot be reconstructed from per-column statistics.
+**`partial_threshold_jaccard` returns `(inter, union)`** because `union[i,j]` depends on both columns simultaneously.
 
-Provided finalisations (default implementations):
+Provided finalisations:
 
 | Finalisation | Formula |
 |---|---|
 | `bray_dist_matrix()` | `1 − 2·partial_bray[i,j] / (w[i] + w[j])` |
 | `euclidean_dist_matrix()` | `√partial_euclidean[i,j]` |
 | `threshold_jaccard_dist_matrix(t)` | `1 − inter[i,j] / union[i,j]` |
-| `relfreq_bray_dist_matrix()` | `1 − partial_relfreq_bray[i,j]` (two-pass: col_weights then partial) |
+| `relfreq_bray_dist_matrix()` | `1 − partial_relfreq_bray[i,j]` |
 | `relfreq_euclidean_dist_matrix()` | `√partial_relfreq_euclidean[i,j]` |
 | `hellinger_dist_matrix()` | `√partial_hellinger[i,j] / √2` |
 | `hellinger_euclidean_dist_matrix()` | `√partial_hellinger[i,j]` |
 
 ### BitPartials
 
-Required: `partial_jaccard() -> (Array2<u64>, Array2<u64>)` (inter, union), `partial_hamming() -> Array2<u64>`. Both additive across layers and partitions.
+Required: `partial_jaccard() -> (Array2<u64>, Array2<u64>)`, `partial_hamming() -> Array2<u64>`. Both additive across layers and partitions.
 
 ---
 
 ## Temp-file-backed types
 
-`MemoryBitVec` and `MemoryIntVec` are reserved for truly transient intra-method intermediates (e.g. a single `cmp_scalar` result that lives for one loop iteration). **All inter-function results use temp-file-backed types** so the OS can page them out under memory pressure. This matters in practice: processing dozens of layers × hundreds of partitions in parallel would otherwise accumulate gigabytes of live anonymous memory.
+**All inter-function results use temp-file-backed types** so the OS can page them out under memory pressure. This matters in practice: processing dozens of layers × hundreds of partitions in parallel would otherwise accumulate gigabytes of live anonymous memory.
 
 ### Lifecycle
 
 ```
 TempCompactIntVecBuilder::new(n)   →  writable mmap in TempDir
-     ↓  (set / add / count_bits / mask_with / …)
+     ↓  (inc_present_fast / inc_predicate_fast / add / mask_with / …)
  .freeze()                          →  TempCompactIntVec  (read-only mmap + TempDir)
      ↓  (optional)
  .make_persistent(path)             →  PersistentCompactIntVec  (permanent file)
@@ -587,7 +370,7 @@ TempCompactIntVecBuilder::new(n)   →  writable mmap in TempDir
 
 Same pattern for `TempBitVecBuilder` → `TempBitVec` → `PersistentBitVec`.
 
-**Drop order**: in `TempCompactIntVec { vec: PersistentCompactIntVec, _temp: TempDir }`, Rust drops fields in declaration order — `vec` (mmap) is released before `_temp` (directory) is deleted. No explicit `drop()` needed.
+**Drop order**: `TempCompactIntVec { vec: PersistentCompactIntVec, _temp: TempDir }` — Rust drops fields in declaration order. `vec` (mmap) released before `_temp` (directory deleted). No explicit `drop()` needed.
 
 ### TempCompactIntVec / TempCompactIntVecBuilder
 
@@ -603,9 +386,9 @@ pub(crate) struct TempCompactIntVecBuilder {
 }
 ```
 
-`TempCompactIntVec` implements `IntSlice` (full delegation to inner `PersistentCompactIntVec`).  
-`TempCompactIntVecBuilder` implements `IntSlice` + `IntSliceMut` (delegation to inner builder).  
-`make_persistent(path)` copies the temp file to `path` and opens it as `PersistentCompactIntVec`.
+`TempCompactIntVec`: read access via `get(slot)`, `sum()`, `iter()`, `view() -> IntSliceView<'_>`.
+
+`TempCompactIntVecBuilder`: full delegation to inner `PersistentCompactIntVecBuilder` — all bulk computation methods (`inc_present_fast`, `inc_predicate_fast`, `add`, `min`, `max`, `diff`, `mask_with`) are exposed as `pub(crate)`.
 
 ### TempBitVec / TempBitVecBuilder
 
@@ -621,9 +404,26 @@ pub(crate) struct TempBitVecBuilder {
 }
 ```
 
-`TempBitVec` implements `BitSlice`.  
-`TempBitVecBuilder` implements `BitSlice` + `BitSliceMut`.  
-`make_persistent(path)` copies the temp file and opens as `PersistentBitVec`.
+`TempBitVec`: read access via `get(slot)`, `count_ones()`, `view() -> BitSliceView<'_>`, `iter()`.
+
+`TempBitVecBuilder`: exposes `set(slot, bool)`, `or(BitSliceView)`, and:
+
+```rust
+pub(crate) fn or_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool)
+```
+
+`or_where` — two passes, no intermediate allocation:
+
+```
+Pass 1 — primary bytes, O(n):
+  for slot in 0..n:
+    b = col.primary_bytes()[slot]
+    if b < 255 AND pred(b as u32): self.set(slot, true)
+
+Pass 2 — overflow, O(k):
+  for (slot, val) in col.overflow_entries():
+    if pred(val): self.set(slot, true)
+```
 
 ---
 
@@ -635,18 +435,16 @@ pub(crate) struct TempBitVecBuilder {
 pub struct ColGroup { pub name: String, pub indices: Vec<usize> }
 ```
 
-Defined **once at the index level** from column metadata. Valid in all matrices of all layers and partitions because column structure is identical across the entire hierarchy (same samples/genomes everywhere; only rows = kmer slots are partitioned).
-
-`ColGroup` is passed by reference unchanged to any matrix — no index translation.
+Defined **once at the index level** from column metadata. Valid in all matrices of all layers and partitions — column structure is identical across the entire hierarchy; only rows (kmer slots) are partitioned.
 
 ### Composition axis
 
-- **Across partitions**: kmer space is partitioned → partial results are **concatenated** (disjoint kmer ranges).
-- **Across layers**: same kmer space, different counts → partial results are **aggregated** (add, OR, etc.).
+- **Across partitions**: kmer space is partitioned → partial results **concatenated** (disjoint kmer ranges).
+- **Across layers**: same kmer space, different counts → partial results **aggregated** (add, OR, etc.).
 
 ### MatrixGroupOps
 
-Group operations live on the matrix and expose only **additive intermediates** backed by temp files. Predicates (final thresholds → `MemoryBitVec`) are applied at the index level after accumulation.
+Group operations expose only **additive intermediates** backed by temp files. Final predicates are applied at the index level after accumulation.
 
 ```rust
 pub trait MatrixGroupOps {
@@ -661,48 +459,59 @@ pub trait MatrixGroupOps {
 }
 ```
 
-Implemented for both `PersistentCompactIntMatrix` and `PersistentBitMatrix`. For bit matrices, `partial_group_sum` delegates to `partial_group_presence_count(g, 1)` since values are 0/1.
+Implemented for both `PersistentCompactIntMatrix` and `PersistentBitMatrix`. For bit matrices, `partial_group_sum` delegates to `partial_group_presence_count(g, 1)`.
 
 **`partial_group_presence_count` — chunking for large groups:**
 
-When `g.indices.len() < 255`, per-slot counts fit in a raw `u8` — fast path: accumulate directly into `primary_bytes_mut()` using `inc_primary_bits`, then `freeze()`. No overflow map needed.
+When `g.indices.len() < 255`: per-slot counts stay within `u8` range. Use `inc_present_fast` (bit matrix) or `inc_predicate_fast(col_view(c), |v| v >= threshold)` (int matrix) — raw u8 increment, no overflow map written.
 
-When `g.indices.len() ≥ 255`, process in chunks of 254 columns — each chunk stays within `u8` range — then add chunks into a running `TempCompactIntVecBuilder` accumulator via `IntSliceMut::add`. This keeps peak memory proportional to one partition, not the number of columns × partitions.
+When `g.indices.len() ≥ 255`: process in chunks of 254 columns (each chunk stays within u8 range), accumulate into a running builder via `.add(chunk_frozen.view())`.
 
 ```
 fast path (< 255 cols):
   builder = TempCompactIntVecBuilder::new(n)
   for c in group:
-    mask = col_view(c).cmp_scalar(|v| v >= threshold)  // MemoryBitVec
-    inc_primary_bits(primary_bytes_mut, mask)           // u8 safe
+    builder.inc_predicate_fast(matrix.col_view(c), |v| v >= threshold)
   builder.freeze()
 
 slow path (≥ 255 cols):
   result = TempCompactIntVecBuilder::new(n)
   for chunk in group.chunks(254):
-    chunk_builder = TempCompactIntVecBuilder::new(n)
-    inc_primary_bits(chunk_builder, …)
-    chunk_frozen = chunk_builder.freeze()
-    IntSliceMut::add(&mut result, &chunk_frozen)
+    chunk_b = TempCompactIntVecBuilder::new(n)
+    for c in chunk:
+      chunk_b.inc_predicate_fast(matrix.col_view(c), |v| v >= threshold)
+    frozen = chunk_b.freeze()
+    result.add(frozen.view())
   result.freeze()
 ```
 
-Non-additive predicates (`group_all`, `group_at_least(k)`) are **not** on the matrix — composed at the index level:
+**`partial_group_any`** uses `or_where` on `TempBitVecBuilder`:
 
 ```
+result = TempBitVecBuilder::new(n)
+for c in group:
+  result.or_where(matrix.col_view(c), |v| v >= threshold)
+result.freeze()
+```
+
+**Non-additive predicates** (`group_all`, `group_at_least(k)`) are composed at the index level:
+
+```rust
 // "present in >= 2 ingroup columns with count >= 3, absent from all outgroup"
 let presence = layers.map(|l| l.partial_group_presence_count(&ingroup, 3)?).add_all()?;
-let in_mask  = presence.geq(2);
+let in_mask  = presence.view().geq(2);   // IntSliceView method
 
 let out_sum  = layers.map(|l| l.partial_group_sum(&outgroup)?).add_all()?;
-let out_mask = out_sum.leq(0);
+let out_mask = out_sum.view().leq(0);
 
-let mask = in_mask & &out_mask;    // BitSliceMut::and — O(n/64)
+let mut mask_b = TempBitVecBuilder::new(n)?;
+mask_b.copy_from(in_mask);
+mask_b.and(out_mask);
 ```
 
-### mask_with (IntSliceMut)
+### mask_with
 
-Provided method on `IntSliceMut`. Zeros every slot where the corresponding mask bit is 0. Iterates only zero bits — O(n_zeros), O(1) when mask is all-ones.
+Direct method on `PersistentCompactIntVecBuilder` (and delegation via `TempCompactIntVecBuilder`). Zeros every slot where the corresponding mask bit is 0. Iterates only zero bits — O(n_zeros), O(1) when mask is all-ones.
 
 ```
 for (w_idx, word) in mask.words():
@@ -711,7 +520,7 @@ for (w_idx, word) in mask.words():
   while zeros != 0:
     bit = trailing_zeros(zeros)
     s = w_idx * 64 + bit
-    if primary[s] != 0: self.set(s, 0)   // clears overflow entry too
+    if primary[s] != 0: set(s, 0)   // clears overflow entry too
     zeros &= zeros − 1
 ```
 

From 4c4524766c63db809a029f1365c59fff4cbd9c36 Mon Sep 17 00:00:00 2001
From: Eric Coissac <eric@coissac.eu>
Date: Thu, 18 Jun 2026 07:34:29 +0200
Subject: [PATCH 17/24] feat(matrix): add partial group reductions and column
 persistence

Expands MatrixGroupOps with partial_group_min/max helpers for bitwise reductions and introduces add_col_from methods to persist external vectors as matrix columns. Refactors column aggregation in the partitioner to leverage these group operations directly, replacing iterative row processing with simplified builder lifecycle management and explicit metadata serialization.
---
 docmd/implementation/obicompactvec.md    |  71 +++++-----
 src/obicompactvec/src/bitmatrix.rs       |  40 ++++++
 src/obicompactvec/src/colgroup.rs        |  45 +++++-
 src/obicompactvec/src/intmatrix.rs       |  32 +++++
 src/obikpartitionner/src/select_layer.rs | 170 +++++++++--------------
 5 files changed, 206 insertions(+), 152 deletions(-)

diff --git a/docmd/implementation/obicompactvec.md b/docmd/implementation/obicompactvec.md
index e3c8568..301b021 100644
--- a/docmd/implementation/obicompactvec.md
+++ b/docmd/implementation/obicompactvec.md
@@ -444,71 +444,64 @@ Defined **once at the index level** from column metadata. Valid in all matrices
 
 ### MatrixGroupOps
 
-Group operations expose only **additive intermediates** backed by temp files. Final predicates are applied at the index level after accumulation.
+Five required primitives + two default methods derived from them. All return temp-file-backed types.
 
 ```rust
 pub trait MatrixGroupOps {
+    // required
     fn partial_group_presence_count(&self, g: &ColGroup, threshold: u32)
         -> io::Result<TempCompactIntVec>;
-
     fn partial_group_sum(&self, g: &ColGroup)
         -> io::Result<TempCompactIntVec>;
-
     fn partial_group_any(&self, g: &ColGroup, threshold: u32)
         -> io::Result<TempBitVec>;
+    fn partial_group_min(&self, g: &ColGroup)
+        -> io::Result<TempCompactIntVec>;
+    fn partial_group_max(&self, g: &ColGroup)
+        -> io::Result<TempCompactIntVec>;
+
+    // defaults derived from partial_group_presence_count
+    fn partial_group_all(&self, g: &ColGroup, threshold: u32)
+        -> io::Result<TempBitVec>;   // slot=1 iff count == g.indices.len()
+    fn partial_group_none(&self, g: &ColGroup, threshold: u32)
+        -> io::Result<TempBitVec>;   // slot=1 iff count == 0
 }
 ```
 
-Implemented for both `PersistentCompactIntMatrix` and `PersistentBitMatrix`. For bit matrices, `partial_group_sum` delegates to `partial_group_presence_count(g, 1)`.
+Implemented for both `PersistentCompactIntMatrix` and `PersistentBitMatrix`.
+
+For **bit matrices**: values are 0/1, so `partial_group_sum` = `partial_group_presence_count(g, 1)`; `partial_group_min` is AND (set first column then mask-with remaining); `partial_group_max` is OR via `partial_group_any` + `inc_present`.
 
 **`partial_group_presence_count` — chunking for large groups:**
 
-When `g.indices.len() < 255`: per-slot counts stay within `u8` range. Use `inc_present_fast` (bit matrix) or `inc_predicate_fast(col_view(c), |v| v >= threshold)` (int matrix) — raw u8 increment, no overflow map written.
+When `g.indices.len() < 255`: per-slot counts stay within `u8` range. Use `inc_present_fast` (bit) or `inc_predicate_fast(col_view(c), |v| v >= threshold)` (int) — raw u8 increment, no overflow entry written.
 
-When `g.indices.len() ≥ 255`: process in chunks of 254 columns (each chunk stays within u8 range), accumulate into a running builder via `.add(chunk_frozen.view())`.
+When `g.indices.len() ≥ 255`: process in chunks of 254 columns, accumulate via `.add(chunk_frozen.view())`.
 
-```
-fast path (< 255 cols):
-  builder = TempCompactIntVecBuilder::new(n)
-  for c in group:
-    builder.inc_predicate_fast(matrix.col_view(c), |v| v >= threshold)
-  builder.freeze()
+**`partial_group_min` (int matrix)**: copy first column via `.add(col_view(first))` (start from 0 ⇒ copy), then `.min(col_view(c))` for remaining.
 
-slow path (≥ 255 cols):
-  result = TempCompactIntVecBuilder::new(n)
-  for chunk in group.chunks(254):
-    chunk_b = TempCompactIntVecBuilder::new(n)
-    for c in chunk:
-      chunk_b.inc_predicate_fast(matrix.col_view(c), |v| v >= threshold)
-    frozen = chunk_b.freeze()
-    result.add(frozen.view())
-  result.freeze()
-```
+**`partial_group_max` (int matrix)**: `.max(col_view(c))` for all columns (start from 0 ⇒ first column acts as copy).
 
-**`partial_group_any`** uses `or_where` on `TempBitVecBuilder`:
+**`partial_group_any`** uses `or_where` on `TempBitVecBuilder` (two-pass: primary bytes then overflow entries).
 
-```
-result = TempBitVecBuilder::new(n)
-for c in group:
-  result.or_where(matrix.col_view(c), |v| v >= threshold)
-result.freeze()
-```
+**`partial_group_all` / `partial_group_none`** (default): call `partial_group_presence_count`, then iterate slots to produce the bit result. O(n) extra pass, not chunked.
 
-**Non-additive predicates** (`group_all`, `group_at_least(k)`) are composed at the index level:
+### add_col_from — matrix builder integration
+
+Both matrix builders accept temp-file results directly:
 
 ```rust
-// "present in >= 2 ingroup columns with count >= 3, absent from all outgroup"
-let presence = layers.map(|l| l.partial_group_presence_count(&ingroup, 3)?).add_all()?;
-let in_mask  = presence.view().geq(2);   // IntSliceView method
+// PersistentBitMatrixBuilder
+fn add_col_from(&mut self, src: &TempBitVec)         -> io::Result<()>
+fn add_col_from_int(&mut self, src: &TempCompactIntVec) -> io::Result<()>  // nonzero → 1
 
-let out_sum  = layers.map(|l| l.partial_group_sum(&outgroup)?).add_all()?;
-let out_mask = out_sum.view().leq(0);
-
-let mut mask_b = TempBitVecBuilder::new(n)?;
-mask_b.copy_from(in_mask);
-mask_b.and(out_mask);
+// PersistentCompactIntMatrixBuilder
+fn add_col_from(&mut self, src: &TempCompactIntVec)  -> io::Result<()>
+fn add_col_from_bit(&mut self, src: &TempBitVec)     -> io::Result<()>  // bit → 0/1 u32
 ```
 
+`add_col_from` copies the temp file to the matrix directory and increments `n_cols`; `close()` writes `meta.json` with the final column count. No separate `write_meta` step needed.
+
 ### mask_with
 
 Direct method on `PersistentCompactIntVecBuilder` (and delegation via `TempCompactIntVecBuilder`). Zeros every slot where the corresponding mask bit is 0. Iterates only zero bits — O(n_zeros), O(1) when mask is all-ones.
diff --git a/src/obicompactvec/src/bitmatrix.rs b/src/obicompactvec/src/bitmatrix.rs
index 2717174..55a4561 100644
--- a/src/obicompactvec/src/bitmatrix.rs
+++ b/src/obicompactvec/src/bitmatrix.rs
@@ -402,6 +402,26 @@ impl PersistentBitMatrixBuilder {
         PersistentBitVecBuilder::new(self.n, &path)
     }
 
+    pub fn add_col_from(&mut self, src: &TempBitVec) -> io::Result<()> {
+        src.make_persistent(&col_path(&self.dir, self.n_cols))?;
+        self.n_cols += 1;
+        Ok(())
+    }
+
+    pub fn add_col_from_int(&mut self, src: &TempCompactIntVec) -> io::Result<()> {
+        let path = col_path(&self.dir, self.n_cols);
+        self.n_cols += 1;
+        let mut b = PersistentBitVecBuilder::new(self.n, &path)?;
+        let view = src.view();
+        for slot in 0..self.n {
+            if view.primary_bytes()[slot] > 0 { b.set(slot, true); }
+        }
+        for (slot, _) in view.overflow_entries() {
+            b.set(slot, true);
+        }
+        b.close()
+    }
+
     pub fn close(self) -> io::Result<()> {
         MatrixMeta { n: self.n, n_cols: self.n_cols }.save(&self.dir)
     }
@@ -446,6 +466,26 @@ impl MatrixGroupOps for PersistentBitMatrix {
         }
         result.freeze()
     }
+
+    fn partial_group_min(&self, g: &ColGroup) -> io::Result<TempCompactIntVec> {
+        // min of 0/1 values = AND: 1 only if ALL columns are 1
+        let n = self.n();
+        let mut result = TempCompactIntVecBuilder::new(n)?;
+        if let Some((&first, rest)) = g.indices.split_first() {
+            result.inc_present_fast(self.col_view(first));
+            for &c in rest { result.mask_with(self.col_view(c)); }
+        }
+        result.freeze()
+    }
+
+    fn partial_group_max(&self, g: &ColGroup) -> io::Result<TempCompactIntVec> {
+        // max of 0/1 values = OR: 1 if any column is 1
+        let any = self.partial_group_any(g, 1)?;
+        let n = any.len();
+        let mut result = TempCompactIntVecBuilder::new(n)?;
+        result.inc_present(any.view());
+        result.freeze()
+    }
 }
 
 // ── Shared matrix helpers (also used by intmatrix.rs) ─────────────────────────
diff --git a/src/obicompactvec/src/colgroup.rs b/src/obicompactvec/src/colgroup.rs
index c238a62..fa70830 100644
--- a/src/obicompactvec/src/colgroup.rs
+++ b/src/obicompactvec/src/colgroup.rs
@@ -1,6 +1,6 @@
 use std::io;
 
-use crate::tempbitvec::TempBitVec;
+use crate::tempbitvec::{TempBitVec, TempBitVecBuilder};
 use crate::tempintvec::TempCompactIntVec;
 
 // ── ColGroup ──────────────────────────────────────────────────────────────────
@@ -23,12 +23,14 @@ impl ColGroup {
 
 // ── MatrixGroupOps ────────────────────────────────────────────────────────────
 
-/// Per-matrix group aggregations that return **additive intermediates**.
+/// Per-matrix group aggregations.
 ///
-/// Results must be composed by the caller (concat across partitions, add across
-/// layers) before applying final predicates (`geq`, `leq`, …).  Non-additive
-/// predicates like `group_all` or `group_at_least(k)` are intentionally absent
-/// — they are derived at the index level from these intermediates.
+/// `partial_group_presence_count`, `partial_group_sum`, `partial_group_any`,
+/// `partial_group_min`, `partial_group_max` are the primitives; each impl must
+/// provide all five.
+///
+/// `partial_group_all` and `partial_group_none` have default implementations
+/// derived from `partial_group_presence_count` and should rarely need overriding.
 pub trait MatrixGroupOps {
     /// Per-slot count of group columns whose value ≥ `threshold`.
     fn partial_group_presence_count(&self, g: &ColGroup, threshold: u32) -> io::Result<TempCompactIntVec>;
@@ -36,6 +38,35 @@ pub trait MatrixGroupOps {
     /// Per-slot sum of values across all group columns.
     fn partial_group_sum(&self, g: &ColGroup) -> io::Result<TempCompactIntVec>;
 
-    /// Per-slot OR: true if any group column has value ≥ `threshold`.
+    /// Per-slot OR: 1 if any group column has value ≥ `threshold`.
     fn partial_group_any(&self, g: &ColGroup, threshold: u32) -> io::Result<TempBitVec>;
+
+    /// Per-slot min value across all group columns (0 if group is empty).
+    fn partial_group_min(&self, g: &ColGroup) -> io::Result<TempCompactIntVec>;
+
+    /// Per-slot max value across all group columns (0 if group is empty).
+    fn partial_group_max(&self, g: &ColGroup) -> io::Result<TempCompactIntVec>;
+
+    /// Per-slot AND: 1 if ALL group columns have value ≥ `threshold`.
+    fn partial_group_all(&self, g: &ColGroup, threshold: u32) -> io::Result<TempBitVec> {
+        let counts = self.partial_group_presence_count(g, threshold)?;
+        let n = counts.len();
+        let n_required = g.indices.len() as u32;
+        let mut b = TempBitVecBuilder::new(n)?;
+        for slot in 0..n {
+            if counts.get(slot) >= n_required { b.set(slot, true); }
+        }
+        b.freeze()
+    }
+
+    /// Per-slot NOR: 1 if NO group column has value ≥ `threshold`.
+    fn partial_group_none(&self, g: &ColGroup, threshold: u32) -> io::Result<TempBitVec> {
+        let counts = self.partial_group_presence_count(g, threshold)?;
+        let n = counts.len();
+        let mut b = TempBitVecBuilder::new(n)?;
+        for slot in 0..n {
+            if counts.get(slot) == 0 { b.set(slot, true); }
+        }
+        b.freeze()
+    }
 }
diff --git a/src/obicompactvec/src/intmatrix.rs b/src/obicompactvec/src/intmatrix.rs
index f3486d6..b2fa97e 100644
--- a/src/obicompactvec/src/intmatrix.rs
+++ b/src/obicompactvec/src/intmatrix.rs
@@ -386,6 +386,21 @@ impl PersistentCompactIntMatrixBuilder {
         self.n_cols += 1;
         PersistentCompactIntVecBuilder::new(self.n, &path)
     }
+
+    pub fn add_col_from(&mut self, src: &TempCompactIntVec) -> io::Result<()> {
+        src.make_persistent(&col_path(&self.dir, self.n_cols))?;
+        self.n_cols += 1;
+        Ok(())
+    }
+
+    pub fn add_col_from_bit(&mut self, src: &TempBitVec) -> io::Result<()> {
+        let path = col_path(&self.dir, self.n_cols);
+        self.n_cols += 1;
+        let mut b = PersistentCompactIntVecBuilder::new(self.n, &path)?;
+        b.inc_present(src.view());
+        b.close()
+    }
+
     pub fn close(self) -> io::Result<()> {
         MatrixMeta { n: self.n, n_cols: self.n_cols }.save(&self.dir)
     }
@@ -431,4 +446,21 @@ impl MatrixGroupOps for PersistentCompactIntMatrix {
         }
         result.freeze()
     }
+
+    fn partial_group_min(&self, g: &ColGroup) -> io::Result<TempCompactIntVec> {
+        let n = self.n();
+        let mut result = TempCompactIntVecBuilder::new(n)?;
+        if let Some((&first, rest)) = g.indices.split_first() {
+            result.add(self.col_view(first));
+            for &c in rest { result.min(self.col_view(c)); }
+        }
+        result.freeze()
+    }
+
+    fn partial_group_max(&self, g: &ColGroup) -> io::Result<TempCompactIntVec> {
+        let n = self.n();
+        let mut result = TempCompactIntVecBuilder::new(n)?;
+        for &c in &g.indices { result.max(self.col_view(c)); }
+        result.freeze()
+    }
 }
diff --git a/src/obikpartitionner/src/select_layer.rs b/src/obikpartitionner/src/select_layer.rs
index 36286c0..c7f45e4 100644
--- a/src/obikpartitionner/src/select_layer.rs
+++ b/src/obikpartitionner/src/select_layer.rs
@@ -3,8 +3,9 @@ use std::io;
 use std::path::{Path, PathBuf};
 
 use obicompactvec::{
-    PersistentBitMatrix, PersistentBitMatrixBuilder, PersistentBitVecBuilder,
-    PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, PersistentCompactIntVecBuilder,
+    ColGroup, MatrixGroupOps,
+    PersistentBitMatrix, PersistentBitMatrixBuilder,
+    PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder,
 };
 use obilayeredmap::meta::PartitionMeta;
 use obilayeredmap::OLMError;
@@ -40,52 +41,6 @@ pub struct OutputCol {
     pub op:      AggOp,
 }
 
-// ── Aggregation ───────────────────────────────────────────────────────────────
-
-#[inline]
-fn aggregate(op: AggOp, indices: &[usize], src_row: &[u32], threshold: u32) -> u32 {
-    match op {
-        AggOp::Any => {
-            if indices.iter().any(|&i| src_row[i] > threshold) { 1 } else { 0 }
-        }
-        AggOp::All => {
-            if indices.is_empty() { return 0; }
-            if indices.iter().all(|&i| src_row[i] > threshold) { 1 } else { 0 }
-        }
-        AggOp::None => {
-            if indices.iter().all(|&i| src_row[i] <= threshold) { 1 } else { 0 }
-        }
-        AggOp::Sum => {
-            indices.iter().map(|&i| src_row[i]).fold(0u32, |a, b| a.saturating_add(b))
-        }
-        AggOp::Min => indices.iter().map(|&i| src_row[i]).min().unwrap_or(0),
-        AggOp::Max => indices.iter().map(|&i| src_row[i]).max().unwrap_or(0),
-    }
-}
-
-// ── ColBuilder ────────────────────────────────────────────────────────────────
-
-enum ColBuilder {
-    Bit(PersistentBitVecBuilder),
-    Int(PersistentCompactIntVecBuilder),
-}
-
-impl ColBuilder {
-    fn set_val(&mut self, slot: usize, value: u32) {
-        match self {
-            ColBuilder::Bit(b) => b.set(slot, value > 0),
-            ColBuilder::Int(b) => b.set(slot, value),
-        }
-    }
-
-    fn close(self) -> SKResult<()> {
-        match self {
-            ColBuilder::Bit(b) => b.close().map_err(SKError::Io),
-            ColBuilder::Int(b) => b.close().map_err(SKError::Io),
-        }
-    }
-}
-
 // ── Helpers ───────────────────────────────────────────────────────────────────
 
 fn olm_to_sk(e: OLMError) -> SKError {
@@ -95,21 +50,6 @@ fn olm_to_sk(e: OLMError) -> SKError {
     }
 }
 
-fn col_path_bit(dir: &Path, col: usize) -> PathBuf {
-    dir.join(format!("col_{col:06}.pbiv"))
-}
-
-fn col_path_int(dir: &Path, col: usize) -> PathBuf {
-    dir.join(format!("col_{col:06}.pciv"))
-}
-
-fn write_matrix_meta(dir: &Path, n: usize, n_cols: usize) -> io::Result<()> {
-    fs::write(
-        dir.join("meta.json"),
-        format!("{{\"n\":{n},\"n_cols\":{n_cols}}}\n"),
-    )
-}
-
 /// Copy all plain files (not subdirectories) from `src_dir` to `dst_dir`.
 fn copy_layer_files(src_dir: &Path, dst_dir: &Path) -> io::Result<()> {
     for entry in fs::read_dir(src_dir)? {
@@ -125,30 +65,64 @@ fn copy_layer_files(src_dir: &Path, dst_dir: &Path) -> io::Result<()> {
 // ── fill_builders ─────────────────────────────────────────────────────────────
 
 fn fill_builders(
-    builders: &mut [ColBuilder],
     specs: &[OutputCol],
-    n: usize,
-    n_src: usize,
     src_layer_dir: &Path,
     src_is_count: bool,
     threshold: u32,
+    output_presence: bool,
+    mut dst_bit: Option<&mut PersistentBitMatrixBuilder>,
+    mut dst_int: Option<&mut PersistentCompactIntMatrixBuilder>,
 ) -> SKResult<()> {
-    let mut src_buf = vec![0u32; n_src];
-
     if src_is_count {
         let mat = PersistentCompactIntMatrix::open(src_layer_dir).map_err(SKError::Io)?;
-        for slot in 0..n {
-            mat.fill_row(slot, &mut src_buf);
-            for (col, spec) in specs.iter().enumerate() {
-                builders[col].set_val(slot, aggregate(spec.op, &spec.indices, &src_buf, threshold));
+        for spec in specs {
+            let g = ColGroup::new(&spec.label, spec.indices.clone());
+            if output_presence {
+                let b = dst_bit.as_deref_mut().unwrap();
+                match spec.op {
+                    AggOp::Any  => b.add_col_from    (&mat.partial_group_any (&g, threshold).map_err(SKError::Io)?),
+                    AggOp::All  => b.add_col_from    (&mat.partial_group_all (&g, threshold).map_err(SKError::Io)?),
+                    AggOp::None => b.add_col_from    (&mat.partial_group_none(&g, threshold).map_err(SKError::Io)?),
+                    AggOp::Sum  => b.add_col_from_int(&mat.partial_group_sum (&g).map_err(SKError::Io)?),
+                    AggOp::Min  => b.add_col_from_int(&mat.partial_group_min (&g).map_err(SKError::Io)?),
+                    AggOp::Max  => b.add_col_from_int(&mat.partial_group_max (&g).map_err(SKError::Io)?),
+                }.map_err(SKError::Io)?;
+            } else {
+                let b = dst_int.as_deref_mut().unwrap();
+                match spec.op {
+                    AggOp::Sum  => b.add_col_from    (&mat.partial_group_sum (&g).map_err(SKError::Io)?),
+                    AggOp::Min  => b.add_col_from    (&mat.partial_group_min (&g).map_err(SKError::Io)?),
+                    AggOp::Max  => b.add_col_from    (&mat.partial_group_max (&g).map_err(SKError::Io)?),
+                    AggOp::Any  => b.add_col_from_bit(&mat.partial_group_any (&g, threshold).map_err(SKError::Io)?),
+                    AggOp::All  => b.add_col_from_bit(&mat.partial_group_all (&g, threshold).map_err(SKError::Io)?),
+                    AggOp::None => b.add_col_from_bit(&mat.partial_group_none(&g, threshold).map_err(SKError::Io)?),
+                }.map_err(SKError::Io)?;
             }
         }
     } else {
         let mat = PersistentBitMatrix::open(src_layer_dir).map_err(SKError::Io)?;
-        for slot in 0..n {
-            mat.fill_row(slot, &mut src_buf);
-            for (col, spec) in specs.iter().enumerate() {
-                builders[col].set_val(slot, aggregate(spec.op, &spec.indices, &src_buf, threshold));
+        for spec in specs {
+            let g = ColGroup::new(&spec.label, spec.indices.clone());
+            if output_presence {
+                let b = dst_bit.as_deref_mut().unwrap();
+                match spec.op {
+                    AggOp::Any  => b.add_col_from    (&mat.partial_group_any (&g, 1).map_err(SKError::Io)?),
+                    AggOp::All  => b.add_col_from    (&mat.partial_group_all (&g, 1).map_err(SKError::Io)?),
+                    AggOp::None => b.add_col_from    (&mat.partial_group_none(&g, 1).map_err(SKError::Io)?),
+                    AggOp::Sum  => b.add_col_from_int(&mat.partial_group_sum (&g).map_err(SKError::Io)?),
+                    AggOp::Min  => b.add_col_from_int(&mat.partial_group_min (&g).map_err(SKError::Io)?),
+                    AggOp::Max  => b.add_col_from_int(&mat.partial_group_max (&g).map_err(SKError::Io)?),
+                }.map_err(SKError::Io)?;
+            } else {
+                let b = dst_int.as_deref_mut().unwrap();
+                match spec.op {
+                    AggOp::Sum  => b.add_col_from    (&mat.partial_group_sum (&g).map_err(SKError::Io)?),
+                    AggOp::Min  => b.add_col_from    (&mat.partial_group_min (&g).map_err(SKError::Io)?),
+                    AggOp::Max  => b.add_col_from    (&mat.partial_group_max (&g).map_err(SKError::Io)?),
+                    AggOp::Any  => b.add_col_from_bit(&mat.partial_group_any (&g, 1).map_err(SKError::Io)?),
+                    AggOp::All  => b.add_col_from_bit(&mat.partial_group_all (&g, 1).map_err(SKError::Io)?),
+                    AggOp::None => b.add_col_from_bit(&mat.partial_group_none(&g, 1).map_err(SKError::Io)?),
+                }.map_err(SKError::Io)?;
             }
         }
     }
@@ -168,7 +142,7 @@ impl KmerPartition {
         src: &KmerPartition,
         i: usize,
         specs: &[OutputCol],
-        n_src_genomes: usize,
+        _n_src_genomes: usize,
         threshold: u32,
         output_presence: bool,
         in_place: bool,
@@ -188,7 +162,6 @@ impl KmerPartition {
             fs::create_dir_all(&dst_index_dir)?;
         }
 
-        let n_out = specs.len();
         let data_subdir = if output_presence { "presence" } else { "counts" };
 
         for l in 0..src_meta.n_layers {
@@ -201,7 +174,7 @@ impl KmerPartition {
             let presence_dir = src_layer_dir.join("presence");
             let src_is_count = counts_dir.exists() && !presence_dir.exists();
 
-            // Determine number of slots from the source matrix.
+            // Determine number of slots and detect implicit layers.
             let n = if counts_dir.exists() {
                 PersistentCompactIntMatrix::open(&src_layer_dir).map_err(SKError::Io)?.n()
             } else if presence_dir.exists() {
@@ -216,7 +189,7 @@ impl KmerPartition {
             };
 
             // Choose the output data directory (temp name for in-place).
-            let (dst_data_dir, final_data_dir) = if in_place {
+            let (dst_data_dir, final_data_dir): (PathBuf, PathBuf) = if in_place {
                 let tmp  = dst_layer_dir.join(format!("{data_subdir}_new"));
                 let perm = dst_layer_dir.join(data_subdir);
                 (tmp, perm)
@@ -231,37 +204,22 @@ impl KmerPartition {
             }
             fs::create_dir_all(&dst_data_dir)?;
 
-            // Initialise packed-format skeleton.
-            if output_presence {
-                PersistentBitMatrixBuilder::new(n, &dst_data_dir)
-                    .map_err(SKError::Io)?.close().map_err(SKError::Io)?;
+            let (mut dst_bit, mut dst_int) = if output_presence {
+                (Some(PersistentBitMatrixBuilder::new(n, &dst_data_dir).map_err(SKError::Io)?), None)
             } else {
-                PersistentCompactIntMatrixBuilder::new(n, &dst_data_dir)
-                    .map_err(SKError::Io)?.close().map_err(SKError::Io)?;
-            }
-
-            // Create column builders.
-            let mut builders: Vec<ColBuilder> = (0..n_out)
-                .map(|col| -> SKResult<ColBuilder> {
-                    if output_presence {
-                        Ok(ColBuilder::Bit(PersistentBitVecBuilder::new(
-                            n, &col_path_bit(&dst_data_dir, col),
-                        )?))
-                    } else {
-                        Ok(ColBuilder::Int(PersistentCompactIntVecBuilder::new(
-                            n, &col_path_int(&dst_data_dir, col),
-                        )?))
-                    }
-                })
-                .collect::<SKResult<_>>()?;
+                (None, Some(PersistentCompactIntMatrixBuilder::new(n, &dst_data_dir).map_err(SKError::Io)?))
+            };
 
             fill_builders(
-                &mut builders, specs, n, n_src_genomes,
-                &src_layer_dir, src_is_count, threshold,
+                specs, &src_layer_dir, src_is_count, threshold, output_presence,
+                dst_bit.as_mut(), dst_int.as_mut(),
             )?;
 
-            for b in builders { b.close()?; }
-            write_matrix_meta(&dst_data_dir, n, n_out).map_err(SKError::Io)?;
+            if output_presence {
+                dst_bit.unwrap().close().map_err(SKError::Io)?;
+            } else {
+                dst_int.unwrap().close().map_err(SKError::Io)?;
+            }
 
             // In-place: swap old data dir for new.
             if in_place {

From 7c1efa9cbbe5ea6b2703b92c3b2efdab76c2316b Mon Sep 17 00:00:00 2001
From: Eric Coissac <eric@coissac.eu>
Date: Fri, 19 Jun 2026 09:12:07 +0200
Subject: [PATCH 18/24] feat: add vectorized column filters and optimize
 partitioner iteration

Adds `FilterMask` and conditional bitwise methods (`*_where`) to `obicompactvec` for composable column-based slot filtering. Extends `obikpartitionner` with a `MatrixGroupOps` trait and `column_mask_expr` method to express aggregate constraints as vectorized masks. Refactors matrix builder management into a unified `Builders` enum and introduces `try_compute_combined_mask`, enabling O(1) slot checks and skipping unnecessary row reads during partitioning and rebuilding passes.
---
 src/obicompactvec/src/bitvec.rs           |  68 +++++++-
 src/obicompactvec/src/colgroup.rs         |  70 ++++++++
 src/obicompactvec/src/lib.rs              |   2 +-
 src/obicompactvec/src/tempbitvec.rs       |  37 +++--
 src/obikpartitionner/src/filter.rs        | 120 +++++++++++++
 src/obikpartitionner/src/merge_layer.rs   |  36 ++++
 src/obikpartitionner/src/rebuild_layer.rs | 194 ++++++++++++++++------
 7 files changed, 462 insertions(+), 65 deletions(-)

diff --git a/src/obicompactvec/src/bitvec.rs b/src/obicompactvec/src/bitvec.rs
index 8cde36b..966d57f 100644
--- a/src/obicompactvec/src/bitvec.rs
+++ b/src/obicompactvec/src/bitvec.rs
@@ -5,7 +5,7 @@ use std::path::{Path, PathBuf};
 use memmap2::{Mmap, MmapMut};
 
 use crate::reader::PersistentCompactIntVec;
-use crate::views::{BitSliceView, BitSliceIter};
+use crate::views::{BitSliceIter, BitSliceView, IntSliceView};
 
 const MAGIC: [u8; 4] = *b"PBIV";
 
@@ -241,6 +241,72 @@ impl PersistentBitVecBuilder {
         }
     }
 
+    /// OR in bits at slots where `pred(col[slot])` is true.
+    pub fn or_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
+        assert_eq!(self.n, col.len(), "IntSliceView length mismatch");
+        let n = self.n;
+        let primary = col.primary_bytes();
+        let words = self.data_words_mut();
+        let nw = n_words(n);
+        for wi in 0..nw {
+            let base  = wi * 64;
+            let limit = (base + 64).min(n);
+            let mut mask = 0u64;
+            for bit in 0..(limit - base) {
+                let b = primary[base + bit];
+                if b < 255 && pred(b as u32) { mask |= 1u64 << bit; }
+            }
+            words[wi] |= mask;
+        }
+        for (slot, val) in col.overflow_entries() {
+            if pred(val) { words[slot >> 6] |= 1u64 << (slot & 63); }
+        }
+    }
+
+    /// Clear bits at slots where `pred(col[slot])` is false.
+    pub fn and_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
+        assert_eq!(self.n, col.len(), "IntSliceView length mismatch");
+        let n = self.n;
+        let primary = col.primary_bytes();
+        let words = self.data_words_mut();
+        let nw = n_words(n);
+        for wi in 0..nw {
+            let base  = wi * 64;
+            let limit = (base + 64).min(n);
+            let mut mask = 0u64;
+            for bit in 0..(limit - base) {
+                let b = primary[base + bit];
+                if b < 255 && !pred(b as u32) { mask |= 1u64 << bit; }
+            }
+            words[wi] &= !mask;
+        }
+        for (slot, val) in col.overflow_entries() {
+            if !pred(val) { words[slot >> 6] &= !(1u64 << (slot & 63)); }
+        }
+    }
+
+    /// Toggle bits at slots where `pred(col[slot])` is true.
+    pub fn xor_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
+        assert_eq!(self.n, col.len(), "IntSliceView length mismatch");
+        let n = self.n;
+        let primary = col.primary_bytes();
+        let words = self.data_words_mut();
+        let nw = n_words(n);
+        for wi in 0..nw {
+            let base  = wi * 64;
+            let limit = (base + 64).min(n);
+            let mut mask = 0u64;
+            for bit in 0..(limit - base) {
+                let b = primary[base + bit];
+                if b < 255 && pred(b as u32) { mask |= 1u64 << bit; }
+            }
+            words[wi] ^= mask;
+        }
+        for (slot, val) in col.overflow_entries() {
+            if pred(val) { words[slot >> 6] ^= 1u64 << (slot & 63); }
+        }
+    }
+
     pub fn iter(&self) -> BitSliceIter<'_> {
         self.view().iter()
     }
diff --git a/src/obicompactvec/src/colgroup.rs b/src/obicompactvec/src/colgroup.rs
index fa70830..b8c5ec8 100644
--- a/src/obicompactvec/src/colgroup.rs
+++ b/src/obicompactvec/src/colgroup.rs
@@ -70,3 +70,73 @@ pub trait MatrixGroupOps {
         b.freeze()
     }
 }
+
+// ── FilterMask — expression tree for column-based slot filters ────────────────
+
+/// A composable filter expression that can be evaluated against a matrix
+/// using only column operations (no MPHF lookup per kmer).
+///
+/// `threshold` semantics follow [`MatrixGroupOps::partial_group_presence_count`]:
+/// a slot contributes to the count when its value is **≥ threshold**.
+/// To match the row-level filter (`value > t`), callers should pass `t + 1`.
+#[derive(Debug, Clone)]
+pub enum FilterMask {
+    /// Slot passes if count of columns in `indices` with value ≥ `threshold` is ≥ `min_count`.
+    PresenceGeq { indices: Vec<usize>, threshold: u32, min_count: usize },
+    /// Slot passes if count of columns in `indices` with value ≥ `threshold` is ≤ `max_count`.
+    PresenceLeq { indices: Vec<usize>, threshold: u32, max_count: usize },
+    /// Slot passes if sum of values across `indices` columns is ≥ `min_sum`.
+    SumGeq { indices: Vec<usize>, min_sum: u32 },
+    /// Slot passes if sum of values across `indices` columns is ≤ `max_sum`.
+    SumLeq { indices: Vec<usize>, max_sum: u32 },
+    /// Slot passes if it passes all sub-expressions. Empty `And` is always true.
+    And(Vec<FilterMask>),
+}
+
+/// Evaluate a [`FilterMask`] against `mat`, returning a per-slot `TempBitVec`
+/// where bit=1 means the slot passes the filter.
+pub fn eval_filter_mask(expr: &FilterMask, mat: &dyn MatrixGroupOps, n: usize) -> io::Result<TempBitVec> {
+    match expr {
+        FilterMask::PresenceGeq { indices, threshold, min_count } => {
+            let g = ColGroup::new("", indices.clone());
+            let counts = mat.partial_group_presence_count(&g, *threshold)?;
+            let mut b = TempBitVecBuilder::new(n)?;
+            let mc = *min_count as u32;
+            b.or_where(counts.view(), |v| v >= mc);
+            b.freeze()
+        }
+        FilterMask::PresenceLeq { indices, threshold, max_count } => {
+            let g = ColGroup::new("", indices.clone());
+            let counts = mat.partial_group_presence_count(&g, *threshold)?;
+            let mut b = TempBitVecBuilder::new(n)?;
+            let mc = *max_count as u32;
+            b.or_where(counts.view(), |v| v <= mc);
+            b.freeze()
+        }
+        FilterMask::SumGeq { indices, min_sum } => {
+            let g = ColGroup::new("", indices.clone());
+            let sums = mat.partial_group_sum(&g)?;
+            let mut b = TempBitVecBuilder::new(n)?;
+            let ms = *min_sum;
+            b.or_where(sums.view(), |v| v >= ms);
+            b.freeze()
+        }
+        FilterMask::SumLeq { indices, max_sum } => {
+            let g = ColGroup::new("", indices.clone());
+            let sums = mat.partial_group_sum(&g)?;
+            let mut b = TempBitVecBuilder::new(n)?;
+            let ms = *max_sum;
+            b.or_where(sums.view(), |v| v <= ms);
+            b.freeze()
+        }
+        FilterMask::And(parts) => {
+            let mut b = TempBitVecBuilder::new(n)?;
+            b.not(); // initialise à tout-1 (tout passe)
+            for part in parts {
+                let m = eval_filter_mask(part, mat, n)?;
+                b.and(m.view());
+            }
+            b.freeze()
+        }
+    }
+}
diff --git a/src/obicompactvec/src/lib.rs b/src/obicompactvec/src/lib.rs
index ddd3bdc..25a8032 100644
--- a/src/obicompactvec/src/lib.rs
+++ b/src/obicompactvec/src/lib.rs
@@ -15,7 +15,7 @@ pub mod traits;
 pub use bitvec::{BitIter, PersistentBitVec, PersistentBitVecBuilder};
 pub use bitmatrix::{PersistentBitMatrix, PersistentBitMatrixBuilder, pack_bit_matrix};
 pub use builder::PersistentCompactIntVecBuilder;
-pub use colgroup::{ColGroup, MatrixGroupOps};
+pub use colgroup::{ColGroup, FilterMask, MatrixGroupOps, eval_filter_mask};
 pub use intmatrix::{PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, pack_compact_int_matrix};
 pub use layer_meta::LayerMeta;
 pub use reader::PersistentCompactIntVec;
diff --git a/src/obicompactvec/src/tempbitvec.rs b/src/obicompactvec/src/tempbitvec.rs
index 3024ffb..b8991df 100644
--- a/src/obicompactvec/src/tempbitvec.rs
+++ b/src/obicompactvec/src/tempbitvec.rs
@@ -73,18 +73,31 @@ impl TempBitVecBuilder {
         self.builder.or(other);
     }
 
-    /// Set self[slot] where pred(col[slot]) is true. Two-pass: primary then overflow.
+    pub(crate) fn and(&mut self, other: BitSliceView<'_>) {
+        self.builder.and(other);
+    }
+
+    pub(crate) fn xor(&mut self, other: BitSliceView<'_>) {
+        self.builder.xor(other);
+    }
+
+    pub(crate) fn not(&mut self) {
+        self.builder.not();
+    }
+
+    pub(crate) fn copy_from(&mut self, src: BitSliceView<'_>) {
+        self.builder.copy_from(src);
+    }
+
     pub fn or_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
-        for slot in 0..col.len() {
-            let b = col.primary_bytes()[slot];
-            if b < 255 && pred(b as u32) {
-                self.builder.set(slot, true);
-            }
-        }
-        for (slot, val) in col.overflow_entries() {
-            if pred(val) {
-                self.builder.set(slot, true);
-            }
-        }
+        self.builder.or_where(col, pred);
+    }
+
+    pub(crate) fn and_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
+        self.builder.and_where(col, pred);
+    }
+
+    pub(crate) fn xor_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
+        self.builder.xor_where(col, pred);
     }
 }
diff --git a/src/obikpartitionner/src/filter.rs b/src/obikpartitionner/src/filter.rs
index d5c6346..00f3b03 100644
--- a/src/obikpartitionner/src/filter.rs
+++ b/src/obikpartitionner/src/filter.rs
@@ -1,9 +1,24 @@
+use obicompactvec::FilterMask;
+
 /// Trait for kmer row filters.
 ///
 /// `row` contains raw per-genome counts (or 0/1 for presence/absence data).
 /// `n_genomes` equals `row.len()`.
 pub trait KmerFilter: Send + Sync {
     fn passes(&self, row: &[u32], n_genomes: usize) -> bool;
+
+    /// Express this filter as a [`FilterMask`] column-operation expression.
+    ///
+    /// Returns `Some(expr)` if the filter can be evaluated solely from matrix
+    /// column aggregates (no per-kmer row scan needed).  Returns `None` if the
+    /// filter requires row-level inspection.
+    ///
+    /// `threshold` semantics in the returned mask use `>= threshold`, matching
+    /// [`obicompactvec::MatrixGroupOps`].  Implementations must add 1 to any
+    /// row-level threshold that uses strict `>` comparison.
+    fn column_mask_expr(&self, _n_genomes: usize) -> Option<FilterMask> {
+        None
+    }
 }
 
 /// True when `row` passes every filter in `filters`.
@@ -29,6 +44,16 @@ impl KmerFilter for MinGenomeFraction {
         let p = present_count(row, self.threshold);
         p as f64 / n_genomes as f64 >= self.frac
     }
+
+    fn column_mask_expr(&self, n_genomes: usize) -> Option<FilterMask> {
+        let t = self.threshold.checked_add(1)?;
+        let min_count = (self.frac * n_genomes as f64).ceil() as usize;
+        Some(FilterMask::PresenceGeq {
+            indices: (0..n_genomes).collect(),
+            threshold: t,
+            min_count,
+        })
+    }
 }
 
 /// At most `frac` fraction of genomes contain this kmer (count > `threshold`).
@@ -42,6 +67,16 @@ impl KmerFilter for MaxGenomeFraction {
         let p = present_count(row, self.threshold);
         p as f64 / n_genomes as f64 <= self.frac
     }
+
+    fn column_mask_expr(&self, n_genomes: usize) -> Option<FilterMask> {
+        let t = self.threshold.checked_add(1)?;
+        let max_count = (self.frac * n_genomes as f64).floor() as usize;
+        Some(FilterMask::PresenceLeq {
+            indices: (0..n_genomes).collect(),
+            threshold: t,
+            max_count,
+        })
+    }
 }
 
 /// At least `count` genomes contain this kmer (count > `threshold`).
@@ -54,6 +89,15 @@ impl KmerFilter for MinGenomeCount {
     fn passes(&self, row: &[u32], _n_genomes: usize) -> bool {
         present_count(row, self.threshold) >= self.count
     }
+
+    fn column_mask_expr(&self, n_genomes: usize) -> Option<FilterMask> {
+        let t = self.threshold.checked_add(1)?;
+        Some(FilterMask::PresenceGeq {
+            indices: (0..n_genomes).collect(),
+            threshold: t,
+            min_count: self.count,
+        })
+    }
 }
 
 /// At most `count` genomes contain this kmer (count > `threshold`).
@@ -66,6 +110,15 @@ impl KmerFilter for MaxGenomeCount {
     fn passes(&self, row: &[u32], _n_genomes: usize) -> bool {
         present_count(row, self.threshold) <= self.count
     }
+
+    fn column_mask_expr(&self, n_genomes: usize) -> Option<FilterMask> {
+        let t = self.threshold.checked_add(1)?;
+        Some(FilterMask::PresenceLeq {
+            indices: (0..n_genomes).collect(),
+            threshold: t,
+            max_count: self.count,
+        })
+    }
 }
 
 // ── Total-count filters (count indexes only) ───────────────────────────────────
@@ -79,6 +132,13 @@ impl KmerFilter for MinTotalCount {
     fn passes(&self, row: &[u32], _n_genomes: usize) -> bool {
         row.iter().sum::<u32>() >= self.total
     }
+
+    fn column_mask_expr(&self, n_genomes: usize) -> Option<FilterMask> {
+        Some(FilterMask::SumGeq {
+            indices: (0..n_genomes).collect(),
+            min_sum: self.total,
+        })
+    }
 }
 
 /// Sum of counts across all genomes <= `total`.
@@ -90,6 +150,13 @@ impl KmerFilter for MaxTotalCount {
     fn passes(&self, row: &[u32], _n_genomes: usize) -> bool {
         row.iter().sum::<u32>() <= self.total
     }
+
+    fn column_mask_expr(&self, n_genomes: usize) -> Option<FilterMask> {
+        Some(FilterMask::SumLeq {
+            indices: (0..n_genomes).collect(),
+            max_sum: self.total,
+        })
+    }
 }
 
 // ── Group-based quorum filter ─────────────────────────────────────────────────
@@ -113,6 +180,37 @@ pub struct GroupQuorumFilter {
     pub max_outgroup_frac:  f64,
 }
 
+impl GroupQuorumFilter {
+    // Build PresenceGeq/PresenceLeq constraints for one group (ingroup or outgroup).
+    fn group_mask_parts(
+        indices: &[usize],
+        threshold: u32,
+        min_count: usize,
+        max_count: usize,
+        min_frac: f64,
+        max_frac: f64,
+        parts: &mut Vec<FilterMask>,
+    ) {
+        let n = indices.len();
+        let geq = min_count.max((min_frac * n as f64).ceil() as usize);
+        if geq > 0 {
+            parts.push(FilterMask::PresenceGeq {
+                indices: indices.to_vec(),
+                threshold,
+                min_count: geq,
+            });
+        }
+        let leq = max_count.min((max_frac * n as f64).floor() as usize);
+        if leq < n {
+            parts.push(FilterMask::PresenceLeq {
+                indices: indices.to_vec(),
+                threshold,
+                max_count: leq,
+            });
+        }
+    }
+}
+
 impl KmerFilter for GroupQuorumFilter {
     fn passes(&self, row: &[u32], _n_genomes: usize) -> bool {
         if !self.ingroup_idx.is_empty() {
@@ -139,4 +237,26 @@ impl KmerFilter for GroupQuorumFilter {
         }
         true
     }
+
+    fn column_mask_expr(&self, _n_genomes: usize) -> Option<FilterMask> {
+        let t = self.threshold.checked_add(1)?;
+        let mut parts: Vec<FilterMask> = Vec::new();
+        if !self.ingroup_idx.is_empty() {
+            Self::group_mask_parts(
+                &self.ingroup_idx, t,
+                self.min_count, self.max_count,
+                self.min_frac, self.max_frac,
+                &mut parts,
+            );
+        }
+        if !self.outgroup_idx.is_empty() {
+            Self::group_mask_parts(
+                &self.outgroup_idx, t,
+                self.min_outgroup_count, self.max_outgroup_count,
+                self.min_outgroup_frac, self.max_outgroup_frac,
+                &mut parts,
+            );
+        }
+        Some(FilterMask::And(parts))
+    }
 }
diff --git a/src/obikpartitionner/src/merge_layer.rs b/src/obikpartitionner/src/merge_layer.rs
index 0701b6d..32750af 100644
--- a/src/obikpartitionner/src/merge_layer.rs
+++ b/src/obikpartitionner/src/merge_layer.rs
@@ -10,6 +10,7 @@ use obipipeline::{
 };
 
 use obicompactvec::{
+    MatrixGroupOps,
     PersistentBitMatrix, PersistentBitMatrixBuilder, PersistentBitVecBuilder,
     PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, PersistentCompactIntVecBuilder,
 };
@@ -78,6 +79,41 @@ impl SrcLayerData {
         }
         buf
     }
+
+    pub(crate) fn n_slots(&self) -> usize {
+        match self {
+            SrcLayerData::Presence(_, mat) => mat.n(),
+            SrcLayerData::Count(_, mat) => mat.n(),
+        }
+    }
+
+    /// MPHF lookup: returns the slot index for `kmer` (kmer must be in the domain).
+    #[inline]
+    pub(crate) fn slot(&self, kmer: CanonicalKmer) -> usize {
+        match self {
+            SrcLayerData::Presence(mphf, _) => mphf.index(kmer),
+            SrcLayerData::Count(mphf, _) => mphf.index(kmer),
+        }
+    }
+
+    /// Row lookup by slot index, bypassing the MPHF.
+    #[inline]
+    pub(crate) fn fill_row_by_slot(&self, slot: usize, n_genomes: usize) -> Vec<u32> {
+        let mut buf = vec![0u32; n_genomes];
+        match self {
+            SrcLayerData::Presence(_, mat) => mat.fill_row(slot, &mut buf),
+            SrcLayerData::Count(_, mat) => mat.fill_row(slot, &mut buf),
+        }
+        buf
+    }
+
+    /// Call `f` with a reference to the underlying matrix as `&dyn MatrixGroupOps`.
+    pub(crate) fn with_matrix<R>(&self, f: impl FnOnce(&dyn MatrixGroupOps) -> R) -> R {
+        match self {
+            SrcLayerData::Presence(_, mat) => f(mat),
+            SrcLayerData::Count(_, mat) => f(mat),
+        }
+    }
 }
 
 // ── helpers ───────────────────────────────────────────────────────────────────
diff --git a/src/obikpartitionner/src/rebuild_layer.rs b/src/obikpartitionner/src/rebuild_layer.rs
index 6bd40f3..b8893ef 100644
--- a/src/obikpartitionner/src/rebuild_layer.rs
+++ b/src/obikpartitionner/src/rebuild_layer.rs
@@ -1,8 +1,9 @@
 use std::path::Path;
 
 use obicompactvec::{
-    PersistentBitMatrixBuilder, PersistentBitVecBuilder, PersistentCompactIntMatrixBuilder,
-    PersistentCompactIntVecBuilder,
+    FilterMask, eval_filter_mask,
+    PersistentBitMatrixBuilder, PersistentBitVecBuilder,
+    PersistentCompactIntMatrixBuilder, PersistentCompactIntVecBuilder,
 };
 use obidebruinj::GraphDeBruijn;
 use obikseq::CanonicalKmer;
@@ -10,18 +11,135 @@ use obilayeredmap::meta::PartitionMeta;
 use obilayeredmap::{IndexMode, MphfLayer};
 use obiskio::{SKError, SKResult, UnitigFileReader};
 
-use crate::common::{ColBuilder, col_path_bit, col_path_int, load_meta, olm_to_sk, write_matrix_meta};
-use crate::filter::{KmerFilter, passes_all};
+use crate::common::{load_meta, olm_to_sk};
+use crate::filter::KmerFilter;
 use crate::graph_pipeline::materialize_layer;
 use crate::merge_layer::{MergeMode, SrcLayerData};
 use crate::partition::KmerPartition;
 
 const INDEX_SUBDIR: &str = "index";
 
-/// Iterate all kmers in `src_index_dir` that pass `filters`, yielding `(kmer, row)`.
+// ── Builders — pair matrix builder + column builders for one mode ─────────────
+
+enum Builders {
+    Presence(PersistentBitMatrixBuilder, Vec<PersistentBitVecBuilder>),
+    Count(PersistentCompactIntMatrixBuilder, Vec<PersistentCompactIntVecBuilder>),
+}
+
+impl Builders {
+    fn new(mode: MergeMode, n: usize, dir: &Path, n_genomes: usize) -> SKResult<Self> {
+        match mode {
+            MergeMode::Presence => {
+                let mut mat = PersistentBitMatrixBuilder::new(n, dir).map_err(SKError::Io)?;
+                let mut cols = Vec::with_capacity(n_genomes);
+                for _ in 0..n_genomes { cols.push(mat.add_col().map_err(SKError::Io)?); }
+                Ok(Builders::Presence(mat, cols))
+            }
+            MergeMode::Count => {
+                let mut mat = PersistentCompactIntMatrixBuilder::new(n, dir).map_err(SKError::Io)?;
+                let mut cols = Vec::with_capacity(n_genomes);
+                for _ in 0..n_genomes { cols.push(mat.add_col().map_err(SKError::Io)?); }
+                Ok(Builders::Count(mat, cols))
+            }
+        }
+    }
+
+    fn set_val(&mut self, col: usize, slot: usize, value: u32) {
+        match self {
+            Builders::Presence(_, cols) => cols[col].set(slot, value > 0),
+            Builders::Count(_, cols)    => cols[col].set(slot, value),
+        }
+    }
+
+    fn close(self) -> SKResult<()> {
+        match self {
+            Builders::Presence(mat, cols) => {
+                for b in cols { b.close().map_err(SKError::Io)?; }
+                mat.close().map_err(SKError::Io)
+            }
+            Builders::Count(mat, cols) => {
+                for b in cols { b.close().map_err(SKError::Io)?; }
+                mat.close().map_err(SKError::Io)
+            }
+        }
+    }
+}
+
+// ── try_compute_combined_mask ─────────────────────────────────────────────────
+
+/// Build a per-slot `TempBitVec` mask from `filters` using column operations
+/// on the source matrix — no per-kmer MPHF lookup or row read needed.
 ///
-/// Uses [`SrcLayerData`] semantics: counts take priority over presence when
-/// `mode = Count`; presence (or implicit all-ones) is used for `Presence`.
+/// Returns `Some(mask)` when every filter in `filters` can express itself as
+/// a [`FilterMask`] expression.  Returns `None` when any filter requires
+/// row-level inspection (fall back to `passes_all`).
+fn try_compute_combined_mask(
+    filters: &[Box<dyn KmerFilter>],
+    src_data: &SrcLayerData,
+    n_genomes: usize,
+) -> SKResult<Option<obicompactvec::TempBitVec>> {
+    if filters.is_empty() {
+        return Ok(None);
+    }
+    let mut exprs: Vec<FilterMask> = Vec::with_capacity(filters.len());
+    for f in filters {
+        match f.column_mask_expr(n_genomes) {
+            Some(expr) => exprs.push(expr),
+            None => return Ok(None),
+        }
+    }
+    let combined = FilterMask::And(exprs);
+    let n = src_data.n_slots();
+    let mask = src_data
+        .with_matrix(|mat| eval_filter_mask(&combined, mat, n))
+        .map_err(SKError::Io)?;
+    Ok(Some(mask))
+}
+
+// ── iter_src_kmers_masked (pass 1) ────────────────────────────────────────────
+
+/// Iterate all passing kmers in `src_index_dir`, yielding only the kmer value.
+///
+/// When all filters can be expressed as column operations, a per-slot mask is
+/// computed once per layer and used for O(1) slot-check per kmer instead of a
+/// full row read.  Falls back to row-level `passes_all` otherwise.
+fn iter_src_kmers_masked(
+    src_index_dir: &Path,
+    mode: MergeMode,
+    n_genomes: usize,
+    filters: &[Box<dyn KmerFilter>],
+    mut cb: impl FnMut(CanonicalKmer),
+) -> SKResult<()> {
+    let src_meta = load_meta(src_index_dir, "rebuild")?;
+    for l in 0..src_meta.n_layers {
+        let src_layer_dir = src_index_dir.join(format!("layer_{l}"));
+        let unitigs_path = src_layer_dir.join("unitigs.bin");
+        if !unitigs_path.exists() { continue; }
+
+        let src_data = SrcLayerData::open(&src_layer_dir, mode)?;
+        let mask = try_compute_combined_mask(filters, &src_data, n_genomes)?;
+        let reader = UnitigFileReader::open_sequential(&unitigs_path)?;
+
+        for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
+            let slot = src_data.slot(kmer);
+            let passes = match &mask {
+                Some(m) => m.get(slot),
+                None => {
+                    let row = src_data.fill_row_by_slot(slot, n_genomes);
+                    filters.iter().all(|f| f.passes(&row, n_genomes))
+                }
+            };
+            if passes { cb(kmer); }
+        }
+    }
+    Ok(())
+}
+
+// ── iter_src_layers (pass 2) ──────────────────────────────────────────────────
+
+/// Iterate all passing kmers in `src_index_dir`, yielding `(kmer, row)`.
+///
+/// When the slot mask is available, skips the row read for filtered-out slots.
 fn iter_src_layers(
     src_index_dir: &Path,
     mode: MergeMode,
@@ -33,17 +151,23 @@ fn iter_src_layers(
     for l in 0..src_meta.n_layers {
         let src_layer_dir = src_index_dir.join(format!("layer_{l}"));
         let unitigs_path = src_layer_dir.join("unitigs.bin");
-        if !unitigs_path.exists() {
-            continue;
-        }
+        if !unitigs_path.exists() { continue; }
 
-        let reader = UnitigFileReader::open_sequential(&unitigs_path)?;
         let src_data = SrcLayerData::open(&src_layer_dir, mode)?;
+        let mask = try_compute_combined_mask(filters, &src_data, n_genomes)?;
+        let reader = UnitigFileReader::open_sequential(&unitigs_path)?;
 
         for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
-            let row = src_data.lookup(kmer, n_genomes);
-            if passes_all(filters, &row, n_genomes) {
+            let slot = src_data.slot(kmer);
+            if let Some(ref m) = mask {
+                if !m.get(slot) { continue; }
+                let row = src_data.fill_row_by_slot(slot, n_genomes);
                 cb(kmer, row.into_boxed_slice());
+            } else {
+                let row = src_data.fill_row_by_slot(slot, n_genomes);
+                if filters.iter().all(|f| f.passes(&row, n_genomes)) {
+                    cb(kmer, row.into_boxed_slice());
+                }
             }
         }
     }
@@ -81,7 +205,7 @@ impl KmerPartition {
 
         // ── Pass 1: collect filtered kmers into de Bruijn graph ───────────────
         let mut g = GraphDeBruijn::new();
-        iter_src_layers(&src_index_dir, mode, n_genomes, filters, |kmer, _row| {
+        iter_src_kmers_masked(&src_index_dir, mode, n_genomes, filters, |kmer| {
             g.push(kmer);
         })?;
 
@@ -100,54 +224,22 @@ impl KmerPartition {
         // ── Prepare matrix builders (one column per genome) ───────────────────
         let data_dir = match mode {
             MergeMode::Presence => dst_layer_dir.join("presence"),
-            MergeMode::Count => dst_layer_dir.join("counts"),
+            MergeMode::Count    => dst_layer_dir.join("counts"),
         };
         std::fs::create_dir_all(&data_dir)?;
-
-        let mut builders: Vec<ColBuilder> = match mode {
-            MergeMode::Presence => {
-                PersistentBitMatrixBuilder::new(n_new, &data_dir)
-                    .map_err(SKError::Io)?
-                    .close()
-                    .map_err(SKError::Io)?;
-                (0..n_genomes)
-                    .map(|g| -> SKResult<ColBuilder> {
-                        let b = PersistentBitVecBuilder::new(n_new, &col_path_bit(&data_dir, g))?;
-                        Ok(ColBuilder::Bit(b))
-                    })
-                    .collect::<SKResult<_>>()?
-            }
-            MergeMode::Count => {
-                PersistentCompactIntMatrixBuilder::new(n_new, &data_dir)
-                    .map_err(SKError::Io)?
-                    .close()
-                    .map_err(SKError::Io)?;
-                (0..n_genomes)
-                    .map(|g| -> SKResult<ColBuilder> {
-                        let b = PersistentCompactIntVecBuilder::new(
-                            n_new,
-                            &col_path_int(&data_dir, g),
-                        )?;
-                        Ok(ColBuilder::Int(b))
-                    })
-                    .collect::<SKResult<_>>()?
-            }
-        };
+        let mut builders = Builders::new(mode, n_new, &data_dir, n_genomes)?;
 
         // ── Pass 2: fill builders ─────────────────────────────────────────────
         iter_src_layers(&src_index_dir, mode, n_genomes, filters, |kmer, row| {
             if let Some(slot) = dst_mphf.find(kmer) {
                 for (col, &value) in row.iter().enumerate() {
-                    builders[col].set_val(slot, value);
+                    builders.set_val(col, slot, value);
                 }
             }
         })?;
 
-        // ── Close builders, write metadata ────────────────────────────────────
-        for b in builders {
-            b.close()?;
-        }
-        write_matrix_meta(&data_dir, n_new, n_genomes).map_err(SKError::Io)?;
+        // ── Close builders and write metadata ─────────────────────────────────
+        builders.close()?;
 
         PartitionMeta {
             n_layers: 1,

From 9abb2db92fc453ddd87da19faadd527730591b8f Mon Sep 17 00:00:00 2001
From: Eric Coissac <eric@coissac.eu>
Date: Fri, 19 Jun 2026 09:20:14 +0200
Subject: [PATCH 19/24] refactor: replace explicit bit-setting loops with
 optimized bulk operations

Refactor bitmatrix, colgroup, and layer modules to replace manual iteration with concise `or_where` predicates and bulk inversion calls. This simplifies the codebase and leverages optimized internal implementations for improved performance.
---
 src/obicompactvec/src/bitmatrix.rs | 8 +-------
 src/obicompactvec/src/colgroup.rs  | 8 ++------
 src/obilayeredmap/src/layer.rs     | 4 +---
 3 files changed, 4 insertions(+), 16 deletions(-)

diff --git a/src/obicompactvec/src/bitmatrix.rs b/src/obicompactvec/src/bitmatrix.rs
index 55a4561..c054ae0 100644
--- a/src/obicompactvec/src/bitmatrix.rs
+++ b/src/obicompactvec/src/bitmatrix.rs
@@ -412,13 +412,7 @@ impl PersistentBitMatrixBuilder {
         let path = col_path(&self.dir, self.n_cols);
         self.n_cols += 1;
         let mut b = PersistentBitVecBuilder::new(self.n, &path)?;
-        let view = src.view();
-        for slot in 0..self.n {
-            if view.primary_bytes()[slot] > 0 { b.set(slot, true); }
-        }
-        for (slot, _) in view.overflow_entries() {
-            b.set(slot, true);
-        }
+        b.or_where(src.view(), |v| v > 0);
         b.close()
     }
 
diff --git a/src/obicompactvec/src/colgroup.rs b/src/obicompactvec/src/colgroup.rs
index b8c5ec8..b1545e6 100644
--- a/src/obicompactvec/src/colgroup.rs
+++ b/src/obicompactvec/src/colgroup.rs
@@ -53,9 +53,7 @@ pub trait MatrixGroupOps {
         let n = counts.len();
         let n_required = g.indices.len() as u32;
         let mut b = TempBitVecBuilder::new(n)?;
-        for slot in 0..n {
-            if counts.get(slot) >= n_required { b.set(slot, true); }
-        }
+        b.or_where(counts.view(), |v| v >= n_required);
         b.freeze()
     }
 
@@ -64,9 +62,7 @@ pub trait MatrixGroupOps {
         let counts = self.partial_group_presence_count(g, threshold)?;
         let n = counts.len();
         let mut b = TempBitVecBuilder::new(n)?;
-        for slot in 0..n {
-            if counts.get(slot) == 0 { b.set(slot, true); }
-        }
+        b.or_where(counts.view(), |v| v == 0);
         b.freeze()
     }
 }
diff --git a/src/obilayeredmap/src/layer.rs b/src/obilayeredmap/src/layer.rs
index 72b38ea..241feea 100644
--- a/src/obilayeredmap/src/layer.rs
+++ b/src/obilayeredmap/src/layer.rs
@@ -107,9 +107,7 @@ impl Layer<()> {
         fs::create_dir_all(&presence_dir).map_err(OLMError::Io)?;
         let mut mb = PersistentBitMatrixBuilder::new(n_kmers, &presence_dir).map_err(OLMError::Io)?;
         let mut col = mb.add_col().map_err(OLMError::Io)?;
-        for slot in 0..n_kmers {
-            col.set(slot, true);
-        }
+        col.not();
         col.close().map_err(OLMError::Io)?;
         mb.close().map_err(OLMError::Io)
     }

From 280ca1f5a331363a99021a8faabd5616629663e3 Mon Sep 17 00:00:00 2001
From: Eric Coissac <eric@coissac.eu>
Date: Fri, 19 Jun 2026 09:23:44 +0200
Subject: [PATCH 20/24] feat: add optimized new_ones constructor for all-ones
 bit vectors

Introduces `new_ones` and `add_col_ones` methods to directly initialize all-ones bit vectors and matrix columns. This replaces redundant initialization sequences that created zero-filled structures and applied bitwise NOT, with a single pass that writes contiguous 0xFF bytes to disk. The change eliminates inversion overhead, streamlines test setup, and improves performance for filter mask intersection logic while preserving identical semantics.
---
 src/obicompactvec/src/bitmatrix.rs      | 10 ++++++---
 src/obicompactvec/src/bitvec.rs         | 27 +++++++++++++++++++++++++
 src/obicompactvec/src/colgroup.rs       |  3 +--
 src/obicompactvec/src/tempbitvec.rs     |  7 +++++++
 src/obicompactvec/src/tests/colgroup.rs |  3 +--
 src/obilayeredmap/src/layer.rs          |  4 +---
 6 files changed, 44 insertions(+), 10 deletions(-)

diff --git a/src/obicompactvec/src/bitmatrix.rs b/src/obicompactvec/src/bitmatrix.rs
index c054ae0..72f8b05 100644
--- a/src/obicompactvec/src/bitmatrix.rs
+++ b/src/obicompactvec/src/bitmatrix.rs
@@ -299,9 +299,7 @@ impl PersistentBitMatrix {
             Self::Columnar(m) => PersistentBitVecBuilder::build_from(m.col(c), path),
             Self::Packed(m)   => m.col_persist(c, path),
             Self::Implicit { n_rows, .. } => {
-                let mut b = PersistentBitVecBuilder::new(*n_rows, path)?;
-                b.not();
-                Ok(b)
+                PersistentBitVecBuilder::new_ones(*n_rows, path)
             }
         }
     }
@@ -402,6 +400,12 @@ impl PersistentBitMatrixBuilder {
         PersistentBitVecBuilder::new(self.n, &path)
     }
 
+    pub fn add_col_ones(&mut self) -> io::Result<PersistentBitVecBuilder> {
+        let path = col_path(&self.dir, self.n_cols);
+        self.n_cols += 1;
+        PersistentBitVecBuilder::new_ones(self.n, &path)
+    }
+
     pub fn add_col_from(&mut self, src: &TempBitVec) -> io::Result<()> {
         src.make_persistent(&col_path(&self.dir, self.n_cols))?;
         self.n_cols += 1;
diff --git a/src/obicompactvec/src/bitvec.rs b/src/obicompactvec/src/bitvec.rs
index 966d57f..145bd63 100644
--- a/src/obicompactvec/src/bitvec.rs
+++ b/src/obicompactvec/src/bitvec.rs
@@ -145,6 +145,33 @@ impl PersistentBitVecBuilder {
         Ok(Self { mmap, n, path: path.to_path_buf() })
     }
 
+    /// Create an all-ones bit vector of length `n` at `path`.
+    ///
+    /// More efficient than `new(n, path)` + `not()`: the data is written as
+    /// 0xFF bytes in a single sequential pass, with no intermediate all-zeros state.
+    pub fn new_ones(n: usize, path: &Path) -> io::Result<Self> {
+        let nw        = n_words(n);
+        let file_size = HEADER_SIZE + nw * 8;
+        let mut file  = OpenOptions::new()
+            .read(true).write(true).create(true).truncate(true)
+            .open(path)?;
+        file.write_all(&MAGIC)?;
+        file.write_all(&[0u8; 4])?;
+        file.write_all(&(n as u64).to_le_bytes())?;
+        file.write_all(&vec![0xFFu8; nw * 8])?;
+        file.seek(SeekFrom::Start(0))?;
+        file.set_len(file_size as u64)?;
+        let mut mmap = unsafe { MmapMut::map_mut(&file)? };
+        // Clear padding bits in the last word so trailing bits are always 0.
+        let rem = n % 64;
+        if rem != 0 {
+            let ptr   = mmap[HEADER_SIZE..].as_mut_ptr() as *mut u64;
+            let words = unsafe { std::slice::from_raw_parts_mut(ptr, nw) };
+            words[nw - 1] &= (1u64 << rem) - 1;
+        }
+        Ok(Self { mmap, n, path: path.to_path_buf() })
+    }
+
     pub fn build_from(source: &PersistentBitVec, path: &Path) -> io::Result<Self> {
         fs::copy(source.path(), path)?;
         let file = OpenOptions::new().read(true).write(true).open(path)?;
diff --git a/src/obicompactvec/src/colgroup.rs b/src/obicompactvec/src/colgroup.rs
index b1545e6..49ca477 100644
--- a/src/obicompactvec/src/colgroup.rs
+++ b/src/obicompactvec/src/colgroup.rs
@@ -126,8 +126,7 @@ pub fn eval_filter_mask(expr: &FilterMask, mat: &dyn MatrixGroupOps, n: usize) -
             b.freeze()
         }
         FilterMask::And(parts) => {
-            let mut b = TempBitVecBuilder::new(n)?;
-            b.not(); // initialise à tout-1 (tout passe)
+            let mut b = TempBitVecBuilder::new_ones(n)?;
             for part in parts {
                 let m = eval_filter_mask(part, mat, n)?;
                 b.and(m.view());
diff --git a/src/obicompactvec/src/tempbitvec.rs b/src/obicompactvec/src/tempbitvec.rs
index b8991df..8bbec16 100644
--- a/src/obicompactvec/src/tempbitvec.rs
+++ b/src/obicompactvec/src/tempbitvec.rs
@@ -56,6 +56,13 @@ impl TempBitVecBuilder {
         Ok(Self { builder, temp })
     }
 
+    pub(crate) fn new_ones(n: usize) -> io::Result<Self> {
+        let temp = TempDir::new()?;
+        let path = temp.path().join("data.pbiv");
+        let builder = PersistentBitVecBuilder::new_ones(n, &path)?;
+        Ok(Self { builder, temp })
+    }
+
     pub(crate) fn freeze(self) -> io::Result<TempBitVec> {
         let Self { builder, temp } = self;
         let vec = builder.finish()?;
diff --git a/src/obicompactvec/src/tests/colgroup.rs b/src/obicompactvec/src/tests/colgroup.rs
index 884450f..d1c7cf1 100644
--- a/src/obicompactvec/src/tests/colgroup.rs
+++ b/src/obicompactvec/src/tests/colgroup.rs
@@ -150,8 +150,7 @@ fn mask_with_all_ones_is_noop() {
     let dir = tempdir().unwrap();
     let mut v = PersistentCompactIntVecBuilder::new(4, &dir.path().join("v.pciv")).unwrap();
     v.set(0, 300); v.set(1, 1); v.set(2, 0); v.set(3, 42);
-    let mut mask = PersistentBitVecBuilder::new(4, &dir.path().join("m.pbiv")).unwrap();
-    mask.not();  // all bits → 1
+    let mask = PersistentBitVecBuilder::new_ones(4, &dir.path().join("m.pbiv")).unwrap();
     v.mask_with(mask.view());
     v.close().unwrap();
     let r = PersistentCompactIntVec::open(&dir.path().join("v.pciv")).unwrap();
diff --git a/src/obilayeredmap/src/layer.rs b/src/obilayeredmap/src/layer.rs
index 241feea..475bca7 100644
--- a/src/obilayeredmap/src/layer.rs
+++ b/src/obilayeredmap/src/layer.rs
@@ -106,9 +106,7 @@ impl Layer<()> {
         let presence_dir = layer_dir.join(PRESENCE_DIR);
         fs::create_dir_all(&presence_dir).map_err(OLMError::Io)?;
         let mut mb = PersistentBitMatrixBuilder::new(n_kmers, &presence_dir).map_err(OLMError::Io)?;
-        let mut col = mb.add_col().map_err(OLMError::Io)?;
-        col.not();
-        col.close().map_err(OLMError::Io)?;
+        mb.add_col_ones().map_err(OLMError::Io)?.close().map_err(OLMError::Io)?;
         mb.close().map_err(OLMError::Io)
     }
 }

From c694e1f2b0f3d8fbc1f3fdeed74093e1b65eb706 Mon Sep 17 00:00:00 2001
From: Eric Coissac <eric@coissac.eu>
Date: Fri, 19 Jun 2026 09:55:41 +0200
Subject: [PATCH 21/24] feat: add benchmark pipeline, expose APIs, and enforce
 strict paths

Introduces a Make-based orchestration for simulating, indexing, merging, filtering, and verifying k-mer counts and presence. Exposes internal builder and iterator APIs publicly, enforces mandatory leading slashes for predicate patterns, registers the `obitaxonomy` crate, and updates tooling configurations alongside documentation.
---
 .DS_Store                           | Bin 10244 -> 10244 bytes
 .gitignore                          |  10 ++
 .serena/.gitignore                  |   2 +
 .serena/project.yml                 | 133 ++++++++++++++++++
 CLAUDE.md                           |  26 ++++
 Makefile                            |   1 +
 benchmark/Makefile                  | 144 ++++++++++++++++++++
 benchmark/README.md                 | 132 ++++++++++++++++++
 benchmark/aggregate_stats.sh        |  53 ++++++++
 benchmark/build_reference.py        | 137 +++++++++++++++++++
 benchmark/build_reference.sh        |  39 ++++++
 benchmark/deps.mk                   | 199 +++++++++++++++++++++++++++
 benchmark/downloads.sh              |  48 +++++++
 benchmark/filter_one_count.sh       | 108 +++++++++++++++
 benchmark/filter_one_presence.sh    | 108 +++++++++++++++
 benchmark/index_one_count.sh        | 103 ++++++++++++++
 benchmark/index_one_presence.sh     | 102 ++++++++++++++
 benchmark/make_deps.py              | 118 ++++++++++++++++
 benchmark/merge_count.sh            | 103 ++++++++++++++
 benchmark/merge_presence.sh         | 104 ++++++++++++++
 benchmark/simulate.sh               |  12 ++
 benchmark/simulate_one.sh           |  33 +++++
 benchmark/verify_count.py           | 181 +++++++++++++++++++++++++
 benchmark/verify_merge_count.py     | 201 ++++++++++++++++++++++++++++
 benchmark/verify_merge_count.sh     |  27 ++++
 benchmark/verify_merge_presence.py  | 170 +++++++++++++++++++++++
 benchmark/verify_merge_presence.sh  |  27 ++++
 benchmark/verify_one_count.sh       |  30 +++++
 benchmark/verify_one_presence.sh    |  30 +++++
 benchmark/verify_presence.py        | 139 +++++++++++++++++++
 docmd/implementation/filtering.md   |  15 ++-
 mkdocs.yml                          |   1 +
 src/Cargo.lock                      |   4 +
 src/Cargo.toml                      |   2 +-
 src/obicompactvec/src/bitvec.rs     |   8 +-
 src/obicompactvec/src/lib.rs        |   8 +-
 src/obicompactvec/src/tempbitvec.rs |  23 ++--
 src/obicompactvec/src/tempintvec.rs |  36 +++--
 src/obidebruinj/src/debruijn.rs     |   1 +
 src/obikindex/src/merge.rs          |   4 +-
 src/obikmer/src/cmd/predicate.rs    |  19 +--
 test.sk.fasta                       |  28 ----
 42 files changed, 2585 insertions(+), 84 deletions(-)
 create mode 100644 .serena/.gitignore
 create mode 100644 .serena/project.yml
 create mode 100644 benchmark/Makefile
 create mode 100644 benchmark/README.md
 create mode 100755 benchmark/aggregate_stats.sh
 create mode 100755 benchmark/build_reference.py
 create mode 100755 benchmark/build_reference.sh
 create mode 100644 benchmark/deps.mk
 create mode 100755 benchmark/downloads.sh
 create mode 100755 benchmark/filter_one_count.sh
 create mode 100755 benchmark/filter_one_presence.sh
 create mode 100755 benchmark/index_one_count.sh
 create mode 100755 benchmark/index_one_presence.sh
 create mode 100644 benchmark/make_deps.py
 create mode 100755 benchmark/merge_count.sh
 create mode 100755 benchmark/merge_presence.sh
 create mode 100755 benchmark/simulate.sh
 create mode 100644 benchmark/simulate_one.sh
 create mode 100755 benchmark/verify_count.py
 create mode 100755 benchmark/verify_merge_count.py
 create mode 100755 benchmark/verify_merge_count.sh
 create mode 100755 benchmark/verify_merge_presence.py
 create mode 100755 benchmark/verify_merge_presence.sh
 create mode 100755 benchmark/verify_one_count.sh
 create mode 100755 benchmark/verify_one_presence.sh
 create mode 100755 benchmark/verify_presence.py
 delete mode 100644 test.sk.fasta

diff --git a/.DS_Store b/.DS_Store
index 5f0cbf7b1dbcb941bc5c3c51ebdd4b614453d7b7..96c32f71fc181e77027f3582347b8e854e7e5eb5 100644
GIT binary patch
delta 1729
zcmeH{TWl0n7{|~50Od>=yJyRm9cbxpU7)e7r8i1Tp;or5NNZb5YcDh`yPW|h?9R}g
z-9@c+Rd_)q3KNYnYT^?byd@Pi!3!Ei;^je8P0)ym4+axVyaf~EgJ))zhx(!qKKS4~
z%y<6ZneUr3`Tf5+GI(V0<tC2T>`=}0y5P*r#`b8qxqZ`YG(6MM+!zhFwzbaAN~|)n
zp>rs4Ae}MHqxMh15FW5)zDiD8g;*+WrU+ToWpjT(lCM#29~~Q?knNZJmG(nb4YuKr
z6?xv$hO~4trx&!IWKOdJWZ?zr#qwZv?b`YctsPyl?R|suKJKrO1M-T|f^O-lv^H``
zS9ANvlh%}GsJgLlLRU2-r|;FZtmLN+(y)=tXl|9VQmJA*!p8C`eLAn_?wl{W&1#ge
z#80zHRUF_}Q*}yJub@U*!`Ln-AIoau?xp2A<vNKc*|4SG<#m>$a)ZPzY50I{luDcA
z7RFU+B%MrYX|G$G+|GEfG+NNImTnp%S@c&Zo$^-ZM^qY{GBX)*a?5+TTiM3=8MfPb
zd0*1X4II>rtH+8fH#2^oO<4PrS#98up=G_AxYEm5g8SUOQxsSAoLSneQ3e<rciP>k
zs-jZ`_xp;s4Jk^3bKNNDIRU{mc8ZX<_-Bl<$W;=N?s{(-IcuDCSMHZbx*NT;L>O63
znn@4oBYQ}e%#dT`aq<j#g}hJBk<ZCR@*TNMt^lA|f-)>a0F?+rL9rI~SdZ(`fKK!v
zj+-!qU6{mfB%mS#0|zmU0*bg7_v0{*;sk6wipTIYPU2ad!t*$dx9~RJ!MperU*ao#
zjf?mmKjRntO3SE^a_Xm5bPe4^JLqQGNw?5A?Wd!3l4@b^11P*mCQGpt=w8Rmb|SD0
zb0rgpw%@X&w{L-o+x~%xFO)k@TOO!f5ez$CZr{4i_Eis*D=Sx3Rj-yqBf>Y@9rk5K
zDe|R`8`!+FReP-*;!A{$Jh`q;$TDFePp*%KLYxZcda|)8B=beWmy%n%F)AcVqGPix
z^TooU-d@)YLK0^>JRR#1ogBL{R{WQ+Um)L-OXNrLhrnKr<pOpc8qtO=f^k0vu@l1>
z5uEoRg(;V{DM)9r&%)ifN8r9s0Dln2@Gu_1aXf)1@stbwIT!j1conbX4Bo(-IEVB2
z5Fg=Ve1b3V4KCpaT=@&x{)J@mn?rW4CHMiGTN?g-aj#`&b1^fW&lqF+lo$gs*<vpK
Q{_pfZ*#9C}yCwMEp9TSdKL7v#

delta 1538
zcmeHGU1%It82!%8CY|w;?A`3f-QI3uY_w^cP&e3WQ*67*8l%~y+2*HBs-~IEOmT3t
zo9t{_qvERc$*3R$wkV?Zp-KxyNLx{=FCtYC!5YC9M8PM)zk&!x5bw^cc@kfI@xgic
z?woUfzI!<59?u-lywxS#j+RSqCL+62*<2w@RI?*Hm$RKxB5@6@CQw_qPSbDTyk3rt
zA2>KMd02=A#~QJ6M#(;E-EEXC$L$HtyQ8tN)b(=>(fG}`ZoO@L_wL?%2KKE4MW`_x
z;oSW(wz(War>=WcY`v5d3t4-9!7eR|Hcj)`<`&5xFx81e1=BuaTc)_6`Rg~fNO^x{
zCRMalHLDvowM#zc*pFJms4nd2<b3R5J8uf7+A7h-<#^7>TDh_?b-&S$JGeY})G9i5
zAurNZa#w`QiJ8Letd%c0Rc_B-DW@uZ2Mwn*JZI(Aoj$5IPpP>>&O=7g8a|e{idD8>
zv4=&_<13r`OuJO@CpUx|(@Mu=<-8u#^i(v8P*7&%h?|y@%}=|-;aE9u%vx>91bJV?
zAJ!^Jqh?A_FAdNX&Cwz)(Tnsdou><Qkv^nP=_-9q-_dpYm42gD`W*q(BM5;IB-*hB
zThW2-NTLV5NF#$0jA9HAA`251j-Xt?0*>JkEaGWAgJ<y^p2rEihSzZlXYnpB;XS;M
z%eaEiv4XGg6Mn{Ztg-;BWpymb1Z!krwu!Z~1nXwKte<84>oZhk99om({V+ofoJ=yP
zM20&VS>JW;?=u+hzxTd@^x!{auu0diHu$M)x-r^XiQ>-PJ?>!BIO8|DKZIiLQoNz8
z^0y(rk>8@JNQl}M54Q6ynhJxc^I>Ovo2K$7m<M@R{B~WXOw{<WGk&M83HV?aze`tn
z69FGK^Q5i|@<2Ql@6%Ldgr>D6=chfnZAte)IP`C9{+O=Nm-G#NuWbGUHP8@6D>h>r
zc480qDq~aV!#)gSzp}St?i8kBU<NtNqJTLRG5HuC#}g>yDP{7KvbkdNNxX!Y@d{4k
z4ZMjnSjKr=#M^jB+5Q1O!56rOZ#@yTX6BHO|El=K9g4{1Wc0H3!jtvO>snU>`h-1G
W$d9Ub>h6y<{qN9!!XY;q{p?Q;+ev2t

diff --git a/.gitignore b/.gitignore
index 76d17de..ec94743 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,3 +9,13 @@ data-stress
 ./**/*.json
 *.bin
 Betula_exilis--IGA-24-33
+benchmark/genomes
+benchmark/simulated_data
+benchmark/specimen_index_presence
+benchmark/specimen_index_count
+benchmark/global_index_presence
+benchmark/global_index_count
+benchmark/stats
+benchmark/reference_index
+benchmark/specific_index_count
+benchmark/specific_index_presence
diff --git a/.serena/.gitignore b/.serena/.gitignore
new file mode 100644
index 0000000..2e510af
--- /dev/null
+++ b/.serena/.gitignore
@@ -0,0 +1,2 @@
+/cache
+/project.local.yml
diff --git a/.serena/project.yml b/.serena/project.yml
new file mode 100644
index 0000000..1a35e2f
--- /dev/null
+++ b/.serena/project.yml
@@ -0,0 +1,133 @@
+# the name by which the project can be referenced within Serena
+project_name: "obikmer"
+
+
+# list of languages for which language servers are started; choose from:
+#   al                  angular             ansible             bash                clojure
+#   cpp                 cpp_ccls            crystal             csharp              csharp_omnisharp
+#   dart                elixir              elm                 erlang              fortran
+#   fsharp              go                  groovy              haskell             haxe
+#   hlsl                html                java                json                julia
+#   kotlin              lean4               lua                 luau                markdown
+#   matlab              msl                 nix                 ocaml               pascal
+#   perl                php                 php_phpactor        powershell          python
+#   python_jedi         python_ty           r                   rego                ruby
+#   ruby_solargraph     rust                scala               scss                solidity
+#   svelte              swift               systemverilog       terraform           toml
+#   typescript          typescript_vts      vue                 yaml                zig
+#   (This list may be outdated. For the current list, see values of Language enum here:
+#   https://github.com/oraios/serena/blob/main/src/solidlsp/ls_config.py
+#   For some languages, there are alternative language servers, e.g. csharp_omnisharp, ruby_solargraph.)
+# Note:
+#   - For C, use cpp
+#   - For JavaScript, use typescript
+#   - For Angular projects, use angular (subsumes typescript+html; requires `npm install` in the project root)
+#   - For Svelte projects, use svelte (subsumes typescript/javascript for .svelte projects; requires npm)
+#   - For SCSS / Sass / plain CSS, use scss (some-sass-language-server handles all three)
+#   - For Free Pascal/Lazarus, use pascal
+# Special requirements:
+#   Some languages require additional setup/installations.
+#   See here for details: https://oraios.github.io/serena/01-about/020_programming-languages.html#language-servers
+# When using multiple languages, the first language server that supports a given file will be used for that file.
+# The first language is the default language and the respective language server will be used as a fallback.
+# Note that when using the JetBrains backend, language servers are not used and this list is correspondingly ignored.
+languages:
+- rust
+
+# the encoding used by text files in the project
+# For a list of possible encodings, see https://docs.python.org/3.11/library/codecs.html#standard-encodings
+encoding: "utf-8"
+
+# line ending convention to use when writing source files.
+# Possible values: unset (use global setting), "lf", "crlf", or "native" (platform default)
+# This does not affect Serena's own files (e.g. memories and configuration files), which always use native line endings.
+line_ending:
+
+# The language backend to use for this project.
+# If not set, the global setting from serena_config.yml is used.
+# Valid values: LSP, JetBrains
+# Note: the backend is fixed at startup. If a project with a different backend
+# is activated post-init, an error will be returned.
+language_backend:
+
+# whether to use project's .gitignore files to ignore files
+ignore_all_files_in_gitignore: true
+
+# advanced configuration option allowing to configure language server-specific options.
+# Maps the language key to the options.
+# Have a look at the docstring of the constructors of the LS implementations within solidlsp (e.g., for C# or PHP) to see which options are available.
+# No documentation on options means no options are available.
+ls_specific_settings: {}
+
+# list of additional workspace folder paths for cross-package reference support (e.g. in monorepos).
+# Paths can be absolute or relative to the project root.
+# Each folder is registered as an LSP workspace folder, enabling language servers to discover
+# symbols and references across package boundaries.
+# Currently supported for: TypeScript.
+# Example:
+#   additional_workspace_folders:
+#     - ../sibling-package
+#     - ../shared-lib
+additional_workspace_folders: []
+
+# list of additional paths to ignore in this project.
+# Same syntax as gitignore, so you can use * and **.
+# Note: global ignored_paths from serena_config.yml are also applied additively.
+ignored_paths: []
+
+# whether the project is in read-only mode
+# If set to true, all editing tools will be disabled and attempts to use them will result in an error
+# Added on 2025-04-18
+read_only: false
+
+# list of tool names to exclude.
+# This extends the existing exclusions (e.g. from the global configuration)
+# Find the list of tools here: https://oraios.github.io/serena/01-about/035_tools.html
+excluded_tools: []
+
+# list of tools to include that would otherwise be disabled (particularly optional tools that are disabled by default).
+# This extends the existing inclusions (e.g. from the global configuration).
+# Find the list of tools here: https://oraios.github.io/serena/01-about/035_tools.html
+included_optional_tools: []
+
+# fixed set of tools to use as the base tool set (if non-empty), replacing Serena's default set of tools.
+# This cannot be combined with non-empty excluded_tools or included_optional_tools.
+# Find the list of tools here: https://oraios.github.io/serena/01-about/035_tools.html
+fixed_tools: []
+
+# list of mode names that are to be activated by default, overriding the setting in the global configuration.
+# The full set of modes to be activated is base_modes (from global config) + default_modes + added_modes.
+# If the setting is undefined/empty, the default_modes from the global configuration (serena_config.yml) apply.
+# Otherwise, this overrides the setting from the global configuration (serena_config.yml).
+# Therefore, you can set this to [] if you do not want the default modes defined in the global config to apply
+# for this project.
+# This setting can, in turn, be overridden by CLI parameters (--mode).
+# See https://oraios.github.io/serena/02-usage/050_configuration.html#modes
+default_modes:
+
+# list of mode names to be activated additionally for this project, e.g. ["query-projects"]
+# The full set of modes to be activated is base_modes (from global config) + default_modes + added_modes.
+# See https://oraios.github.io/serena/02-usage/050_configuration.html#modes
+added_modes:
+
+# initial prompt for the project. It will always be given to the LLM upon activating the project
+# (contrary to the memories, which are loaded on demand).
+initial_prompt: ""
+
+# time budget (seconds) per tool call for the retrieval of additional symbol information
+# such as docstrings or parameter information.
+# This overrides the corresponding setting in the global configuration; see the documentation there.
+# If null or missing, use the setting from the global configuration.
+symbol_info_budget:
+
+# list of regex patterns which, when matched, mark a memory entry as read‑only.
+# Extends the list from the global configuration, merging the two lists.
+read_only_memory_patterns: []
+
+# list of regex patterns for memories to completely ignore.
+# Matching memories will not appear in list_memories or activate_project output
+# and cannot be accessed via read_memory or write_memory.
+# To access ignored memory files, use the read_file tool on the raw file path.
+# Extends the list from the global configuration, merging the two lists.
+# Example: ["_archive/.*", "_episodes/.*"]
+ignored_memory_patterns: []
diff --git a/CLAUDE.md b/CLAUDE.md
index 6fa8412..c6cac5a 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -73,3 +73,29 @@ Lors de l'ajout de nouveaux fichiers Markdown dans `docmd/`, mettre à jour la s
 ---
 
 Je continue à poser mes questions et à guider la discussion.
+
+---
+
+## MCP Tools
+
+**Règle absolue : avant tout travail de code, appeler `mcp__serena__initial_instructions` pour charger les instructions Serena.**
+
+### Hiérarchie des outils pour ce projet Rust
+
+**Navigation et édition de code → serena en priorité**
+- Trouver un symbole, une déclaration, les implémentations d'un trait : `mcp__serena__find_symbol`, `mcp__serena__find_declaration`, `mcp__serena__find_implementations`
+- Trouver les usages d'un symbole : `mcp__serena__find_referencing_symbols`
+- Diagnostics LSP (erreurs de compilation) : `mcp__serena__get_diagnostics_for_file`
+- Vue d'ensemble d'un fichier : `mcp__serena__get_symbols_overview`
+- Modifier le corps d'une fonction/impl : `mcp__serena__replace_symbol_body`
+- Ne pas utiliser `cclsp` quand serena couvre le besoin
+
+**Analyse architecturale → jcodemunch**
+- Hotspots, couplage, dead code, dépendances entre modules
+- Utiliser avant de refactorer une zone critique
+
+**Raisonnement complexe → sequential-thinking**
+- Décisions d'architecture, choix d'algorithme, trade-offs non triviaux
+
+**Documentation de crates → context7**
+- Toujours consulter avant d'utiliser une API de bibliothèque externe
diff --git a/Makefile b/Makefile
index e203e6a..0fe6d46 100644
--- a/Makefile
+++ b/Makefile
@@ -22,6 +22,7 @@ $(MKDOCS): $(VENV)/bin/activate
 		mkdocs mkdocs-material \
 		mkdocs-mermaid2-plugin \
 		mkdocs-bibtex
+	$(PIP) install --quiet --upgrade InSilicoSeq
 
 # ── obikmer binary ───────────────────────────────────────────────────────────
 
diff --git a/benchmark/Makefile b/benchmark/Makefile
new file mode 100644
index 0000000..5654ecc
--- /dev/null
+++ b/benchmark/Makefile
@@ -0,0 +1,144 @@
+# Requires GNU Make >= 4.3 (grouped targets &:) — use gmake on macOS
+BINARY  := ../src/target/release/obikmer
+VENV_PY := ../.venv/bin/python3
+
+GENOMES := $(wildcard genomes/*.fna.gz)
+
+# SPECIMENS, SPECIES, and the full dependency graph are generated by
+# make_deps.py from the genome FASTA headers — like .d files in C.
+# Make rebuilds deps.mk whenever genomes/ changes and restarts.
+-include deps.mk
+
+REF_NPZS              := $(SPECIMENS:%=reference_index/%.npz)
+PRESENCE_DONE         := $(SPECIMENS:%=specimen_index_presence/%/index.done)
+PRESENCE_STATS        := $(SPECIMENS:%=stats/indexing_presence/%.stats)
+COUNT_DONE            := $(SPECIMENS:%=specimen_index_count/%/index.done)
+COUNT_STATS           := $(SPECIMENS:%=stats/indexing_count/%.stats)
+VERIFY_PRESENCE_STATS := $(SPECIMENS:%=stats/verify_presence/%.stats)
+VERIFY_COUNT_STATS    := $(SPECIMENS:%=stats/verify_count/%.stats)
+SPECIFIC_PRESENCE_DONE  := $(SPECIES:%=specific_index_presence/%/index.done)
+SPECIFIC_PRESENCE_STATS := $(SPECIES:%=stats/specific_kmer_presence/%.stats)
+SPECIFIC_COUNT_DONE     := $(SPECIES:%=specific_index_count/%/index.done)
+SPECIFIC_COUNT_STATS    := $(SPECIES:%=stats/specific_kmer_count/%.stats)
+SIMULATED_READS := $(foreach s,$(SPECIMENS),simulated_data/$(subst --,/,$s)/reads_R1.fastq.gz)
+
+.NOTPARALLEL:
+
+.PHONY: all simulate reference \
+        index_presence index_count \
+        aggregate_index_presence aggregate_index_count \
+        merge_presence merge_count \
+        verify_presence verify_count \
+        aggregate_verify_presence aggregate_verify_count \
+        verify_merge_presence verify_merge_count \
+        filter_presence filter_count \
+        aggregate_filter_presence aggregate_filter_count
+
+verify_merge_presence: stats/verify_merge_presence/current.csv
+verify_merge_count:    stats/verify_merge_count/current.csv
+
+all: aggregate_verify_presence aggregate_verify_count \
+     verify_merge_presence verify_merge_count \
+     aggregate_filter_presence aggregate_filter_count
+
+# ── dependency file ───────────────────────────────────────────────────────────
+
+deps.mk: $(GENOMES)
+	$(VENV_PY) make_deps.py $^ > $@
+
+# ── simulation ────────────────────────────────────────────────────────────────
+# Prerequisites (genome → reads) are in deps.mk; $< is the genome file.
+
+$(SIMULATED_READS):
+	bash simulate_one.sh $< $(dir $@)
+
+simulate: $(SIMULATED_READS)
+
+# ── reference kmer sets ───────────────────────────────────────────────────────
+# Prerequisites (reads → npz) are in deps.mk.
+
+reference_index/%.npz:
+	bash build_reference.sh $*
+
+reference: $(REF_NPZS)
+
+# ── per-specimen indexing ─────────────────────────────────────────────────────
+# Prerequisites (reads → index.done + .stats) are in deps.mk.
+
+specimen_index_presence/%/index.done \
+stats/indexing_presence/%.stats &: $(BINARY)
+	bash index_one_presence.sh $*
+
+specimen_index_count/%/index.done \
+stats/indexing_count/%.stats &: $(BINARY)
+	bash index_one_count.sh $*
+
+index_presence: $(PRESENCE_DONE)
+index_count:    $(COUNT_DONE)
+
+# ── indexing stats aggregation ────────────────────────────────────────────────
+
+aggregate_index_presence: $(PRESENCE_STATS)
+	bash aggregate_stats.sh indexing_presence
+
+aggregate_index_count: $(COUNT_STATS)
+	bash aggregate_stats.sh indexing_count
+
+# ── global merge ──────────────────────────────────────────────────────────────
+
+global_index_presence/index.done: $(PRESENCE_DONE) $(BINARY)
+	bash merge_presence.sh
+
+global_index_count/index.done: $(COUNT_DONE) $(BINARY)
+	bash merge_count.sh
+
+merge_presence: global_index_presence/index.done
+merge_count:    global_index_count/index.done
+
+# ── per-specimen verification ─────────────────────────────────────────────────
+# Prerequisites (index.done + npz → .stats) are in deps.mk.
+
+stats/verify_presence/%.stats:
+	bash verify_one_presence.sh $*
+
+stats/verify_count/%.stats:
+	bash verify_one_count.sh $*
+
+verify_presence: $(VERIFY_PRESENCE_STATS)
+verify_count:    $(VERIFY_COUNT_STATS)
+
+# ── verification stats aggregation ───────────────────────────────────────────
+
+aggregate_verify_presence: $(VERIFY_PRESENCE_STATS)
+	bash aggregate_stats.sh verify_presence
+
+aggregate_verify_count: $(VERIFY_COUNT_STATS)
+	bash aggregate_stats.sh verify_count
+
+# ── species-specific indexes ──────────────────────────────────────────────────
+# Prerequisites (global index → specific index) are in deps.mk.
+
+specific_index_presence/%/index.done \
+stats/specific_kmer_presence/%.stats &: $(BINARY)
+	bash filter_one_presence.sh $*
+
+specific_index_count/%/index.done \
+stats/specific_kmer_count/%.stats &: $(BINARY)
+	bash filter_one_count.sh $*
+
+filter_presence: $(SPECIFIC_PRESENCE_DONE)
+filter_count:    $(SPECIFIC_COUNT_DONE)
+
+aggregate_filter_presence: $(SPECIFIC_PRESENCE_STATS)
+	bash aggregate_stats.sh specific_kmer_presence
+
+aggregate_filter_count: $(SPECIFIC_COUNT_STATS)
+	bash aggregate_stats.sh specific_kmer_count
+
+# ── merged index verification ─────────────────────────────────────────────────
+
+stats/verify_merge_presence/current.csv: $(REF_NPZS) global_index_presence/index.done
+	bash verify_merge_presence.sh
+
+stats/verify_merge_count/current.csv: $(REF_NPZS) global_index_count/index.done
+	bash verify_merge_count.sh
diff --git a/benchmark/README.md b/benchmark/README.md
new file mode 100644
index 0000000..04ad741
--- /dev/null
+++ b/benchmark/README.md
@@ -0,0 +1,132 @@
+# Benchmark pipeline
+
+Requires **GNU Make ≥ 4.3** (grouped targets `&:`).  On macOS use `gmake`.
+
+```
+gmake all          # full pipeline
+gmake simulate     # simulation only
+gmake reference    # reference kmer sets only
+```
+
+## Pipeline overview
+
+```mermaid
+flowchart TD
+    GENOMES["genomes/*.fna.gz"]
+    BIN["obikmer binary"]
+
+    GENOMES --> simulate
+    simulate --> simdata[("simulated_data/")]
+
+    simdata --> reference
+    reference --> refnpz[("reference_index/*.npz")]
+
+    subgraph presence ["Presence track"]
+        simdata  --> index_presence
+        BIN      --> index_presence
+        index_presence --> pres_done[("specimen_index_presence/")]
+        index_presence --> pres_istats[("stats/indexing_presence/")]
+        pres_istats --> aggregate_index_presence
+
+        pres_done --> merge_presence
+        BIN       --> merge_presence
+        merge_presence --> gpres[("global_index_presence/")]
+
+        refnpz    --> verify_presence
+        pres_done --> verify_presence
+        verify_presence --> vpres_stats[("stats/verify_presence/")]
+        vpres_stats --> aggregate_verify_presence
+
+        gpres --> filter_presence
+        BIN   --> filter_presence
+        filter_presence --> spec_pres[("specific_index_presence/")]
+        filter_presence --> spec_pres_stats[("stats/specific_kmer_presence/")]
+        spec_pres_stats --> aggregate_filter_presence
+
+        refnpz --> verify_merge_presence
+        gpres  --> verify_merge_presence
+        verify_merge_presence --> vmp[("stats/verify_merge_presence/")]
+    end
+
+    subgraph count ["Count track"]
+        simdata --> index_count
+        BIN     --> index_count
+        index_count --> count_done[("specimen_index_count/")]
+        index_count --> count_istats[("stats/indexing_count/")]
+        count_istats --> aggregate_index_count
+
+        count_done --> merge_count
+        BIN        --> merge_count
+        merge_count --> gcount[("global_index_count/")]
+
+        refnpz     --> verify_count
+        count_done --> verify_count
+        verify_count --> vcount_stats[("stats/verify_count/")]
+        vcount_stats --> aggregate_verify_count
+
+        gcount --> filter_count
+        BIN    --> filter_count
+        filter_count --> spec_count[("specific_index_count/")]
+        filter_count --> spec_count_stats[("stats/specific_kmer_count/")]
+        spec_count_stats --> aggregate_filter_count
+
+        refnpz --> verify_merge_count
+        gcount --> verify_merge_count
+        verify_merge_count --> vmc[("stats/verify_merge_count/")]
+    end
+
+    aggregate_verify_presence  --> all
+    aggregate_verify_count     --> all
+    vmp                        --> all
+    vmc                        --> all
+    all -. "$(MAKE) re-eval" .-> aggregate_filter_presence
+    all -. "$(MAKE) re-eval" .-> aggregate_filter_count
+```
+
+## Steps
+
+| Target | Script | Description |
+|---|---|---|
+| `simulate` | `simulate.sh` | Simulate sequencing reads from the reference genomes |
+| `reference` | `build_reference.sh` | Build reference kmer sets (`.npz`) from simulation truth |
+| `index_presence` | `index_one_presence.sh` | Index each specimen (presence mode) |
+| `index_count` | `index_one_count.sh` | Index each specimen (count mode) |
+| `aggregate_index_presence` | `aggregate_stats.sh` | Aggregate per-specimen indexing stats (presence) |
+| `aggregate_index_count` | `aggregate_stats.sh` | Aggregate per-specimen indexing stats (count) |
+| `merge_presence` | `merge_presence.sh` | Merge all specimen presence indexes into a global index |
+| `merge_count` | `merge_count.sh` | Merge all specimen count indexes into a global index |
+| `verify_presence` | `verify_one_presence.sh` | Verify each specimen presence index against reference |
+| `verify_count` | `verify_one_count.sh` | Verify each specimen count index against reference |
+| `aggregate_verify_presence` | `aggregate_stats.sh` | Aggregate per-specimen verification stats (presence) |
+| `aggregate_verify_count` | `aggregate_stats.sh` | Aggregate per-specimen verification stats (count) |
+| `filter_presence` | `filter_one_presence.sh` | Extract species-specific presence indexes from global index |
+| `filter_count` | `filter_one_count.sh` | Extract species-specific count indexes from global index |
+| `aggregate_filter_presence` | `aggregate_stats.sh` | Aggregate species-specific kmer stats (presence) |
+| `aggregate_filter_count` | `aggregate_stats.sh` | Aggregate species-specific kmer stats (count) |
+| `verify_merge_presence` | `verify_merge_presence.sh` | Verify global presence index against all reference sets |
+| `verify_merge_count` | `verify_merge_count.sh` | Verify global count index against all reference sets |
+
+## Directory layout
+
+```
+benchmark/
+├── genomes/                        # input reference genomes (.fna.gz)
+├── simulated_data/                 # generated by simulate
+│   └── <species>/<specimen>/
+├── reference_index/                # reference kmer sets (.npz)
+├── specimen_index_presence/        # per-specimen presence indexes
+├── specimen_index_count/           # per-specimen count indexes
+├── global_index_presence/          # merged global presence index
+├── global_index_count/             # merged global count index
+├── specific_index_presence/        # species-specific presence indexes
+├── specific_index_count/           # species-specific count indexes
+└── stats/                          # all benchmark statistics
+    ├── indexing_presence/
+    ├── indexing_count/
+    ├── verify_presence/
+    ├── verify_count/
+    ├── specific_kmer_presence/
+    ├── specific_kmer_count/
+    ├── verify_merge_presence/
+    └── verify_merge_count/
+```
diff --git a/benchmark/aggregate_stats.sh b/benchmark/aggregate_stats.sh
new file mode 100755
index 0000000..19901bb
--- /dev/null
+++ b/benchmark/aggregate_stats.sh
@@ -0,0 +1,53 @@
+#!/usr/bin/env bash
+# Usage: aggregate_stats.sh TYPE
+# TYPE = indexing_presence | indexing_count | verify_presence | verify_count
+#
+# Reads all stats/TYPE/*.stats files (one CSV data row each, no header).
+# Creates a new stats/TYPE/run_NNN.csv only if any .stats file is newer than
+# the most recent run CSV (idempotent when nothing changed).
+set -euo pipefail
+
+TYPE="$1"
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+STATS_DIR="${SCRIPT_DIR}/stats/${TYPE}"
+
+case "${TYPE}" in
+    indexing_presence|indexing_count)
+        HEADER="run,species,strain,scatter_wall_s,scatter_rss_b,dereplicate_wall_s,dereplicate_rss_b,count_kmer_wall_s,count_kmer_rss_b,index_wall_s,index_rss_b,total_wall_s,total_rss_b"
+        ;;
+    verify_presence)
+        HEADER="run,species,strain,ref_kmers,idx_kmers,false_neg,false_pos,fn_pct,fp_pct"
+        ;;
+    verify_count)
+        HEADER="run,species,strain,ref_kmers,idx_kmers,false_neg,false_pos,count_mismatch,fn_pct,fp_pct,cm_pct"
+        ;;
+    specific_kmer_presence|specific_kmer_count)
+        HEADER="run,species,rebuild_wall_s,rebuild_rss_b,pack_wall_s,pack_rss_b,filter_total_wall_s,filter_total_rss_b,select_wall_s,select_rss_b,select_total_wall_s,select_total_rss_b"
+        ;;
+    *)
+        echo "ERROR: unknown stats type '${TYPE}'" >&2
+        exit 1
+        ;;
+esac
+
+# Find most recent existing run CSV (empty string if none).
+latest_csv=$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' 2>/dev/null | sort | tail -1)
+
+# Check if any .stats file is newer than the latest run CSV.
+if [[ -n "${latest_csv}" ]] && \
+   [[ -z "$(find "${STATS_DIR}" -maxdepth 1 -name '*.stats' -newer "${latest_csv}" 2>/dev/null)" ]]; then
+    echo "[${TYPE}] stats up to date (${latest_csv})"
+    exit 0
+fi
+
+run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' 2>/dev/null | wc -l | tr -d ' ')")
+CSV="${STATS_DIR}/run_${run_n}.csv"
+
+echo "${HEADER}" >"${CSV}"
+
+# Sort .stats files by name for reproducible row order.
+while IFS= read -r stats_file; do
+    sed "s/^/${run_n},/" "${stats_file}"
+done < <(find "${STATS_DIR}" -maxdepth 1 -name '*.stats' | sort) >>"${CSV}"
+
+echo "[${TYPE}] run ${run_n} → ${CSV}"
diff --git a/benchmark/build_reference.py b/benchmark/build_reference.py
new file mode 100755
index 0000000..eddd3da
--- /dev/null
+++ b/benchmark/build_reference.py
@@ -0,0 +1,137 @@
+#!/usr/bin/env python3
+"""Build a reference kmer index from paired-end FASTQ reads.
+
+Extracts canonical kmers — min(kmer, revcomp(kmer)) encoded as uint64 —
+counts their abundances, and saves a sorted numpy pair (kmers, counts).
+
+Output .npz arrays
+  kmers  : uint64, sorted ascending — canonical kmer integers
+  counts : uint32, same order      — raw read abundances
+"""
+import argparse
+import gzip
+import sys
+from collections import defaultdict
+
+import numpy as np
+
+
+# ── encoding ────────────────────────────────────────────────────────────────
+
+_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3,
+           'a': 0, 'c': 1, 'g': 2, 't': 3}
+
+# Lookup table: revcomp of one byte (4 bases, 8 bits).
+# Precomputed once at import time.
+_REVCOMP8 = [0] * 256
+for _i in range(256):
+    _rc, _x = 0, _i
+    for _ in range(4):
+        _rc = (_rc << 2) | (3 - (_x & 3))
+        _x >>= 2
+    _REVCOMP8[_i] = _rc
+del _i, _rc, _x
+
+
+def revcomp_int(kmer: int, k: int) -> int:
+    """Reverse-complement of a kmer encoded as an integer (2 bits/base).
+
+    Uses byte-level lookup (4 bases at a time) for speed.
+    """
+    rc = 0
+    bits_left = 2 * k
+    while bits_left > 0:
+        chunk = min(8, bits_left)
+        rc_byte = _REVCOMP8[kmer & 0xFF] >> (8 - chunk)
+        rc = (rc << chunk) | rc_byte
+        kmer >>= chunk
+        bits_left -= chunk
+    return rc
+
+
+# ── FASTQ parsing ────────────────────────────────────────────────────────────
+
+def iter_sequences(path: str):
+    """Yield raw sequences from a (gzipped) FASTQ file."""
+    opener = gzip.open if path.endswith('.gz') else open
+    with opener(path, 'rt') as fh:
+        while True:
+            if not fh.readline():   # '@' header
+                break
+            seq = fh.readline().rstrip('\n')
+            fh.readline()           # '+'
+            fh.readline()           # quality
+            yield seq
+
+
+# ── kmer counting ────────────────────────────────────────────────────────────
+
+def count_kmers(paths: list[str], k: int) -> dict[int, int]:
+    mask = (1 << (2 * k)) - 1
+    counts: dict[int, int] = defaultdict(int)
+    n_reads = 0
+
+    for path in paths:
+        for seq in iter_sequences(path):
+            n_reads += 1
+            kmer = 0
+            run = 0          # consecutive valid bases
+
+            for c in seq:
+                b = _ENCODE.get(c)
+                if b is None:    # N or unexpected character → reset
+                    kmer = 0
+                    run = 0
+                    continue
+                kmer = ((kmer << 2) | b) & mask
+                run += 1
+                if run >= k:
+                    rc = revcomp_int(kmer, k)
+                    counts[kmer if kmer <= rc else rc] += 1
+
+            if n_reads % 100_000 == 0:
+                print(f'  {n_reads:,} reads processed, '
+                      f'{len(counts):,} distinct kmers so far',
+                      file=sys.stderr)
+
+    print(f'  {n_reads:,} reads total, {len(counts):,} distinct kmers',
+          file=sys.stderr)
+    return counts
+
+
+# ── main ─────────────────────────────────────────────────────────────────────
+
+def main() -> None:
+    ap = argparse.ArgumentParser(description=__doc__,
+                                 formatter_class=argparse.RawDescriptionHelpFormatter)
+    ap.add_argument('reads', nargs='+', metavar='FASTQ',
+                    help='Input reads (FASTQ, gzip OK)')
+    ap.add_argument('-k', '--kmer-size', type=int, default=31,
+                    metavar='K')
+    ap.add_argument('--min-abundance', type=int, default=1,
+                    metavar='N', help='Drop kmers with count < N (default 1)')
+    ap.add_argument('-o', '--output', required=True,
+                    metavar='FILE', help='Output .npz path')
+    args = ap.parse_args()
+
+    print(f'k={args.kmer_size}  files={len(args.reads)}', file=sys.stderr)
+    counts = count_kmers(args.reads, args.kmer_size)
+
+    if args.min_abundance > 1:
+        before = len(counts)
+        counts = {k: v for k, v in counts.items() if v >= args.min_abundance}
+        print(f'  min-abundance={args.min_abundance}: '
+              f'{before - len(counts):,} kmers dropped, '
+              f'{len(counts):,} retained',
+              file=sys.stderr)
+
+    print(f'Sorting and saving → {args.output}', file=sys.stderr)
+    kmers_arr  = np.fromiter(sorted(counts), dtype=np.uint64, count=len(counts))
+    counts_arr = np.array([counts[int(k)] for k in kmers_arr], dtype=np.uint32)
+
+    np.savez_compressed(args.output, kmers=kmers_arr, counts=counts_arr)
+    print(f'Done  {len(kmers_arr):,} kmers  →  {args.output}', file=sys.stderr)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/benchmark/build_reference.sh b/benchmark/build_reference.sh
new file mode 100755
index 0000000..3d312c1
--- /dev/null
+++ b/benchmark/build_reference.sh
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+SIMDATA_DIR="${SCRIPT_DIR}/simulated_data"
+REF_DIR="${SCRIPT_DIR}/reference_index"
+PYTHON="${SCRIPT_DIR}/../.venv/bin/python3"
+BUILD_PY="${SCRIPT_DIR}/build_reference.py"
+
+KMER_SIZE="${KMER_SIZE:-31}"
+MIN_ABUNDANCE="${MIN_ABUNDANCE:-1}"
+
+mkdir -p "${REF_DIR}"
+
+for species_dir in "${SIMDATA_DIR}"/*/; do
+    [[ -d "${species_dir}" ]] || continue
+    species=$(basename "${species_dir}")
+
+    for strain_dir in "${species_dir}"*/; do
+        [[ -d "${strain_dir}" ]] || continue
+        strain=$(basename "${strain_dir}")
+
+        r1="${strain_dir}/reads_R1.fastq.gz"
+        r2="${strain_dir}/reads_R2.fastq.gz"
+        if [[ ! -f "${r1}" || ! -f "${r2}" ]]; then
+            echo "SKIP ${species}--${strain}: reads not found" >&2
+            continue
+        fi
+
+        out="${REF_DIR}/${species}--${strain}.npz"
+        echo "[${species}--${strain}] → ${out}"
+
+        "${PYTHON}" "${BUILD_PY}" \
+            --kmer-size      "${KMER_SIZE}" \
+            --min-abundance  "${MIN_ABUNDANCE}" \
+            --output         "${out}" \
+            "${r1}" "${r2}"
+    done
+done
diff --git a/benchmark/deps.mk b/benchmark/deps.mk
new file mode 100644
index 0000000..031dd59
--- /dev/null
+++ b/benchmark/deps.mk
@@ -0,0 +1,199 @@
+SPECIMENS := Escherichia_coli--K-12_MG1655 Escherichia_coli--EDL933 Salmonella_enterica--LT2 Escherichia_coli--CFT073 Bacillus_subtilis--168 Salmonella_enterica--P125109 Shouchella_clausii--KSM-K16 Escherichia_coli--K-12_W3110 Klebsiella_pneumoniae--MGH_78578 Opitutus_terrae--PB90-1 Saccharolobus_islandicus--M.16.4 Acidobacterium_capsulatum--ATCC_51196 Salmonella_enterica--AKU_12601 Proteus_mirabilis--HI4320 Salmonella_enterica--CT18 Klebsiella_pneumoniae--HS11286 Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1 Klebsiella_pneumoniae--ATCC_13883 Yersinia_ruckeri--YRB Candidozyma_auris--GCF_003013715.1_ASM301371v2
+SPECIES   := Escherichia_coli Salmonella_enterica Bacillus_subtilis Shouchella_clausii Klebsiella_pneumoniae Opitutus_terrae Saccharolobus_islandicus Acidobacterium_capsulatum Proteus_mirabilis Wolbachia_endosymbiont Yersinia_ruckeri Candidozyma_auris
+
+# Escherichia_coli--K-12_MG1655
+simulated_data/Escherichia_coli/K-12_MG1655/reads_R1.fastq.gz: genomes/GCF_000005845.2_ASM584v2_genomic.fna.gz
+reference_index/Escherichia_coli--K-12_MG1655.npz: simulated_data/Escherichia_coli/K-12_MG1655/reads_R1.fastq.gz
+specimen_index_presence/Escherichia_coli--K-12_MG1655/index.done stats/indexing_presence/Escherichia_coli--K-12_MG1655.stats: simulated_data/Escherichia_coli/K-12_MG1655/reads_R1.fastq.gz
+specimen_index_count/Escherichia_coli--K-12_MG1655/index.done stats/indexing_count/Escherichia_coli--K-12_MG1655.stats: simulated_data/Escherichia_coli/K-12_MG1655/reads_R1.fastq.gz
+stats/verify_presence/Escherichia_coli--K-12_MG1655.stats: reference_index/Escherichia_coli--K-12_MG1655.npz specimen_index_presence/Escherichia_coli--K-12_MG1655/index.done
+stats/verify_count/Escherichia_coli--K-12_MG1655.stats: reference_index/Escherichia_coli--K-12_MG1655.npz specimen_index_count/Escherichia_coli--K-12_MG1655/index.done
+
+# Escherichia_coli--EDL933
+simulated_data/Escherichia_coli/EDL933/reads_R1.fastq.gz: genomes/GCF_000006665.1_ASM666v1_genomic.fna.gz
+reference_index/Escherichia_coli--EDL933.npz: simulated_data/Escherichia_coli/EDL933/reads_R1.fastq.gz
+specimen_index_presence/Escherichia_coli--EDL933/index.done stats/indexing_presence/Escherichia_coli--EDL933.stats: simulated_data/Escherichia_coli/EDL933/reads_R1.fastq.gz
+specimen_index_count/Escherichia_coli--EDL933/index.done stats/indexing_count/Escherichia_coli--EDL933.stats: simulated_data/Escherichia_coli/EDL933/reads_R1.fastq.gz
+stats/verify_presence/Escherichia_coli--EDL933.stats: reference_index/Escherichia_coli--EDL933.npz specimen_index_presence/Escherichia_coli--EDL933/index.done
+stats/verify_count/Escherichia_coli--EDL933.stats: reference_index/Escherichia_coli--EDL933.npz specimen_index_count/Escherichia_coli--EDL933/index.done
+
+# Salmonella_enterica--LT2
+simulated_data/Salmonella_enterica/LT2/reads_R1.fastq.gz: genomes/GCF_000006945.2_ASM694v2_genomic.fna.gz
+reference_index/Salmonella_enterica--LT2.npz: simulated_data/Salmonella_enterica/LT2/reads_R1.fastq.gz
+specimen_index_presence/Salmonella_enterica--LT2/index.done stats/indexing_presence/Salmonella_enterica--LT2.stats: simulated_data/Salmonella_enterica/LT2/reads_R1.fastq.gz
+specimen_index_count/Salmonella_enterica--LT2/index.done stats/indexing_count/Salmonella_enterica--LT2.stats: simulated_data/Salmonella_enterica/LT2/reads_R1.fastq.gz
+stats/verify_presence/Salmonella_enterica--LT2.stats: reference_index/Salmonella_enterica--LT2.npz specimen_index_presence/Salmonella_enterica--LT2/index.done
+stats/verify_count/Salmonella_enterica--LT2.stats: reference_index/Salmonella_enterica--LT2.npz specimen_index_count/Salmonella_enterica--LT2/index.done
+
+# Escherichia_coli--CFT073
+simulated_data/Escherichia_coli/CFT073/reads_R1.fastq.gz: genomes/GCF_000007445.1_ASM744v1_genomic.fna.gz
+reference_index/Escherichia_coli--CFT073.npz: simulated_data/Escherichia_coli/CFT073/reads_R1.fastq.gz
+specimen_index_presence/Escherichia_coli--CFT073/index.done stats/indexing_presence/Escherichia_coli--CFT073.stats: simulated_data/Escherichia_coli/CFT073/reads_R1.fastq.gz
+specimen_index_count/Escherichia_coli--CFT073/index.done stats/indexing_count/Escherichia_coli--CFT073.stats: simulated_data/Escherichia_coli/CFT073/reads_R1.fastq.gz
+stats/verify_presence/Escherichia_coli--CFT073.stats: reference_index/Escherichia_coli--CFT073.npz specimen_index_presence/Escherichia_coli--CFT073/index.done
+stats/verify_count/Escherichia_coli--CFT073.stats: reference_index/Escherichia_coli--CFT073.npz specimen_index_count/Escherichia_coli--CFT073/index.done
+
+# Bacillus_subtilis--168
+simulated_data/Bacillus_subtilis/168/reads_R1.fastq.gz: genomes/GCF_000009045.1_ASM904v1_genomic.fna.gz
+reference_index/Bacillus_subtilis--168.npz: simulated_data/Bacillus_subtilis/168/reads_R1.fastq.gz
+specimen_index_presence/Bacillus_subtilis--168/index.done stats/indexing_presence/Bacillus_subtilis--168.stats: simulated_data/Bacillus_subtilis/168/reads_R1.fastq.gz
+specimen_index_count/Bacillus_subtilis--168/index.done stats/indexing_count/Bacillus_subtilis--168.stats: simulated_data/Bacillus_subtilis/168/reads_R1.fastq.gz
+stats/verify_presence/Bacillus_subtilis--168.stats: reference_index/Bacillus_subtilis--168.npz specimen_index_presence/Bacillus_subtilis--168/index.done
+stats/verify_count/Bacillus_subtilis--168.stats: reference_index/Bacillus_subtilis--168.npz specimen_index_count/Bacillus_subtilis--168/index.done
+
+# Salmonella_enterica--P125109
+simulated_data/Salmonella_enterica/P125109/reads_R1.fastq.gz: genomes/GCF_000009505.1_ASM950v1_genomic.fna.gz
+reference_index/Salmonella_enterica--P125109.npz: simulated_data/Salmonella_enterica/P125109/reads_R1.fastq.gz
+specimen_index_presence/Salmonella_enterica--P125109/index.done stats/indexing_presence/Salmonella_enterica--P125109.stats: simulated_data/Salmonella_enterica/P125109/reads_R1.fastq.gz
+specimen_index_count/Salmonella_enterica--P125109/index.done stats/indexing_count/Salmonella_enterica--P125109.stats: simulated_data/Salmonella_enterica/P125109/reads_R1.fastq.gz
+stats/verify_presence/Salmonella_enterica--P125109.stats: reference_index/Salmonella_enterica--P125109.npz specimen_index_presence/Salmonella_enterica--P125109/index.done
+stats/verify_count/Salmonella_enterica--P125109.stats: reference_index/Salmonella_enterica--P125109.npz specimen_index_count/Salmonella_enterica--P125109/index.done
+
+# Shouchella_clausii--KSM-K16
+simulated_data/Shouchella_clausii/KSM-K16/reads_R1.fastq.gz: genomes/GCF_000009825.1_ASM982v1_genomic.fna.gz
+reference_index/Shouchella_clausii--KSM-K16.npz: simulated_data/Shouchella_clausii/KSM-K16/reads_R1.fastq.gz
+specimen_index_presence/Shouchella_clausii--KSM-K16/index.done stats/indexing_presence/Shouchella_clausii--KSM-K16.stats: simulated_data/Shouchella_clausii/KSM-K16/reads_R1.fastq.gz
+specimen_index_count/Shouchella_clausii--KSM-K16/index.done stats/indexing_count/Shouchella_clausii--KSM-K16.stats: simulated_data/Shouchella_clausii/KSM-K16/reads_R1.fastq.gz
+stats/verify_presence/Shouchella_clausii--KSM-K16.stats: reference_index/Shouchella_clausii--KSM-K16.npz specimen_index_presence/Shouchella_clausii--KSM-K16/index.done
+stats/verify_count/Shouchella_clausii--KSM-K16.stats: reference_index/Shouchella_clausii--KSM-K16.npz specimen_index_count/Shouchella_clausii--KSM-K16/index.done
+
+# Escherichia_coli--K-12_W3110
+simulated_data/Escherichia_coli/K-12_W3110/reads_R1.fastq.gz: genomes/GCF_000010245.2_ASM1024v1_genomic.fna.gz
+reference_index/Escherichia_coli--K-12_W3110.npz: simulated_data/Escherichia_coli/K-12_W3110/reads_R1.fastq.gz
+specimen_index_presence/Escherichia_coli--K-12_W3110/index.done stats/indexing_presence/Escherichia_coli--K-12_W3110.stats: simulated_data/Escherichia_coli/K-12_W3110/reads_R1.fastq.gz
+specimen_index_count/Escherichia_coli--K-12_W3110/index.done stats/indexing_count/Escherichia_coli--K-12_W3110.stats: simulated_data/Escherichia_coli/K-12_W3110/reads_R1.fastq.gz
+stats/verify_presence/Escherichia_coli--K-12_W3110.stats: reference_index/Escherichia_coli--K-12_W3110.npz specimen_index_presence/Escherichia_coli--K-12_W3110/index.done
+stats/verify_count/Escherichia_coli--K-12_W3110.stats: reference_index/Escherichia_coli--K-12_W3110.npz specimen_index_count/Escherichia_coli--K-12_W3110/index.done
+
+# Klebsiella_pneumoniae--MGH_78578
+simulated_data/Klebsiella_pneumoniae/MGH_78578/reads_R1.fastq.gz: genomes/GCF_000016305.1_ASM1630v1_genomic.fna.gz
+reference_index/Klebsiella_pneumoniae--MGH_78578.npz: simulated_data/Klebsiella_pneumoniae/MGH_78578/reads_R1.fastq.gz
+specimen_index_presence/Klebsiella_pneumoniae--MGH_78578/index.done stats/indexing_presence/Klebsiella_pneumoniae--MGH_78578.stats: simulated_data/Klebsiella_pneumoniae/MGH_78578/reads_R1.fastq.gz
+specimen_index_count/Klebsiella_pneumoniae--MGH_78578/index.done stats/indexing_count/Klebsiella_pneumoniae--MGH_78578.stats: simulated_data/Klebsiella_pneumoniae/MGH_78578/reads_R1.fastq.gz
+stats/verify_presence/Klebsiella_pneumoniae--MGH_78578.stats: reference_index/Klebsiella_pneumoniae--MGH_78578.npz specimen_index_presence/Klebsiella_pneumoniae--MGH_78578/index.done
+stats/verify_count/Klebsiella_pneumoniae--MGH_78578.stats: reference_index/Klebsiella_pneumoniae--MGH_78578.npz specimen_index_count/Klebsiella_pneumoniae--MGH_78578/index.done
+
+# Opitutus_terrae--PB90-1
+simulated_data/Opitutus_terrae/PB90-1/reads_R1.fastq.gz: genomes/GCF_000019965.1_ASM1996v1_genomic.fna.gz
+reference_index/Opitutus_terrae--PB90-1.npz: simulated_data/Opitutus_terrae/PB90-1/reads_R1.fastq.gz
+specimen_index_presence/Opitutus_terrae--PB90-1/index.done stats/indexing_presence/Opitutus_terrae--PB90-1.stats: simulated_data/Opitutus_terrae/PB90-1/reads_R1.fastq.gz
+specimen_index_count/Opitutus_terrae--PB90-1/index.done stats/indexing_count/Opitutus_terrae--PB90-1.stats: simulated_data/Opitutus_terrae/PB90-1/reads_R1.fastq.gz
+stats/verify_presence/Opitutus_terrae--PB90-1.stats: reference_index/Opitutus_terrae--PB90-1.npz specimen_index_presence/Opitutus_terrae--PB90-1/index.done
+stats/verify_count/Opitutus_terrae--PB90-1.stats: reference_index/Opitutus_terrae--PB90-1.npz specimen_index_count/Opitutus_terrae--PB90-1/index.done
+
+# Saccharolobus_islandicus--M.16.4
+simulated_data/Saccharolobus_islandicus/M.16.4/reads_R1.fastq.gz: genomes/GCF_000022445.1_ASM2244v1_genomic.fna.gz
+reference_index/Saccharolobus_islandicus--M.16.4.npz: simulated_data/Saccharolobus_islandicus/M.16.4/reads_R1.fastq.gz
+specimen_index_presence/Saccharolobus_islandicus--M.16.4/index.done stats/indexing_presence/Saccharolobus_islandicus--M.16.4.stats: simulated_data/Saccharolobus_islandicus/M.16.4/reads_R1.fastq.gz
+specimen_index_count/Saccharolobus_islandicus--M.16.4/index.done stats/indexing_count/Saccharolobus_islandicus--M.16.4.stats: simulated_data/Saccharolobus_islandicus/M.16.4/reads_R1.fastq.gz
+stats/verify_presence/Saccharolobus_islandicus--M.16.4.stats: reference_index/Saccharolobus_islandicus--M.16.4.npz specimen_index_presence/Saccharolobus_islandicus--M.16.4/index.done
+stats/verify_count/Saccharolobus_islandicus--M.16.4.stats: reference_index/Saccharolobus_islandicus--M.16.4.npz specimen_index_count/Saccharolobus_islandicus--M.16.4/index.done
+
+# Acidobacterium_capsulatum--ATCC_51196
+simulated_data/Acidobacterium_capsulatum/ATCC_51196/reads_R1.fastq.gz: genomes/GCF_000022565.1_ASM2256v1_genomic.fna.gz
+reference_index/Acidobacterium_capsulatum--ATCC_51196.npz: simulated_data/Acidobacterium_capsulatum/ATCC_51196/reads_R1.fastq.gz
+specimen_index_presence/Acidobacterium_capsulatum--ATCC_51196/index.done stats/indexing_presence/Acidobacterium_capsulatum--ATCC_51196.stats: simulated_data/Acidobacterium_capsulatum/ATCC_51196/reads_R1.fastq.gz
+specimen_index_count/Acidobacterium_capsulatum--ATCC_51196/index.done stats/indexing_count/Acidobacterium_capsulatum--ATCC_51196.stats: simulated_data/Acidobacterium_capsulatum/ATCC_51196/reads_R1.fastq.gz
+stats/verify_presence/Acidobacterium_capsulatum--ATCC_51196.stats: reference_index/Acidobacterium_capsulatum--ATCC_51196.npz specimen_index_presence/Acidobacterium_capsulatum--ATCC_51196/index.done
+stats/verify_count/Acidobacterium_capsulatum--ATCC_51196.stats: reference_index/Acidobacterium_capsulatum--ATCC_51196.npz specimen_index_count/Acidobacterium_capsulatum--ATCC_51196/index.done
+
+# Salmonella_enterica--AKU_12601
+simulated_data/Salmonella_enterica/AKU_12601/reads_R1.fastq.gz: genomes/GCF_000026565.1_ASM2656v1_genomic.fna.gz
+reference_index/Salmonella_enterica--AKU_12601.npz: simulated_data/Salmonella_enterica/AKU_12601/reads_R1.fastq.gz
+specimen_index_presence/Salmonella_enterica--AKU_12601/index.done stats/indexing_presence/Salmonella_enterica--AKU_12601.stats: simulated_data/Salmonella_enterica/AKU_12601/reads_R1.fastq.gz
+specimen_index_count/Salmonella_enterica--AKU_12601/index.done stats/indexing_count/Salmonella_enterica--AKU_12601.stats: simulated_data/Salmonella_enterica/AKU_12601/reads_R1.fastq.gz
+stats/verify_presence/Salmonella_enterica--AKU_12601.stats: reference_index/Salmonella_enterica--AKU_12601.npz specimen_index_presence/Salmonella_enterica--AKU_12601/index.done
+stats/verify_count/Salmonella_enterica--AKU_12601.stats: reference_index/Salmonella_enterica--AKU_12601.npz specimen_index_count/Salmonella_enterica--AKU_12601/index.done
+
+# Proteus_mirabilis--HI4320
+simulated_data/Proteus_mirabilis/HI4320/reads_R1.fastq.gz: genomes/GCF_000069965.1_ASM6996v1_genomic.fna.gz
+reference_index/Proteus_mirabilis--HI4320.npz: simulated_data/Proteus_mirabilis/HI4320/reads_R1.fastq.gz
+specimen_index_presence/Proteus_mirabilis--HI4320/index.done stats/indexing_presence/Proteus_mirabilis--HI4320.stats: simulated_data/Proteus_mirabilis/HI4320/reads_R1.fastq.gz
+specimen_index_count/Proteus_mirabilis--HI4320/index.done stats/indexing_count/Proteus_mirabilis--HI4320.stats: simulated_data/Proteus_mirabilis/HI4320/reads_R1.fastq.gz
+stats/verify_presence/Proteus_mirabilis--HI4320.stats: reference_index/Proteus_mirabilis--HI4320.npz specimen_index_presence/Proteus_mirabilis--HI4320/index.done
+stats/verify_count/Proteus_mirabilis--HI4320.stats: reference_index/Proteus_mirabilis--HI4320.npz specimen_index_count/Proteus_mirabilis--HI4320/index.done
+
+# Salmonella_enterica--CT18
+simulated_data/Salmonella_enterica/CT18/reads_R1.fastq.gz: genomes/GCF_000195995.1_ASM19599v1_genomic.fna.gz
+reference_index/Salmonella_enterica--CT18.npz: simulated_data/Salmonella_enterica/CT18/reads_R1.fastq.gz
+specimen_index_presence/Salmonella_enterica--CT18/index.done stats/indexing_presence/Salmonella_enterica--CT18.stats: simulated_data/Salmonella_enterica/CT18/reads_R1.fastq.gz
+specimen_index_count/Salmonella_enterica--CT18/index.done stats/indexing_count/Salmonella_enterica--CT18.stats: simulated_data/Salmonella_enterica/CT18/reads_R1.fastq.gz
+stats/verify_presence/Salmonella_enterica--CT18.stats: reference_index/Salmonella_enterica--CT18.npz specimen_index_presence/Salmonella_enterica--CT18/index.done
+stats/verify_count/Salmonella_enterica--CT18.stats: reference_index/Salmonella_enterica--CT18.npz specimen_index_count/Salmonella_enterica--CT18/index.done
+
+# Klebsiella_pneumoniae--HS11286
+simulated_data/Klebsiella_pneumoniae/HS11286/reads_R1.fastq.gz: genomes/GCF_000240185.1_ASM24018v2_genomic.fna.gz
+reference_index/Klebsiella_pneumoniae--HS11286.npz: simulated_data/Klebsiella_pneumoniae/HS11286/reads_R1.fastq.gz
+specimen_index_presence/Klebsiella_pneumoniae--HS11286/index.done stats/indexing_presence/Klebsiella_pneumoniae--HS11286.stats: simulated_data/Klebsiella_pneumoniae/HS11286/reads_R1.fastq.gz
+specimen_index_count/Klebsiella_pneumoniae--HS11286/index.done stats/indexing_count/Klebsiella_pneumoniae--HS11286.stats: simulated_data/Klebsiella_pneumoniae/HS11286/reads_R1.fastq.gz
+stats/verify_presence/Klebsiella_pneumoniae--HS11286.stats: reference_index/Klebsiella_pneumoniae--HS11286.npz specimen_index_presence/Klebsiella_pneumoniae--HS11286/index.done
+stats/verify_count/Klebsiella_pneumoniae--HS11286.stats: reference_index/Klebsiella_pneumoniae--HS11286.npz specimen_index_count/Klebsiella_pneumoniae--HS11286/index.done
+
+# Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1
+simulated_data/Wolbachia_endosymbiont/GCF_000306885.1_ASM30688v1/reads_R1.fastq.gz: genomes/GCF_000306885.1_ASM30688v1_genomic.fna.gz
+reference_index/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.npz: simulated_data/Wolbachia_endosymbiont/GCF_000306885.1_ASM30688v1/reads_R1.fastq.gz
+specimen_index_presence/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1/index.done stats/indexing_presence/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.stats: simulated_data/Wolbachia_endosymbiont/GCF_000306885.1_ASM30688v1/reads_R1.fastq.gz
+specimen_index_count/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1/index.done stats/indexing_count/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.stats: simulated_data/Wolbachia_endosymbiont/GCF_000306885.1_ASM30688v1/reads_R1.fastq.gz
+stats/verify_presence/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.stats: reference_index/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.npz specimen_index_presence/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1/index.done
+stats/verify_count/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.stats: reference_index/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.npz specimen_index_count/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1/index.done
+
+# Klebsiella_pneumoniae--ATCC_13883
+simulated_data/Klebsiella_pneumoniae/ATCC_13883/reads_R1.fastq.gz: genomes/GCF_000742135.1_ASM74213v1_genomic.fna.gz
+reference_index/Klebsiella_pneumoniae--ATCC_13883.npz: simulated_data/Klebsiella_pneumoniae/ATCC_13883/reads_R1.fastq.gz
+specimen_index_presence/Klebsiella_pneumoniae--ATCC_13883/index.done stats/indexing_presence/Klebsiella_pneumoniae--ATCC_13883.stats: simulated_data/Klebsiella_pneumoniae/ATCC_13883/reads_R1.fastq.gz
+specimen_index_count/Klebsiella_pneumoniae--ATCC_13883/index.done stats/indexing_count/Klebsiella_pneumoniae--ATCC_13883.stats: simulated_data/Klebsiella_pneumoniae/ATCC_13883/reads_R1.fastq.gz
+stats/verify_presence/Klebsiella_pneumoniae--ATCC_13883.stats: reference_index/Klebsiella_pneumoniae--ATCC_13883.npz specimen_index_presence/Klebsiella_pneumoniae--ATCC_13883/index.done
+stats/verify_count/Klebsiella_pneumoniae--ATCC_13883.stats: reference_index/Klebsiella_pneumoniae--ATCC_13883.npz specimen_index_count/Klebsiella_pneumoniae--ATCC_13883/index.done
+
+# Yersinia_ruckeri--YRB
+simulated_data/Yersinia_ruckeri/YRB/reads_R1.fastq.gz: genomes/GCF_000834255.1_ASM83425v1_genomic.fna.gz
+reference_index/Yersinia_ruckeri--YRB.npz: simulated_data/Yersinia_ruckeri/YRB/reads_R1.fastq.gz
+specimen_index_presence/Yersinia_ruckeri--YRB/index.done stats/indexing_presence/Yersinia_ruckeri--YRB.stats: simulated_data/Yersinia_ruckeri/YRB/reads_R1.fastq.gz
+specimen_index_count/Yersinia_ruckeri--YRB/index.done stats/indexing_count/Yersinia_ruckeri--YRB.stats: simulated_data/Yersinia_ruckeri/YRB/reads_R1.fastq.gz
+stats/verify_presence/Yersinia_ruckeri--YRB.stats: reference_index/Yersinia_ruckeri--YRB.npz specimen_index_presence/Yersinia_ruckeri--YRB/index.done
+stats/verify_count/Yersinia_ruckeri--YRB.stats: reference_index/Yersinia_ruckeri--YRB.npz specimen_index_count/Yersinia_ruckeri--YRB/index.done
+
+# Candidozyma_auris--GCF_003013715.1_ASM301371v2
+simulated_data/Candidozyma_auris/GCF_003013715.1_ASM301371v2/reads_R1.fastq.gz: genomes/GCF_003013715.1_ASM301371v2_genomic.fna.gz
+reference_index/Candidozyma_auris--GCF_003013715.1_ASM301371v2.npz: simulated_data/Candidozyma_auris/GCF_003013715.1_ASM301371v2/reads_R1.fastq.gz
+specimen_index_presence/Candidozyma_auris--GCF_003013715.1_ASM301371v2/index.done stats/indexing_presence/Candidozyma_auris--GCF_003013715.1_ASM301371v2.stats: simulated_data/Candidozyma_auris/GCF_003013715.1_ASM301371v2/reads_R1.fastq.gz
+specimen_index_count/Candidozyma_auris--GCF_003013715.1_ASM301371v2/index.done stats/indexing_count/Candidozyma_auris--GCF_003013715.1_ASM301371v2.stats: simulated_data/Candidozyma_auris/GCF_003013715.1_ASM301371v2/reads_R1.fastq.gz
+stats/verify_presence/Candidozyma_auris--GCF_003013715.1_ASM301371v2.stats: reference_index/Candidozyma_auris--GCF_003013715.1_ASM301371v2.npz specimen_index_presence/Candidozyma_auris--GCF_003013715.1_ASM301371v2/index.done
+stats/verify_count/Candidozyma_auris--GCF_003013715.1_ASM301371v2.stats: reference_index/Candidozyma_auris--GCF_003013715.1_ASM301371v2.npz specimen_index_count/Candidozyma_auris--GCF_003013715.1_ASM301371v2/index.done
+
+# Escherichia_coli
+specific_index_presence/Escherichia_coli/index.done stats/specific_kmer_presence/Escherichia_coli.stats: global_index_presence/index.done
+specific_index_count/Escherichia_coli/index.done stats/specific_kmer_count/Escherichia_coli.stats: global_index_count/index.done
+# Salmonella_enterica
+specific_index_presence/Salmonella_enterica/index.done stats/specific_kmer_presence/Salmonella_enterica.stats: global_index_presence/index.done
+specific_index_count/Salmonella_enterica/index.done stats/specific_kmer_count/Salmonella_enterica.stats: global_index_count/index.done
+# Bacillus_subtilis
+specific_index_presence/Bacillus_subtilis/index.done stats/specific_kmer_presence/Bacillus_subtilis.stats: global_index_presence/index.done
+specific_index_count/Bacillus_subtilis/index.done stats/specific_kmer_count/Bacillus_subtilis.stats: global_index_count/index.done
+# Shouchella_clausii
+specific_index_presence/Shouchella_clausii/index.done stats/specific_kmer_presence/Shouchella_clausii.stats: global_index_presence/index.done
+specific_index_count/Shouchella_clausii/index.done stats/specific_kmer_count/Shouchella_clausii.stats: global_index_count/index.done
+# Klebsiella_pneumoniae
+specific_index_presence/Klebsiella_pneumoniae/index.done stats/specific_kmer_presence/Klebsiella_pneumoniae.stats: global_index_presence/index.done
+specific_index_count/Klebsiella_pneumoniae/index.done stats/specific_kmer_count/Klebsiella_pneumoniae.stats: global_index_count/index.done
+# Opitutus_terrae
+specific_index_presence/Opitutus_terrae/index.done stats/specific_kmer_presence/Opitutus_terrae.stats: global_index_presence/index.done
+specific_index_count/Opitutus_terrae/index.done stats/specific_kmer_count/Opitutus_terrae.stats: global_index_count/index.done
+# Saccharolobus_islandicus
+specific_index_presence/Saccharolobus_islandicus/index.done stats/specific_kmer_presence/Saccharolobus_islandicus.stats: global_index_presence/index.done
+specific_index_count/Saccharolobus_islandicus/index.done stats/specific_kmer_count/Saccharolobus_islandicus.stats: global_index_count/index.done
+# Acidobacterium_capsulatum
+specific_index_presence/Acidobacterium_capsulatum/index.done stats/specific_kmer_presence/Acidobacterium_capsulatum.stats: global_index_presence/index.done
+specific_index_count/Acidobacterium_capsulatum/index.done stats/specific_kmer_count/Acidobacterium_capsulatum.stats: global_index_count/index.done
+# Proteus_mirabilis
+specific_index_presence/Proteus_mirabilis/index.done stats/specific_kmer_presence/Proteus_mirabilis.stats: global_index_presence/index.done
+specific_index_count/Proteus_mirabilis/index.done stats/specific_kmer_count/Proteus_mirabilis.stats: global_index_count/index.done
+# Wolbachia_endosymbiont
+specific_index_presence/Wolbachia_endosymbiont/index.done stats/specific_kmer_presence/Wolbachia_endosymbiont.stats: global_index_presence/index.done
+specific_index_count/Wolbachia_endosymbiont/index.done stats/specific_kmer_count/Wolbachia_endosymbiont.stats: global_index_count/index.done
+# Yersinia_ruckeri
+specific_index_presence/Yersinia_ruckeri/index.done stats/specific_kmer_presence/Yersinia_ruckeri.stats: global_index_presence/index.done
+specific_index_count/Yersinia_ruckeri/index.done stats/specific_kmer_count/Yersinia_ruckeri.stats: global_index_count/index.done
+# Candidozyma_auris
+specific_index_presence/Candidozyma_auris/index.done stats/specific_kmer_presence/Candidozyma_auris.stats: global_index_presence/index.done
+specific_index_count/Candidozyma_auris/index.done stats/specific_kmer_count/Candidozyma_auris.stats: global_index_count/index.done
diff --git a/benchmark/downloads.sh b/benchmark/downloads.sh
new file mode 100755
index 0000000..d86111e
--- /dev/null
+++ b/benchmark/downloads.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+assemblies=(
+    GCF_000005845.2
+    GCF_000010245.2
+    GCF_000007445.1
+    GCF_000006665.1
+
+    GCF_000006945.2
+    GCF_000195995.1
+    GCF_000009505.1
+    GCF_000026565.1
+
+    GCF_000016305.1
+    GCF_000019965.1
+    GCF_000240185.1
+    GCF_000742135.1
+
+    GCF_000069965.1
+    GCF_000022565.1
+    GCF_000306885.1
+    GCF_003013715.1
+
+    GCF_000009045.1
+    GCF_000009825.1
+    GCF_000022445.1
+    GCF_000834255.1
+)
+
+mkdir -p genomes
+
+for acc in "${assemblies[@]}"; do
+    echo "Downloading ${acc}"
+
+    datasets download genome accession "${acc}" \
+        --include genome \
+        --filename "${acc}.zip"
+
+    unzip -q "${acc}.zip" -d "${acc}"
+    find "${acc}" -name "*.fna" |
+        while read file; do
+            obiconvert -Z ${file} >genomes/$(basename ${file}).gz
+        done
+
+    rm -rf "${acc}" "${acc}.zip"
+done
diff --git a/benchmark/filter_one_count.sh b/benchmark/filter_one_count.sh
new file mode 100755
index 0000000..115ed3c
--- /dev/null
+++ b/benchmark/filter_one_count.sh
@@ -0,0 +1,108 @@
+#!/usr/bin/env bash
+# Usage: filter_one_count.sh SPECIES
+# Filters global_index_count to keep only kmers specific to SPECIES,
+# then selects the SPECIES column in-place.
+# Outputs:
+#   specific_index_count/SPECIES/index.done  (written by obikmer select)
+#   stats/specific_kmer_count/SPECIES.stats  (one CSV data row, no header)
+set -euo pipefail
+
+SPECIES="$1"
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
+
+SOURCE="${SCRIPT_DIR}/global_index_count"
+OUTPUT="${SCRIPT_DIR}/specific_index_count/${SPECIES}"
+STATS_DIR="${SCRIPT_DIR}/stats/specific_kmer_count"
+STATS_FILE="${STATS_DIR}/${SPECIES}.stats"
+
+mkdir -p "${STATS_DIR}"
+
+echo "[${SPECIES}] filter (count) → ${OUTPUT}"
+
+LOG_FILTER=$(mktemp)
+LOG_SELECT=$(mktemp)
+trap 'rm -f "${LOG_FILTER}" "${LOG_SELECT}"' EXIT
+
+"${BINARY}" filter \
+    --output "${OUTPUT}" \
+    --force \
+    --ingroup "species=${SPECIES}" \
+    --outgroup all \
+    --min-frac 0.5 \
+    --max-frac 1.0 \
+    --max-outgroup-count 0 \
+    "${SOURCE}" \
+    2>"${LOG_FILTER}"
+
+cat "${LOG_FILTER}" >&2
+
+"${BINARY}" select \
+    --in-place \
+    --group "${SPECIES}:species=${SPECIES}" \
+    --group-op "${SPECIES}:any" \
+    --select "${SPECIES}" \
+    "${OUTPUT}" \
+    2>"${LOG_SELECT}"
+
+cat "${LOG_SELECT}" >&2
+
+python3 - "${SPECIES}" "${LOG_FILTER}" "${LOG_SELECT}" <<'PYEOF' >"${STATS_FILE}"
+import sys, re
+
+species, log_filter, log_select = sys.argv[1], sys.argv[2], sys.argv[3]
+
+def strip_ansi(s):
+    return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
+
+def parse_wall(s):
+    s = s.strip()
+    if s.endswith('ms'): return float(s[:-2]) / 1000.0
+    if s.endswith('s'):  return float(s[:-1])
+    return 0.0
+
+def parse_rss(s):
+    m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
+    if not m: return 0
+    return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
+
+def is_sep(s):
+    return bool(s) and not re.search(r'[A-Za-z0-9]', s)
+
+def parse_reporter(logfile):
+    stats = {}
+    state = 'scan'
+    with open(logfile, errors='replace') as fh:
+        for raw in fh:
+            line = strip_ansi(raw.rstrip('\n'))
+            s    = line.strip()
+            if state == 'scan':
+                if re.search(r'\bstage\b.*\bwall\b', line):
+                    state = 'in_header'
+            elif state == 'in_header':
+                if is_sep(s): state = 'rows'
+            elif state == 'rows':
+                if is_sep(s): state = 'total'
+                elif s:
+                    parts = re.split(r'  +', s)
+                    if len(parts) >= 4:
+                        stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
+            elif state == 'total':
+                if s:
+                    parts = re.split(r'  +', s)
+                    if len(parts) >= 3:
+                        stats['TOTAL'] = (parse_wall(parts[1]),
+                                          parse_rss(parts[3]) if len(parts) > 3 else 0)
+                break
+    return stats
+
+f = parse_reporter(log_filter)
+s = parse_reporter(log_select)
+
+row = [species]
+for stage, d in [('rebuild', f), ('pack', f), ('filter_total', f), ('select', s), ('select_total', s)]:
+    key = 'TOTAL' if stage.endswith('_total') else stage
+    w, r = d.get(key, ('', ''))
+    row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
+print(','.join(row))
+PYEOF
diff --git a/benchmark/filter_one_presence.sh b/benchmark/filter_one_presence.sh
new file mode 100755
index 0000000..12099ce
--- /dev/null
+++ b/benchmark/filter_one_presence.sh
@@ -0,0 +1,108 @@
+#!/usr/bin/env bash
+# Usage: filter_one_presence.sh SPECIES
+# Filters global_index_presence to keep only kmers specific to SPECIES,
+# then selects the SPECIES column in-place.
+# Outputs:
+#   specific_index_presence/SPECIES/index.done  (written by obikmer select)
+#   stats/specific_kmer_presence/SPECIES.stats  (one CSV data row, no header)
+set -euo pipefail
+
+SPECIES="$1"
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
+
+SOURCE="${SCRIPT_DIR}/global_index_presence"
+OUTPUT="${SCRIPT_DIR}/specific_index_presence/${SPECIES}"
+STATS_DIR="${SCRIPT_DIR}/stats/specific_kmer_presence"
+STATS_FILE="${STATS_DIR}/${SPECIES}.stats"
+
+mkdir -p "${STATS_DIR}"
+
+echo "[${SPECIES}] filter (presence) → ${OUTPUT}"
+
+LOG_FILTER=$(mktemp)
+LOG_SELECT=$(mktemp)
+trap 'rm -f "${LOG_FILTER}" "${LOG_SELECT}"' EXIT
+
+"${BINARY}" filter \
+    --output "${OUTPUT}" \
+    --force \
+    --ingroup "species=${SPECIES}" \
+    --outgroup all \
+    --min-frac 0.5 \
+    --max-frac 1.0 \
+    --max-outgroup-count 0 \
+    "${SOURCE}" \
+    2>"${LOG_FILTER}"
+
+cat "${LOG_FILTER}" >&2
+
+"${BINARY}" select \
+    --in-place \
+    --group "${SPECIES}:species=${SPECIES}" \
+    --group-op "${SPECIES}:any" \
+    --select "${SPECIES}" \
+    "${OUTPUT}" \
+    2>"${LOG_SELECT}"
+
+cat "${LOG_SELECT}" >&2
+
+python3 - "${SPECIES}" "${LOG_FILTER}" "${LOG_SELECT}" <<'PYEOF' >"${STATS_FILE}"
+import sys, re
+
+species, log_filter, log_select = sys.argv[1], sys.argv[2], sys.argv[3]
+
+def strip_ansi(s):
+    return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
+
+def parse_wall(s):
+    s = s.strip()
+    if s.endswith('ms'): return float(s[:-2]) / 1000.0
+    if s.endswith('s'):  return float(s[:-1])
+    return 0.0
+
+def parse_rss(s):
+    m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
+    if not m: return 0
+    return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
+
+def is_sep(s):
+    return bool(s) and not re.search(r'[A-Za-z0-9]', s)
+
+def parse_reporter(logfile):
+    stats = {}
+    state = 'scan'
+    with open(logfile, errors='replace') as fh:
+        for raw in fh:
+            line = strip_ansi(raw.rstrip('\n'))
+            s    = line.strip()
+            if state == 'scan':
+                if re.search(r'\bstage\b.*\bwall\b', line):
+                    state = 'in_header'
+            elif state == 'in_header':
+                if is_sep(s): state = 'rows'
+            elif state == 'rows':
+                if is_sep(s): state = 'total'
+                elif s:
+                    parts = re.split(r'  +', s)
+                    if len(parts) >= 4:
+                        stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
+            elif state == 'total':
+                if s:
+                    parts = re.split(r'  +', s)
+                    if len(parts) >= 3:
+                        stats['TOTAL'] = (parse_wall(parts[1]),
+                                          parse_rss(parts[3]) if len(parts) > 3 else 0)
+                break
+    return stats
+
+f = parse_reporter(log_filter)
+s = parse_reporter(log_select)
+
+row = [species]
+for stage, d in [('rebuild', f), ('pack', f), ('filter_total', f), ('select', s), ('select_total', s)]:
+    key = 'TOTAL' if stage.endswith('_total') else stage
+    w, r = d.get(key, ('', ''))
+    row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
+print(','.join(row))
+PYEOF
diff --git a/benchmark/index_one_count.sh b/benchmark/index_one_count.sh
new file mode 100755
index 0000000..325ec7f
--- /dev/null
+++ b/benchmark/index_one_count.sh
@@ -0,0 +1,103 @@
+#!/usr/bin/env bash
+# Usage: index_one_count.sh SPECIMEN
+# SPECIMEN = "species--strain" (Make pattern stem)
+# Outputs:
+#   specimen_index_count/SPECIMEN/index.done  (written by obikmer)
+#   stats/indexing_count/SPECIMEN.stats       (one CSV data row, no header)
+set -euo pipefail
+
+SPECIMEN="$1"
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
+
+species="${SPECIMEN%%--*}"
+strain="${SPECIMEN#*--}"
+
+READS_DIR="${SCRIPT_DIR}/simulated_data/${species}/${strain}"
+INDEX_PATH="${SCRIPT_DIR}/specimen_index_count/${SPECIMEN}"
+STATS_DIR="${SCRIPT_DIR}/stats/indexing_count"
+STATS_FILE="${STATS_DIR}/${SPECIMEN}.stats"
+
+mkdir -p "${STATS_DIR}"
+
+r1="${READS_DIR}/reads_R1.fastq.gz"
+r2="${READS_DIR}/reads_R2.fastq.gz"
+if [[ ! -f "${r1}" || ! -f "${r2}" ]]; then
+    echo "ERROR: reads not found in ${READS_DIR}" >&2
+    exit 1
+fi
+
+echo "[${SPECIMEN}] indexing (count) → ${INDEX_PATH}"
+
+STDERR_LOG=$(mktemp)
+trap 'rm -f "${STDERR_LOG}"' EXIT
+
+"${BINARY}" index \
+    --output "${INDEX_PATH}" \
+    --force \
+    --theta 0 \
+    --with-counts \
+    --label "${SPECIMEN}" \
+    --meta  "species=${species}" \
+    "${r1}" "${r2}" \
+    2>"${STDERR_LOG}"
+
+cat "${STDERR_LOG}" >&2
+
+python3 - "${species}" "${strain}" "${STDERR_LOG}" <<'PYEOF' >"${STATS_FILE}"
+import sys, re
+
+species, strain, logfile = sys.argv[1], sys.argv[2], sys.argv[3]
+
+def strip_ansi(s):
+    return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
+
+def parse_wall(s):
+    s = s.strip()
+    if s.endswith('ms'): return float(s[:-2]) / 1000.0
+    if s.endswith('s'):  return float(s[:-1])
+    return 0.0
+
+def parse_rss(s):
+    m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
+    if not m: return 0
+    return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
+
+def is_sep(s):
+    return bool(s) and not re.search(r'[A-Za-z0-9]', s)
+
+stats = {}
+state = 'scan'
+
+with open(logfile, errors='replace') as fh:
+    for raw in fh:
+        line = strip_ansi(raw.rstrip('\n'))
+        s    = line.strip()
+        if state == 'scan':
+            if re.search(r'\bstage\b.*\bwall\b', line):
+                state = 'in_header'
+        elif state == 'in_header':
+            if is_sep(s): state = 'rows'
+        elif state == 'rows':
+            if is_sep(s): state = 'total'
+            elif s:
+                parts = re.split(r'  +', s)
+                if len(parts) >= 4:
+                    stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
+        elif state == 'total':
+            if s:
+                parts = re.split(r'  +', s)
+                if len(parts) >= 3:
+                    stats[parts[0]] = (parse_wall(parts[1]),
+                                       parse_rss(parts[3]) if len(parts) > 3 else 0)
+            break
+
+STAGE_ORDER = ['scatter', 'dereplicate', 'count_kmer', 'index']
+row = [species, strain]
+for stage in STAGE_ORDER:
+    w, r = stats.get(stage, ('', ''))
+    row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
+tw, tr = stats.get('TOTAL', ('', ''))
+row += [f'{tw:.3f}' if isinstance(tw, float) else '', str(tr)]
+print(','.join(row))
+PYEOF
diff --git a/benchmark/index_one_presence.sh b/benchmark/index_one_presence.sh
new file mode 100755
index 0000000..029c537
--- /dev/null
+++ b/benchmark/index_one_presence.sh
@@ -0,0 +1,102 @@
+#!/usr/bin/env bash
+# Usage: index_one_presence.sh SPECIMEN
+# SPECIMEN = "species--strain" (Make pattern stem)
+# Outputs:
+#   specimen_index_presence/SPECIMEN/index.done  (written by obikmer)
+#   stats/indexing_presence/SPECIMEN.stats       (one CSV data row, no header)
+set -euo pipefail
+
+SPECIMEN="$1"
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
+
+species="${SPECIMEN%%--*}"
+strain="${SPECIMEN#*--}"
+
+READS_DIR="${SCRIPT_DIR}/simulated_data/${species}/${strain}"
+INDEX_PATH="${SCRIPT_DIR}/specimen_index_presence/${SPECIMEN}"
+STATS_DIR="${SCRIPT_DIR}/stats/indexing_presence"
+STATS_FILE="${STATS_DIR}/${SPECIMEN}.stats"
+
+mkdir -p "${STATS_DIR}"
+
+r1="${READS_DIR}/reads_R1.fastq.gz"
+r2="${READS_DIR}/reads_R2.fastq.gz"
+if [[ ! -f "${r1}" || ! -f "${r2}" ]]; then
+    echo "ERROR: reads not found in ${READS_DIR}" >&2
+    exit 1
+fi
+
+echo "[${SPECIMEN}] indexing (presence) → ${INDEX_PATH}"
+
+STDERR_LOG=$(mktemp)
+trap 'rm -f "${STDERR_LOG}"' EXIT
+
+"${BINARY}" index \
+    --output "${INDEX_PATH}" \
+    --force \
+    --theta 0 \
+    --label "${SPECIMEN}" \
+    --meta  "species=${species}" \
+    "${r1}" "${r2}" \
+    2>"${STDERR_LOG}"
+
+cat "${STDERR_LOG}" >&2
+
+python3 - "${species}" "${strain}" "${STDERR_LOG}" <<'PYEOF' >"${STATS_FILE}"
+import sys, re
+
+species, strain, logfile = sys.argv[1], sys.argv[2], sys.argv[3]
+
+def strip_ansi(s):
+    return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
+
+def parse_wall(s):
+    s = s.strip()
+    if s.endswith('ms'): return float(s[:-2]) / 1000.0
+    if s.endswith('s'):  return float(s[:-1])
+    return 0.0
+
+def parse_rss(s):
+    m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
+    if not m: return 0
+    return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
+
+def is_sep(s):
+    return bool(s) and not re.search(r'[A-Za-z0-9]', s)
+
+stats = {}
+state = 'scan'
+
+with open(logfile, errors='replace') as fh:
+    for raw in fh:
+        line = strip_ansi(raw.rstrip('\n'))
+        s    = line.strip()
+        if state == 'scan':
+            if re.search(r'\bstage\b.*\bwall\b', line):
+                state = 'in_header'
+        elif state == 'in_header':
+            if is_sep(s): state = 'rows'
+        elif state == 'rows':
+            if is_sep(s): state = 'total'
+            elif s:
+                parts = re.split(r'  +', s)
+                if len(parts) >= 4:
+                    stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
+        elif state == 'total':
+            if s:
+                parts = re.split(r'  +', s)
+                if len(parts) >= 3:
+                    stats[parts[0]] = (parse_wall(parts[1]),
+                                       parse_rss(parts[3]) if len(parts) > 3 else 0)
+            break
+
+STAGE_ORDER = ['scatter', 'dereplicate', 'count_kmer', 'index']
+row = [species, strain]
+for stage in STAGE_ORDER:
+    w, r = stats.get(stage, ('', ''))
+    row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
+tw, tr = stats.get('TOTAL', ('', ''))
+row += [f'{tw:.3f}' if isinstance(tw, float) else '', str(tr)]
+print(','.join(row))
+PYEOF
diff --git a/benchmark/make_deps.py b/benchmark/make_deps.py
new file mode 100644
index 0000000..03f7e2a
--- /dev/null
+++ b/benchmark/make_deps.py
@@ -0,0 +1,118 @@
+#!/usr/bin/env python3
+"""Generate deps.mk — pure dependency declarations for the benchmark pipeline.
+
+Like C .d files: only target: prerequisites lines, no recipes.
+Recipes stay in the Makefile as generic rules.
+"""
+import gzip
+import re
+import sys
+from pathlib import Path
+
+STOP_WORDS    = {'complete', 'chromosome', 'whole', 'sequence', 'genome',
+                 'endosymbiont', 'of'}
+STOP_PREFIXES = ('scaffold', 'contig', 'plasmid')
+
+
+def is_stop(tok):
+    t = tok.lower()
+    return t in STOP_WORDS or any(t.startswith(p) for p in STOP_PREFIXES)
+
+
+def sanitize(s):
+    return re.sub(r'[^A-Za-z0-9._-]', '_', s).strip('_')
+
+
+def collect_tokens(text):
+    parts = []
+    for tok in text.split():
+        tok = tok.rstrip(',.')
+        if is_stop(tok):
+            break
+        parts.append(sanitize(tok))
+    return '_'.join(filter(None, parts))
+
+
+def parse_organism(defn, gcf_id):
+    words   = defn.split()
+    species = sanitize(words[0] + '_' + words[1])
+
+    m = re.search(r'\bstr\.\s+(\S+)(?:\s+substr\.\s+(\S+))?', defn)
+    if m:
+        strain = sanitize(m.group(1))
+        if m.group(2):
+            strain += '_' + sanitize(m.group(2))
+        return species, strain
+
+    m = re.search(r'\bstrain\b\s+(.*)', defn)
+    if m:
+        strain = collect_tokens(m.group(1))
+        if strain:
+            return species, strain
+
+    remainder = re.sub(r'^\S+ \S+\s*', '', defn)
+    remainder = re.sub(r'^subsp\.\s+\S+\s*', '', remainder)
+    remainder = re.sub(r'^serovar\s+\S+\s*', '', remainder)
+    strain    = collect_tokens(remainder)
+    return species, strain if strain else gcf_id
+
+
+def first_definition(path):
+    with gzip.open(path, 'rt') as fh:
+        for line in fh:
+            if line.startswith('>'):
+                m = re.search(r'"definition":"([^"]*)"', line)
+                return m.group(1) if m else line[1:].split()[0]
+    return Path(path).stem
+
+
+def main():
+    entries = []   # (specimen, species, sim_dir, genome_path)
+    species_seen = []
+
+    for path in sorted(sys.argv[1:]):
+        gcf_id  = Path(path).name.replace('_genomic.fna.gz', '')
+        defn    = first_definition(path)
+        sp, st  = parse_organism(defn, gcf_id)
+        specimen = f'{sp}--{st}'
+        sim_dir  = f'simulated_data/{sp}/{st}'
+        entries.append((specimen, sp, sim_dir, path))
+        if sp not in species_seen:
+            species_seen.append(sp)
+
+    specimens = [e[0] for e in entries]
+    print('SPECIMENS :=', ' '.join(specimens))
+    print('SPECIES   :=', ' '.join(species_seen))
+
+    for specimen, species, sim_dir, genome in entries:
+        reads = f'{sim_dir}/reads_R1.fastq.gz'
+        p_done  = f'specimen_index_presence/{specimen}/index.done'
+        p_stats = f'stats/indexing_presence/{specimen}.stats'
+        c_done  = f'specimen_index_count/{specimen}/index.done'
+        c_stats = f'stats/indexing_count/{specimen}.stats'
+        ref     = f'reference_index/{specimen}.npz'
+        vp      = f'stats/verify_presence/{specimen}.stats'
+        vc      = f'stats/verify_count/{specimen}.stats'
+
+        print()
+        print(f'# {specimen}')
+        print(f'{reads}: {genome}')
+        print(f'{ref}: {reads}')
+        print(f'{p_done} {p_stats}: {reads}')
+        print(f'{c_done} {c_stats}: {reads}')
+        print(f'{vp}: {ref} {p_done}')
+        print(f'{vc}: {ref} {c_done}')
+
+    print()
+    for sp in species_seen:
+        sp_done  = f'specific_index_presence/{sp}/index.done'
+        sp_stats = f'stats/specific_kmer_presence/{sp}.stats'
+        sc_done  = f'specific_index_count/{sp}/index.done'
+        sc_stats = f'stats/specific_kmer_count/{sp}.stats'
+        print(f'# {sp}')
+        print(f'{sp_done} {sp_stats}: global_index_presence/index.done')
+        print(f'{sc_done} {sc_stats}: global_index_count/index.done')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/benchmark/merge_count.sh b/benchmark/merge_count.sh
new file mode 100755
index 0000000..871b436
--- /dev/null
+++ b/benchmark/merge_count.sh
@@ -0,0 +1,103 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
+IDX_DIR="${SCRIPT_DIR}/specimen_index_count"
+OUTPUT="${SCRIPT_DIR}/global_index_count"
+STATS_DIR="${SCRIPT_DIR}/stats/merge_count"
+
+mkdir -p "${STATS_DIR}"
+
+run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' | wc -l | tr -d ' ')")
+CSV="${STATS_DIR}/run_${run_n}.csv"
+
+printf 'run,n_sources,bootstrap_wall_s,bootstrap_rss_b,spectrums_wall_s,spectrums_rss_b,merge_partitions_wall_s,merge_partitions_rss_b,pack_wall_s,pack_rss_b,total_wall_s,total_rss_b\n' >"${CSV}"
+
+parse_reporter() {
+    local run="$1" n_sources="$2" logfile="$3"
+    python3 - "$run" "$n_sources" "$logfile" <<'PYEOF'
+import sys, re
+
+run, n_sources, logfile = sys.argv[1], sys.argv[2], sys.argv[3]
+
+def strip_ansi(s):
+    return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
+
+def parse_wall(s):
+    s = s.strip()
+    if s.endswith('ms'): return float(s[:-2]) / 1000.0
+    if s.endswith('s'):  return float(s[:-1])
+    return 0.0
+
+def parse_rss(s):
+    m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
+    if not m: return 0
+    return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
+
+def is_sep(s):
+    return bool(s) and not re.search(r'[A-Za-z0-9]', s)
+
+stats = {}
+state = 'scan'
+
+with open(logfile, errors='replace') as fh:
+    for raw in fh:
+        line = strip_ansi(raw.rstrip('\n'))
+        s    = line.strip()
+
+        if state == 'scan':
+            if re.search(r'\bstage\b.*\bwall\b', line):
+                state = 'in_header'
+        elif state == 'in_header':
+            if is_sep(s):
+                state = 'rows'
+        elif state == 'rows':
+            if is_sep(s):
+                state = 'total'
+            elif s:
+                parts = re.split(r'  +', s)
+                if len(parts) >= 4:
+                    stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
+        elif state == 'total':
+            if s:
+                parts = re.split(r'  +', s)
+                if len(parts) >= 3:
+                    stats[parts[0]] = (parse_wall(parts[1]),
+                                       parse_rss(parts[3]) if len(parts) > 3 else 0)
+            break
+
+STAGE_ORDER = ['bootstrap', 'spectrums', 'merge_partitions', 'pack']
+row = [run, n_sources]
+for stage in STAGE_ORDER:
+    w, r = stats.get(stage, ('', ''))
+    row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
+tw, tr = stats.get('TOTAL', ('', ''))
+row += [f'{tw:.3f}' if isinstance(tw, float) else '', str(tr)]
+print(','.join(row))
+PYEOF
+}
+
+mapfile -t sources < <(find "${IDX_DIR}" -mindepth 1 -maxdepth 1 -type d | sort)
+
+if [[ ${#sources[@]} -eq 0 ]]; then
+    echo "ERROR: no indexes found in ${IDX_DIR}" >&2
+    exit 1
+fi
+
+echo "Merging ${#sources[@]} count indexes → ${OUTPUT}"
+printf '  %s\n' "${sources[@]}"
+
+STDERR_LOG=$(mktemp)
+trap 'rm -f "${STDERR_LOG}"' EXIT
+
+"${BINARY}" merge \
+    --output  "${OUTPUT}" \
+    --force \
+    "${sources[@]}" \
+    2>"${STDERR_LOG}"
+
+cat "${STDERR_LOG}" >&2
+parse_reporter "${run_n}" "${#sources[@]}" "${STDERR_LOG}" >>"${CSV}"
+
+echo "Done. Run ${run_n} → ${CSV}"
diff --git a/benchmark/merge_presence.sh b/benchmark/merge_presence.sh
new file mode 100755
index 0000000..7a816d1
--- /dev/null
+++ b/benchmark/merge_presence.sh
@@ -0,0 +1,104 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
+IDX_DIR="${SCRIPT_DIR}/specimen_index_presence"
+OUTPUT="${SCRIPT_DIR}/global_index_presence"
+STATS_DIR="${SCRIPT_DIR}/stats/merge_presence"
+
+mkdir -p "${STATS_DIR}"
+
+run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' | wc -l | tr -d ' ')")
+CSV="${STATS_DIR}/run_${run_n}.csv"
+
+printf 'run,n_sources,bootstrap_wall_s,bootstrap_rss_b,spectrums_wall_s,spectrums_rss_b,merge_partitions_wall_s,merge_partitions_rss_b,pack_wall_s,pack_rss_b,total_wall_s,total_rss_b\n' >"${CSV}"
+
+parse_reporter() {
+    local run="$1" n_sources="$2" logfile="$3"
+    python3 - "$run" "$n_sources" "$logfile" <<'PYEOF'
+import sys, re
+
+run, n_sources, logfile = sys.argv[1], sys.argv[2], sys.argv[3]
+
+def strip_ansi(s):
+    return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
+
+def parse_wall(s):
+    s = s.strip()
+    if s.endswith('ms'): return float(s[:-2]) / 1000.0
+    if s.endswith('s'):  return float(s[:-1])
+    return 0.0
+
+def parse_rss(s):
+    m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
+    if not m: return 0
+    return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
+
+def is_sep(s):
+    return bool(s) and not re.search(r'[A-Za-z0-9]', s)
+
+stats = {}
+state = 'scan'
+
+with open(logfile, errors='replace') as fh:
+    for raw in fh:
+        line = strip_ansi(raw.rstrip('\n'))
+        s    = line.strip()
+
+        if state == 'scan':
+            if re.search(r'\bstage\b.*\bwall\b', line):
+                state = 'in_header'
+        elif state == 'in_header':
+            if is_sep(s):
+                state = 'rows'
+        elif state == 'rows':
+            if is_sep(s):
+                state = 'total'
+            elif s:
+                parts = re.split(r'  +', s)
+                if len(parts) >= 4:
+                    stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
+        elif state == 'total':
+            if s:
+                parts = re.split(r'  +', s)
+                if len(parts) >= 3:
+                    stats[parts[0]] = (parse_wall(parts[1]),
+                                       parse_rss(parts[3]) if len(parts) > 3 else 0)
+            break
+
+STAGE_ORDER = ['bootstrap', 'spectrums', 'merge_partitions', 'pack']
+row = [run, n_sources]
+for stage in STAGE_ORDER:
+    w, r = stats.get(stage, ('', ''))
+    row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
+tw, tr = stats.get('TOTAL', ('', ''))
+row += [f'{tw:.3f}' if isinstance(tw, float) else '', str(tr)]
+print(','.join(row))
+PYEOF
+}
+
+mapfile -t sources < <(find "${IDX_DIR}" -mindepth 1 -maxdepth 1 -type d | sort)
+
+if [[ ${#sources[@]} -eq 0 ]]; then
+    echo "ERROR: no indexes found in ${IDX_DIR}" >&2
+    exit 1
+fi
+
+echo "Merging ${#sources[@]} presence indexes → ${OUTPUT}"
+printf '  %s\n' "${sources[@]}"
+
+STDERR_LOG=$(mktemp)
+trap 'rm -f "${STDERR_LOG}"' EXIT
+
+"${BINARY}" merge \
+    --output          "${OUTPUT}" \
+    --force \
+    --force-presence \
+    "${sources[@]}" \
+    2>"${STDERR_LOG}"
+
+cat "${STDERR_LOG}" >&2
+parse_reporter "${run_n}" "${#sources[@]}" "${STDERR_LOG}" >>"${CSV}"
+
+echo "Done. Run ${run_n} → ${CSV}"
diff --git a/benchmark/simulate.sh b/benchmark/simulate.sh
new file mode 100755
index 0000000..c486255
--- /dev/null
+++ b/benchmark/simulate.sh
@@ -0,0 +1,12 @@
+#!/usr/bin/env bash
+# Simulate all genomes. Delegates to simulate_one.sh per genome.
+# Prefer running via `gmake simulate` which handles individual dependencies.
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+for genome_file in "${SCRIPT_DIR}"/genomes/*.fna.gz; do
+    out_dir=$("${SCRIPT_DIR}/../.venv/bin/python3" "${SCRIPT_DIR}/make_deps.py" \
+        --dir-for "${genome_file}")
+    bash "${SCRIPT_DIR}/simulate_one.sh" "${genome_file}" "${out_dir}"
+done
diff --git a/benchmark/simulate_one.sh b/benchmark/simulate_one.sh
new file mode 100644
index 0000000..d4c4c1a
--- /dev/null
+++ b/benchmark/simulate_one.sh
@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+# Usage: simulate_one.sh genome.fna.gz output_dir
+# Simulates paired-end HiSeq reads for a single genome.
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ISS="${SCRIPT_DIR}/../.venv/bin/iss"
+COVERAGE=15
+READ_LENGTH=150
+CPUS="${CPUS:-$(sysctl -n hw.logicalcpu 2>/dev/null || nproc 2>/dev/null || echo 2)}"
+
+genome_file="$1"
+out_dir="$2"
+
+mkdir -p "${out_dir}"
+
+tmp_fasta=$(mktemp "${TMPDIR:-/tmp}/obikmer_XXXXXX.fna")
+trap 'rm -f "${tmp_fasta}"' EXIT
+
+gzip -dc "${genome_file}" > "${tmp_fasta}"
+
+genome_size=$(grep -v "^>" "${tmp_fasta}" | tr -d '[:space:]' | wc -c | tr -d ' ')
+n_reads=$(python3 -c "import math; print(math.ceil(${COVERAGE} * ${genome_size} / (2 * ${READ_LENGTH})))")
+
+echo "[${out_dir}]  genome=${genome_size} bp  →  ${n_reads} read pairs  (${COVERAGE}x HiSeq)"
+
+"${ISS}" generate \
+    --genomes   "${tmp_fasta}" \
+    --model     HiSeq \
+    --n_reads   "${n_reads}" \
+    --cpus      "${CPUS}" \
+    --compress \
+    --output    "${out_dir}/reads"
diff --git a/benchmark/verify_count.py b/benchmark/verify_count.py
new file mode 100755
index 0000000..0b204e0
--- /dev/null
+++ b/benchmark/verify_count.py
@@ -0,0 +1,181 @@
+#!/usr/bin/env python3
+"""Compare an obikmer count index against a reference kmer set (presence + counts).
+
+Loads the reference .npz (sorted uint64 kmers + uint32 counts from build_reference.py),
+streams `obikmer dump` from a --with-counts index, then reports:
+  - false negatives : kmers in reference absent from the index
+  - false positives : kmers in the index absent from the reference
+  - count mismatches: kmers present in both but with differing counts
+
+Output to stdout: one CSV row
+  species,strain,ref_kmers,idx_kmers,false_neg,false_pos,count_mismatch,
+  fn_pct,fp_pct,cm_pct
+"""
+import argparse
+import subprocess
+import sys
+
+import numpy as np
+
+
+# ── encoding ──────────────────────────────────────────────────────────────────
+
+_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3,
+           'a': 0, 'c': 1, 'g': 2, 't': 3}
+
+_DECODE = ['A', 'C', 'G', 'T']
+
+
+def encode_kmer(s: str) -> int:
+    kmer = 0
+    for c in s:
+        kmer = (kmer << 2) | _ENCODE[c]
+    return kmer
+
+
+def decode_kmer(val: int, k: int) -> str:
+    bases = []
+    for _ in range(k):
+        bases.append(_DECODE[val & 3])
+        val >>= 2
+    return ''.join(reversed(bases))
+
+
+# ── dump parsing ──────────────────────────────────────────────────────────────
+
+def load_index(obikmer_bin: str, index_dir: str) -> tuple[np.ndarray, np.ndarray]:
+    """Stream `obikmer dump` and return (kmers_sorted_uint64, counts_uint32)."""
+    cmd = [obikmer_bin, 'dump', index_dir]
+    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL,
+                            text=True)
+    kmers, counts = [], []
+    header = True
+    for line in proc.stdout:
+        if header:
+            header = False
+            continue
+        parts = line.rstrip('\n').split(',')
+        kmers.append(encode_kmer(parts[0]))
+        counts.append(int(parts[1]))
+    proc.wait()
+    if proc.returncode != 0:
+        print(f'ERROR: obikmer dump exited {proc.returncode}', file=sys.stderr)
+        sys.exit(1)
+    order = np.argsort(np.array(kmers, dtype=np.uint64), kind='stable')
+    return (np.array(kmers, dtype=np.uint64)[order],
+            np.array(counts, dtype=np.uint32)[order])
+
+
+# ── comparison ────────────────────────────────────────────────────────────────
+
+def compare(ref_kmers: np.ndarray, ref_counts: np.ndarray,
+            idx_kmers: np.ndarray, idx_counts: np.ndarray,
+            ) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    """Return (false_neg, false_pos, cm_ref_kmers, cm_ref_counts, cm_idx_counts).
+
+    All arrays sorted; cm_* cover kmers present in both arrays but with
+    differing counts.
+    """
+    false_neg = np.setdiff1d(ref_kmers, idx_kmers, assume_unique=True)
+    false_pos = np.setdiff1d(idx_kmers, ref_kmers, assume_unique=True)
+
+    # Count mismatches among shared kmers.
+    # Both arrays are sorted so we can use searchsorted.
+    pos_in_idx = np.searchsorted(idx_kmers, ref_kmers)
+    pos_in_idx = np.clip(pos_in_idx, 0, len(idx_kmers) - 1)
+    shared_mask = idx_kmers[pos_in_idx] == ref_kmers
+
+    shared_ref_counts = ref_counts[shared_mask]
+    shared_idx_counts = idx_counts[pos_in_idx[shared_mask]]
+    mismatch_mask     = shared_ref_counts != shared_idx_counts
+
+    cm_kmers      = ref_kmers[shared_mask][mismatch_mask]
+    cm_ref_counts = shared_ref_counts[mismatch_mask]
+    cm_idx_counts = shared_idx_counts[mismatch_mask]
+
+    return false_neg, false_pos, cm_kmers, cm_ref_counts, cm_idx_counts
+
+
+# ── main ─────────────────────────────────────────────────────────────────────
+
+def main() -> None:
+    ap = argparse.ArgumentParser(description=__doc__,
+                                 formatter_class=argparse.RawDescriptionHelpFormatter)
+    ap.add_argument('reference',  metavar='REF_NPZ',   nargs='?',
+                    help='Reference .npz file')
+    ap.add_argument('index',      metavar='INDEX_DIR', nargs='?',
+                    help='obikmer index directory (built with --with-counts)')
+    ap.add_argument('--obikmer',  default='obikmer',
+                    help='Path to obikmer binary')
+    ap.add_argument('--species',  default='')
+    ap.add_argument('--strain',   default='')
+    ap.add_argument('--header',   action='store_true',
+                    help='Print CSV header and exit')
+    ap.add_argument('--save-fp',  metavar='FILE',
+                    help='Save false-positive kmer strings to FILE')
+    ap.add_argument('--save-fn',  metavar='FILE',
+                    help='Save false-negative kmer strings to FILE')
+    ap.add_argument('--save-cm',  metavar='FILE',
+                    help='Save count-mismatch rows (kmer,ref_count,idx_count) to FILE')
+    args = ap.parse_args()
+
+    if args.header:
+        print('species,strain,ref_kmers,idx_kmers,'
+              'false_neg,false_pos,count_mismatch,'
+              'fn_pct,fp_pct,cm_pct')
+        return
+
+    # Detect k
+    cmd1 = [args.obikmer, 'dump', '--head', '1', args.index]
+    out1 = subprocess.check_output(cmd1, stderr=subprocess.DEVNULL, text=True)
+    k = len(out1.splitlines()[1].split(',')[0])
+
+    # Load reference
+    print(f'Loading reference: {args.reference}', file=sys.stderr)
+    npz = np.load(args.reference)
+    ref_kmers  = npz['kmers']    # sorted uint64
+    ref_counts = npz['counts']   # uint32
+
+    # Load index
+    print(f'Streaming dump (k={k}): {args.index}', file=sys.stderr)
+    idx_kmers, idx_counts = load_index(args.obikmer, args.index)
+
+    print(f'k={k}  ref={len(ref_kmers):,}  idx={len(idx_kmers):,}', file=sys.stderr)
+
+    false_neg, false_pos, cm_kmers, cm_ref, cm_idx = compare(
+        ref_kmers, ref_counts, idx_kmers, idx_counts)
+
+    n_shared  = len(ref_kmers) - len(false_neg)
+    fn_pct    = 100.0 * len(false_neg) / len(ref_kmers) if len(ref_kmers) else 0.0
+    fp_pct    = 100.0 * len(false_pos) / len(idx_kmers) if len(idx_kmers) else 0.0
+    cm_pct    = 100.0 * len(cm_kmers)  / n_shared       if n_shared        else 0.0
+
+    print(f'false negatives : {len(false_neg):,}  ({fn_pct:.4f}%)', file=sys.stderr)
+    print(f'false positives : {len(false_pos):,}  ({fp_pct:.4f}%)', file=sys.stderr)
+    print(f'count mismatches: {len(cm_kmers):,}  ({cm_pct:.4f}% of shared)',
+          file=sys.stderr)
+
+    if args.save_fn and len(false_neg):
+        with open(args.save_fn, 'w') as fh:
+            for v in false_neg:
+                fh.write(decode_kmer(int(v), k) + '\n')
+
+    if args.save_fp and len(false_pos):
+        with open(args.save_fp, 'w') as fh:
+            for v in false_pos:
+                fh.write(decode_kmer(int(v), k) + '\n')
+
+    if args.save_cm and len(cm_kmers):
+        with open(args.save_cm, 'w') as fh:
+            fh.write('kmer,ref_count,idx_count\n')
+            for v, rc, ic in zip(cm_kmers, cm_ref, cm_idx):
+                fh.write(f'{decode_kmer(int(v), k)},{rc},{ic}\n')
+
+    print(f'{args.species},{args.strain},'
+          f'{len(ref_kmers)},{len(idx_kmers)},'
+          f'{len(false_neg)},{len(false_pos)},{len(cm_kmers)},'
+          f'{fn_pct:.4f},{fp_pct:.4f},{cm_pct:.4f}')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/benchmark/verify_merge_count.py b/benchmark/verify_merge_count.py
new file mode 100755
index 0000000..72518a1
--- /dev/null
+++ b/benchmark/verify_merge_count.py
@@ -0,0 +1,201 @@
+#!/usr/bin/env python3
+"""Verify the merged count index against all per-specimen reference sets.
+
+Streams `obikmer dump` once on the merged index, accumulates per-specimen
+kmer+count pairs from each column, then compares each against its reference .npz.
+
+Output to stdout: one CSV row per specimen (same columns as verify_count.py)
+  species,strain,ref_kmers,idx_kmers,false_neg,false_pos,count_mismatch,
+  fn_pct,fp_pct,cm_pct
+"""
+import argparse
+import subprocess
+import sys
+from pathlib import Path
+
+import numpy as np
+
+
+# ── encoding ──────────────────────────────────────────────────────────────────
+
+_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3,
+           'a': 0, 'c': 1, 'g': 2, 't': 3}
+
+_DECODE = ['A', 'C', 'G', 'T']
+
+
+def encode_kmer(s: str) -> int:
+    kmer = 0
+    for c in s:
+        kmer = (kmer << 2) | _ENCODE[c]
+    return kmer
+
+
+def decode_kmer(val: int, k: int) -> str:
+    bases = []
+    for _ in range(k):
+        bases.append(_DECODE[val & 3])
+        val >>= 2
+    return ''.join(reversed(bases))
+
+
+# ── single-pass dump ──────────────────────────────────────────────────────────
+
+def stream_merged_dump(obikmer_bin: str, index_dir: str,
+                       ) -> tuple[list[str], dict[str, tuple[list[int], list[int]]]]:
+    """Stream the merged dump once.
+
+    Returns:
+        specimen_names : column labels in dump order
+        per_specimen   : mapping label → (kmer_ints, counts) for entries > 0
+    """
+    cmd = [obikmer_bin, 'dump', index_dir]
+    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL,
+                            text=True)
+
+    header_line = proc.stdout.readline().rstrip('\n')
+    cols = header_line.split(',')
+    specimen_names = cols[1:]
+    per_specimen: dict[str, tuple[list[int], list[int]]] = {
+        name: ([], []) for name in specimen_names}
+
+    for line in proc.stdout:
+        parts = line.rstrip('\n').split(',')
+        kmer_int = encode_kmer(parts[0])
+        for i, name in enumerate(specimen_names):
+            count = int(parts[i + 1])
+            if count > 0:
+                per_specimen[name][0].append(kmer_int)
+                per_specimen[name][1].append(count)
+
+    proc.wait()
+    if proc.returncode != 0:
+        print(f'ERROR: obikmer dump exited {proc.returncode}', file=sys.stderr)
+        sys.exit(1)
+
+    return specimen_names, per_specimen
+
+
+# ── per-specimen comparison ───────────────────────────────────────────────────
+
+def compare_specimen(name: str,
+                     kmer_list: list[int],
+                     count_list: list[int],
+                     ref_dir: Path,
+                     k: int,
+                     save_fn: Path | None,
+                     save_fp: Path | None,
+                     save_cm: Path | None,
+                     ) -> str:
+    ref_path = ref_dir / f'{name}.npz'
+    if not ref_path.exists():
+        print(f'  SKIP {name}: no reference at {ref_path}', file=sys.stderr)
+        return ''
+
+    species = name.split('--')[0]
+    strain  = name[len(species) + 2:]
+
+    npz        = np.load(ref_path)
+    ref_kmers  = npz['kmers']    # sorted uint64
+    ref_counts = npz['counts']   # uint32
+
+    order      = np.argsort(np.array(kmer_list, dtype=np.uint64), kind='stable')
+    idx_kmers  = np.array(kmer_list,  dtype=np.uint64)[order]
+    idx_counts = np.array(count_list, dtype=np.uint32)[order]
+
+    false_neg = np.setdiff1d(ref_kmers, idx_kmers, assume_unique=True)
+    false_pos = np.setdiff1d(idx_kmers, ref_kmers, assume_unique=True)
+
+    # Count mismatches among shared kmers
+    pos_in_idx     = np.searchsorted(idx_kmers, ref_kmers)
+    pos_in_idx     = np.clip(pos_in_idx, 0, len(idx_kmers) - 1)
+    shared_mask    = idx_kmers[pos_in_idx] == ref_kmers
+    mismatch_mask  = ref_counts[shared_mask] != idx_counts[pos_in_idx[shared_mask]]
+    cm_kmers       = ref_kmers[shared_mask][mismatch_mask]
+    cm_ref         = ref_counts[shared_mask][mismatch_mask]
+    cm_idx         = idx_counts[pos_in_idx[shared_mask]][mismatch_mask]
+
+    n_shared = int(shared_mask.sum())
+    fn_pct   = 100.0 * len(false_neg) / len(ref_kmers) if len(ref_kmers) else 0.0
+    fp_pct   = 100.0 * len(false_pos) / len(idx_kmers) if len(idx_kmers) else 0.0
+    cm_pct   = 100.0 * len(cm_kmers)  / n_shared       if n_shared        else 0.0
+
+    print(f'  {name}: ref={len(ref_kmers):,}  idx={len(idx_kmers):,}  '
+          f'fn={len(false_neg):,} ({fn_pct:.4f}%)  '
+          f'fp={len(false_pos):,} ({fp_pct:.4f}%)  '
+          f'cm={len(cm_kmers):,} ({cm_pct:.4f}%)',
+          file=sys.stderr)
+
+    if save_fn and len(false_neg):
+        fn_file = save_fn / f'{name}_fn.txt'
+        fn_file.write_text('\n'.join(decode_kmer(int(v), k) for v in false_neg) + '\n')
+
+    if save_fp and len(false_pos):
+        fp_file = save_fp / f'{name}_fp.txt'
+        fp_file.write_text('\n'.join(decode_kmer(int(v), k) for v in false_pos) + '\n')
+
+    if save_cm and len(cm_kmers):
+        cm_file = save_cm / f'{name}_cm.csv'
+        lines = ['kmer,ref_count,idx_count']
+        for v, rc, ic in zip(cm_kmers, cm_ref, cm_idx):
+            lines.append(f'{decode_kmer(int(v), k)},{rc},{ic}')
+        cm_file.write_text('\n'.join(lines) + '\n')
+
+    return (f'{species},{strain},'
+            f'{len(ref_kmers)},{len(idx_kmers)},'
+            f'{len(false_neg)},{len(false_pos)},{len(cm_kmers)},'
+            f'{fn_pct:.4f},{fp_pct:.4f},{cm_pct:.4f}')
+
+
+# ── main ─────────────────────────────────────────────────────────────────────
+
+def main() -> None:
+    ap = argparse.ArgumentParser(description=__doc__,
+                                 formatter_class=argparse.RawDescriptionHelpFormatter)
+    ap.add_argument('index',     metavar='INDEX_DIR', nargs='?',
+                    help='Merged count index directory')
+    ap.add_argument('ref_dir',   metavar='REF_DIR',   nargs='?',
+                    help='Directory containing per-specimen .npz reference files')
+    ap.add_argument('--obikmer', default='obikmer')
+    ap.add_argument('--header',  action='store_true',
+                    help='Print CSV header and exit')
+    ap.add_argument('--save-fn', metavar='DIR',
+                    help='Directory for false-negative kmer lists')
+    ap.add_argument('--save-fp', metavar='DIR',
+                    help='Directory for false-positive kmer lists')
+    ap.add_argument('--save-cm', metavar='DIR',
+                    help='Directory for count-mismatch CSV files')
+    args = ap.parse_args()
+
+    if args.header:
+        print('species,strain,ref_kmers,idx_kmers,'
+              'false_neg,false_pos,count_mismatch,'
+              'fn_pct,fp_pct,cm_pct')
+        return
+
+    ref_dir = Path(args.ref_dir)
+    save_fn = Path(args.save_fn) if args.save_fn else None
+    save_fp = Path(args.save_fp) if args.save_fp else None
+    save_cm = Path(args.save_cm) if args.save_cm else None
+    for d in (save_fn, save_fp, save_cm):
+        if d: d.mkdir(parents=True, exist_ok=True)
+
+    out1 = subprocess.check_output(
+        [args.obikmer, 'dump', '--head', '1', args.index],
+        stderr=subprocess.DEVNULL, text=True)
+    k = len(out1.splitlines()[1].split(',')[0])
+
+    print(f'k={k}  streaming merged dump: {args.index}', file=sys.stderr)
+    specimen_names, per_specimen = stream_merged_dump(args.obikmer, args.index)
+    print(f'{len(specimen_names)} specimen columns loaded', file=sys.stderr)
+
+    for name in specimen_names:
+        kmers, counts = per_specimen[name]
+        row = compare_specimen(name, kmers, counts, ref_dir, k,
+                               save_fn, save_fp, save_cm)
+        if row:
+            print(row)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/benchmark/verify_merge_count.sh b/benchmark/verify_merge_count.sh
new file mode 100755
index 0000000..ebf4c36
--- /dev/null
+++ b/benchmark/verify_merge_count.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
+INDEX="${SCRIPT_DIR}/global_index_count"
+REF_DIR="${SCRIPT_DIR}/reference_index"
+STATS_DIR="${SCRIPT_DIR}/stats/verify_merge_count"
+PYTHON="${SCRIPT_DIR}/../.venv/bin/python3"
+VERIFY_PY="${SCRIPT_DIR}/verify_merge_count.py"
+
+mkdir -p "${STATS_DIR}"
+
+CURRENT="${STATS_DIR}/current.csv"
+
+"${PYTHON}" "${VERIFY_PY}" --header >"${CURRENT}"
+
+"${PYTHON}" "${VERIFY_PY}" \
+    --obikmer "${BINARY}" \
+    "${INDEX}" "${REF_DIR}" \
+    >>"${CURRENT}"
+
+run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'count_*.csv' | wc -l | tr -d ' ')")
+ARCHIVE="${STATS_DIR}/count_${run_n}.csv"
+cp "${CURRENT}" "${ARCHIVE}"
+
+echo "Done. Results → ${ARCHIVE}"
diff --git a/benchmark/verify_merge_presence.py b/benchmark/verify_merge_presence.py
new file mode 100755
index 0000000..66fc12c
--- /dev/null
+++ b/benchmark/verify_merge_presence.py
@@ -0,0 +1,170 @@
+#!/usr/bin/env python3
+"""Verify the merged presence index against all per-specimen reference sets.
+
+Streams `obikmer dump` once on the merged index, accumulates per-specimen
+kmer sets from each column, then compares each against its reference .npz.
+
+Output to stdout: one CSV row per specimen (same columns as verify_presence.py)
+  species,strain,ref_kmers,idx_kmers,false_neg,false_pos,fn_pct,fp_pct
+"""
+import argparse
+import subprocess
+import sys
+from pathlib import Path
+
+import numpy as np
+
+
+# ── encoding ──────────────────────────────────────────────────────────────────
+
+_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3,
+           'a': 0, 'c': 1, 'g': 2, 't': 3}
+
+_DECODE = ['A', 'C', 'G', 'T']
+
+
+def encode_kmer(s: str) -> int:
+    kmer = 0
+    for c in s:
+        kmer = (kmer << 2) | _ENCODE[c]
+    return kmer
+
+
+def decode_kmer(val: int, k: int) -> str:
+    bases = []
+    for _ in range(k):
+        bases.append(_DECODE[val & 3])
+        val >>= 2
+    return ''.join(reversed(bases))
+
+
+# ── single-pass dump ──────────────────────────────────────────────────────────
+
+def stream_merged_dump(obikmer_bin: str, index_dir: str,
+                       ) -> tuple[list[str], dict[str, list[int]]]:
+    """Stream the merged dump once.
+
+    Returns:
+        specimen_names : column labels in dump order (excluding 'kmer')
+        per_specimen   : mapping label → list of kmer ints where presence > 0
+    """
+    cmd = [obikmer_bin, 'dump', index_dir]
+    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL,
+                            text=True)
+
+    header_line = proc.stdout.readline().rstrip('\n')
+    cols = header_line.split(',')
+    specimen_names = cols[1:]           # first col is 'kmer'
+    per_specimen: dict[str, list[int]] = {name: [] for name in specimen_names}
+
+    for line in proc.stdout:
+        parts = line.rstrip('\n').split(',')
+        kmer_int = encode_kmer(parts[0])
+        for i, name in enumerate(specimen_names):
+            if int(parts[i + 1]) > 0:
+                per_specimen[name].append(kmer_int)
+
+    proc.wait()
+    if proc.returncode != 0:
+        print(f'ERROR: obikmer dump exited {proc.returncode}', file=sys.stderr)
+        sys.exit(1)
+
+    return specimen_names, per_specimen
+
+
+# ── per-specimen comparison ───────────────────────────────────────────────────
+
+def compare_specimen(name: str,
+                     kmer_list: list[int],
+                     ref_dir: Path,
+                     k: int,
+                     save_fn: Path | None,
+                     save_fp: Path | None,
+                     ) -> str:
+    """Compare one specimen column against its reference .npz.
+
+    Returns a CSV row string.
+    """
+    ref_path = ref_dir / f'{name}.npz'
+    if not ref_path.exists():
+        print(f'  SKIP {name}: no reference at {ref_path}', file=sys.stderr)
+        return ''
+
+    species = name.split('--')[0]
+    strain  = name[len(species) + 2:]
+
+    ref_kmers = np.load(ref_path)['kmers']          # sorted uint64
+    idx_kmers = np.array(sorted(kmer_list), dtype=np.uint64)
+
+    false_neg = np.setdiff1d(ref_kmers, idx_kmers, assume_unique=True)
+    false_pos = np.setdiff1d(idx_kmers, ref_kmers, assume_unique=True)
+
+    fn_pct = 100.0 * len(false_neg) / len(ref_kmers) if len(ref_kmers) else 0.0
+    fp_pct = 100.0 * len(false_pos) / len(idx_kmers) if len(idx_kmers) else 0.0
+
+    print(f'  {name}: ref={len(ref_kmers):,}  idx={len(idx_kmers):,}  '
+          f'fn={len(false_neg):,} ({fn_pct:.4f}%)  '
+          f'fp={len(false_pos):,} ({fp_pct:.4f}%)',
+          file=sys.stderr)
+
+    if save_fn and len(false_neg):
+        fn_file = save_fn / f'{name}_fn.txt'
+        fn_file.write_text('\n'.join(decode_kmer(int(v), k) for v in false_neg) + '\n')
+
+    if save_fp and len(false_pos):
+        fp_file = save_fp / f'{name}_fp.txt'
+        fp_file.write_text('\n'.join(decode_kmer(int(v), k) for v in false_pos) + '\n')
+
+    return (f'{species},{strain},'
+            f'{len(ref_kmers)},{len(idx_kmers)},'
+            f'{len(false_neg)},{len(false_pos)},'
+            f'{fn_pct:.4f},{fp_pct:.4f}')
+
+
+# ── main ─────────────────────────────────────────────────────────────────────
+
+def main() -> None:
+    ap = argparse.ArgumentParser(description=__doc__,
+                                 formatter_class=argparse.RawDescriptionHelpFormatter)
+    ap.add_argument('index',     metavar='INDEX_DIR', nargs='?',
+                    help='Merged presence index directory')
+    ap.add_argument('ref_dir',   metavar='REF_DIR',   nargs='?',
+                    help='Directory containing per-specimen .npz reference files')
+    ap.add_argument('--obikmer', default='obikmer')
+    ap.add_argument('--header',  action='store_true',
+                    help='Print CSV header and exit')
+    ap.add_argument('--save-fn', metavar='DIR',
+                    help='Directory to save false-negative kmer lists')
+    ap.add_argument('--save-fp', metavar='DIR',
+                    help='Directory to save false-positive kmer lists')
+    args = ap.parse_args()
+
+    if args.header:
+        print('species,strain,ref_kmers,idx_kmers,'
+              'false_neg,false_pos,fn_pct,fp_pct')
+        return
+
+    ref_dir  = Path(args.ref_dir)
+    save_fn  = Path(args.save_fn) if args.save_fn else None
+    save_fp  = Path(args.save_fp) if args.save_fp else None
+    if save_fn: save_fn.mkdir(parents=True, exist_ok=True)
+    if save_fp: save_fp.mkdir(parents=True, exist_ok=True)
+
+    # Detect k
+    out1 = subprocess.check_output(
+        [args.obikmer, 'dump', '--head', '1', args.index],
+        stderr=subprocess.DEVNULL, text=True)
+    k = len(out1.splitlines()[1].split(',')[0])
+
+    print(f'k={k}  streaming merged dump: {args.index}', file=sys.stderr)
+    specimen_names, per_specimen = stream_merged_dump(args.obikmer, args.index)
+    print(f'{len(specimen_names)} specimen columns loaded', file=sys.stderr)
+
+    for name in specimen_names:
+        row = compare_specimen(name, per_specimen[name], ref_dir, k, save_fn, save_fp)
+        if row:
+            print(row)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/benchmark/verify_merge_presence.sh b/benchmark/verify_merge_presence.sh
new file mode 100755
index 0000000..bea5ddf
--- /dev/null
+++ b/benchmark/verify_merge_presence.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
+INDEX="${SCRIPT_DIR}/global_index_presence"
+REF_DIR="${SCRIPT_DIR}/reference_index"
+STATS_DIR="${SCRIPT_DIR}/stats/verify_merge_presence"
+PYTHON="${SCRIPT_DIR}/../.venv/bin/python3"
+VERIFY_PY="${SCRIPT_DIR}/verify_merge_presence.py"
+
+mkdir -p "${STATS_DIR}"
+
+CURRENT="${STATS_DIR}/current.csv"
+
+"${PYTHON}" "${VERIFY_PY}" --header >"${CURRENT}"
+
+"${PYTHON}" "${VERIFY_PY}" \
+    --obikmer "${BINARY}" \
+    "${INDEX}" "${REF_DIR}" \
+    >>"${CURRENT}"
+
+run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'presence_*.csv' | wc -l | tr -d ' ')")
+ARCHIVE="${STATS_DIR}/presence_${run_n}.csv"
+cp "${CURRENT}" "${ARCHIVE}"
+
+echo "Done. Results → ${ARCHIVE}"
diff --git a/benchmark/verify_one_count.sh b/benchmark/verify_one_count.sh
new file mode 100755
index 0000000..3dfb8d6
--- /dev/null
+++ b/benchmark/verify_one_count.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+# Usage: verify_one_count.sh SPECIMEN
+# SPECIMEN = "species--strain" (Make pattern stem)
+# Output: stats/verify_count/SPECIMEN.stats (one CSV data row, no header)
+set -euo pipefail
+
+SPECIMEN="$1"
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
+PYTHON="${SCRIPT_DIR}/../.venv/bin/python3"
+VERIFY_PY="${SCRIPT_DIR}/verify_count.py"
+
+species="${SPECIMEN%%--*}"
+strain="${SPECIMEN#*--}"
+
+REF_NPZ="${SCRIPT_DIR}/reference_index/${SPECIMEN}.npz"
+INDEX_DIR="${SCRIPT_DIR}/specimen_index_count/${SPECIMEN}"
+STATS_DIR="${SCRIPT_DIR}/stats/verify_count"
+STATS_FILE="${STATS_DIR}/${SPECIMEN}.stats"
+
+mkdir -p "${STATS_DIR}"
+
+echo "[${SPECIMEN}] verifying count"
+
+"${PYTHON}" "${VERIFY_PY}" \
+    --obikmer "${BINARY}" \
+    --species "${species}" \
+    --strain  "${strain}" \
+    "${REF_NPZ}" "${INDEX_DIR}" \
+    >"${STATS_FILE}"
diff --git a/benchmark/verify_one_presence.sh b/benchmark/verify_one_presence.sh
new file mode 100755
index 0000000..252a2c3
--- /dev/null
+++ b/benchmark/verify_one_presence.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+# Usage: verify_one_presence.sh SPECIMEN
+# SPECIMEN = "species--strain" (Make pattern stem)
+# Output: stats/verify_presence/SPECIMEN.stats (one CSV data row, no header)
+set -euo pipefail
+
+SPECIMEN="$1"
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
+PYTHON="${SCRIPT_DIR}/../.venv/bin/python3"
+VERIFY_PY="${SCRIPT_DIR}/verify_presence.py"
+
+species="${SPECIMEN%%--*}"
+strain="${SPECIMEN#*--}"
+
+REF_NPZ="${SCRIPT_DIR}/reference_index/${SPECIMEN}.npz"
+INDEX_DIR="${SCRIPT_DIR}/specimen_index_presence/${SPECIMEN}"
+STATS_DIR="${SCRIPT_DIR}/stats/verify_presence"
+STATS_FILE="${STATS_DIR}/${SPECIMEN}.stats"
+
+mkdir -p "${STATS_DIR}"
+
+echo "[${SPECIMEN}] verifying presence"
+
+"${PYTHON}" "${VERIFY_PY}" \
+    --obikmer "${BINARY}" \
+    --species "${species}" \
+    --strain  "${strain}" \
+    "${REF_NPZ}" "${INDEX_DIR}" \
+    >"${STATS_FILE}"
diff --git a/benchmark/verify_presence.py b/benchmark/verify_presence.py
new file mode 100755
index 0000000..7041dd5
--- /dev/null
+++ b/benchmark/verify_presence.py
@@ -0,0 +1,139 @@
+#!/usr/bin/env python3
+"""Compare an obikmer index against a reference kmer set (presence/absence).
+
+Loads the reference .npz (sorted uint64 kmers built by build_reference.py),
+streams the output of `obikmer dump`, encodes each kmer string to uint64,
+then reports false negatives and false positives using numpy set operations.
+
+Output to stdout: one CSV row
+  species, strain, ref_kmers, idx_kmers, false_neg, false_pos, fn_pct, fp_pct
+"""
+import argparse
+import subprocess
+import sys
+
+import numpy as np
+
+
+# ── encoding ──────────────────────────────────────────────────────────────────
+
+_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3,
+           'a': 0, 'c': 1, 'g': 2, 't': 3}
+
+_DECODE = ['A', 'C', 'G', 'T']
+
+
+def encode_kmer(s: str) -> int:
+    kmer = 0
+    for c in s:
+        kmer = (kmer << 2) | _ENCODE[c]
+    return kmer
+
+
+def decode_kmer(val: int, k: int) -> str:
+    bases = []
+    for _ in range(k):
+        bases.append(_DECODE[val & 3])
+        val >>= 2
+    return ''.join(reversed(bases))
+
+
+# ── dump parsing ──────────────────────────────────────────────────────────────
+
+def load_index_kmers(obikmer_bin: str, index_dir: str) -> np.ndarray:
+    """Stream `obikmer dump` and return a sorted uint64 array of kmer integers."""
+    cmd = [obikmer_bin, 'dump', index_dir]
+    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL,
+                            text=True)
+    kmers = []
+    header = True
+    for line in proc.stdout:
+        if header:
+            header = False
+            continue
+        kmer_str = line.split(',', 1)[0]
+        kmers.append(encode_kmer(kmer_str))
+    proc.wait()
+    if proc.returncode != 0:
+        print(f'ERROR: obikmer dump exited {proc.returncode}', file=sys.stderr)
+        sys.exit(1)
+    arr = np.array(kmers, dtype=np.uint64)
+    arr.sort()
+    return arr
+
+
+# ── comparison ────────────────────────────────────────────────────────────────
+
+def compare(ref: np.ndarray, idx: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
+    """Return (false_negatives, false_positives) as uint64 arrays."""
+    false_neg = np.setdiff1d(ref, idx, assume_unique=True)
+    false_pos = np.setdiff1d(idx, ref, assume_unique=True)
+    return false_neg, false_pos
+
+
+# ── main ─────────────────────────────────────────────────────────────────────
+
+def main() -> None:
+    ap = argparse.ArgumentParser(description=__doc__,
+                                 formatter_class=argparse.RawDescriptionHelpFormatter)
+    ap.add_argument('reference',  metavar='REF_NPZ',   nargs='?', help='Reference .npz file')
+    ap.add_argument('index',      metavar='INDEX_DIR', nargs='?', help='obikmer index directory')
+    ap.add_argument('--obikmer',  default='obikmer',   help='Path to obikmer binary')
+    ap.add_argument('--species',  default='',          help='Species label for CSV row')
+    ap.add_argument('--strain',   default='',          help='Strain label for CSV row')
+    ap.add_argument('--header',   action='store_true', help='Print CSV header and exit')
+    ap.add_argument('--save-fp',  metavar='FILE',
+                    help='Save false-positive kmer strings to FILE')
+    ap.add_argument('--save-fn',  metavar='FILE',
+                    help='Save false-negative kmer strings to FILE')
+    args = ap.parse_args()
+
+    if args.header:
+        print('species,strain,ref_kmers,idx_kmers,'
+              'false_neg,false_pos,fn_pct,fp_pct')
+        return
+
+    # Detect k from the index (one cheap call before the full dump).
+    cmd1 = [args.obikmer, 'dump', '--head', '1', args.index]
+    out1 = subprocess.check_output(cmd1, stderr=subprocess.DEVNULL, text=True)
+    k = len(out1.splitlines()[1].split(',')[0])
+
+    # Load reference
+    print(f'Loading reference: {args.reference}', file=sys.stderr)
+    npz = np.load(args.reference)
+    ref_kmers = npz['kmers']          # already sorted uint64
+
+    # Load index
+    print(f'Streaming dump (k={k}): {args.index}', file=sys.stderr)
+    idx_kmers = load_index_kmers(args.obikmer, args.index)
+
+    print(f'k={k}  ref={len(ref_kmers):,}  idx={len(idx_kmers):,}', file=sys.stderr)
+
+    false_neg, false_pos = compare(ref_kmers, idx_kmers)
+
+    fn_pct = 100.0 * len(false_neg) / len(ref_kmers) if len(ref_kmers) else 0.0
+    fp_pct = 100.0 * len(false_pos) / len(idx_kmers) if len(idx_kmers) else 0.0
+
+    print(f'false negatives: {len(false_neg):,}  ({fn_pct:.4f}%)', file=sys.stderr)
+    print(f'false positives: {len(false_pos):,}  ({fp_pct:.4f}%)', file=sys.stderr)
+
+    if args.save_fn and len(false_neg):
+        with open(args.save_fn, 'w') as fh:
+            for v in false_neg:
+                fh.write(decode_kmer(int(v), k) + '\n')
+        print(f'False negatives saved → {args.save_fn}', file=sys.stderr)
+
+    if args.save_fp and len(false_pos):
+        with open(args.save_fp, 'w') as fh:
+            for v in false_pos:
+                fh.write(decode_kmer(int(v), k) + '\n')
+        print(f'False positives saved → {args.save_fp}', file=sys.stderr)
+
+    print(f'{args.species},{args.strain},'
+          f'{len(ref_kmers)},{len(idx_kmers)},'
+          f'{len(false_neg)},{len(false_pos)},'
+          f'{fn_pct:.4f},{fp_pct:.4f}')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/docmd/implementation/filtering.md b/docmd/implementation/filtering.md
index 4dfab31..fe56bc9 100644
--- a/docmd/implementation/filtering.md
+++ b/docmd/implementation/filtering.md
@@ -29,16 +29,17 @@ Multiple values separated by `|` are always OR-ed within the predicate.
 
 ### Path matching (`~` and `!~`)
 
-Metadata values can represent hierarchical taxonomic paths such as
+Metadata values can represent hierarchical concept paths such as
 `/Eukaryota/Viridiplantae/Streptophyta/Betulaceae/Betula/nana`.
 
-- **Absolute pattern** (starts with `/`): the value must start with the pattern
-  at a segment boundary.
-  `taxon~/Betulaceae/Betula` matches `/Betulaceae/Betula/nana` and
+**Both the stored metadata value and the pattern must start with `/`.**
+A pattern that does not start with `/` is rejected at parse time with an error.
+
+The value matches the pattern if it equals it exactly or starts with the pattern
+followed by `/` (segment-boundary prefix):
+
+- `taxon~/Betulaceae/Betula` matches `/Betulaceae/Betula/nana` and
   `/Betulaceae/Betula` but not `/Betulaceae/Betuloides/…`.
-- **Bare segment** (no leading `/`): the value must contain the pattern as an
-  exact path component anywhere.
-  `taxon~Betula` matches any path that has `Betula` as one of its segments.
 
 ### Missing metadata key → NA
 
diff --git a/mkdocs.yml b/mkdocs.yml
index c27d1a9..7973e78 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -53,6 +53,7 @@ nav:
       - Merge parallelism & memory: implementation/merge_parallelism.md
       - Kmer filtering: implementation/filtering.md
       - Select command: implementation/select.md
+      - obitaxonomy crate: implementation/obitaxonomy.md
   - Architecture:
       - Sequences: architecture/sequences/invariant.md
       - Kmer index: architecture/index_architecture.md
diff --git a/src/Cargo.lock b/src/Cargo.lock
index 2983231..a48a7fd 100644
--- a/src/Cargo.lock
+++ b/src/Cargo.lock
@@ -1853,6 +1853,10 @@ dependencies = [
  "tracing",
 ]
 
+[[package]]
+name = "obitaxonomy"
+version = "0.1.0"
+
 [[package]]
 name = "object"
 version = "0.37.3"
diff --git a/src/Cargo.toml b/src/Cargo.toml
index 46a4f87..141df02 100644
--- a/src/Cargo.toml
+++ b/src/Cargo.toml
@@ -1,5 +1,5 @@
 [workspace]
 resolver = "3"
-members = ["obikseq", "obiread", "obiskbuilder", "obifastwrite", "obikmer","obikrope","obipipeline", "obikpartitionner","obiskio","obidebruinj","obilayeredmap", "obicompactvec", "obisys", "obikindex"]
+members = ["obikseq", "obiread", "obiskbuilder", "obifastwrite", "obikmer","obikrope","obipipeline", "obikpartitionner","obiskio","obidebruinj","obilayeredmap", "obicompactvec", "obisys", "obikindex", "obitaxonomy"]
 [profile.release]
 debug = 1
diff --git a/src/obicompactvec/src/bitvec.rs b/src/obicompactvec/src/bitvec.rs
index 145bd63..ee7d6f7 100644
--- a/src/obicompactvec/src/bitvec.rs
+++ b/src/obicompactvec/src/bitvec.rs
@@ -88,9 +88,9 @@ impl<'a> IntoIterator for &'a PersistentBitVec {
 // ── BitIter ───────────────────────────────────────────────────────────────────
 
 pub struct BitIter<'a> {
-    pub(crate) words: &'a [u64],
-    pub(crate) slot:  usize,
-    pub(crate) n:     usize,
+    words: &'a [u64],
+    slot:  usize,
+    n:     usize,
 }
 
 impl ExactSizeIterator for BitIter<'_> {}
@@ -132,7 +132,7 @@ impl PersistentBitVecBuilder {
         Ok(Self { mmap, n, path: path.to_path_buf() })
     }
 
-    pub(crate) fn from_raw_bytes(bytes: &[u8], n: usize, path: &Path) -> io::Result<Self> {
+    pub fn from_raw_bytes(bytes: &[u8], n: usize, path: &Path) -> io::Result<Self> {
         let file_size = HEADER_SIZE + n_bytes_for_words(n);
         let file = OpenOptions::new()
             .read(true).write(true).create(true).truncate(true)
diff --git a/src/obicompactvec/src/lib.rs b/src/obicompactvec/src/lib.rs
index 25a8032..9041ab7 100644
--- a/src/obicompactvec/src/lib.rs
+++ b/src/obicompactvec/src/lib.rs
@@ -18,11 +18,11 @@ pub use builder::PersistentCompactIntVecBuilder;
 pub use colgroup::{ColGroup, FilterMask, MatrixGroupOps, eval_filter_mask};
 pub use intmatrix::{PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, pack_compact_int_matrix};
 pub use layer_meta::LayerMeta;
-pub use reader::PersistentCompactIntVec;
-pub use tempbitvec::TempBitVec;
-pub use tempintvec::TempCompactIntVec;
+pub use reader::{PersistentCompactIntVec, Iter as CompactIntVecIter};
+pub use tempbitvec::{TempBitVec, TempBitVecBuilder};
+pub use tempintvec::{TempCompactIntVec, TempCompactIntVecBuilder};
 pub use traits::{BitPartials, ColumnWeights, CountPartials};
-pub use views::{BitSliceView, IntSliceView};
+pub use views::{BitSliceView, BitSliceIter, IntSliceView, IntSliceViewIter};
 
 #[cfg(test)]
 #[path = "tests/mod.rs"]
diff --git a/src/obicompactvec/src/tempbitvec.rs b/src/obicompactvec/src/tempbitvec.rs
index 8bbec16..df1d436 100644
--- a/src/obicompactvec/src/tempbitvec.rs
+++ b/src/obicompactvec/src/tempbitvec.rs
@@ -43,27 +43,27 @@ impl TempBitVec {
 
 // ── TempBitVecBuilder — mutable, becomes TempBitVec on freeze ────────────────
 
-pub(crate) struct TempBitVecBuilder {
+pub struct TempBitVecBuilder {
     builder: PersistentBitVecBuilder,
     temp: TempDir,
 }
 
 impl TempBitVecBuilder {
-    pub(crate) fn new(n: usize) -> io::Result<Self> {
+    pub fn new(n: usize) -> io::Result<Self> {
         let temp = TempDir::new()?;
         let path = temp.path().join("data.pbiv");
         let builder = PersistentBitVecBuilder::new(n, &path)?;
         Ok(Self { builder, temp })
     }
 
-    pub(crate) fn new_ones(n: usize) -> io::Result<Self> {
+    pub fn new_ones(n: usize) -> io::Result<Self> {
         let temp = TempDir::new()?;
         let path = temp.path().join("data.pbiv");
         let builder = PersistentBitVecBuilder::new_ones(n, &path)?;
         Ok(Self { builder, temp })
     }
 
-    pub(crate) fn freeze(self) -> io::Result<TempBitVec> {
+    pub fn freeze(self) -> io::Result<TempBitVec> {
         let Self { builder, temp } = self;
         let vec = builder.finish()?;
         Ok(TempBitVec { vec, _temp: temp })
@@ -72,7 +72,8 @@ impl TempBitVecBuilder {
     pub fn set(&mut self, slot: usize, value: bool) {
         self.builder.set(slot, value);
     }
-    pub(crate) fn view(&self) -> BitSliceView<'_> {
+
+    pub fn view(&self) -> BitSliceView<'_> {
         self.builder.view()
     }
 
@@ -80,19 +81,19 @@ impl TempBitVecBuilder {
         self.builder.or(other);
     }
 
-    pub(crate) fn and(&mut self, other: BitSliceView<'_>) {
+    pub fn and(&mut self, other: BitSliceView<'_>) {
         self.builder.and(other);
     }
 
-    pub(crate) fn xor(&mut self, other: BitSliceView<'_>) {
+    pub fn xor(&mut self, other: BitSliceView<'_>) {
         self.builder.xor(other);
     }
 
-    pub(crate) fn not(&mut self) {
+    pub fn not(&mut self) {
         self.builder.not();
     }
 
-    pub(crate) fn copy_from(&mut self, src: BitSliceView<'_>) {
+    pub fn copy_from(&mut self, src: BitSliceView<'_>) {
         self.builder.copy_from(src);
     }
 
@@ -100,11 +101,11 @@ impl TempBitVecBuilder {
         self.builder.or_where(col, pred);
     }
 
-    pub(crate) fn and_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
+    pub fn and_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
         self.builder.and_where(col, pred);
     }
 
-    pub(crate) fn xor_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
+    pub fn xor_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
         self.builder.xor_where(col, pred);
     }
 }
diff --git a/src/obicompactvec/src/tempintvec.rs b/src/obicompactvec/src/tempintvec.rs
index e5ff848..b0b3492 100644
--- a/src/obicompactvec/src/tempintvec.rs
+++ b/src/obicompactvec/src/tempintvec.rs
@@ -32,60 +32,58 @@ impl TempCompactIntVec {
 
 // ── TempCompactIntVecBuilder — mutable, becomes TempCompactIntVec on freeze ──
 
-pub(crate) struct TempCompactIntVecBuilder {
+pub struct TempCompactIntVecBuilder {
     builder: PersistentCompactIntVecBuilder,
     temp:    TempDir,
 }
 
 impl TempCompactIntVecBuilder {
-    pub(crate) fn new(n: usize) -> io::Result<Self> {
+    pub fn new(n: usize) -> io::Result<Self> {
         let temp = TempDir::new()?;
         let path = temp.path().join("data.pciv");
         let builder = PersistentCompactIntVecBuilder::new(n, &path)?;
         Ok(Self { builder, temp })
     }
 
-    pub(crate) fn freeze(self) -> io::Result<TempCompactIntVec> {
+    pub fn freeze(self) -> io::Result<TempCompactIntVec> {
         let Self { builder, temp } = self;
         let vec = builder.finish()?;
         Ok(TempCompactIntVec { vec, _temp: temp })
     }
 
-    // ── Delegation methods ────────────────────────────────────────────────────
+    pub fn n(&self) -> usize { self.builder.len() }
 
-    pub(crate) fn n(&self) -> usize { self.builder.len() }
+    pub fn set(&mut self, slot: usize, value: u32) { self.builder.set(slot, value); }
+    pub fn get(&self, slot: usize) -> u32           { self.builder.get(slot) }
 
-    pub(crate) fn set(&mut self, slot: usize, value: u32) { self.builder.set(slot, value); }
-    pub(crate) fn get(&self, slot: usize) -> u32           { self.builder.get(slot) }
+    pub fn primary_bytes(&self)         -> &[u8]      { self.builder.primary_bytes() }
+    pub fn primary_bytes_mut(&mut self) -> &mut [u8]  { self.builder.primary_bytes_mut() }
 
-    pub(crate) fn primary_bytes(&self)       -> &[u8]      { self.builder.primary_bytes() }
-    pub(crate) fn primary_bytes_mut(&mut self) -> &mut [u8] { self.builder.primary_bytes_mut() }
-
-    pub(crate) fn inc_present(&mut self, col: BitSliceView<'_>) {
+    pub fn inc_present(&mut self, col: BitSliceView<'_>) {
         self.builder.inc_present(col);
     }
 
-    pub(crate) fn inc_present_fast(&mut self, col: BitSliceView<'_>) {
+    pub fn inc_present_fast(&mut self, col: BitSliceView<'_>) {
         self.builder.inc_present_fast(col);
     }
 
-    pub(crate) fn inc_predicate(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
+    pub fn inc_predicate(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
         self.builder.inc_predicate(col, pred);
     }
 
-    pub(crate) fn inc_predicate_fast(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
+    pub fn inc_predicate_fast(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
         self.builder.inc_predicate_fast(col, pred);
     }
 
-    pub(crate) fn add(&mut self, other: IntSliceView<'_>) {
+    pub fn add(&mut self, other: IntSliceView<'_>) {
         self.builder.add(other);
     }
 
-    pub(crate) fn mask_with(&mut self, mask: BitSliceView<'_>) {
+    pub fn mask_with(&mut self, mask: BitSliceView<'_>) {
         self.builder.mask_with(mask);
     }
 
-    pub(crate) fn min(&mut self, other: IntSliceView<'_>) { self.builder.min(other); }
-    pub(crate) fn max(&mut self, other: IntSliceView<'_>) { self.builder.max(other); }
-    pub(crate) fn diff(&mut self, other: IntSliceView<'_>) { self.builder.diff(other); }
+    pub fn min(&mut self, other: IntSliceView<'_>)  { self.builder.min(other); }
+    pub fn max(&mut self, other: IntSliceView<'_>)  { self.builder.max(other); }
+    pub fn diff(&mut self, other: IntSliceView<'_>) { self.builder.diff(other); }
 }
diff --git a/src/obidebruinj/src/debruijn.rs b/src/obidebruinj/src/debruijn.rs
index 8d300f2..f59f03a 100644
--- a/src/obidebruinj/src/debruijn.rs
+++ b/src/obidebruinj/src/debruijn.rs
@@ -3,6 +3,7 @@ use crossbeam_channel;
 use hashbrown::HashMap;
 use obikseq::k;
 use obikseq::{CanonicalKmer, Sequence, Unitig};
+#[cfg(not(any(test, feature = "test-utils")))]
 use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
 use std::cell::RefCell;
 use std::fmt;
diff --git a/src/obikindex/src/merge.rs b/src/obikindex/src/merge.rs
index c637c9b..cbfdaba 100644
--- a/src/obikindex/src/merge.rs
+++ b/src/obikindex/src/merge.rs
@@ -11,7 +11,7 @@ use obilayeredmap::IndexMode;
 use crate::error::{OKIError, OKIResult};
 use crate::index::KmerIndex;
 use crate::meta::{GenomeInfo, IndexMeta};
-use crate::state::IndexState;
+use crate::state::{IndexState, SENTINEL_INDEXED};
 
 pub use obikpartitionner::MergeMode;
 
@@ -263,6 +263,8 @@ impl KmerIndex {
             rep.push(t.stop());
         }
 
+        fs::File::create(output.join(SENTINEL_INDEXED)).map_err(OKIError::Io)?;
+
         KmerIndex::open(output)
     }
 }
diff --git a/src/obikmer/src/cmd/predicate.rs b/src/obikmer/src/cmd/predicate.rs
index 04678f0..b1183d3 100644
--- a/src/obikmer/src/cmd/predicate.rs
+++ b/src/obikmer/src/cmd/predicate.rs
@@ -49,6 +49,11 @@ impl MetaPred {
         if values.iter().any(|v| v.is_empty()) {
             return Err(format!("empty value in predicate: {s}"));
         }
+        if matches!(op, PredOp::Matches | PredOp::NotMatches) {
+            if let Some(v) = values.iter().find(|v| !v.starts_with('/')) {
+                return Err(format!("path predicate value must start with '/': {v:?} in predicate: {s}"));
+            }
+        }
 
         Ok(Self { key, op, values })
     }
@@ -72,16 +77,12 @@ impl MetaPred {
 
 /// True if `value` is equal to `pattern` or is a descendant of it in a `/`-separated hierarchy.
 ///
-/// - Absolute pattern (`/a/b`): `value` must start with `/a/b` at a segment boundary.
-/// - Bare segment (`b`): `value` must contain `b` as an exact segment anywhere.
+/// Both `value` and `pattern` must start with `/`.
+/// `value` matches if it equals `pattern` exactly or starts with `pattern` followed by `/`.
 fn path_matches(value: &str, pattern: &str) -> bool {
-    if pattern.starts_with('/') {
-        value == pattern
-            || (value.starts_with(pattern)
-                && value[pattern.len()..].starts_with('/'))
-    } else {
-        value.split('/').any(|seg| seg == pattern)
-    }
+    value == pattern
+        || (value.starts_with(pattern)
+            && value[pattern.len()..].starts_with('/'))
 }
 
 // ── Three-value group evaluation ──────────────────────────────────────────────
diff --git a/test.sk.fasta b/test.sk.fasta
deleted file mode 100644
index ff8e303..0000000
--- a/test.sk.fasta
+++ /dev/null
@@ -1,28 +0,0 @@
->F1FE4776BF3E1F06 {"seq_length":51,"kmer_size":31,"minimizer_size":11,"partition":229,"minimizer":"AAAAAAAATTA"}
-GAGTATACTCATGTGAGGGTAAAAAAAATTAAGTCCCATATTGAAACATTA
->C14BF81526DD6CB7 {"seq_length":31,"kmer_size":31,"minimizer_size":11,"partition":84,"minimizer":"AAAAAAATTAA"}
-AAAAAAATTAAGTCCCATATTGAAACATTAT
->9156D79605E4AC23 {"seq_length":31,"kmer_size":31,"minimizer_size":11,"partition":87,"minimizer":"AAAAAATTAAG"}
-AAAAAATTAAGTCCCATATTGAAACATTATC
->74666D1D78812D1E {"seq_length":31,"kmer_size":31,"minimizer_size":11,"partition":118,"minimizer":"AAAAATTAAGT"}
-AAAAATTAAGTCCCATATTGAAACATTATCA
->45EEFC3520FBDA9A {"seq_length":31,"kmer_size":31,"minimizer_size":11,"partition":32,"minimizer":"AAAATTAAGTC"}
-AAAATTAAGTCCCATATTGAAACATTATCAC
->5F44864B90170AF4 {"seq_length":49,"kmer_size":31,"minimizer_size":11,"partition":137,"minimizer":"AAACATTATCA"}
-AAATTAAGTCCCATATTGAAACATTATCACAAATGTGAGTTGTTAATAT
->8D10A11C86F8EF26 {"seq_length":42,"kmer_size":31,"minimizer_size":11,"partition":26,"minimizer":"AAATGTGAGTT"}
-AACATTATCACAAATGTGAGTTGTTAATATTACATAATTGGG
->C18F1086D0AF6E34 {"seq_length":32,"kmer_size":31,"minimizer_size":11,"partition":9,"minimizer":"TGTGAGTTGTT"}
-AATGTGAGTTGTTAATATTACATAATTGGGTT
->933477394DAF03BB {"seq_length":31,"kmer_size":31,"minimizer_size":11,"partition":48,"minimizer":"TAATTGGGTTT"}
-TGTGAGTTGTTAATATTACATAATTGGGTTT
->3CEE7E5227956042 {"seq_length":36,"kmer_size":31,"minimizer_size":11,"partition":252,"minimizer":"AATTGGGTTTT"}
-GTGAGTTGTTAATATTACATAATTGGGTTTTATGCT
->1BAF5B8767D63D0B {"seq_length":33,"kmer_size":31,"minimizer_size":11,"partition":201,"minimizer":"AAAGGCTCCCT"}
-TGAAAGGCTCCCTAGCGTGTTAATTAATCTCCC
->8368A897DB263C6F {"seq_length":38,"kmer_size":31,"minimizer_size":11,"partition":22,"minimizer":"CCTAGCGTGTT"}
-AAGGCTCCCTAGCGTGTTAATTAATCTCCCTGACAAGT
->247DC82E11CF8055 {"seq_length":35,"kmer_size":31,"minimizer_size":11,"partition":128,"minimizer":"AATCTCCCTGA"}
-CTAGCGTGTTAATTAATCTCCCTGACAAGTAGTGT
->11C93BBC8A5F6327 {"seq_length":35,"kmer_size":31,"minimizer_size":11,"partition":62,"minimizer":"CAAGTAGTGTT"}
-GTGTTAATTAATCTCCCTGACAAGTAGTGTTAGTG

From 9356be4ec0afd7e719ba6cf10e559b75eacad887 Mon Sep 17 00:00:00 2001
From: Eric Coissac <eric@coissac.eu>
Date: Sun, 21 Jun 2026 10:37:50 +0200
Subject: [PATCH 22/24] feat: introduce obitaxonomy crate for hierarchical
 taxonomy parsing

Adds the `obitaxonomy` crate to parse and validate hierarchical taxonomy paths using a strict `taxonomy:/name@rank/...` syntax. Replaces generic string-based path matching in predicates with structured `TaxPath` and `TaxPattern` types, enforcing explicit anchor constraints and rank-aware semantics. Updates filtering documentation to clarify optional leading slashes and segment-boundary matching rules.
---
 docmd/implementation/filtering.md      |  18 ++--
 docmd/implementation/obitaxonomy.md    | 143 +++++++++++++++++++++++++
 src/Cargo.lock                         |   1 +
 src/obikmer/Cargo.toml                 |   1 +
 src/obikmer/src/cmd/predicate.rs       |  20 ++--
 src/obitaxonomy/Cargo.toml             |   6 ++
 src/obitaxonomy/src/error.rs           |  38 +++++++
 src/obitaxonomy/src/lib.rs             |  11 ++
 src/obitaxonomy/src/path.rs            |  82 ++++++++++++++
 src/obitaxonomy/src/pattern.rs         |  72 +++++++++++++
 src/obitaxonomy/src/segment.rs         |  49 +++++++++
 src/obitaxonomy/src/segment_pattern.rs |  41 +++++++
 12 files changed, 464 insertions(+), 18 deletions(-)
 create mode 100644 docmd/implementation/obitaxonomy.md
 create mode 100644 src/obitaxonomy/Cargo.toml
 create mode 100644 src/obitaxonomy/src/error.rs
 create mode 100644 src/obitaxonomy/src/lib.rs
 create mode 100644 src/obitaxonomy/src/path.rs
 create mode 100644 src/obitaxonomy/src/pattern.rs
 create mode 100644 src/obitaxonomy/src/segment.rs
 create mode 100644 src/obitaxonomy/src/segment_pattern.rs

diff --git a/docmd/implementation/filtering.md b/docmd/implementation/filtering.md
index fe56bc9..ea6d4a2 100644
--- a/docmd/implementation/filtering.md
+++ b/docmd/implementation/filtering.md
@@ -32,14 +32,20 @@ Multiple values separated by `|` are always OR-ed within the predicate.
 Metadata values can represent hierarchical concept paths such as
 `/Eukaryota/Viridiplantae/Streptophyta/Betulaceae/Betula/nana`.
 
-**Both the stored metadata value and the pattern must start with `/`.**
-A pattern that does not start with `/` is rejected at parse time with an error.
+Stored taxonomy values always start with `/` (the root of the path).
+Query patterns do **not** need to start with `/` — a leading `/` is an optional
+start anchor, not a requirement.
 
-The value matches the pattern if it equals it exactly or starts with the pattern
-followed by `/` (segment-boundary prefix):
+| Pattern form | Semantics |
+|---|---|
+| `A/B` | contiguous sub-path A then B, anywhere in the value |
+| `/A/B` | value starts with A then B |
+| `A/B$` | value ends with A then B |
+| `/A/B$` | value is exactly A then B |
+| `A@x/B` | A with class `x` followed by B with any class |
 
-- `taxon~/Betulaceae/Betula` matches `/Betulaceae/Betula/nana` and
-  `/Betulaceae/Betula` but not `/Betulaceae/Betuloides/…`.
+- `taxon~/Betulaceae/Betula` matches any path that starts with `Betulaceae` then `Betula`.
+- `taxon~Betula` matches any path containing `Betula` as a segment, anywhere.
 
 ### Missing metadata key → NA
 
diff --git a/docmd/implementation/obitaxonomy.md b/docmd/implementation/obitaxonomy.md
new file mode 100644
index 0000000..d8ccd22
--- /dev/null
+++ b/docmd/implementation/obitaxonomy.md
@@ -0,0 +1,143 @@
+# `obitaxonomy` — taxonomy concept paths
+
+`obitaxonomy` is a dependency-free crate that defines a typed representation
+of hierarchical concept paths (taxonomic or otherwise) stored in genome metadata.
+
+---
+
+## Concept path syntax
+
+A concept path is stored as a metadata value with the prefix `taxonomy:/`:
+
+```
+taxonomy:/enterobacteriaceae@family/Escherichia@genus/Escherichia coli@species
+```
+
+Structure:
+
+- The `taxonomy:/` prefix is the type discriminator. Any metadata value starting
+  with it is parsed as a `TaxPath`; all others remain plain strings.
+- The remainder is one or more `/`-separated segments.
+- Each segment is `name` or `name@rank`, where `rank` is a label for the
+  taxonomic level (e.g. `family`, `genus`, `species`).
+- Rank annotations are **optional per segment** and can be mixed freely.
+- Spaces are allowed in both names and ranks.
+
+### Reserved character
+
+`@` is reserved throughout the taxonomy system and may **not** appear in:
+
+| Context | Constraint |
+|---------|------------|
+| Segment name | forbidden |
+| Rank/class label | forbidden |
+| Metadata key names | forbidden (used as `key@rank` in predicate syntax) |
+
+`@` is freely allowed in plain-text metadata values (non-taxonomy).
+
+### Parse errors
+
+| Condition | Error |
+|-----------|-------|
+| Value does not start with `taxonomy:/` | `MissingPrefix` |
+| No segments after the prefix | `EmptyPath` |
+| Segment with empty name (consecutive `/`) | `EmptySegmentName` |
+| Segment with trailing `@` and no rank (`name@`) | `EmptyRankName` |
+| Segment with more than one `@` | `AmbiguousRank` |
+
+---
+
+## Public API
+
+### `TaxSegment`
+
+A single node: a name and an optional rank.
+
+```rust
+seg.name()            // &str
+seg.rank()            // Option<&str>
+seg.to_string()       // "name" or "name@rank"
+TaxSegment::parse(s)  // Result<TaxSegment, TaxError>
+```
+
+### `TaxPath`
+
+```rust
+TaxPath::parse(s)               // Result<TaxPath, TaxError>
+path.segments()                 // &[TaxSegment]
+path.depth()                    // usize — number of segments
+path.is_ancestor_of(&other)     // bool — prefix match by name, ranks ignored
+path.name_at_rank("genus")      // Option<&str>
+path.to_string()                // reconstructs "taxonomy:/…"
+```
+
+`is_ancestor_of` compares segment **names** only — rank annotations are
+informational and do not affect the ancestry relation.
+
+```rust
+let a: TaxPath = "taxonomy:/Enterobacteriaceae@family/Escherichia@genus".parse()?;
+let b: TaxPath = "taxonomy:/Enterobacteriaceae@family/Escherichia@genus/Escherichia coli@species".parse()?;
+
+assert!(a.is_ancestor_of(&b));   // true
+assert!(b.is_ancestor_of(&a));   // false
+assert!(a.is_ancestor_of(&a));   // true  (equal ⇒ ancestor)
+
+assert_eq!(b.name_at_rank("species"), Some("Escherichia coli"));
+assert_eq!(b.name_at_rank("genus"),   Some("Escherichia"));
+assert_eq!(b.name_at_rank("order"),   None);
+```
+
+---
+
+## Integration with `GenomeInfo`
+
+At index load time, every metadata value is inspected once:
+
+- Starts with `taxonomy:/` → parsed into `TaxPath`, stored in `genome.taxonomy`.
+- Otherwise → kept as-is in `genome.meta`.
+
+```rust
+struct GenomeInfo {
+    label:    String,
+    meta:     HashMap<String, String>,    // plain text metadata
+    taxonomy: HashMap<String, TaxPath>,   // parsed taxonomy metadata
+}
+```
+
+The raw string is not duplicated. `TaxPath::to_string()` reconstructs the
+original value losslessly for serialisation.
+
+---
+
+## Predicate operators (in `filter` / `select`)
+
+Path predicates use the `~` / `!~` operators. The **stored value** always starts
+with `/` (rooted path); the **query pattern** does not need to.
+
+### Path pattern syntax
+
+| Pattern | Semantics |
+|---------|-----------|
+| `A/B` | contiguous sub-path A then B, anywhere in the value |
+| `/A/B` | value starts with A then B (start-anchored) |
+| `A/B$` | value ends with A then B (end-anchored) |
+| `/A/B$` | value is exactly A then B (fully anchored) |
+| `A@x/B` | A with class `x` followed by B with any class |
+| `A@x/B@y` | A with class `x` followed by B with class `y` |
+
+A segment pattern without `@` matches the segment name regardless of its stored class.
+
+### Rank-aware queries
+
+```
+key@rank=value
+```
+
+| Predicate form | Semantics |
+|----------------|-----------|
+| `key@rank=value` | genome's `key` has `value` at rank `rank` |
+| `key@rank!=value` | does not |
+| `key@rank=v1\|v2` | value at `rank` is `v1` or `v2` |
+
+`~` combined with `@rank` on the key (e.g. `key@genus~pattern`) is not defined
+and is rejected at parse time.
diff --git a/src/Cargo.lock b/src/Cargo.lock
index a48a7fd..bdb1caa 100644
--- a/src/Cargo.lock
+++ b/src/Cargo.lock
@@ -1722,6 +1722,7 @@ dependencies = [
  "obiskbuilder",
  "obiskio",
  "obisys",
+ "obitaxonomy",
  "pprof",
  "rayon",
  "serde_json",
diff --git a/src/obikmer/Cargo.toml b/src/obikmer/Cargo.toml
index 2dcfb91..4045476 100644
--- a/src/obikmer/Cargo.toml
+++ b/src/obikmer/Cargo.toml
@@ -19,6 +19,7 @@ obikpartitionner = { path = "../obikpartitionner" }
 obisys        = { path = "../obisys" }
 obiskio       = { path = "../obiskio" }
 obikindex     = { path = "../obikindex" }
+obitaxonomy   = { path = "../obitaxonomy" }
 obilayeredmap = { path = "../obilayeredmap" }
 clap          = { version = "4", features = ["derive"] }
 serde_json    = "1"
diff --git a/src/obikmer/src/cmd/predicate.rs b/src/obikmer/src/cmd/predicate.rs
index b1183d3..47baab9 100644
--- a/src/obikmer/src/cmd/predicate.rs
+++ b/src/obikmer/src/cmd/predicate.rs
@@ -3,6 +3,7 @@ use std::collections::HashMap;
 use clap::Args;
 use obikindex::GenomeInfo;
 use obikpartitionner::{GroupQuorumFilter, KmerFilter};
+use obitaxonomy::{TaxPath, TaxPattern};
 
 // ── Operator ──────────────────────────────────────────────────────────────────
 
@@ -49,12 +50,6 @@ impl MetaPred {
         if values.iter().any(|v| v.is_empty()) {
             return Err(format!("empty value in predicate: {s}"));
         }
-        if matches!(op, PredOp::Matches | PredOp::NotMatches) {
-            if let Some(v) = values.iter().find(|v| !v.starts_with('/')) {
-                return Err(format!("path predicate value must start with '/': {v:?} in predicate: {s}"));
-            }
-        }
-
         Ok(Self { key, op, values })
     }
 
@@ -75,14 +70,15 @@ impl MetaPred {
 
 // ── Path matching ─────────────────────────────────────────────────────────────
 
-/// True if `value` is equal to `pattern` or is a descendant of it in a `/`-separated hierarchy.
+/// True if the stored taxonomy `value` matches `pattern`.
 ///
-/// Both `value` and `pattern` must start with `/`.
-/// `value` matches if it equals `pattern` exactly or starts with `pattern` followed by `/`.
+/// `value` must be a valid `TaxPath` (starts with `taxonomy:/`).
+/// `pattern` is a `TaxPattern` query (see `obitaxonomy::TaxPattern` for syntax).
+/// Returns `false` if either fails to parse.
 fn path_matches(value: &str, pattern: &str) -> bool {
-    value == pattern
-        || (value.starts_with(pattern)
-            && value[pattern.len()..].starts_with('/'))
+    let Ok(path) = TaxPath::parse(value)    else { return false };
+    let Ok(pat)  = TaxPattern::parse(pattern) else { return false };
+    pat.matches(&path)
 }
 
 // ── Three-value group evaluation ──────────────────────────────────────────────
diff --git a/src/obitaxonomy/Cargo.toml b/src/obitaxonomy/Cargo.toml
new file mode 100644
index 0000000..b391f4d
--- /dev/null
+++ b/src/obitaxonomy/Cargo.toml
@@ -0,0 +1,6 @@
+[package]
+name = "obitaxonomy"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
diff --git a/src/obitaxonomy/src/error.rs b/src/obitaxonomy/src/error.rs
new file mode 100644
index 0000000..5f4f24e
--- /dev/null
+++ b/src/obitaxonomy/src/error.rs
@@ -0,0 +1,38 @@
+use std::fmt;
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum TaxError {
+    /// Stored value does not start with the `taxonomy:/` prefix.
+    MissingPrefix,
+    /// Stored path contains no segments after the prefix.
+    EmptyPath,
+    /// Query pattern contains no segments (after stripping anchors).
+    EmptyPattern,
+    /// A segment has an empty name (e.g. consecutive `/`).
+    EmptySegmentName,
+    /// A segment has a trailing `@` with no rank name.
+    EmptyRankName { segment: String },
+    /// A segment contains more than one `@`.
+    AmbiguousRank { segment: String },
+}
+
+impl fmt::Display for TaxError {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            TaxError::MissingPrefix =>
+                write!(f, "taxonomy path must start with \"taxonomy:/\""),
+            TaxError::EmptyPath =>
+                write!(f, "taxonomy path has no segments"),
+            TaxError::EmptyPattern =>
+                write!(f, "taxonomy query pattern has no segments"),
+            TaxError::EmptySegmentName =>
+                write!(f, "segment has an empty name"),
+            TaxError::EmptyRankName { segment } =>
+                write!(f, "segment has '@' with no rank name: {segment:?}"),
+            TaxError::AmbiguousRank { segment } =>
+                write!(f, "segment contains more than one '@': {segment:?}"),
+        }
+    }
+}
+
+impl std::error::Error for TaxError {}
diff --git a/src/obitaxonomy/src/lib.rs b/src/obitaxonomy/src/lib.rs
new file mode 100644
index 0000000..aea3cff
--- /dev/null
+++ b/src/obitaxonomy/src/lib.rs
@@ -0,0 +1,11 @@
+mod error;
+mod segment;
+mod segment_pattern;
+mod path;
+mod pattern;
+
+pub use error::TaxError;
+pub use segment::TaxSegment;
+pub use segment_pattern::SegmentPattern;
+pub use path::{TaxPath, PREFIX};
+pub use pattern::TaxPattern;
diff --git a/src/obitaxonomy/src/path.rs b/src/obitaxonomy/src/path.rs
new file mode 100644
index 0000000..096c09b
--- /dev/null
+++ b/src/obitaxonomy/src/path.rs
@@ -0,0 +1,82 @@
+use std::fmt;
+use std::str::FromStr;
+
+use crate::error::TaxError;
+use crate::segment::TaxSegment;
+
+/// The prefix that marks a metadata value as a taxonomy path.
+pub const PREFIX: &str = "taxonomy:/";
+
+/// A rooted, `/`-separated taxonomy path with optional per-segment rank annotations.
+///
+/// Stored form: `taxonomy:/seg1@rank1/seg2/seg3@rank3`
+/// The leading `taxonomy:/` is the discriminator; the remainder is one or more
+/// `/`-separated segments, each of the form `name` or `name@rank`.
+///
+/// `@` is reserved and may not appear in segment names or rank names.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct TaxPath {
+    segments: Vec<TaxSegment>,
+}
+
+impl TaxPath {
+    pub fn parse(s: &str) -> Result<Self, TaxError> {
+        let tail = s.strip_prefix(PREFIX).ok_or(TaxError::MissingPrefix)?;
+        if tail.is_empty() {
+            return Err(TaxError::EmptyPath);
+        }
+        let segments = tail.split('/')
+            .map(TaxSegment::parse)
+            .collect::<Result<Vec<_>, _>>()?;
+        Ok(Self { segments })
+    }
+
+    /// True if `self` is an ancestor of — or equal to — `other`.
+    ///
+    /// Comparison is by segment name only; rank annotations are ignored.
+    /// `self` must be a prefix of `other` at segment granularity.
+    pub fn is_ancestor_of(&self, other: &TaxPath) -> bool {
+        self.segments.len() <= other.segments.len()
+            && self.segments.iter().zip(other.segments.iter())
+                .all(|(a, b)| a.name() == b.name())
+    }
+
+    /// Returns the name of the first segment whose rank equals `rank`, if any.
+    pub fn name_at_rank(&self, rank: &str) -> Option<&str> {
+        self.segments.iter()
+            .find(|s| s.rank() == Some(rank))
+            .map(|s| s.name())
+    }
+
+    /// True if any segment has the given rank.
+    pub fn has_rank(&self, rank: &str) -> bool {
+        self.segments.iter().any(|s| s.rank() == Some(rank))
+    }
+
+    /// True if the path contains a segment with both the given rank and name.
+    pub fn matches_rank(&self, rank: &str, name: &str) -> bool {
+        self.segments.iter().any(|s| s.rank() == Some(rank) && s.name() == name)
+    }
+
+    pub fn segments(&self) -> &[TaxSegment] { &self.segments }
+    pub fn depth(&self)    -> usize          { self.segments.len() }
+    pub fn is_empty(&self) -> bool           { self.segments.is_empty() }
+}
+
+impl fmt::Display for TaxPath {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "{}", PREFIX)?;
+        let mut first = true;
+        for seg in &self.segments {
+            if !first { write!(f, "/")?; }
+            write!(f, "{seg}")?;
+            first = false;
+        }
+        Ok(())
+    }
+}
+
+impl FromStr for TaxPath {
+    type Err = TaxError;
+    fn from_str(s: &str) -> Result<Self, Self::Err> { Self::parse(s) }
+}
diff --git a/src/obitaxonomy/src/pattern.rs b/src/obitaxonomy/src/pattern.rs
new file mode 100644
index 0000000..c0474d8
--- /dev/null
+++ b/src/obitaxonomy/src/pattern.rs
@@ -0,0 +1,72 @@
+use crate::error::TaxError;
+use crate::path::TaxPath;
+use crate::segment::TaxSegment;
+use crate::segment_pattern::SegmentPattern;
+
+/// A query pattern for matching against stored `TaxPath` values.
+///
+/// Syntax:
+///
+/// | Form     | Semantics |
+/// |----------|-----------|
+/// | `A/B`    | A then B as a contiguous sub-path, anywhere in the value |
+/// | `/A/B`   | value starts with A then B (start-anchored) |
+/// | `A/B$`   | value ends with A then B (end-anchored) |
+/// | `/A/B$`  | value is exactly A then B (fully anchored) |
+/// | `A@x/B`  | A with rank `x`, followed by B with any rank |
+///
+/// A segment pattern without `@` matches any segment with that name regardless
+/// of its stored rank.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct TaxPattern {
+    start_anchored: bool,
+    end_anchored:   bool,
+    segments:       Vec<SegmentPattern>,
+}
+
+impl TaxPattern {
+    pub fn parse(s: &str) -> Result<Self, TaxError> {
+        let s = s.trim();
+
+        let start_anchored = s.starts_with('/');
+        let s = if start_anchored { &s[1..] } else { s };
+
+        let end_anchored = s.ends_with('$');
+        let s = if end_anchored { &s[..s.len() - 1] } else { s };
+
+        if s.is_empty() {
+            return Err(TaxError::EmptyPattern);
+        }
+
+        let segments = s.split('/')
+            .map(SegmentPattern::parse)
+            .collect::<Result<Vec<_>, _>>()?;
+
+        Ok(Self { start_anchored, end_anchored, segments })
+    }
+
+    /// True if this pattern matches `path` according to the anchor flags.
+    ///
+    /// The pattern must match a contiguous run of segments in the path.
+    /// Start/end anchors restrict where that run may begin or end.
+    pub fn matches(&self, path: &TaxPath) -> bool {
+        let n = self.segments.len();
+        let m = path.depth();
+
+        if n > m { return false; }
+
+        let segs = path.segments();
+        match (self.start_anchored, self.end_anchored) {
+            (true,  true)  => n == m && self.window_matches(segs, 0),
+            (true,  false) => self.window_matches(segs, 0),
+            (false, true)  => self.window_matches(segs, m - n),
+            (false, false) => (0..=(m - n)).any(|i| self.window_matches(segs, i)),
+        }
+    }
+
+    fn window_matches(&self, segs: &[TaxSegment], start: usize) -> bool {
+        self.segments.iter()
+            .zip(segs[start..start + self.segments.len()].iter())
+            .all(|(pat, seg)| pat.matches(seg))
+    }
+}
diff --git a/src/obitaxonomy/src/segment.rs b/src/obitaxonomy/src/segment.rs
new file mode 100644
index 0000000..b06436d
--- /dev/null
+++ b/src/obitaxonomy/src/segment.rs
@@ -0,0 +1,49 @@
+use std::fmt;
+
+use crate::error::TaxError;
+
+/// A single node in a taxonomy path: a name and an optional rank.
+///
+/// Neither `name` nor `rank` may contain `@` (reserved separator).
+/// Serialised form: `name` or `name@rank`.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct TaxSegment {
+    name: String,
+    rank: Option<String>,
+}
+
+impl TaxSegment {
+    pub fn parse(raw: &str) -> Result<Self, TaxError> {
+        let parts: Vec<&str> = raw.splitn(3, '@').collect();
+
+        let (name_raw, rank_raw) = match parts.as_slice() {
+            [name]        => (*name, None),
+            [name, rank]  => (*name, Some(*rank)),
+            _             => return Err(TaxError::AmbiguousRank { segment: raw.to_string() }),
+        };
+
+        if name_raw.is_empty() {
+            return Err(TaxError::EmptySegmentName);
+        }
+
+        let rank = match rank_raw {
+            None     => None,
+            Some("") => return Err(TaxError::EmptyRankName { segment: raw.to_string() }),
+            Some(r)  => Some(r.to_string()),
+        };
+
+        Ok(Self { name: name_raw.to_string(), rank })
+    }
+
+    pub fn name(&self) -> &str { &self.name }
+    pub fn rank(&self) -> Option<&str> { self.rank.as_deref() }
+}
+
+impl fmt::Display for TaxSegment {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match &self.rank {
+            None    => write!(f, "{}", self.name),
+            Some(r) => write!(f, "{}@{}", self.name, r),
+        }
+    }
+}
diff --git a/src/obitaxonomy/src/segment_pattern.rs b/src/obitaxonomy/src/segment_pattern.rs
new file mode 100644
index 0000000..13895ed
--- /dev/null
+++ b/src/obitaxonomy/src/segment_pattern.rs
@@ -0,0 +1,41 @@
+use crate::error::TaxError;
+use crate::segment::TaxSegment;
+
+/// A single segment in a query pattern: a required name and an optional rank filter.
+///
+/// If `rank` is `None`, the pattern matches any segment with the given name,
+/// regardless of its stored rank. If `rank` is `Some(r)`, both name and rank
+/// must match exactly.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct SegmentPattern {
+    name: String,
+    rank: Option<String>,
+}
+
+impl SegmentPattern {
+    pub fn parse(raw: &str) -> Result<Self, TaxError> {
+        let parts: Vec<&str> = raw.splitn(3, '@').collect();
+        let (name_raw, rank_raw) = match parts.as_slice() {
+            [name]       => (*name, None),
+            [name, rank] => (*name, Some(*rank)),
+            _            => return Err(TaxError::AmbiguousRank { segment: raw.to_string() }),
+        };
+        if name_raw.is_empty() {
+            return Err(TaxError::EmptySegmentName);
+        }
+        let rank = match rank_raw {
+            None     => None,
+            Some("") => return Err(TaxError::EmptyRankName { segment: raw.to_string() }),
+            Some(r)  => Some(r.to_string()),
+        };
+        Ok(Self { name: name_raw.to_string(), rank })
+    }
+
+    /// True if this pattern matches `seg`.
+    /// Name must match exactly. If a rank is specified in the pattern, the
+    /// segment's rank must match; otherwise any rank (or no rank) is accepted.
+    pub fn matches(&self, seg: &TaxSegment) -> bool {
+        self.name == seg.name()
+            && self.rank.as_deref().map_or(true, |r| seg.rank() == Some(r))
+    }
+}

From c1d6f277ced705b30b8d624325ef86ff8ca9a5d3 Mon Sep 17 00:00:00 2001
From: Eric Coissac <eric@coissac.eu>
Date: Sun, 21 Jun 2026 19:07:15 +0200
Subject: [PATCH 23/24] feat(select): add metrics reporting to selection
 methods

Integrates an obisys::Reporter across indexing and command modules to capture execution metrics. Replaces discarded timer stops with explicit rep.push() calls, adds timing instrumentation for the pack stage, and prints collected reports after each selection branch.
---
 src/obikindex/src/select.rs   | 16 +++++++++-------
 src/obikmer/src/cmd/select.rs |  9 +++++++--
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/src/obikindex/src/select.rs b/src/obikindex/src/select.rs
index 1db57bd..a27125b 100644
--- a/src/obikindex/src/select.rs
+++ b/src/obikindex/src/select.rs
@@ -3,7 +3,7 @@ use std::io;
 use std::path::Path;
 
 use obikpartitionner::{KmerPartition, OutputCol, PARTITIONS_SUBDIR};
-use obisys::{Stage, progress_bar};
+use obisys::{Reporter, Stage, progress_bar};
 use tracing::info;
 
 use crate::error::{OKIError, OKIResult};
@@ -25,6 +25,7 @@ impl KmerIndex {
         threshold: u32,
         output_presence: bool,
         force: bool,
+        rep: &mut Reporter,
     ) -> OKIResult<Self> {
         let output = output.as_ref();
 
@@ -80,13 +81,14 @@ impl KmerIndex {
         ).map_err(OKIError::Partition)?;
 
         pb.finish_and_clear();
-
-        let _ = t.stop();
+        rep.push(t.stop());
 
         fs::File::create(output.join(SENTINEL_INDEXED))?;
 
         let idx = KmerIndex::open(output)?;
+        let t_pack = Stage::start("pack");
         idx.pack_matrices()?;
+        rep.push(t_pack.stop());
         Ok(idx)
     }
 
@@ -98,6 +100,7 @@ impl KmerIndex {
         specs: &[OutputCol],
         threshold: u32,
         output_presence: bool,
+        rep: &mut Reporter,
     ) -> OKIResult<()> {
         if self.state() != IndexState::Indexed {
             return Err(OKIError::NotIndexed(self.root_path.clone()));
@@ -106,7 +109,6 @@ impl KmerIndex {
         let n_src_genomes = self.meta.genomes.len();
         let n_partitions  = self.partition.n_partitions();
 
-        // Open a second handle to the same path so we can borrow src and dst simultaneously.
         let src_partition = KmerPartition::open_with_config(
             &self.root_path,
             self.meta.config.kmer_size,
@@ -132,17 +134,17 @@ impl KmerIndex {
         ).map_err(OKIError::Partition)?;
 
         pb.finish_and_clear();
+        rep.push(t.stop());
 
-        let _ = t.stop();
-
-        // Update index.meta with new genome list and with_counts flag.
         self.meta.config.with_counts = !output_presence;
         self.meta.genomes = specs.iter()
             .map(|s| GenomeInfo::new(s.label.clone()))
             .collect();
         self.meta.write(&self.root_path)?;
 
+        let t_pack = Stage::start("pack");
         self.pack_matrices()?;
+        rep.push(t_pack.stop());
         Ok(())
     }
 }
diff --git a/src/obikmer/src/cmd/select.rs b/src/obikmer/src/cmd/select.rs
index e021b36..35719e8 100644
--- a/src/obikmer/src/cmd/select.rs
+++ b/src/obikmer/src/cmd/select.rs
@@ -4,6 +4,7 @@ use std::path::PathBuf;
 use clap::{Args, ValueEnum};
 use obikindex::{GenomeInfo, KmerIndex};
 use obikpartitionner::{AggOp, OutputCol};
+use obisys::Reporter;
 use tracing::info;
 
 use super::predicate::matching_genome_indices;
@@ -229,20 +230,24 @@ pub fn run(args: SelectArgs) {
         if output_presence { "presence" } else { "count" },
     );
 
+    let mut rep = Reporter::new();
+
     if args.in_place {
-        src.select_in_place(&specs, args.presence_threshold, output_presence)
+        src.select_in_place(&specs, args.presence_threshold, output_presence, &mut rep)
             .unwrap_or_else(|e| {
                 eprintln!("select error: {e}");
                 std::process::exit(1);
             });
+        rep.print();
         info!("selected in-place → {}", args.source.display());
     } else {
         let output = args.output.unwrap();
-        KmerIndex::select(&output, &src, &specs, args.presence_threshold, output_presence, args.force)
+        KmerIndex::select(&output, &src, &specs, args.presence_threshold, output_presence, args.force, &mut rep)
             .unwrap_or_else(|e| {
                 eprintln!("select error: {e}");
                 std::process::exit(1);
             });
+        rep.print();
         info!("selected index → {}", output.display());
     }
 }

From a522c0907e99312d4ac09f259383eb2b0ec60e84 Mon Sep 17 00:00:00 2001
From: Eric Coissac <eric@coissac.eu>
Date: Mon, 22 Jun 2026 10:33:16 +0200
Subject: [PATCH 24/24] feat: add CI/CD workflows, release automation, and CLI
 version flag

Adds Gitea Actions for continuous integration and tagged releases, including static musl binary compilation and artifact upload. Introduces a Makefile target to automate semantic version bumping and publishing. Bumps the package version to 0.1.1 and enables automatic `--version` output via Clap.
---
 .gitea/workflows/ci.yml      | 32 ++++++++++++++++++++++++
 .gitea/workflows/release.yml | 47 ++++++++++++++++++++++++++++++++++++
 Makefile                     | 25 +++++++++++++++++++
 src/Cargo.lock               |  2 +-
 src/obikmer/Cargo.toml       |  2 +-
 src/obikmer/src/main.rs      |  2 +-
 6 files changed, 107 insertions(+), 3 deletions(-)
 create mode 100644 .gitea/workflows/ci.yml
 create mode 100644 .gitea/workflows/release.yml

diff --git a/.gitea/workflows/ci.yml b/.gitea/workflows/ci.yml
new file mode 100644
index 0000000..3bcc128
--- /dev/null
+++ b/.gitea/workflows/ci.yml
@@ -0,0 +1,32 @@
+name: CI
+
+on:
+  push:
+    branches: ['**']
+  pull_request:
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    container: rust:latest
+    defaults:
+      run:
+        working-directory: src
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Cache cargo registry
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cargo/registry
+            ~/.cargo/git
+            src/target
+          key: ${{ runner.os }}-cargo-${{ hashFiles('src/Cargo.lock') }}
+          restore-keys: ${{ runner.os }}-cargo-
+
+      - name: Build
+        run: cargo build --release
+
+      - name: Test
+        run: cargo test --release
diff --git a/.gitea/workflows/release.yml b/.gitea/workflows/release.yml
new file mode 100644
index 0000000..d74c374
--- /dev/null
+++ b/.gitea/workflows/release.yml
@@ -0,0 +1,47 @@
+name: Release
+
+on:
+  push:
+    tags:
+      - 'v*'
+
+jobs:
+  build-linux-static:
+    runs-on: ubuntu-latest
+    container: rust:latest
+    defaults:
+      run:
+        working-directory: src
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Cache cargo registry
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cargo/registry
+            ~/.cargo/git
+            src/target
+          key: linux-musl-cargo-${{ hashFiles('src/Cargo.lock') }}
+          restore-keys: linux-musl-cargo-
+
+      - name: Install musl toolchain
+        run: |
+          apt-get update -qq && apt-get install -y -qq musl-tools
+          rustup target add x86_64-unknown-linux-musl
+
+      - name: Build static binary
+        run: cargo build --release --target x86_64-unknown-linux-musl
+
+      - name: Prepare artifact
+        run: |
+          mkdir -p /tmp/dist
+          cp target/x86_64-unknown-linux-musl/release/obikmer /tmp/dist/obikmer-linux-x86_64
+          strip /tmp/dist/obikmer-linux-x86_64
+
+      - name: Upload release asset
+        uses: actions/upload-artifact@v4
+        with:
+          name: obikmer-linux-x86_64
+          path: /tmp/dist/obikmer-linux-x86_64
+          if-no-files-found: error
diff --git a/Makefile b/Makefile
index 0fe6d46..04942bf 100644
--- a/Makefile
+++ b/Makefile
@@ -63,3 +63,28 @@ clean-doc:
 .PHONY: clean
 clean: clean-doc
 	rm -rf $(VENV)
+
+# ── release ───────────────────────────────────────────────────────────────────
+
+CARGO_TOML := $(CARGO_DIR)/obikmer/Cargo.toml
+
+.PHONY: bump-version
+bump-version:
+	@current=$$(grep '^version = ' $(CARGO_TOML) | head -n 1 | sed 's/version = "\(.*\)"/\1/'); \
+	if [ -n "$(RELEASE)" ]; then \
+		new_version="$(RELEASE)"; \
+	else \
+		major=$$(echo $$current | cut -d. -f1); \
+		minor=$$(echo $$current | cut -d. -f2); \
+		patch=$$(echo $$current | cut -d. -f3); \
+		new_patch=$$((patch + 1)); \
+		new_version="$$major.$$minor.$$new_patch"; \
+	fi; \
+	echo "Version: $$current -> $$new_version"; \
+	sed -i.bak "s/^version = \"$$current\"/version = \"$$new_version\"/" $(CARGO_TOML) && \
+	rm $(CARGO_TOML).bak
+
+.PHONY: release
+release: bump-version
+	@jj auto-describe
+	@jj git push --change @
diff --git a/src/Cargo.lock b/src/Cargo.lock
index bdb1caa..bb91a4d 100644
--- a/src/Cargo.lock
+++ b/src/Cargo.lock
@@ -1704,7 +1704,7 @@ dependencies = [
 
 [[package]]
 name = "obikmer"
-version = "0.1.0"
+version = "0.1.1"
 dependencies = [
  "clap",
  "csv",
diff --git a/src/obikmer/Cargo.toml b/src/obikmer/Cargo.toml
index 4045476..07239a0 100644
--- a/src/obikmer/Cargo.toml
+++ b/src/obikmer/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "obikmer"
-version = "0.1.0"
+version = "0.1.1"
 edition = "2024"
 
 [[bin]]
diff --git a/src/obikmer/src/main.rs b/src/obikmer/src/main.rs
index fdcf69c..a0b270b 100644
--- a/src/obikmer/src/main.rs
+++ b/src/obikmer/src/main.rs
@@ -6,7 +6,7 @@ use clap::{Parser, Subcommand};
 use tracing_subscriber::{EnvFilter, fmt};
 
 #[derive(Parser)]
-#[command(name = "obikmer", about = "DNA k-mer tools")]
+#[command(name = "obikmer", about = "DNA k-mer tools", version)]
 struct Cli {
     #[command(subcommand)]
     command: Commands,