refactor: extract obikindex crate and remove deprecated CLI commands

Extracted core indexing logic, state tracking, and metadata management into a new `obikindex` crate. Refactored the `index` and `unitig` commands to leverage the `KmerIndex` abstraction and state-driven pipeline transitions. Removed obsolete CLI subcommands (`count`, `fasta`, `longtig`, `partition`) and their associated pipeline steps. Updated FASTA writing utilities for single-line output and deterministic identifiers, and refreshed workspace dependencies.
2026-05-20 18:21:05 +02:00
parent f8cfb493b8
commit 17c9e076bd
24 changed files with 792 additions and 1593 deletions
@@ -1,4 +1,28 @@
-## Dans OBILayeredMap
+## Chose à vérifier suite à la commande index

- Est-ce que CachelineEfVec est vraiment justifier dans notre cas. vu les contraintes sur la distribution des valeurs imposées par CachelineEfVec en terme d'ordre, de grandeur et de dispersion ?
- Il semble que le count de kmer soit stocké, ce qui doit-être une possibilité pas une obligation.
+- partition.meta ne devrait plus exister
+- les spectrums globaux devrait etre identifier par génome
+  - regrouper dans un sous-dossier spectrums à la racine de l'index avec un nom basé sur le génome
+- les spectrum patiels ont-ils vocation à être conserver ?
+- l'étape de déreplication dure quasiment autant de temps que le comptage mais ne laisse aucune trace de progression à l'utilisateur
+
+## commandes à ajouter
+
+- merge : pour construire un index à partir d'index existants
+  - deux modes : count et presence/absence. count exige que tous les index mergés soient déjà en mode count. mode presence/absence par defaut. Si passage de mode count à mode presence/absence, par defaut presence = count >= 1. Possibilité de spécifier un seuil personnalisé.
+
+- filter : produit un nouvel index filtré à partir d'un index existant en verifiant que les kmer présents dans le nouvel index respectent les critères de filtrage spécifiés
+  - quorum de presence en fraction-(min/max) du nombre de génomes, en nombre-(min/max) de génomes, si mode count la présence peut être défini par un seuil personnalisé minimum et maximum
+
+- aggregate : aggrege toutes les colonnes d'une matrice d'index en une seule colonne.
+
+- query : scan un fichier de sequences et retourne pour chaque sequence quels kmer sont présents dans l'index et dans quel genomes
+
+- distance : calcule la matrice de distance entre les genomes
+  - proposer une option pour chaque distance à calculer
+  - un possibité de récuperer la matrice des kmer communs
+  - un possibité de calculer l'arbre nj 
+  - les matrices sont sauvegardées en CSV
+  - les arbres NJ sont sauvegardés en Newick avec les longeurs de branche
+
+- dump : une table csv de l'index avec les kmer et les genomes associés en mode count ou presence/absence avec une option pour forcer le mode presence/absence meme si l'index est en mode count. Par defaut, le mode count est utilisé pour les index en mode count et le mode presence/absence pour les index en mode presence/absence.
@@ -1,5 +1,5 @@
 [workspace]
 resolver = "3"
-members = ["obikseq", "obiread", "obiskbuilder", "obifastwrite", "obikmer","obikrope","obipipeline", "obikpartitionner","obiskio","obidebruinj","obilayeredmap", "obicompactvec", "obisys"]
+members = ["obikseq", "obiread", "obiskbuilder", "obifastwrite", "obikmer","obikrope","obipipeline", "obikpartitionner","obiskio","obidebruinj","obilayeredmap", "obicompactvec", "obisys", "obikindex"]
 [profile.release]
 debug = 1
@@ -1,12 +1,10 @@
 //use ahash::RandomState;
 use hashbrown::HashMap;
-use obifastwrite::write_unitig;
 use obikseq::k;
 use obikseq::unitig::Unitig;
 use obikseq::{CanonicalKmer, Kmer, Sequence};
 use std::cell::Cell;
 use std::fmt;
-use std::io;
 use xxhash_rust::xxh3::Xxh3Builder;

 // ── Types ─────────────────────────────────────────────────────────────────────
@@ -293,59 +291,6 @@ impl GraphDeBruijn {
        Some(oriented)
    }

-    fn next_longtig_kmer(&self, kmer: Kmer) -> Option<Kmer> {
-        let canonical = kmer.canonical();
-        let node = self.nodes.get(&canonical)?.get();
-
-        let direct = kmer.raw() == canonical.raw();
-
-        if (direct && node.n_right_neighbours() == 0) || (!direct && node.n_left_neighbours() == 0)
-        {
-            return None;
-        }
-
-        let next_c: CanonicalKmer = if direct {
-            if node.can_extend_right() {
-                canonical
-                    .into_kmer()
-                    .push_right(node.right_nuc())
-                    .canonical()
-            } else {
-                self.iter_right_neighbors(canonical)
-                    .filter(|n| !self.is_visited(n).unwrap_or(true))
-                    .next()?
-            }
-        } else {
-            if node.can_extend_left() {
-                canonical.into_kmer().push_left(node.left_nuc()).canonical()
-            } else {
-                self.iter_left_neighbors(canonical)
-                    .filter(|n| !self.is_visited(n).unwrap_or(true))
-                    .next()?
-            }
-        };
-
-        let cell = self.nodes.get(&next_c)?;
-        let next_node = cell.get();
-        if next_node.is_visited() {
-            return None;
-        }
-
-        let oriented = oriented_next(kmer, next_c);
-        let ndirect = oriented.raw() == next_c.raw();
-
-        if (ndirect && next_node.n_right_neighbours() > 1)
-            || (!ndirect && next_node.n_left_neighbours() > 1)
-        {
-            return None;
-        }
-
-        let mut updated = next_node;
-        updated.set_visited();
-        cell.set(updated);
-        Some(oriented)
-    }
-
    fn iter_unitig_kmers(&self, start: Kmer) -> UnitigIter<'_> {
        UnitigIter {
            graph: self,
@@ -353,13 +298,6 @@ impl GraphDeBruijn {
        }
    }

-    fn iter_longtig_kmers(&self, start: Kmer) -> LongtigIter<'_> {
-        LongtigIter {
-            graph: self,
-            current: Some(start),
-        }
-    }
-
    pub fn iter_unitig(&self) -> impl Iterator<Item = Unitig> + '_ {
        let k = k();
        self.start_iter().map(move |(start, first_next)| {
@@ -373,36 +311,6 @@ impl GraphDeBruijn {
        })
    }

-    pub fn iter_longtig(&self) -> impl Iterator<Item = Unitig> + '_ {
-        let k = k();
-        self.start_iter().map(move |(start, first_next)| {
-            let mut nucs: Vec<u8> = (0..k).map(|i| start.nucleotide(i)).collect();
-            if let Some(next_c) = first_next {
-                for kmer in self.iter_longtig_kmers(next_c) {
-                    nucs.push(kmer.nucleotide(k - 1));
-                }
-            }
-            Unitig::from_nucleotides(&nucs)
-        })
-    }
-
-    /// Write all unitigs to `out` in FASTA format.
-    ///
-    /// Calls [`obifastwrite::write_unitig`] for each unitig produced by
-    /// [`iter_unitig`]. Stops and returns the first I/O error encountered.
-    pub fn write_fasta<W: io::Write>(&self, out: &mut W, unitig: bool) -> io::Result<()> {
-        if unitig {
-            for unitig in self.iter_unitig() {
-                write_unitig(&unitig, k(), out)?;
-            }
-        } else {
-            for unitig in self.iter_longtig() {
-                write_unitig(&unitig, k(), out)?;
-            }
-        }
-        Ok(())
-    }
-
    pub fn len(&self) -> usize {
        self.nodes.len()
    }
@@ -516,23 +424,6 @@ impl Iterator for UnitigIter<'_> {
    }
 }

-// ── UnitigIter ────────────────────────────────────────────────────────────────
-
-struct LongtigIter<'a> {
-    graph: &'a GraphDeBruijn,
-    current: Option<Kmer>,
-}
-
-impl Iterator for LongtigIter<'_> {
-    type Item = Kmer;
-
-    fn next(&mut self) -> Option<Kmer> {
-        let current = self.current?;
-        self.current = self.graph.next_longtig_kmer(current);
-        Some(current)
-    }
-}
-
 // ── helpers ───────────────────────────────────────────────────────────────────

 fn oriented_next(from: Kmer, to: CanonicalKmer) -> Kmer {
@@ -1,9 +1,11 @@
 use std::fmt;
 use std::io::{self, Write};
-use xxhash_rust::xxh64::xxh64;

-pub(crate) enum JsonVal<'a> {
+/// A JSON value that is either a number or a quoted string.
+pub enum JsonVal<'a> {
+    /// Integer value, serialised without quotes.
    Num(u64),
+    /// String value, serialised with double quotes.
    Str(&'a str),
 }

@@ -16,11 +18,8 @@ impl fmt::Display for JsonVal<'_> {
    }
 }

-pub(crate) fn seq_id(ascii: &[u8]) -> String {
-    format!("{:016X}", xxh64(ascii, 0))
-}
-
-pub(crate) fn annotation<W: Write>(
+/// Write a JSON object `{"k1":v1,"k2":v2,...}` to `writer`.
+pub fn annotation<W: Write>(
    writer: &mut W,
    fields: &[(&str, JsonVal<'_>)],
 ) -> io::Result<()> {
@@ -34,10 +33,29 @@ pub(crate) fn annotation<W: Write>(
    write!(writer, "}}")
 }

-pub(crate) fn write_sequence<W: Write>(writer: &mut W, seq: &[u8], width: usize) -> io::Result<()> {
-    for chunk in seq.chunks(width) {
-        // SAFETY: seq is valid UTF-8; any contiguous slice of ASCII bytes is too
-        writeln!(writer, "{}", unsafe { std::str::from_utf8_unchecked(chunk) })?;
-    }
-    Ok(())
+/// xxHash-64 of `ascii`, formatted as 16 uppercase hex digits.
+pub fn seq_id(ascii: &[u8]) -> String {
+    use xxhash_rust::xxh64::xxh64;
+    format!("{:016X}", xxh64(ascii, 0))
+}
+
+/// Write `seq` as one line of ASCII DNA, followed by a newline.
+pub fn write_sequence<W: Write>(writer: &mut W, seq: &[u8]) -> io::Result<()> {
+    // SAFETY: seq is valid ASCII DNA (A/C/G/T).
+    writeln!(writer, "{}", unsafe { std::str::from_utf8_unchecked(seq) })
+}
+
+/// Core FASTA record writer.
+///
+/// Writes `>{id} {annotation}\n{sequence}\n`.
+pub fn write_record<W: Write>(
+    seq: &[u8],
+    id: &str,
+    fields: &[(&str, JsonVal<'_>)],
+    out: &mut W,
+) -> io::Result<()> {
+    write!(out, ">{id} ")?;
+    annotation(out, fields)?;
+    writeln!(out)?;
+    write_sequence(out, seq)
 }
@@ -1,32 +1,20 @@
-//! FASTA serialisation of [`SuperKmer`] values.
+//! FASTA serialisation for obikmer sequence types.
 //!
-//! Two functions cover the two phases of the scatter pipeline:
+//! Three public functions cover the main output cases:
 //!
-//! - [`write_scatter`]: scatter phase (before routing). The header annotation
-//!   contains the minimizer sequence decoded from [`SuperKmer::minimizer_pos`].
+//! - [`write_scatter`]: super-kmers in scatter phase (minimizer annotation)
+//! - [`write_count`]: super-kmers in count phase (occurrence count annotation)
+//! - [`write_unitig`]: unitigs from the layered index (partition + index annotation)
 //!
-//! - [`write_count`]: count phase (after deduplication). The header annotation
-//!   contains the occurrence count from [`SuperKmer::count`].
-//!
-//! Both functions write standard OBITools-compatible FASTA:
+//! All produce OBITools-compatible FASTA:
 //!
 //! ```text
-//! >ID {"seq_length":32,"kmer_size":31,"minimizer_size":11,"partition":42,"minimizer":"CGTGCTAGATC"}
-//! GCTAGCATGCTAGCTGTAGCTGTGAGTGCTG
+//! >ID {"key":value,...}
+//! SEQUENCE
 //! ```
 //!
-//! The record identifier is the xxHash-64 of the ASCII sequence, formatted as
-//! a 16-digit uppercase hexadecimal string. xxHash-64 is collision-resistant
-//! enough for debugging identifiers (collision probability < 1e-9 for billions
-//! of distinct super-kmers).
-//!
-//! # Phase contract
-//!
-//! `write_scatter` reads [`SuperKmer::minimizer_pos`], which is only valid
-//! **before** [`SuperKmer::init_count`] is called.  `write_count` reads
-//! [`SuperKmer::count`], which is only meaningful **after** `init_count`.
-//! Mixing the two functions in the wrong phase produces silently wrong output;
-//! this is enforced by pipeline structure, not by the type system.
+//! The lower-level primitive [`write_record`] and the [`JsonVal`] type are also
+//! public for callers that need custom annotations.

 #![deny(missing_docs)]

@@ -35,22 +23,15 @@ mod fasta;
 use std::io::{self, Write};

 use obikseq::{Minimizer, SuperKmer, Unitig};
-use xxhash_rust::xxh64::xxh64;
+
+pub use fasta::{JsonVal, annotation, seq_id, write_record};

 // ── public API ────────────────────────────────────────────────────────────────

 /// Write one super-kmer in FASTA format — **scatter phase**.
 ///
-/// The `minimizer` field in the JSON annotation contains the ASCII sequence of
-/// the minimizer, decoded from [`SuperKmer::minimizer_pos`] (scatter-phase
-/// value of the payload field).
-///
-/// # Parameters
-/// - `sk`: the super-kmer to serialise (must be in scatter phase)
-/// - `out`: destination writer
-/// - `k`: k-mer size used to build `sk`
-/// - `m`: minimizer size
-/// - `partition`: partition index computed from the minimizer hash
+/// ID is the xxHash-64 of the sequence.  JSON annotation includes
+/// `seq_length`, `kmer_size`, `minimizer_size`, `partition`, `minimizer`.
 pub fn write_scatter<W: Write>(
    sk: &SuperKmer,
    out: &mut W,
@@ -61,37 +42,26 @@ pub fn write_scatter<W: Write>(
 ) -> io::Result<()> {
    let ascii = sk.to_ascii();
    let id = seq_id(&ascii);
-    let seq_len = ascii.len();
    let min_seq = minimizer.to_ascii();
-
-    writeln!(
+    let min_str = unsafe { std::str::from_utf8_unchecked(&min_seq) };
+    write_record(
+        &ascii,
+        &id,
+        &[
+            ("seq_length",    JsonVal::Num(ascii.len() as u64)),
+            ("kmer_size",     JsonVal::Num(k as u64)),
+            ("minimizer_size",JsonVal::Num(m as u64)),
+            ("partition",     JsonVal::Num(partition as u64)),
+            ("minimizer",     JsonVal::Str(min_str)),
+        ],
        out,
-        ">{id} {{\"seq_length\":{seq_len},\"kmer_size\":{k},\
-         \"minimizer_size\":{m},\"partition\":{partition},\
-         \"minimizer\":\"{min}\"}}",
-        id = id,
-        seq_len = seq_len,
-        k = k,
-        m = m,
-        partition = partition,
-        min = unsafe { std::str::from_utf8_unchecked(&min_seq) },
-    )?;
-    out.write_all(&ascii)?;
-    out.write_all(b"\n")
+    )
 }

 /// Write one super-kmer in FASTA format — **count phase**.
 ///
-/// The `count` field in the JSON annotation contains the occurrence count from
-/// [`SuperKmer::count`] (count-phase value of the payload field).
-///
-/// # Parameters
-/// - `sk`: the super-kmer to serialise (must be in count phase, i.e. after
-///   [`SuperKmer::init_count`] has been called)
-/// - `out`: destination writer
-/// - `k`: k-mer size
-/// - `m`: minimizer size
-/// - `partition`: partition index
+/// ID is the xxHash-64 of the sequence.  JSON annotation includes
+/// `seq_length`, `kmer_size`, `minimizer_size`, `partition`, `count`.
 pub fn write_count<W: Write>(
    sk: &SuperKmer,
    out: &mut W,
@@ -101,52 +71,47 @@ pub fn write_count<W: Write>(
 ) -> io::Result<()> {
    let ascii = sk.to_ascii();
    let id = seq_id(&ascii);
-    let seq_len = ascii.len();
-    let count = sk.count();
-
-    writeln!(
+    write_record(
+        &ascii,
+        &id,
+        &[
+            ("seq_length",    JsonVal::Num(ascii.len() as u64)),
+            ("kmer_size",     JsonVal::Num(k as u64)),
+            ("minimizer_size",JsonVal::Num(m as u64)),
+            ("partition",     JsonVal::Num(partition as u64)),
+            ("count",         JsonVal::Num(sk.count() as u64)),
+        ],
        out,
-        ">{id} {{\"seq_length\":{seq_len},\"kmer_size\":{k},\
-         \"minimizer_size\":{m},\"partition\":{partition},\
-         \"count\":{count}}}",
-        id = id,
-        seq_len = seq_len,
-        k = k,
-        m = m,
-        partition = partition,
-        count = count,
-    )?;
-    out.write_all(&ascii)?;
-    out.write_all(b"\n")
+    )
 }

 /// Write one unitig in FASTA format.
 ///
-/// Header annotation (JSON):
-/// ```text
-/// >HASH {"seq_length":<seql>,"kmer_size":<k>,"n_kmers":<seql-k+1>}
-/// ```
-///
-/// `HASH` is the xxHash-64 of the ASCII sequence (16 uppercase hex digits).
-/// `n_kmers` is the number of distinct k-mers covered by this unitig.
-pub fn write_unitig<W: Write>(unitig: &Unitig, k: usize, out: &mut W) -> io::Result<()> {
+/// ID is `part_PPPPP_unitig_IIIIII` where `P` is the partition index and `I`
+/// is the unitig index within that partition.  JSON annotation includes
+/// `seq_length`, `kmer_size`, `n_kmers`, `partition`, `unitig_index`.
+pub fn write_unitig<W: Write>(
+    unitig: &Unitig,
+    k: usize,
+    partition: usize,
+    index: usize,
+    out: &mut W,
+) -> io::Result<()> {
    let ascii = unitig.to_ascii();
-    let id = seq_id(&ascii);
    let seql = unitig.seql();
-    let n_kmers = seql - k + 1;
-    writeln!(
+    let id = format!("part_{partition:05}_unitig_{index:06}");
+    write_record(
+        &ascii,
+        &id,
+        &[
+            ("seq_length",   JsonVal::Num(seql as u64)),
+            ("kmer_size",    JsonVal::Num(k as u64)),
+            ("n_kmers",      JsonVal::Num((seql - k + 1) as u64)),
+            ("partition",    JsonVal::Num(partition as u64)),
+            ("unitig_index", JsonVal::Num(index as u64)),
+        ],
        out,
-        ">{id} {{\"seq_length\":{seql},\"kmer_size\":{k},\"n_kmers\":{n_kmers}}}",
-    )?;
-    out.write_all(&ascii)?;
-    out.write_all(b"\n")
-}
-
-// ── internal helpers ──────────────────────────────────────────────────────────
-
-/// xxHash-64 of the ASCII sequence, formatted as 16 uppercase hex digits.
-fn seq_id(ascii: &[u8]) -> String {
-    format!("{:016X}", xxh64(ascii, 0))
+    )
 }

 // ── tests ─────────────────────────────────────────────────────────────────────
@@ -178,9 +143,6 @@ mod tests {

    #[test]
    fn scatter_minimizer_decoded_from_hash() {
-        // "ACG" right-aligned: A=00, C=01, G=10 → 0b000110 = 6
-        // Left-aligned for m=3: shift by 64 − 2·3 = 58.
-        // set_m(3) so that Minimizer::to_ascii() decodes exactly 3 bases.
        obikseq::params::set_m(3);
        let sk = make(b"ACGTACGTACGT");
        let minimizer = Minimizer::from_raw_unchecked(6u64 << (64 - 2 * 3));
@@ -230,13 +192,34 @@ mod tests {

    #[test]
    fn count_sequence_line_correct() {
-        // TTTTACGT canonicalises to ACGTAAAA (revcomp is ACGTAAAA < TTTTACGT)
        let sk = make(b"TTTTACGT");
        let out = capture(|w| write_count(&sk, w, 4, 2, 0));
        let lines: Vec<&str> = out.lines().collect();
        assert_eq!(lines[1], "ACGTAAAA");
    }

+    // ── write_unitig ──────────────────────────────────────────────────────────
+
+    #[test]
+    fn unitig_id_format() {
+        obikseq::params::set_k(4);
+        let unitig = obikseq::packed_seq::PackedSeq::from_ascii(b"ACGTACGT");
+        let out = capture(|w| write_unitig(&unitig, 4, 3, 17, w));
+        let id = out.lines().next().unwrap();
+        assert!(id.starts_with(">part_00003_unitig_000017"), "got: {id}");
+    }
+
+    #[test]
+    fn unitig_annotation_fields() {
+        obikseq::params::set_k(4);
+        let unitig = obikseq::packed_seq::PackedSeq::from_ascii(b"ACGTACGT");
+        let out = capture(|w| write_unitig(&unitig, 4, 2, 5, w));
+        assert!(out.contains("\"partition\":2"));
+        assert!(out.contains("\"unitig_index\":5"));
+        assert!(out.contains("\"n_kmers\":5"));
+        assert!(out.contains("\"kmer_size\":4"));
+    }
+
    // ── ID stability ──────────────────────────────────────────────────────────

    #[test]
@@ -260,7 +243,7 @@ mod tests {
            .next()
            .unwrap()[1..]
            .to_string();
-        assert_eq!(id1, id2, "same sequence must produce same ID");
+        assert_eq!(id1, id2);
    }

    #[test]
@@ -269,21 +252,11 @@ mod tests {
        let sk2 = make(b"TTTTTTTT");

        let id1 = capture(|w| write_scatter(&sk1, w, 4, 2, 0, Minimizer::from_raw_unchecked(0)))
-            .lines()
-            .next()
-            .unwrap()
-            .split_whitespace()
-            .next()
-            .unwrap()[1..]
-            .to_string();
+            .lines().next().unwrap()
+            .split_whitespace().next().unwrap()[1..].to_string();
        let id2 = capture(|w| write_scatter(&sk2, w, 4, 2, 0, Minimizer::from_raw_unchecked(0)))
-            .lines()
-            .next()
-            .unwrap()
-            .split_whitespace()
-            .next()
-            .unwrap()[1..]
-            .to_string();
+            .lines().next().unwrap()
+            .split_whitespace().next().unwrap()[1..].to_string();
        assert_ne!(id1, id2);
    }

@@ -291,7 +264,7 @@ mod tests {
    fn id_is_16_hex_digits() {
        let sk = make(b"ACGTACGT");
        let out = capture(|w| write_scatter(&sk, w, 4, 2, 0, Minimizer::from_raw_unchecked(0)));
-        let id = &out.lines().next().unwrap()[1..17]; // skip '>'
+        let id = &out.lines().next().unwrap()[1..17];
        assert_eq!(id.len(), 16);
        assert!(id.chars().all(|c| c.is_ascii_hexdigit()));
    }
@@ -0,0 +1,21 @@
+[package]
+name = "obikindex"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+obikpartitionner = { path = "../obikpartitionner" }
+obikseq          = { path = "../obikseq" }
+obisys           = { path = "../obisys" }
+obiskio          = { path = "../obiskio" }
+obidebruinj      = { path = "../obidebruinj" }
+obilayeredmap    = { path = "../obilayeredmap" }
+obicompactvec    = { path = "../obicompactvec" }
+cacheline-ef     = "1.1"
+epserde          = "0.8"
+ptr_hash         = "1.1"
+rayon            = "1"
+serde            = { version = "1", features = ["derive"] }
+serde_json       = "1"
+indicatif        = "0.17"
+tracing          = "0.1.44"
@@ -0,0 +1,53 @@
+use std::fmt;
+use std::io;
+
+use obiskio::SKError;
+use obilayeredmap::OLMError;
+
+#[derive(Debug)]
+pub enum OKIError {
+    Io(io::Error),
+    Json(serde_json::Error),
+    Partition(SKError),
+    Layer(OLMError),
+}
+
+pub type OKIResult<T> = Result<T, OKIError>;
+
+impl fmt::Display for OKIError {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            OKIError::Io(e)        => write!(f, "I/O error: {e}"),
+            OKIError::Json(e)      => write!(f, "JSON error: {e}"),
+            OKIError::Partition(e) => write!(f, "partition error: {e}"),
+            OKIError::Layer(e)     => write!(f, "layer error: {e}"),
+        }
+    }
+}
+
+impl std::error::Error for OKIError {
+    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
+        match self {
+            OKIError::Io(e)        => Some(e),
+            OKIError::Json(e)      => Some(e),
+            OKIError::Partition(e) => Some(e),
+            OKIError::Layer(e)     => Some(e),
+        }
+    }
+}
+
+impl From<io::Error> for OKIError {
+    fn from(e: io::Error) -> Self { OKIError::Io(e) }
+}
+
+impl From<serde_json::Error> for OKIError {
+    fn from(e: serde_json::Error) -> Self { OKIError::Json(e) }
+}
+
+impl From<SKError> for OKIError {
+    fn from(e: SKError) -> Self { OKIError::Partition(e) }
+}
+
+impl From<OLMError> for OKIError {
+    fn from(e: OLMError) -> Self { OKIError::Layer(e) }
+}
@@ -0,0 +1,301 @@
+use std::fs;
+use std::path::{Path, PathBuf};
+use std::sync::atomic::{AtomicUsize, Ordering};
+use std::sync::{Arc, Mutex};
+
+use cacheline_ef::{CachelineEf, CachelineEfVec};
+use epserde::prelude::*;
+use indicatif::{ProgressBar, ProgressStyle};
+use obicompactvec::{PersistentCompactIntMatrix, PersistentCompactIntVec};
+use obidebruinj::GraphDeBruijn;
+use obikpartitionner::KmerPartition;
+use obilayeredmap::layer::Layer;
+use obiskio::{SKFileMeta, SKFileReader};
+use obisys::{Reporter, Stage};
+use ptr_hash::{PtrHash, bucket_fn::CubicEps, hash::Xx64};
+use rayon::prelude::*;
+use tracing::info;
+
+use crate::error::{OKIError, OKIResult};
+use crate::meta::{IndexConfig, IndexMeta};
+use crate::state::{IndexState, SENTINEL_INDEXED, SENTINEL_SCATTERED};
+
+type Mphf = PtrHash<u64, CubicEps, CachelineEfVec<Vec<CachelineEf>>, Xx64, Vec<u8>>;
+
+pub struct KmerIndex {
+    root_path: PathBuf,
+    meta: IndexMeta,
+    partition: KmerPartition,
+}
+
+impl KmerIndex {
+    /// Create a new index at `path`.
+    ///
+    /// If `genome_label` is `Some`, it is stored immediately.
+    /// If `None`, the label will be derived from the first scatter input path
+    /// when `mark_scattered` is called.
+    pub fn create<P: AsRef<Path>>(
+        path: P,
+        config: IndexConfig,
+        genome_label: Option<String>,
+        force: bool,
+    ) -> OKIResult<Self> {
+        let root_path = path.as_ref().to_owned();
+        let partition = KmerPartition::create(
+            &root_path,
+            config.n_bits,
+            config.kmer_size,
+            config.minimizer_size,
+            force,
+        )?;
+        let mut meta = IndexMeta::new(config);
+        if let Some(label) = genome_label {
+            meta.genomes.push(label);
+        }
+        meta.write(&root_path)?;
+        Ok(Self { root_path, meta, partition })
+    }
+
+    pub fn open<P: AsRef<Path>>(path: P) -> OKIResult<Self> {
+        let root_path = path.as_ref().to_owned();
+        let meta = IndexMeta::read(&root_path).map_err(OKIError::Io)?;
+        let partition = KmerPartition::open(&root_path)?;
+        Ok(Self { root_path, meta, partition })
+    }
+
+    /// Return `true` if `path` contains an `index.meta` file.
+    pub fn exists<P: AsRef<Path>>(path: P) -> bool {
+        IndexMeta::exists(path.as_ref())
+    }
+
+    /// Current construction state, as reported by sentinel files on disk.
+    pub fn state(&self) -> IndexState {
+        IndexState::detect(&self.root_path).unwrap_or(IndexState::Empty)
+    }
+
+    pub fn meta(&self) -> &IndexMeta { &self.meta }
+    pub fn kmer_size(&self) -> usize { self.meta.config.kmer_size }
+    pub fn minimizer_size(&self) -> usize { self.meta.config.minimizer_size }
+    pub fn n_partitions(&self) -> usize { self.partition.n_partitions() }
+
+    /// Expose the inner partition so the caller can run scatter into it.
+    /// Call `mark_scattered` once scatter is complete.
+    pub fn partition_mut(&mut self) -> &mut KmerPartition {
+        &mut self.partition
+    }
+
+    /// Mark scatter as complete and write `scatter.done`.
+    ///
+    /// If no genome label was set at creation time, one is derived from
+    /// `first_scatter_path` (filename stripped of all extensions).
+    /// If `first_scatter_path` is also `None`, the label defaults to `"unknown"`.
+    pub fn mark_scattered(&mut self, first_scatter_path: Option<&Path>) -> OKIResult<()> {
+        if self.meta.genomes.is_empty() {
+            let label = first_scatter_path
+                .map(label_from_path)
+                .unwrap_or_else(|| "unknown".to_string());
+            self.meta.genomes.push(label);
+            self.meta.write(&self.root_path)?;
+        }
+        touch(&self.root_path.join(SENTINEL_SCATTERED))?;
+        Ok(())
+    }
+
+    /// Dereplicate all partitions then compute kmer counts.
+    ///
+    /// Writes `kmer_spectrum_raw.json` at the index root upon completion
+    /// (this file doubles as the `Counted` sentinel).
+    pub fn dereplicate_and_count(&self, rep: &mut Reporter) -> OKIResult<()> {
+        let t = Stage::start("dereplicate");
+        self.partition.dereplicate()?;
+        rep.push(t.stop());
+
+        let t = Stage::start("count_kmer");
+        self.partition.count_kmer()?;
+        rep.push(t.stop());
+        Ok(())
+    }
+
+    /// Build the layered MPHF index for all partitions.
+    ///
+    /// Default mode (`config.with_counts = false`): set membership only.
+    /// With counts: count matrix per kmer.
+    ///
+    /// Writes `index.done` upon completion.
+    /// Path to the unitigs file for partition `part`, layer `layer`.
+    pub fn layer_unitigs_path(&self, part: usize, layer: usize) -> PathBuf {
+        self.partition.part_dir(part)
+            .join("index")
+            .join(format!("layer_{layer}"))
+            .join("unitigs.bin")
+    }
+
+    pub fn build_layers(
+        &self,
+        min_ab: u32,
+        max_ab: Option<u32>,
+        keep_intermediate: bool,
+        rep: &mut Reporter,
+    ) -> OKIResult<()> {
+        let n = self.partition.n_partitions();
+        let t = Stage::start("index");
+        let with_counts = self.meta.config.with_counts;
+        let filter_active = min_ab > 1 || max_ab.is_some();
+        let need_counts = filter_active || with_counts;
+        let total_kmers = AtomicUsize::new(0);
+
+        let partition = &self.partition;
+
+        let pb = Arc::new(Mutex::new(
+            ProgressBar::new(n as u64).with_style(
+                ProgressStyle::with_template("index — [{bar:20}] {pos}/{len} | {msg}").unwrap(),
+            ),
+        ));
+
+        (0..n).into_par_iter().for_each(|i| {
+            let part_dir = partition.part_dir(i);
+            let dedup_path = part_dir.join("dereplicated.skmer.zst");
+            if !dedup_path.exists() {
+                return;
+            }
+
+            let layer_dir = part_dir.join("index").join("layer_0");
+            if layer_dir.join("mphf.bin").exists() {
+                return;
+            }
+
+            let mphf1_opt: Option<Mphf> = if need_counts {
+                let p = part_dir.join("mphf1.bin");
+                p.exists().then(|| Mphf::load_full(&p).ok()).flatten()
+            } else {
+                None
+            };
+
+            let counts1_opt: Option<PersistentCompactIntVec> = if need_counts {
+                let p = part_dir.join("counts1.bin");
+                p.exists()
+                    .then(|| PersistentCompactIntVec::open(&p).ok())
+                    .flatten()
+            } else {
+                None
+            };
+
+            let mut g = GraphDeBruijn::new();
+            let mut reader = SKFileReader::open(&dedup_path).unwrap_or_else(|e| {
+                eprintln!("error opening {}: {e}", dedup_path.display());
+                std::process::exit(1);
+            });
+            for sk in reader.iter() {
+                for kmer in sk.iter_canonical_kmers() {
+                    let accept = if filter_active {
+                        match (&mphf1_opt, &counts1_opt) {
+                            (Some(mphf), Some(counts)) => {
+                                let ab = counts.get(mphf.index(&kmer.raw()));
+                                ab >= min_ab && max_ab.map_or(true, |max| ab <= max)
+                            }
+                            _ => true,
+                        }
+                    } else {
+                        true
+                    };
+                    if accept {
+                        g.push(kmer);
+                    }
+                }
+            }
+
+            let n_kmers = g.len();
+            total_kmers.fetch_add(n_kmers, Ordering::Relaxed);
+            g.compute_degrees();
+
+            fs::create_dir_all(&layer_dir).unwrap_or_else(|e| {
+                eprintln!("error creating {}: {e}", layer_dir.display());
+                std::process::exit(1);
+            });
+            let mut uw = Layer::<()>::unitig_writer(&layer_dir).unwrap_or_else(|e| {
+                eprintln!("error creating unitig writer (partition {i}): {e}");
+                std::process::exit(1);
+            });
+            for unitig in g.iter_unitig() {
+                uw.write(&unitig).unwrap_or_else(|e| {
+                    eprintln!("error writing unitig (partition {i}): {e}");
+                    std::process::exit(1);
+                });
+            }
+            uw.close().unwrap_or_else(|e| {
+                eprintln!("error closing unitig writer (partition {i}): {e}");
+                std::process::exit(1);
+            });
+
+            if with_counts {
+                Layer::<PersistentCompactIntMatrix>::build(&layer_dir, |kmer| {
+                    match (&mphf1_opt, &counts1_opt) {
+                        (Some(mphf), Some(counts)) => counts.get(mphf.index(&kmer.raw())),
+                        _ => 1,
+                    }
+                })
+                .unwrap_or_else(|e| {
+                    eprintln!("error building count layer (partition {i}): {e}");
+                    std::process::exit(1);
+                });
+            } else {
+                Layer::<()>::build(&layer_dir).unwrap_or_else(|e| {
+                    eprintln!("error building set layer (partition {i}): {e}");
+                    std::process::exit(1);
+                });
+            }
+
+            let pb = pb.lock().unwrap();
+            pb.inc(1);
+            pb.set_message(format!("{i}: {n_kmers} kmers"));
+        });
+
+        pb.lock().unwrap().finish_and_clear();
+        info!(
+            "done — {} total kmers indexed",
+            total_kmers.load(Ordering::Relaxed)
+        );
+
+        if !keep_intermediate {
+            for i in 0..n {
+                let part_dir = partition.part_dir(i);
+                remove_if_exists(&part_dir.join("dereplicated.skmer.zst"));
+                remove_if_exists(&SKFileMeta::sidecar_path(
+                    &part_dir.join("dereplicated.skmer.zst"),
+                ));
+                remove_if_exists(&part_dir.join("mphf1.bin"));
+                remove_if_exists(&part_dir.join("counts1.bin"));
+            }
+        }
+
+        touch(&self.root_path.join(SENTINEL_INDEXED))?;
+        rep.push(t.stop());
+        Ok(())
+    }
+}
+
+/// Derive a genome label from a file path: filename stripped of all extensions.
+fn label_from_path(path: &Path) -> String {
+    let name = path
+        .file_name()
+        .unwrap_or(path.as_os_str())
+        .to_string_lossy()
+        .into_owned();
+    let mut s = name;
+    while let Some(pos) = s.rfind('.') {
+        s.truncate(pos);
+    }
+    if s.is_empty() { "unknown".to_string() } else { s }
+}
+
+fn touch(path: &Path) -> Result<(), std::io::Error> {
+    fs::File::create(path).map(|_| ())
+}
+
+fn remove_if_exists(path: &Path) {
+    if let Err(e) = fs::remove_file(path) {
+        if e.kind() != std::io::ErrorKind::NotFound {
+            eprintln!("warning: could not remove {}: {e}", path.display());
+        }
+    }
+}
@@ -0,0 +1,9 @@
+pub mod error;
+pub mod meta;
+pub mod state;
+mod index;
+
+pub use error::{OKIError, OKIResult};
+pub use index::KmerIndex;
+pub use meta::{IndexConfig, IndexMeta, META_FILENAME};
+pub use state::{IndexState, SENTINEL_COUNTED, SENTINEL_INDEXED, SENTINEL_SCATTERED};
@@ -0,0 +1,45 @@
+use std::fs;
+use std::io;
+use std::path::Path;
+
+use serde::{Deserialize, Serialize};
+
+pub const META_FILENAME: &str = "index.meta";
+const META_VERSION: u32 = 1;
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct IndexConfig {
+    pub kmer_size: usize,
+    pub minimizer_size: usize,
+    pub n_bits: usize,
+    pub with_counts: bool,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct IndexMeta {
+    pub version: u32,
+    pub config: IndexConfig,
+    /// Ordered list of genome labels indexed here.
+    /// Element 0 is the initial genome; subsequent entries come from merges.
+    pub genomes: Vec<String>,
+}
+
+impl IndexMeta {
+    pub fn new(config: IndexConfig) -> Self {
+        Self { version: META_VERSION, config, genomes: Vec::new() }
+    }
+
+    pub fn write(&self, root: &Path) -> io::Result<()> {
+        let file = fs::File::create(root.join(META_FILENAME))?;
+        serde_json::to_writer_pretty(file, self).map_err(io::Error::other)
+    }
+
+    pub fn read(root: &Path) -> io::Result<Self> {
+        let file = fs::File::open(root.join(META_FILENAME))?;
+        serde_json::from_reader(file).map_err(io::Error::other)
+    }
+
+    pub fn exists(root: &Path) -> bool {
+        root.join(META_FILENAME).exists()
+    }
+}
@@ -0,0 +1,45 @@
+use std::path::Path;
+
+use crate::meta::META_FILENAME;
+
+pub const SENTINEL_SCATTERED: &str = "scatter.done";
+pub const SENTINEL_COUNTED:   &str = "kmer_spectrum_raw.json";
+pub const SENTINEL_INDEXED:   &str = "index.done";
+
+/// Progression state of a `KmerIndex`.
+///
+/// Variants are ordered: `Empty < Scattered < Counted < Indexed`.
+/// A state is reported only when its sentinel file is fully present —
+/// partial states (e.g. scatter interrupted mid-way) are not accepted.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
+pub enum IndexState {
+    /// `index.meta` present; scatter not yet completed.
+    Empty,
+    /// `scatter.done` sentinel present — all super-kmers have been routed.
+    Scattered,
+    /// `kmer_spectrum_raw.json` present — dereplicate + count complete.
+    Counted,
+    /// `index.done` sentinel present — layered MPHF index fully built.
+    Indexed,
+}
+
+impl IndexState {
+    /// Detect the state of the index at `root`.
+    ///
+    /// Returns `None` if `index.meta` is absent (not an obikindex directory).
+    pub fn detect(root: &Path) -> Option<Self> {
+        if !root.join(META_FILENAME).exists() {
+            return None;
+        }
+        if root.join(SENTINEL_INDEXED).exists() {
+            return Some(Self::Indexed);
+        }
+        if root.join(SENTINEL_COUNTED).exists() {
+            return Some(Self::Counted);
+        }
+        if root.join(SENTINEL_SCATTERED).exists() {
+            return Some(Self::Scattered);
+        }
+        Some(Self::Empty)
+    }
+}
@@ -13,21 +13,13 @@ obiread      = { path = "../obiread" }
 obiskbuilder  = { path = "../obiskbuilder" }
 obifastwrite  = { path = "../obifastwrite" }
 obipipeline   = { path = "../obipipeline" }
-obidebruinj  = { path = "../obidebruinj" }
-clap         = { version = "4", features = ["derive"] }
 obikrope      = { path = "../obikrope" }
 obikpartitionner = { path = "../obikpartitionner" }
 obisys        = { path = "../obisys" }
 obiskio       = { path = "../obiskio" }
-obicompactvec     = { path = "../obicompactvec" }
-obilayeredmap     = { path = "../obilayeredmap" }
-niffler      = "3"
+obikindex     = { path = "../obikindex" }
+clap          = { version = "4", features = ["derive"] }
 rayon         = "1"
-ph           = "0.11"
-memmap2      = "0.9"
-epserde      = "0.8"
-ptr_hash     = "1.1"
-cacheline-ef = "1.1"
 indicatif     = "0.17"
 tracing       = "0.1.44"
 tracing-subscriber = { version = "0.3", features = ["fmt", "env-filter"] }
@@ -1,24 +0,0 @@
-use clap::Args;
-use obikpartitionner::KmerPartition;
-use std::path::PathBuf;
-use tracing::info;
-
-#[derive(Args)]
-pub struct CountArgs {
-    /// Partition directory produced by the `partition` command
-    #[arg(short, long)]
-    pub partition: PathBuf,
-}
-
-pub fn run(args: CountArgs) {
-    let kp = KmerPartition::open(&args.partition).unwrap_or_else(|e| {
-        eprintln!("error: {e}");
-        std::process::exit(1)
-    });
-
-    info!("counting kmers in {}", args.partition.display());
-    kp.count_kmer().unwrap_or_else(|e| {
-        eprintln!("error: {e}");
-        std::process::exit(1)
-    });
-}
@@ -1,84 +0,0 @@
-use std::fs::File;
-use std::path::PathBuf;
-use std::sync::atomic::{AtomicUsize, Ordering};
-
-use clap::Args;
-use niffler::Level;
-use niffler::send::compression::Format;
-use obifastwrite::write_count;
-use obikpartitionner::KmerPartition;
-use obiskio::SKFileReader;
-use rayon::prelude::*;
-use tracing::info;
-
-#[derive(Args)]
-pub struct FastaArgs {
-    /// Root of the k-mer partition directory (produced by the `partition` command)
-    pub partition: PathBuf,
-
-    /// Dump dereplicated super-kmers as FASTA (→ <partition>/dereplicated.skmer.fasta.gz)
-    #[arg(long)]
-    pub super_kmers: bool,
-}
-
-pub fn run(args: FastaArgs) {
-    if !args.super_kmers {
-        eprintln!("error: specify at least one output mode (--super-kmers)");
-        std::process::exit(1);
-    }
-
-    let kp = KmerPartition::open(&args.partition).unwrap_or_else(|e| {
-        eprintln!("error opening partition: {e}");
-        std::process::exit(1)
-    });
-
-    if args.super_kmers {
-        dump_super_kmers(&kp, &args.partition);
-    }
-}
-
-fn dump_super_kmers(kp: &KmerPartition, _partition_dir: &PathBuf) {
-    let k = kp.kmer_size();
-    let m = kp.minimizer_size();
-    let n = kp.n_partitions();
-
-    info!("writing {n} partition FASTA files (parallel)");
-
-    let total = AtomicUsize::new(0);
-
-    (0..n).into_par_iter().for_each(|i| {
-        let part_dir = kp.part_dir(i);
-        let in_path = part_dir.join("dereplicated.skmer.zst");
-        if !in_path.exists() {
-            return;
-        }
-        let out_path = part_dir.join("dereplicated.skmer.fasta.gz");
-
-        let file = File::create(&out_path).unwrap_or_else(|e| {
-            eprintln!("error creating {}: {e}", out_path.display());
-            std::process::exit(1)
-        });
-        let mut writer = niffler::send::get_writer(Box::new(file), Format::Gzip, Level::Six)
-            .unwrap_or_else(|e| {
-                eprintln!("error creating gzip writer: {e}");
-                std::process::exit(1)
-            });
-
-        let mut reader = SKFileReader::open(&in_path).unwrap_or_else(|e| {
-            eprintln!("error opening {}: {e}", in_path.display());
-            std::process::exit(1)
-        });
-        let mut count = 0usize;
-        for sk in reader.iter() {
-            write_count(&sk, &mut writer, k, m, i as u32).unwrap_or_else(|e| {
-                eprintln!("write error: {e}");
-                std::process::exit(1)
-            });
-            count += 1;
-        }
-        info!("partition {i}: {count} super-kmers → {}", out_path.display());
-        total.fetch_add(count, Ordering::Relaxed);
-    });
-
-    info!("wrote {} super-kmers total", total.load(Ordering::Relaxed));
-}
@@ -1,17 +1,17 @@
 use std::path::PathBuf;

 use clap::Args;
-use obikpartitionner::KmerPartition;
+use obikindex::{IndexConfig, IndexState, KmerIndex};
 use obikseq::{set_k, set_m};
 use obisys::Reporter;
 use tracing::info;

 use crate::cli::CommonArgs;
-use crate::steps::{build_index, dereplicate_and_count, scatter};
+use crate::steps::scatter;

 #[derive(Args)]
 pub struct IndexArgs {
-    /// Output partition directory
+    /// Output index directory
    #[arg(short, long)]
    pub output: PathBuf,

@@ -19,6 +19,10 @@ pub struct IndexArgs {
    #[arg(long, default_value_t = false)]
    pub force: bool,

+    /// Genome label (default: input filename without path/extension)
+    #[arg(long)]
+    pub label: Option<String>,
+
    /// Minimum kmer abundance (inclusive)
    #[arg(long, default_value_t = 1)]
    pub min_abundance: u32,
@@ -43,53 +47,71 @@ pub fn run(args: IndexArgs) {
    let output = args.output.clone();
    let mut rep = Reporter::new();

-    // ── Stage 1: scatter (skipped if partition already exists) ───────────────
-    let kp = if output.join("partition.meta").exists() {
-        info!("resuming from existing partition at {}", output.display());
-        let kp = KmerPartition::open(&output).unwrap_or_else(|e| {
-            eprintln!("error opening partition: {e}");
+    // ── Open or create the index ─────────────────────────────────────────────
+    let mut idx = if KmerIndex::exists(&output) {
+        info!("resuming from existing index at {}", output.display());
+        KmerIndex::open(&output).unwrap_or_else(|e| {
+            eprintln!("error opening index: {e}");
            std::process::exit(1);
-        });
-        set_k(kp.kmer_size());
-        set_m(kp.minimizer_size());
-        kp
+        })
    } else {
-        let k = args.common.kmer_size;
-        set_k(k);
-        let m = args.common.minimizer_size;
-        set_m(m);
-        let theta = args.common.theta;
+        let config = IndexConfig {
+            kmer_size:      args.common.kmer_size,
+            minimizer_size: args.common.minimizer_size,
+            n_bits:         args.common.partition_bits,
+            with_counts:    args.with_counts,
+        };
+        KmerIndex::create(&output, config, args.label.clone(), args.force).unwrap_or_else(|e| {
+            eprintln!("error creating index: {e}");
+            std::process::exit(1);
+        })
+    };
+
+    set_k(idx.kmer_size());
+    set_m(idx.minimizer_size());
+
+    // ── Stage 1: scatter ─────────────────────────────────────────────────────
+    if idx.state() < IndexState::Scattered {
+        let first_path = args.common.inputs.first().map(PathBuf::from);
+        let k         = idx.kmer_size();
        let level_max = args.common.level_max;
+        let theta     = args.common.theta;
        let n_workers = args.common.threads.max(1);

-        let mut kp =
-            KmerPartition::create(&output, args.common.partition_bits, k, m, args.force)
-                .unwrap_or_else(|e| {
+        scatter(idx.partition_mut(), args.common.seqfile_paths(), k, level_max, theta, n_workers, &mut rep);
+
+        idx.mark_scattered(first_path.as_deref()).unwrap_or_else(|e| {
+            eprintln!("error marking scatter done: {e}");
+            std::process::exit(1);
+        });
+    } else {
+        info!("scatter already done, skipping");
+    }
+
+    // ── Stage 2: dereplicate + count ─────────────────────────────────────────
+    if idx.state() < IndexState::Counted {
+        idx.dereplicate_and_count(&mut rep).unwrap_or_else(|e| {
            eprintln!("error: {e}");
            std::process::exit(1);
        });
-
-        scatter(&mut kp, args.common.seqfile_paths(), k, level_max, theta, n_workers, &mut rep);
-        kp
-    };
-
-
-    // ── Stage 2: dereplicate + count (skipped if already done) ───────────────
-    if !output.join("kmer_spectrum_raw.json").exists() {
-        dereplicate_and_count(&kp, &mut rep);
    } else {
-        info!("kmer counts already present, skipping dereplicate + count");
+        info!("dereplicate+count already done, skipping");
    }

    // ── Stage 3: build layered index ─────────────────────────────────────────
-    build_index(
-        &kp,
+    if idx.state() < IndexState::Indexed {
+        idx.build_layers(
            args.min_abundance,
            args.max_abundance,
-        args.with_counts,
            args.keep_intermediate,
            &mut rep,
-    );
+        ).unwrap_or_else(|e| {
+            eprintln!("error: {e}");
+            std::process::exit(1);
+        });
+    } else {
+        info!("index already built, skipping");
+    }

    rep.print();
 }
@@ -1,143 +0,0 @@
-use std::fs::File;
-use std::path::PathBuf;
-use std::sync::atomic::{AtomicUsize, Ordering};
-
-use clap::Args;
-use niffler::Level;
-use niffler::send::compression::Format;
-use obidebruinj::GraphDeBruijn;
-use obikpartitionner::KmerPartition;
-use obikseq::set_k;
-use obiskio::SKFileReader;
-use ph::fmph::GOFunction;
-use rayon::prelude::*;
-use tracing::info;
-
-#[derive(Args)]
-pub struct LongtigArgs {
-    /// Root of the k-mer partition directory (produced by the `partition` command)
-    pub partition: PathBuf,
-
-    /// Minimum kmer abundance (inclusive); kmers below this threshold are excluded
-    #[arg(long, default_value_t = 1)]
-    pub min_abundance: u32,
-
-    /// Maximum kmer abundance (inclusive); kmers above this threshold are excluded
-    #[arg(long)]
-    pub max_abundance: Option<u32>,
-}
-
-pub fn run(args: LongtigArgs) {
-    let kp = KmerPartition::open(&args.partition).unwrap_or_else(|e| {
-        eprintln!("error opening partition: {e}");
-        std::process::exit(1)
-    });
-
-    let k = kp.kmer_size();
-    set_k(k);
-    let n = kp.n_partitions();
-    info!("building longtigs from {n} partitions (k={k}, parallel)");
-
-    let total_kmers = AtomicUsize::new(0);
-
-    (0..n).into_par_iter().for_each(|i| {
-        let part_dir = kp.part_dir(i);
-        let in_path = part_dir.join("dereplicated.skmer.zst");
-        if !in_path.exists() {
-            return;
-        }
-        let out_path = part_dir.join("longtig.fasta.gz");
-
-        let mut g = GraphDeBruijn::new();
-
-        let mphf_path = part_dir.join("mphf1.bin");
-        let counts_path = part_dir.join("counts1.bin");
-        let filter_active = (args.min_abundance > 1 || args.max_abundance.is_some())
-            && mphf_path.exists()
-            && counts_path.exists();
-
-        let mphf_opt: Option<GOFunction> = if filter_active {
-            let mut f = File::open(&mphf_path).unwrap_or_else(|e| {
-                eprintln!("error opening {}: {e}", mphf_path.display());
-                std::process::exit(1)
-            });
-            Some(GOFunction::read(&mut f).unwrap_or_else(|e| {
-                eprintln!("error reading MPHF {}: {e}", mphf_path.display());
-                std::process::exit(1)
-            }))
-        } else {
-            None
-        };
-
-        let counts_mmap_opt = if filter_active {
-            let cf = File::open(&counts_path).unwrap_or_else(|e| {
-                eprintln!("error opening {}: {e}", counts_path.display());
-                std::process::exit(1)
-            });
-            Some(unsafe {
-                memmap2::Mmap::map(&cf).unwrap_or_else(|e| {
-                    eprintln!("error mmapping {}: {e}", counts_path.display());
-                    std::process::exit(1)
-                })
-            })
-        } else {
-            None
-        };
-
-        let counts_slice: Option<&[u32]> = counts_mmap_opt
-            .as_ref()
-            .map(|m| unsafe { std::slice::from_raw_parts(m.as_ptr() as *const u32, m.len() / 4) });
-
-        let mut reader = SKFileReader::open(&in_path).unwrap_or_else(|e| {
-            eprintln!("error opening {}: {e}", in_path.display());
-            std::process::exit(1)
-        });
-        for sk in reader.iter() {
-            for kmer in sk.iter_canonical_kmers() {
-                let accept = match (&mphf_opt, counts_slice) {
-                    (Some(mphf), Some(counts)) => {
-                        if let Some(slot) = mphf.get(&kmer) {
-                            let ab = counts[slot as usize];
-                            ab >= args.min_abundance
-                                && args.max_abundance.map_or(true, |max| ab <= max)
-                        } else {
-                            false
-                        }
-                    }
-                    _ => true,
-                };
-                if accept {
-                    g.push(kmer);
-                }
-            }
-        }
-
-        let n_kmers = g.len();
-        total_kmers.fetch_add(n_kmers, Ordering::Relaxed);
-        info!(
-            "partition {i}/{n}: {n_kmers} canonical k-mers → {}",
-            out_path.display()
-        );
-
-        g.compute_degrees();
-
-        let file = File::create(&out_path).unwrap_or_else(|e| {
-            eprintln!("error creating {}: {e}", out_path.display());
-            std::process::exit(1)
-        });
-        let mut writer = niffler::send::get_writer(Box::new(file), Format::Gzip, Level::Six)
-            .unwrap_or_else(|e| {
-                eprintln!("error creating gzip writer: {e}");
-                std::process::exit(1)
-            });
-        g.write_fasta(&mut writer, false).unwrap_or_else(|e| {
-            eprintln!("write error on partition {i}: {e}");
-            std::process::exit(1)
-        });
-    });
-
-    info!(
-        "done — {} total canonical k-mers across all partitions",
-        total_kmers.load(Ordering::Relaxed)
-    );
-}
@@ -1,7 +1,3 @@
-pub mod count;
-pub mod fasta;
 pub mod index;
-pub mod longtig;
-pub mod partition;
 pub mod superkmer;
 pub mod unitig;
@@ -1,49 +0,0 @@
-use std::path::PathBuf;
-
-use clap::Args;
-use obikpartitionner::KmerPartition;
-use obisys::Reporter;
-use obikseq::{set_k, set_m};
-
-use crate::cli::CommonArgs;
-use crate::steps::{dereplicate_and_count, scatter};
-
-#[derive(Args)]
-pub struct PartitionArgs {
-    /// Output partition directory
-    #[arg(short, long)]
-    pub output: PathBuf,
-
-    /// Overwrite output directory if it already exists
-    #[arg(long, default_value_t = false)]
-    pub force: bool,
-
-    #[command(flatten)]
-    pub common: CommonArgs,
-}
-
-// ── Entry point ───────────────────────────────────────────────────────────────
-
-pub fn run(args: PartitionArgs) {
-    let k = args.common.kmer_size;
-    set_k(k);
-    let m = args.common.minimizer_size;
-    set_m(m);
-    let theta = args.common.theta;
-    let level_max = args.common.level_max;
-    let n_workers = args.common.threads.max(1);
-
-    let mut kp = KmerPartition::create(&args.output, args.common.partition_bits, k, m, args.force)
-        .unwrap_or_else(|e| {
-            eprintln!("error: {e}");
-            std::process::exit(1)
-        });
-
-    let path_source = args.common.seqfile_paths();
-    let mut rep = Reporter::new();
-
-    scatter(&mut kp, path_source, k, level_max, theta, n_workers, &mut rep);
-    dereplicate_and_count(&kp, &mut rep);
-
-    rep.print();
-}
@@ -1,143 +1,54 @@
-use std::fs::File;
+use std::io::{self, BufWriter, Write};
 use std::path::PathBuf;
-use std::sync::atomic::{AtomicUsize, Ordering};
+use std::sync::Mutex;

 use clap::Args;
-use niffler::Level;
-use niffler::send::compression::Format;
-use obidebruinj::GraphDeBruijn;
-use obikpartitionner::KmerPartition;
+use obifastwrite::write_unitig;
+use obikindex::KmerIndex;
 use obikseq::set_k;
-use obiskio::SKFileReader;
-use ph::fmph::GOFunction;
+use obiskio::UnitigFileReader;
 use rayon::prelude::*;
 use tracing::info;

 #[derive(Args)]
 pub struct UnitigArgs {
-    /// Root of the k-mer partition directory (produced by the `partition` command)
-    pub partition: PathBuf,
-
-    /// Minimum kmer abundance (inclusive); kmers below this threshold are excluded
-    #[arg(long, default_value_t = 1)]
-    pub min_abundance: u32,
-
-    /// Maximum kmer abundance (inclusive); kmers above this threshold are excluded
-    #[arg(long)]
-    pub max_abundance: Option<u32>,
+    /// Index directory produced by the `index` command
+    pub index: PathBuf,
 }

 pub fn run(args: UnitigArgs) {
-    let kp = KmerPartition::open(&args.partition).unwrap_or_else(|e| {
-        eprintln!("error opening partition: {e}");
+    let idx = KmerIndex::open(&args.index).unwrap_or_else(|e| {
+        eprintln!("error opening index: {e}");
        std::process::exit(1)
    });

-    let k = kp.kmer_size();
+    let k = idx.kmer_size();
    set_k(k);
-    let n = kp.n_partitions();
-    info!("building unitigs from {n} partitions (k={k}, parallel)");
+    let n = idx.n_partitions();
+    info!("dumping unitigs from {n} partitions (k={k})");

-    let total_kmers = AtomicUsize::new(0);
+    let stdout = Mutex::new(BufWriter::new(io::stdout()));

    (0..n).into_par_iter().for_each(|i| {
-        let part_dir = kp.part_dir(i);
-        let in_path = part_dir.join("dereplicated.skmer.zst");
-        if !in_path.exists() {
+        let path = idx.layer_unitigs_path(i, 0);
+        if !path.exists() {
            return;
        }
-        let out_path = part_dir.join("unitig.fasta.gz");

-        let mut g = GraphDeBruijn::new();
-
-        let mphf_path = part_dir.join("mphf1.bin");
-        let counts_path = part_dir.join("counts1.bin");
-        let filter_active = (args.min_abundance > 1 || args.max_abundance.is_some())
-            && mphf_path.exists()
-            && counts_path.exists();
-
-        let mphf_opt: Option<GOFunction> = if filter_active {
-            let mut f = File::open(&mphf_path).unwrap_or_else(|e| {
-                eprintln!("error opening {}: {e}", mphf_path.display());
+        let reader = UnitigFileReader::open(&path).unwrap_or_else(|e| {
+            eprintln!("error opening unitigs (partition {i}): {e}");
            std::process::exit(1)
        });
-            Some(GOFunction::read(&mut f).unwrap_or_else(|e| {
-                eprintln!("error reading MPHF {}: {e}", mphf_path.display());
-                std::process::exit(1)
-            }))
-        } else {
-            None
-        };

-        let counts_mmap_opt = if filter_active {
-            let cf = File::open(&counts_path).unwrap_or_else(|e| {
-                eprintln!("error opening {}: {e}", counts_path.display());
+        for j in 0..reader.len() {
+            let unitig = reader.unitig(j);
+            let mut out = stdout.lock().unwrap();
+            write_unitig(&unitig, k, i, j, &mut *out).unwrap_or_else(|e| {
+                eprintln!("write error: {e}");
                std::process::exit(1)
            });
-            Some(unsafe {
-                memmap2::Mmap::map(&cf).unwrap_or_else(|e| {
-                    eprintln!("error mmapping {}: {e}", counts_path.display());
-                    std::process::exit(1)
-                })
-            })
-        } else {
-            None
-        };
-
-        let counts_slice: Option<&[u32]> = counts_mmap_opt
-            .as_ref()
-            .map(|m| unsafe { std::slice::from_raw_parts(m.as_ptr() as *const u32, m.len() / 4) });
-
-        let mut reader = SKFileReader::open(&in_path).unwrap_or_else(|e| {
-            eprintln!("error opening {}: {e}", in_path.display());
-            std::process::exit(1)
-        });
-        for sk in reader.iter() {
-            for kmer in sk.iter_canonical_kmers() {
-                let accept = match (&mphf_opt, counts_slice) {
-                    (Some(mphf), Some(counts)) => {
-                        if let Some(slot) = mphf.get(&kmer) {
-                            let ab = counts[slot as usize];
-                            ab >= args.min_abundance
-                                && args.max_abundance.map_or(true, |max| ab <= max)
-                        } else {
-                            false
        }
-                    }
-                    _ => true,
-                };
-                if accept {
-                    g.push(kmer);
-                }
-            }
-        }
-
-        let n_kmers = g.len();
-        total_kmers.fetch_add(n_kmers, Ordering::Relaxed);
-        info!(
-            "partition {i}/{n}: {n_kmers} canonical k-mers → {}",
-            out_path.display()
-        );
-
-        g.compute_degrees();
-
-        let file = File::create(&out_path).unwrap_or_else(|e| {
-            eprintln!("error creating {}: {e}", out_path.display());
-            std::process::exit(1)
-        });
-        let mut writer = niffler::send::get_writer(Box::new(file), Format::Gzip, Level::Six)
-            .unwrap_or_else(|e| {
-                eprintln!("error creating gzip writer: {e}");
-                std::process::exit(1)
-            });
-        g.write_fasta(&mut writer, true).unwrap_or_else(|e| {
-            eprintln!("write error on partition {i}: {e}");
-            std::process::exit(1)
-        });
    });

-    info!(
-        "done — {} total canonical k-mers across all partitions",
-        total_kmers.load(Ordering::Relaxed)
-    );
+    stdout.into_inner().unwrap().flush().expect("flush error");
 }
@@ -14,20 +14,12 @@ struct Cli {

 #[derive(Subcommand)]
 enum Commands {
-    /// Extract super-kmers from a sequence file (scatter phase)
+    /// Extract super-kmers from a sequence file and write to stdout
    Superkmer(cmd::superkmer::SuperkmerArgs),
-    /// Partition super-kmers on disk by minimizer
-    Partition(cmd::partition::PartitionArgs),
-    /// Count kmers from an existing dereplicated partition directory
-    Count(cmd::count::CountArgs),
-    /// Export partition data to FASTA (--super-kmers: dereplicated super-kmers)
-    Fasta(cmd::fasta::FastaArgs),
-    /// Build de Bruijn unitigs for all partitions and write to unitig.fasta.gz
-    Unitig(cmd::unitig::UnitigArgs),
-    /// Build de Bruijn longtigs for all partitions and write to longtig.fasta.gz
-    Longtig(cmd::longtig::LongtigArgs),
    /// Build the complete genome index (scatter → dereplicate → count → layered MPHF)
    Index(cmd::index::IndexArgs),
+    /// Dump unitigs from a built index to stdout (debug)
+    Unitig(cmd::unitig::UnitigArgs),
 }

 fn main() {
@@ -50,12 +42,8 @@ fn main() {
    let cli = Cli::parse();
    match cli.command {
        Commands::Superkmer(args) => cmd::superkmer::run(args),
-        Commands::Partition(args) => cmd::partition::run(args),
-        Commands::Count(args) => cmd::count::run(args),
-        Commands::Fasta(args) => cmd::fasta::run(args),
-        Commands::Unitig(args) => cmd::unitig::run(args),
-        Commands::Longtig(args) => cmd::longtig::run(args),
        Commands::Index(args)     => cmd::index::run(args),
+        Commands::Unitig(args)    => cmd::unitig::run(args),
    }

    #[cfg(feature = "profiling")]
@@ -1,177 +0,0 @@
-use std::fs;
-use std::path::Path;
-use std::sync::atomic::{AtomicUsize, Ordering};
-use std::sync::{Arc, Mutex};
-
-use cacheline_ef::{CachelineEf, CachelineEfVec};
-use epserde::prelude::*;
-use indicatif::{ProgressBar, ProgressStyle};
-use obicompactvec::PersistentCompactIntMatrix;
-use obicompactvec::PersistentCompactIntVec;
-use obidebruinj::GraphDeBruijn;
-use obikpartitionner::KmerPartition;
-use obilayeredmap::layer::Layer;
-use obiskio::{SKFileMeta, SKFileReader};
-use obisys::{Reporter, Stage};
-use ptr_hash::{PtrHash, bucket_fn::CubicEps, hash::Xx64};
-use rayon::prelude::*;
-use tracing::info;
-
-type Mphf = PtrHash<u64, CubicEps, CachelineEfVec<Vec<CachelineEf>>, Xx64, Vec<u8>>;
-
-/// Build the layered MPHF index for all partitions in parallel.
-///
-/// Default mode (with_counts = false): set membership only (`Layer<()>`).
-/// With counts (with_counts = true): count matrix per kmer (`Layer<PersistentCompactIntMatrix>`).
-///
-/// Skips any partition whose `index/layer_0/mphf.bin` already exists (resume).
-/// Reports the "index" stage to `rep`.
-pub fn build_index(
-    kp: &KmerPartition,
-    min_ab: u32,
-    max_ab: Option<u32>,
-    with_counts: bool,
-    keep_intermediate: bool,
-    rep: &mut Reporter,
-) {
-    let n = kp.n_partitions();
-    let t = Stage::start("index");
-    let total_kmers = AtomicUsize::new(0);
-    let filter_active = min_ab > 1 || max_ab.is_some();
-    let need_counts = filter_active || with_counts;
-
-    let pb = Arc::new(Mutex::new(
-        ProgressBar::new(n as u64).with_style(
-            ProgressStyle::with_template("index — [{bar:20}] {pos}/{len} | {msg}").unwrap(),
-        ),
-    ));
-
-    (0..n).into_par_iter().for_each(|i| {
-        let part_dir = kp.part_dir(i);
-        let dedup_path = part_dir.join("dereplicated.skmer.zst");
-        if !dedup_path.exists() {
-            return;
-        }
-
-        let layer_dir = part_dir.join("index").join("layer_0");
-        if layer_dir.join("mphf.bin").exists() {
-            return;
-        }
-
-        // Load partition MPHF + counts when needed for filtering or count payload
-        let mphf1_opt: Option<Mphf> = if need_counts {
-            let p = part_dir.join("mphf1.bin");
-            p.exists().then(|| Mphf::load_full(&p).ok()).flatten()
-        } else {
-            None
-        };
-
-        let counts1_opt: Option<PersistentCompactIntVec> = if need_counts {
-            let p = part_dir.join("counts1.bin");
-            p.exists()
-                .then(|| PersistentCompactIntVec::open(&p).ok())
-                .flatten()
-        } else {
-            None
-        };
-
-        // Build de Bruijn graph with optional abundance filter
-        let mut g = GraphDeBruijn::new();
-        let mut reader = SKFileReader::open(&dedup_path).unwrap_or_else(|e| {
-            eprintln!("error opening {}: {e}", dedup_path.display());
-            std::process::exit(1);
-        });
-        for sk in reader.iter() {
-            for kmer in sk.iter_canonical_kmers() {
-                let accept = if filter_active {
-                    match (&mphf1_opt, &counts1_opt) {
-                        (Some(mphf), Some(counts)) => {
-                            let ab = counts.get(mphf.index(&kmer.raw()));
-                            ab >= min_ab && max_ab.map_or(true, |max| ab <= max)
-                        }
-                        _ => true,
-                    }
-                } else {
-                    true
-                };
-                if accept {
-                    g.push(kmer);
-                }
-            }
-        }
-
-        let n_kmers = g.len();
-        total_kmers.fetch_add(n_kmers, Ordering::Relaxed);
-        g.compute_degrees();
-
-        // Write unitigs to layer_0/unitigs.bin
-        fs::create_dir_all(&layer_dir).unwrap_or_else(|e| {
-            eprintln!("error creating {}: {e}", layer_dir.display());
-            std::process::exit(1);
-        });
-        let mut uw = Layer::<()>::unitig_writer(&layer_dir).unwrap_or_else(|e| {
-            eprintln!("error creating unitig writer (partition {i}): {e}");
-            std::process::exit(1);
-        });
-        for unitig in g.iter_unitig() {
-            uw.write(&unitig).unwrap_or_else(|e| {
-                eprintln!("error writing unitig (partition {i}): {e}");
-                std::process::exit(1);
-            });
-        }
-        uw.close().unwrap_or_else(|e| {
-            eprintln!("error closing unitig writer (partition {i}): {e}");
-            std::process::exit(1);
-        });
-
-        // Build MPHF layer — mode depends on --with-counts
-        if with_counts {
-            Layer::<PersistentCompactIntMatrix>::build(&layer_dir, |kmer| {
-                match (&mphf1_opt, &counts1_opt) {
-                    (Some(mphf), Some(counts)) => counts.get(mphf.index(&kmer.raw())),
-                    _ => 1,
-                }
-            })
-            .unwrap_or_else(|e| {
-                eprintln!("error building count layer (partition {i}): {e}");
-                std::process::exit(1);
-            });
-        } else {
-            Layer::<()>::build(&layer_dir).unwrap_or_else(|e| {
-                eprintln!("error building set layer (partition {i}): {e}");
-                std::process::exit(1);
-            });
-        }
-
-        let pb = pb.lock().unwrap();
-        pb.inc(1);
-        pb.set_message(format!("{i}: {n_kmers} kmers"));
-    });
-
-    pb.lock().unwrap().finish_and_clear();
-    info!(
-        "done — {} total kmers indexed",
-        total_kmers.load(Ordering::Relaxed)
-    );
-
-    // ── Cleanup intermediate build files ──────────────────────────────────────
-    if !keep_intermediate {
-        for i in 0..n {
-            let part_dir = kp.part_dir(i);
-            remove_if_exists(&part_dir.join("dereplicated.skmer.zst"));
-            remove_if_exists(&SKFileMeta::sidecar_path(&part_dir.join("dereplicated.skmer.zst")));
-            remove_if_exists(&part_dir.join("mphf1.bin"));
-            remove_if_exists(&part_dir.join("counts1.bin"));
-        }
-    }
-
-    rep.push(t.stop());
-}
-
-fn remove_if_exists(path: &Path) {
-    if let Err(e) = fs::remove_file(path) {
-        if e.kind() != std::io::ErrorKind::NotFound {
-            eprintln!("warning: could not remove {}: {e}", path.display());
-        }
-    }
-}
@@ -1,13 +0,0 @@
-use obikpartitionner::KmerPartition;
-use obisys::{Reporter, Stage};
-
-/// Dereplicate then count kmers. Reports each stage to `rep`.
-pub fn dereplicate_and_count(kp: &KmerPartition, rep: &mut Reporter) {
-    let t = Stage::start("dereplicate");
-    kp.dereplicate().expect("dereplicate error");
-    rep.push(t.stop());
-
-    let t = Stage::start("count_kmer");
-    kp.count_kmer().expect("count kmer error");
-    rep.push(t.stop());
-}
@@ -1,7 +1,3 @@
-mod build_index;
-mod dereplicate_and_count;
 mod scatter;

-pub use build_index::build_index;
-pub use dereplicate_and_count::dereplicate_and_count;
 pub use scatter::scatter;