refactor: extract obikindex crate and remove deprecated CLI commands

Extracted core indexing logic, state tracking, and metadata management into a new `obikindex` crate. Refactored the `index` and `unitig` commands to leverage the `KmerIndex` abstraction and state-driven pipeline transitions. Removed obsolete CLI subcommands (`count`, `fasta`, `longtig`, `partition`) and their associated pipeline steps. Updated FASTA writing utilities for single-line output and deterministic identifiers, and refreshed workspace dependencies.
2026-05-20 18:21:05 +02:00
parent f8cfb493b8
commit 17c9e076bd
24 changed files with 792 additions and 1593 deletions
@@ -1,9 +1,11 @@
 use std::fmt;
 use std::io::{self, Write};
-use xxhash_rust::xxh64::xxh64;

-pub(crate) enum JsonVal<'a> {
+/// A JSON value that is either a number or a quoted string.
+pub enum JsonVal<'a> {
+    /// Integer value, serialised without quotes.
    Num(u64),
+    /// String value, serialised with double quotes.
    Str(&'a str),
 }

@@ -16,11 +18,8 @@ impl fmt::Display for JsonVal<'_> {
    }
 }

-pub(crate) fn seq_id(ascii: &[u8]) -> String {
-    format!("{:016X}", xxh64(ascii, 0))
-}
-
-pub(crate) fn annotation<W: Write>(
+/// Write a JSON object `{"k1":v1,"k2":v2,...}` to `writer`.
+pub fn annotation<W: Write>(
    writer: &mut W,
    fields: &[(&str, JsonVal<'_>)],
 ) -> io::Result<()> {
@@ -34,10 +33,29 @@ pub(crate) fn annotation<W: Write>(
    write!(writer, "}}")
 }

-pub(crate) fn write_sequence<W: Write>(writer: &mut W, seq: &[u8], width: usize) -> io::Result<()> {
-    for chunk in seq.chunks(width) {
-        // SAFETY: seq is valid UTF-8; any contiguous slice of ASCII bytes is too
-        writeln!(writer, "{}", unsafe { std::str::from_utf8_unchecked(chunk) })?;
-    }
-    Ok(())
+/// xxHash-64 of `ascii`, formatted as 16 uppercase hex digits.
+pub fn seq_id(ascii: &[u8]) -> String {
+    use xxhash_rust::xxh64::xxh64;
+    format!("{:016X}", xxh64(ascii, 0))
+}
+
+/// Write `seq` as one line of ASCII DNA, followed by a newline.
+pub fn write_sequence<W: Write>(writer: &mut W, seq: &[u8]) -> io::Result<()> {
+    // SAFETY: seq is valid ASCII DNA (A/C/G/T).
+    writeln!(writer, "{}", unsafe { std::str::from_utf8_unchecked(seq) })
+}
+
+/// Core FASTA record writer.
+///
+/// Writes `>{id} {annotation}\n{sequence}\n`.
+pub fn write_record<W: Write>(
+    seq: &[u8],
+    id: &str,
+    fields: &[(&str, JsonVal<'_>)],
+    out: &mut W,
+) -> io::Result<()> {
+    write!(out, ">{id} ")?;
+    annotation(out, fields)?;
+    writeln!(out)?;
+    write_sequence(out, seq)
 }
@@ -1,32 +1,20 @@
-//! FASTA serialisation of [`SuperKmer`] values.
+//! FASTA serialisation for obikmer sequence types.
 //!
-//! Two functions cover the two phases of the scatter pipeline:
+//! Three public functions cover the main output cases:
 //!
-//! - [`write_scatter`]: scatter phase (before routing). The header annotation
-//!   contains the minimizer sequence decoded from [`SuperKmer::minimizer_pos`].
+//! - [`write_scatter`]: super-kmers in scatter phase (minimizer annotation)
+//! - [`write_count`]: super-kmers in count phase (occurrence count annotation)
+//! - [`write_unitig`]: unitigs from the layered index (partition + index annotation)
 //!
-//! - [`write_count`]: count phase (after deduplication). The header annotation
-//!   contains the occurrence count from [`SuperKmer::count`].
-//!
-//! Both functions write standard OBITools-compatible FASTA:
+//! All produce OBITools-compatible FASTA:
 //!
 //! ```text
-//! >ID {"seq_length":32,"kmer_size":31,"minimizer_size":11,"partition":42,"minimizer":"CGTGCTAGATC"}
-//! GCTAGCATGCTAGCTGTAGCTGTGAGTGCTG
+//! >ID {"key":value,...}
+//! SEQUENCE
 //! ```
 //!
-//! The record identifier is the xxHash-64 of the ASCII sequence, formatted as
-//! a 16-digit uppercase hexadecimal string. xxHash-64 is collision-resistant
-//! enough for debugging identifiers (collision probability < 1e-9 for billions
-//! of distinct super-kmers).
-//!
-//! # Phase contract
-//!
-//! `write_scatter` reads [`SuperKmer::minimizer_pos`], which is only valid
-//! **before** [`SuperKmer::init_count`] is called.  `write_count` reads
-//! [`SuperKmer::count`], which is only meaningful **after** `init_count`.
-//! Mixing the two functions in the wrong phase produces silently wrong output;
-//! this is enforced by pipeline structure, not by the type system.
+//! The lower-level primitive [`write_record`] and the [`JsonVal`] type are also
+//! public for callers that need custom annotations.

 #![deny(missing_docs)]

@@ -35,22 +23,15 @@ mod fasta;
 use std::io::{self, Write};

 use obikseq::{Minimizer, SuperKmer, Unitig};
-use xxhash_rust::xxh64::xxh64;
+
+pub use fasta::{JsonVal, annotation, seq_id, write_record};

 // ── public API ────────────────────────────────────────────────────────────────

 /// Write one super-kmer in FASTA format — **scatter phase**.
 ///
-/// The `minimizer` field in the JSON annotation contains the ASCII sequence of
-/// the minimizer, decoded from [`SuperKmer::minimizer_pos`] (scatter-phase
-/// value of the payload field).
-///
-/// # Parameters
-/// - `sk`: the super-kmer to serialise (must be in scatter phase)
-/// - `out`: destination writer
-/// - `k`: k-mer size used to build `sk`
-/// - `m`: minimizer size
-/// - `partition`: partition index computed from the minimizer hash
+/// ID is the xxHash-64 of the sequence.  JSON annotation includes
+/// `seq_length`, `kmer_size`, `minimizer_size`, `partition`, `minimizer`.
 pub fn write_scatter<W: Write>(
    sk: &SuperKmer,
    out: &mut W,
@@ -61,37 +42,26 @@ pub fn write_scatter<W: Write>(
 ) -> io::Result<()> {
    let ascii = sk.to_ascii();
    let id = seq_id(&ascii);
-    let seq_len = ascii.len();
    let min_seq = minimizer.to_ascii();
-
-    writeln!(
+    let min_str = unsafe { std::str::from_utf8_unchecked(&min_seq) };
+    write_record(
+        &ascii,
+        &id,
+        &[
+            ("seq_length",    JsonVal::Num(ascii.len() as u64)),
+            ("kmer_size",     JsonVal::Num(k as u64)),
+            ("minimizer_size",JsonVal::Num(m as u64)),
+            ("partition",     JsonVal::Num(partition as u64)),
+            ("minimizer",     JsonVal::Str(min_str)),
+        ],
        out,
-        ">{id} {{\"seq_length\":{seq_len},\"kmer_size\":{k},\
-         \"minimizer_size\":{m},\"partition\":{partition},\
-         \"minimizer\":\"{min}\"}}",
-        id = id,
-        seq_len = seq_len,
-        k = k,
-        m = m,
-        partition = partition,
-        min = unsafe { std::str::from_utf8_unchecked(&min_seq) },
-    )?;
-    out.write_all(&ascii)?;
-    out.write_all(b"\n")
+    )
 }

 /// Write one super-kmer in FASTA format — **count phase**.
 ///
-/// The `count` field in the JSON annotation contains the occurrence count from
-/// [`SuperKmer::count`] (count-phase value of the payload field).
-///
-/// # Parameters
-/// - `sk`: the super-kmer to serialise (must be in count phase, i.e. after
-///   [`SuperKmer::init_count`] has been called)
-/// - `out`: destination writer
-/// - `k`: k-mer size
-/// - `m`: minimizer size
-/// - `partition`: partition index
+/// ID is the xxHash-64 of the sequence.  JSON annotation includes
+/// `seq_length`, `kmer_size`, `minimizer_size`, `partition`, `count`.
 pub fn write_count<W: Write>(
    sk: &SuperKmer,
    out: &mut W,
@@ -101,52 +71,47 @@ pub fn write_count<W: Write>(
 ) -> io::Result<()> {
    let ascii = sk.to_ascii();
    let id = seq_id(&ascii);
-    let seq_len = ascii.len();
-    let count = sk.count();
-
-    writeln!(
+    write_record(
+        &ascii,
+        &id,
+        &[
+            ("seq_length",    JsonVal::Num(ascii.len() as u64)),
+            ("kmer_size",     JsonVal::Num(k as u64)),
+            ("minimizer_size",JsonVal::Num(m as u64)),
+            ("partition",     JsonVal::Num(partition as u64)),
+            ("count",         JsonVal::Num(sk.count() as u64)),
+        ],
        out,
-        ">{id} {{\"seq_length\":{seq_len},\"kmer_size\":{k},\
-         \"minimizer_size\":{m},\"partition\":{partition},\
-         \"count\":{count}}}",
-        id = id,
-        seq_len = seq_len,
-        k = k,
-        m = m,
-        partition = partition,
-        count = count,
-    )?;
-    out.write_all(&ascii)?;
-    out.write_all(b"\n")
+    )
 }

 /// Write one unitig in FASTA format.
 ///
-/// Header annotation (JSON):
-/// ```text
-/// >HASH {"seq_length":<seql>,"kmer_size":<k>,"n_kmers":<seql-k+1>}
-/// ```
-///
-/// `HASH` is the xxHash-64 of the ASCII sequence (16 uppercase hex digits).
-/// `n_kmers` is the number of distinct k-mers covered by this unitig.
-pub fn write_unitig<W: Write>(unitig: &Unitig, k: usize, out: &mut W) -> io::Result<()> {
+/// ID is `part_PPPPP_unitig_IIIIII` where `P` is the partition index and `I`
+/// is the unitig index within that partition.  JSON annotation includes
+/// `seq_length`, `kmer_size`, `n_kmers`, `partition`, `unitig_index`.
+pub fn write_unitig<W: Write>(
+    unitig: &Unitig,
+    k: usize,
+    partition: usize,
+    index: usize,
+    out: &mut W,
+) -> io::Result<()> {
    let ascii = unitig.to_ascii();
-    let id = seq_id(&ascii);
    let seql = unitig.seql();
-    let n_kmers = seql - k + 1;
-    writeln!(
+    let id = format!("part_{partition:05}_unitig_{index:06}");
+    write_record(
+        &ascii,
+        &id,
+        &[
+            ("seq_length",   JsonVal::Num(seql as u64)),
+            ("kmer_size",    JsonVal::Num(k as u64)),
+            ("n_kmers",      JsonVal::Num((seql - k + 1) as u64)),
+            ("partition",    JsonVal::Num(partition as u64)),
+            ("unitig_index", JsonVal::Num(index as u64)),
+        ],
        out,
-        ">{id} {{\"seq_length\":{seql},\"kmer_size\":{k},\"n_kmers\":{n_kmers}}}",
-    )?;
-    out.write_all(&ascii)?;
-    out.write_all(b"\n")
-}
-
-// ── internal helpers ──────────────────────────────────────────────────────────
-
-/// xxHash-64 of the ASCII sequence, formatted as 16 uppercase hex digits.
-fn seq_id(ascii: &[u8]) -> String {
-    format!("{:016X}", xxh64(ascii, 0))
+    )
 }

 // ── tests ─────────────────────────────────────────────────────────────────────
@@ -178,9 +143,6 @@ mod tests {

    #[test]
    fn scatter_minimizer_decoded_from_hash() {
-        // "ACG" right-aligned: A=00, C=01, G=10 → 0b000110 = 6
-        // Left-aligned for m=3: shift by 64 − 2·3 = 58.
-        // set_m(3) so that Minimizer::to_ascii() decodes exactly 3 bases.
        obikseq::params::set_m(3);
        let sk = make(b"ACGTACGTACGT");
        let minimizer = Minimizer::from_raw_unchecked(6u64 << (64 - 2 * 3));
@@ -230,13 +192,34 @@ mod tests {

    #[test]
    fn count_sequence_line_correct() {
-        // TTTTACGT canonicalises to ACGTAAAA (revcomp is ACGTAAAA < TTTTACGT)
        let sk = make(b"TTTTACGT");
        let out = capture(|w| write_count(&sk, w, 4, 2, 0));
        let lines: Vec<&str> = out.lines().collect();
        assert_eq!(lines[1], "ACGTAAAA");
    }

+    // ── write_unitig ──────────────────────────────────────────────────────────
+
+    #[test]
+    fn unitig_id_format() {
+        obikseq::params::set_k(4);
+        let unitig = obikseq::packed_seq::PackedSeq::from_ascii(b"ACGTACGT");
+        let out = capture(|w| write_unitig(&unitig, 4, 3, 17, w));
+        let id = out.lines().next().unwrap();
+        assert!(id.starts_with(">part_00003_unitig_000017"), "got: {id}");
+    }
+
+    #[test]
+    fn unitig_annotation_fields() {
+        obikseq::params::set_k(4);
+        let unitig = obikseq::packed_seq::PackedSeq::from_ascii(b"ACGTACGT");
+        let out = capture(|w| write_unitig(&unitig, 4, 2, 5, w));
+        assert!(out.contains("\"partition\":2"));
+        assert!(out.contains("\"unitig_index\":5"));
+        assert!(out.contains("\"n_kmers\":5"));
+        assert!(out.contains("\"kmer_size\":4"));
+    }
+
    // ── ID stability ──────────────────────────────────────────────────────────

    #[test]
@@ -260,7 +243,7 @@ mod tests {
            .next()
            .unwrap()[1..]
            .to_string();
-        assert_eq!(id1, id2, "same sequence must produce same ID");
+        assert_eq!(id1, id2);
    }

    #[test]
@@ -269,21 +252,11 @@ mod tests {
        let sk2 = make(b"TTTTTTTT");

        let id1 = capture(|w| write_scatter(&sk1, w, 4, 2, 0, Minimizer::from_raw_unchecked(0)))
-            .lines()
-            .next()
-            .unwrap()
-            .split_whitespace()
-            .next()
-            .unwrap()[1..]
-            .to_string();
+            .lines().next().unwrap()
+            .split_whitespace().next().unwrap()[1..].to_string();
        let id2 = capture(|w| write_scatter(&sk2, w, 4, 2, 0, Minimizer::from_raw_unchecked(0)))
-            .lines()
-            .next()
-            .unwrap()
-            .split_whitespace()
-            .next()
-            .unwrap()[1..]
-            .to_string();
+            .lines().next().unwrap()
+            .split_whitespace().next().unwrap()[1..].to_string();
        assert_ne!(id1, id2);
    }

@@ -291,7 +264,7 @@ mod tests {
    fn id_is_16_hex_digits() {
        let sk = make(b"ACGTACGT");
        let out = capture(|w| write_scatter(&sk, w, 4, 2, 0, Minimizer::from_raw_unchecked(0)));
-        let id = &out.lines().next().unwrap()[1..17]; // skip '>'
+        let id = &out.lines().next().unwrap()[1..17];
        assert_eq!(id.len(), 16);
        assert!(id.chars().all(|c| c.is_ascii_hexdigit()));
    }