refactor: extract obikindex crate and remove deprecated CLI commands

Extracted core indexing logic, state tracking, and metadata management into a new `obikindex` crate. Refactored the `index` and `unitig` commands to leverage the `KmerIndex` abstraction and state-driven pipeline transitions. Removed obsolete CLI subcommands (`count`, `fasta`, `longtig`, `partition`) and their associated pipeline steps. Updated FASTA writing utilities for single-line output and deterministic identifiers, and refreshed workspace dependencies.
This commit is contained in:
Eric Coissac
2026-05-20 18:21:05 +02:00
parent f8cfb493b8
commit 17c9e076bd
24 changed files with 792 additions and 1593 deletions
+31 -13
View File
@@ -1,9 +1,11 @@
use std::fmt;
use std::io::{self, Write};
use xxhash_rust::xxh64::xxh64;
pub(crate) enum JsonVal<'a> {
/// A JSON value that is either a number or a quoted string.
pub enum JsonVal<'a> {
/// Integer value, serialised without quotes.
Num(u64),
/// String value, serialised with double quotes.
Str(&'a str),
}
@@ -16,11 +18,8 @@ impl fmt::Display for JsonVal<'_> {
}
}
pub(crate) fn seq_id(ascii: &[u8]) -> String {
format!("{:016X}", xxh64(ascii, 0))
}
pub(crate) fn annotation<W: Write>(
/// Write a JSON object `{"k1":v1,"k2":v2,...}` to `writer`.
pub fn annotation<W: Write>(
writer: &mut W,
fields: &[(&str, JsonVal<'_>)],
) -> io::Result<()> {
@@ -34,10 +33,29 @@ pub(crate) fn annotation<W: Write>(
write!(writer, "}}")
}
pub(crate) fn write_sequence<W: Write>(writer: &mut W, seq: &[u8], width: usize) -> io::Result<()> {
for chunk in seq.chunks(width) {
// SAFETY: seq is valid UTF-8; any contiguous slice of ASCII bytes is too
writeln!(writer, "{}", unsafe { std::str::from_utf8_unchecked(chunk) })?;
}
Ok(())
/// xxHash-64 of `ascii`, formatted as 16 uppercase hex digits.
pub fn seq_id(ascii: &[u8]) -> String {
use xxhash_rust::xxh64::xxh64;
format!("{:016X}", xxh64(ascii, 0))
}
/// Write `seq` as one line of ASCII DNA, followed by a newline.
pub fn write_sequence<W: Write>(writer: &mut W, seq: &[u8]) -> io::Result<()> {
// SAFETY: seq is valid ASCII DNA (A/C/G/T).
writeln!(writer, "{}", unsafe { std::str::from_utf8_unchecked(seq) })
}
/// Core FASTA record writer.
///
/// Writes `>{id} {annotation}\n{sequence}\n`.
pub fn write_record<W: Write>(
seq: &[u8],
id: &str,
fields: &[(&str, JsonVal<'_>)],
out: &mut W,
) -> io::Result<()> {
write!(out, ">{id} ")?;
annotation(out, fields)?;
writeln!(out)?;
write_sequence(out, seq)
}
+89 -116
View File
@@ -1,32 +1,20 @@
//! FASTA serialisation of [`SuperKmer`] values.
//! FASTA serialisation for obikmer sequence types.
//!
//! Two functions cover the two phases of the scatter pipeline:
//! Three public functions cover the main output cases:
//!
//! - [`write_scatter`]: scatter phase (before routing). The header annotation
//! contains the minimizer sequence decoded from [`SuperKmer::minimizer_pos`].
//! - [`write_scatter`]: super-kmers in scatter phase (minimizer annotation)
//! - [`write_count`]: super-kmers in count phase (occurrence count annotation)
//! - [`write_unitig`]: unitigs from the layered index (partition + index annotation)
//!
//! - [`write_count`]: count phase (after deduplication). The header annotation
//! contains the occurrence count from [`SuperKmer::count`].
//!
//! Both functions write standard OBITools-compatible FASTA:
//! All produce OBITools-compatible FASTA:
//!
//! ```text
//! >ID {"seq_length":32,"kmer_size":31,"minimizer_size":11,"partition":42,"minimizer":"CGTGCTAGATC"}
//! GCTAGCATGCTAGCTGTAGCTGTGAGTGCTG
//! >ID {"key":value,...}
//! SEQUENCE
//! ```
//!
//! The record identifier is the xxHash-64 of the ASCII sequence, formatted as
//! a 16-digit uppercase hexadecimal string. xxHash-64 is collision-resistant
//! enough for debugging identifiers (collision probability < 1e-9 for billions
//! of distinct super-kmers).
//!
//! # Phase contract
//!
//! `write_scatter` reads [`SuperKmer::minimizer_pos`], which is only valid
//! **before** [`SuperKmer::init_count`] is called. `write_count` reads
//! [`SuperKmer::count`], which is only meaningful **after** `init_count`.
//! Mixing the two functions in the wrong phase produces silently wrong output;
//! this is enforced by pipeline structure, not by the type system.
//! The lower-level primitive [`write_record`] and the [`JsonVal`] type are also
//! public for callers that need custom annotations.
#![deny(missing_docs)]
@@ -35,22 +23,15 @@ mod fasta;
use std::io::{self, Write};
use obikseq::{Minimizer, SuperKmer, Unitig};
use xxhash_rust::xxh64::xxh64;
pub use fasta::{JsonVal, annotation, seq_id, write_record};
// ── public API ────────────────────────────────────────────────────────────────
/// Write one super-kmer in FASTA format — **scatter phase**.
///
/// The `minimizer` field in the JSON annotation contains the ASCII sequence of
/// the minimizer, decoded from [`SuperKmer::minimizer_pos`] (scatter-phase
/// value of the payload field).
///
/// # Parameters
/// - `sk`: the super-kmer to serialise (must be in scatter phase)
/// - `out`: destination writer
/// - `k`: k-mer size used to build `sk`
/// - `m`: minimizer size
/// - `partition`: partition index computed from the minimizer hash
/// ID is the xxHash-64 of the sequence. JSON annotation includes
/// `seq_length`, `kmer_size`, `minimizer_size`, `partition`, `minimizer`.
pub fn write_scatter<W: Write>(
sk: &SuperKmer,
out: &mut W,
@@ -61,37 +42,26 @@ pub fn write_scatter<W: Write>(
) -> io::Result<()> {
let ascii = sk.to_ascii();
let id = seq_id(&ascii);
let seq_len = ascii.len();
let min_seq = minimizer.to_ascii();
writeln!(
let min_str = unsafe { std::str::from_utf8_unchecked(&min_seq) };
write_record(
&ascii,
&id,
&[
("seq_length", JsonVal::Num(ascii.len() as u64)),
("kmer_size", JsonVal::Num(k as u64)),
("minimizer_size",JsonVal::Num(m as u64)),
("partition", JsonVal::Num(partition as u64)),
("minimizer", JsonVal::Str(min_str)),
],
out,
">{id} {{\"seq_length\":{seq_len},\"kmer_size\":{k},\
\"minimizer_size\":{m},\"partition\":{partition},\
\"minimizer\":\"{min}\"}}",
id = id,
seq_len = seq_len,
k = k,
m = m,
partition = partition,
min = unsafe { std::str::from_utf8_unchecked(&min_seq) },
)?;
out.write_all(&ascii)?;
out.write_all(b"\n")
)
}
/// Write one super-kmer in FASTA format — **count phase**.
///
/// The `count` field in the JSON annotation contains the occurrence count from
/// [`SuperKmer::count`] (count-phase value of the payload field).
///
/// # Parameters
/// - `sk`: the super-kmer to serialise (must be in count phase, i.e. after
/// [`SuperKmer::init_count`] has been called)
/// - `out`: destination writer
/// - `k`: k-mer size
/// - `m`: minimizer size
/// - `partition`: partition index
/// ID is the xxHash-64 of the sequence. JSON annotation includes
/// `seq_length`, `kmer_size`, `minimizer_size`, `partition`, `count`.
pub fn write_count<W: Write>(
sk: &SuperKmer,
out: &mut W,
@@ -101,52 +71,47 @@ pub fn write_count<W: Write>(
) -> io::Result<()> {
let ascii = sk.to_ascii();
let id = seq_id(&ascii);
let seq_len = ascii.len();
let count = sk.count();
writeln!(
write_record(
&ascii,
&id,
&[
("seq_length", JsonVal::Num(ascii.len() as u64)),
("kmer_size", JsonVal::Num(k as u64)),
("minimizer_size",JsonVal::Num(m as u64)),
("partition", JsonVal::Num(partition as u64)),
("count", JsonVal::Num(sk.count() as u64)),
],
out,
">{id} {{\"seq_length\":{seq_len},\"kmer_size\":{k},\
\"minimizer_size\":{m},\"partition\":{partition},\
\"count\":{count}}}",
id = id,
seq_len = seq_len,
k = k,
m = m,
partition = partition,
count = count,
)?;
out.write_all(&ascii)?;
out.write_all(b"\n")
)
}
/// Write one unitig in FASTA format.
///
/// Header annotation (JSON):
/// ```text
/// >HASH {"seq_length":<seql>,"kmer_size":<k>,"n_kmers":<seql-k+1>}
/// ```
///
/// `HASH` is the xxHash-64 of the ASCII sequence (16 uppercase hex digits).
/// `n_kmers` is the number of distinct k-mers covered by this unitig.
pub fn write_unitig<W: Write>(unitig: &Unitig, k: usize, out: &mut W) -> io::Result<()> {
/// ID is `part_PPPPP_unitig_IIIIII` where `P` is the partition index and `I`
/// is the unitig index within that partition. JSON annotation includes
/// `seq_length`, `kmer_size`, `n_kmers`, `partition`, `unitig_index`.
pub fn write_unitig<W: Write>(
unitig: &Unitig,
k: usize,
partition: usize,
index: usize,
out: &mut W,
) -> io::Result<()> {
let ascii = unitig.to_ascii();
let id = seq_id(&ascii);
let seql = unitig.seql();
let n_kmers = seql - k + 1;
writeln!(
let id = format!("part_{partition:05}_unitig_{index:06}");
write_record(
&ascii,
&id,
&[
("seq_length", JsonVal::Num(seql as u64)),
("kmer_size", JsonVal::Num(k as u64)),
("n_kmers", JsonVal::Num((seql - k + 1) as u64)),
("partition", JsonVal::Num(partition as u64)),
("unitig_index", JsonVal::Num(index as u64)),
],
out,
">{id} {{\"seq_length\":{seql},\"kmer_size\":{k},\"n_kmers\":{n_kmers}}}",
)?;
out.write_all(&ascii)?;
out.write_all(b"\n")
}
// ── internal helpers ──────────────────────────────────────────────────────────
/// xxHash-64 of the ASCII sequence, formatted as 16 uppercase hex digits.
fn seq_id(ascii: &[u8]) -> String {
format!("{:016X}", xxh64(ascii, 0))
)
}
// ── tests ─────────────────────────────────────────────────────────────────────
@@ -178,9 +143,6 @@ mod tests {
#[test]
fn scatter_minimizer_decoded_from_hash() {
// "ACG" right-aligned: A=00, C=01, G=10 → 0b000110 = 6
// Left-aligned for m=3: shift by 64 2·3 = 58.
// set_m(3) so that Minimizer::to_ascii() decodes exactly 3 bases.
obikseq::params::set_m(3);
let sk = make(b"ACGTACGTACGT");
let minimizer = Minimizer::from_raw_unchecked(6u64 << (64 - 2 * 3));
@@ -230,13 +192,34 @@ mod tests {
#[test]
fn count_sequence_line_correct() {
// TTTTACGT canonicalises to ACGTAAAA (revcomp is ACGTAAAA < TTTTACGT)
let sk = make(b"TTTTACGT");
let out = capture(|w| write_count(&sk, w, 4, 2, 0));
let lines: Vec<&str> = out.lines().collect();
assert_eq!(lines[1], "ACGTAAAA");
}
// ── write_unitig ──────────────────────────────────────────────────────────
#[test]
fn unitig_id_format() {
obikseq::params::set_k(4);
let unitig = obikseq::packed_seq::PackedSeq::from_ascii(b"ACGTACGT");
let out = capture(|w| write_unitig(&unitig, 4, 3, 17, w));
let id = out.lines().next().unwrap();
assert!(id.starts_with(">part_00003_unitig_000017"), "got: {id}");
}
#[test]
fn unitig_annotation_fields() {
obikseq::params::set_k(4);
let unitig = obikseq::packed_seq::PackedSeq::from_ascii(b"ACGTACGT");
let out = capture(|w| write_unitig(&unitig, 4, 2, 5, w));
assert!(out.contains("\"partition\":2"));
assert!(out.contains("\"unitig_index\":5"));
assert!(out.contains("\"n_kmers\":5"));
assert!(out.contains("\"kmer_size\":4"));
}
// ── ID stability ──────────────────────────────────────────────────────────
#[test]
@@ -260,7 +243,7 @@ mod tests {
.next()
.unwrap()[1..]
.to_string();
assert_eq!(id1, id2, "same sequence must produce same ID");
assert_eq!(id1, id2);
}
#[test]
@@ -269,21 +252,11 @@ mod tests {
let sk2 = make(b"TTTTTTTT");
let id1 = capture(|w| write_scatter(&sk1, w, 4, 2, 0, Minimizer::from_raw_unchecked(0)))
.lines()
.next()
.unwrap()
.split_whitespace()
.next()
.unwrap()[1..]
.to_string();
.lines().next().unwrap()
.split_whitespace().next().unwrap()[1..].to_string();
let id2 = capture(|w| write_scatter(&sk2, w, 4, 2, 0, Minimizer::from_raw_unchecked(0)))
.lines()
.next()
.unwrap()
.split_whitespace()
.next()
.unwrap()[1..]
.to_string();
.lines().next().unwrap()
.split_whitespace().next().unwrap()[1..].to_string();
assert_ne!(id1, id2);
}
@@ -291,7 +264,7 @@ mod tests {
fn id_is_16_hex_digits() {
let sk = make(b"ACGTACGT");
let out = capture(|w| write_scatter(&sk, w, 4, 2, 0, Minimizer::from_raw_unchecked(0)));
let id = &out.lines().next().unwrap()[1..17]; // skip '>'
let id = &out.lines().next().unwrap()[1..17];
assert_eq!(id.len(), 16);
assert!(id.chars().all(|c| c.is_ascii_hexdigit()));
}