refactor: extract obikindex crate and remove deprecated CLI commands
Extracted core indexing logic, state tracking, and metadata management into a new `obikindex` crate. Refactored the `index` and `unitig` commands to leverage the `KmerIndex` abstraction and state-driven pipeline transitions. Removed obsolete CLI subcommands (`count`, `fasta`, `longtig`, `partition`) and their associated pipeline steps. Updated FASTA writing utilities for single-line output and deterministic identifiers, and refreshed workspace dependencies.
This commit is contained in:
@@ -1,9 +1,11 @@
|
||||
use std::fmt;
|
||||
use std::io::{self, Write};
|
||||
use xxhash_rust::xxh64::xxh64;
|
||||
|
||||
pub(crate) enum JsonVal<'a> {
|
||||
/// A JSON value that is either a number or a quoted string.
|
||||
pub enum JsonVal<'a> {
|
||||
/// Integer value, serialised without quotes.
|
||||
Num(u64),
|
||||
/// String value, serialised with double quotes.
|
||||
Str(&'a str),
|
||||
}
|
||||
|
||||
@@ -16,11 +18,8 @@ impl fmt::Display for JsonVal<'_> {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn seq_id(ascii: &[u8]) -> String {
|
||||
format!("{:016X}", xxh64(ascii, 0))
|
||||
}
|
||||
|
||||
pub(crate) fn annotation<W: Write>(
|
||||
/// Write a JSON object `{"k1":v1,"k2":v2,...}` to `writer`.
|
||||
pub fn annotation<W: Write>(
|
||||
writer: &mut W,
|
||||
fields: &[(&str, JsonVal<'_>)],
|
||||
) -> io::Result<()> {
|
||||
@@ -34,10 +33,29 @@ pub(crate) fn annotation<W: Write>(
|
||||
write!(writer, "}}")
|
||||
}
|
||||
|
||||
pub(crate) fn write_sequence<W: Write>(writer: &mut W, seq: &[u8], width: usize) -> io::Result<()> {
|
||||
for chunk in seq.chunks(width) {
|
||||
// SAFETY: seq is valid UTF-8; any contiguous slice of ASCII bytes is too
|
||||
writeln!(writer, "{}", unsafe { std::str::from_utf8_unchecked(chunk) })?;
|
||||
}
|
||||
Ok(())
|
||||
/// xxHash-64 of `ascii`, formatted as 16 uppercase hex digits.
|
||||
pub fn seq_id(ascii: &[u8]) -> String {
|
||||
use xxhash_rust::xxh64::xxh64;
|
||||
format!("{:016X}", xxh64(ascii, 0))
|
||||
}
|
||||
|
||||
/// Write `seq` as one line of ASCII DNA, followed by a newline.
|
||||
pub fn write_sequence<W: Write>(writer: &mut W, seq: &[u8]) -> io::Result<()> {
|
||||
// SAFETY: seq is valid ASCII DNA (A/C/G/T).
|
||||
writeln!(writer, "{}", unsafe { std::str::from_utf8_unchecked(seq) })
|
||||
}
|
||||
|
||||
/// Core FASTA record writer.
|
||||
///
|
||||
/// Writes `>{id} {annotation}\n{sequence}\n`.
|
||||
pub fn write_record<W: Write>(
|
||||
seq: &[u8],
|
||||
id: &str,
|
||||
fields: &[(&str, JsonVal<'_>)],
|
||||
out: &mut W,
|
||||
) -> io::Result<()> {
|
||||
write!(out, ">{id} ")?;
|
||||
annotation(out, fields)?;
|
||||
writeln!(out)?;
|
||||
write_sequence(out, seq)
|
||||
}
|
||||
|
||||
+89
-116
@@ -1,32 +1,20 @@
|
||||
//! FASTA serialisation of [`SuperKmer`] values.
|
||||
//! FASTA serialisation for obikmer sequence types.
|
||||
//!
|
||||
//! Two functions cover the two phases of the scatter pipeline:
|
||||
//! Three public functions cover the main output cases:
|
||||
//!
|
||||
//! - [`write_scatter`]: scatter phase (before routing). The header annotation
|
||||
//! contains the minimizer sequence decoded from [`SuperKmer::minimizer_pos`].
|
||||
//! - [`write_scatter`]: super-kmers in scatter phase (minimizer annotation)
|
||||
//! - [`write_count`]: super-kmers in count phase (occurrence count annotation)
|
||||
//! - [`write_unitig`]: unitigs from the layered index (partition + index annotation)
|
||||
//!
|
||||
//! - [`write_count`]: count phase (after deduplication). The header annotation
|
||||
//! contains the occurrence count from [`SuperKmer::count`].
|
||||
//!
|
||||
//! Both functions write standard OBITools-compatible FASTA:
|
||||
//! All produce OBITools-compatible FASTA:
|
||||
//!
|
||||
//! ```text
|
||||
//! >ID {"seq_length":32,"kmer_size":31,"minimizer_size":11,"partition":42,"minimizer":"CGTGCTAGATC"}
|
||||
//! GCTAGCATGCTAGCTGTAGCTGTGAGTGCTG
|
||||
//! >ID {"key":value,...}
|
||||
//! SEQUENCE
|
||||
//! ```
|
||||
//!
|
||||
//! The record identifier is the xxHash-64 of the ASCII sequence, formatted as
|
||||
//! a 16-digit uppercase hexadecimal string. xxHash-64 is collision-resistant
|
||||
//! enough for debugging identifiers (collision probability < 1e-9 for billions
|
||||
//! of distinct super-kmers).
|
||||
//!
|
||||
//! # Phase contract
|
||||
//!
|
||||
//! `write_scatter` reads [`SuperKmer::minimizer_pos`], which is only valid
|
||||
//! **before** [`SuperKmer::init_count`] is called. `write_count` reads
|
||||
//! [`SuperKmer::count`], which is only meaningful **after** `init_count`.
|
||||
//! Mixing the two functions in the wrong phase produces silently wrong output;
|
||||
//! this is enforced by pipeline structure, not by the type system.
|
||||
//! The lower-level primitive [`write_record`] and the [`JsonVal`] type are also
|
||||
//! public for callers that need custom annotations.
|
||||
|
||||
#![deny(missing_docs)]
|
||||
|
||||
@@ -35,22 +23,15 @@ mod fasta;
|
||||
use std::io::{self, Write};
|
||||
|
||||
use obikseq::{Minimizer, SuperKmer, Unitig};
|
||||
use xxhash_rust::xxh64::xxh64;
|
||||
|
||||
pub use fasta::{JsonVal, annotation, seq_id, write_record};
|
||||
|
||||
// ── public API ────────────────────────────────────────────────────────────────
|
||||
|
||||
/// Write one super-kmer in FASTA format — **scatter phase**.
|
||||
///
|
||||
/// The `minimizer` field in the JSON annotation contains the ASCII sequence of
|
||||
/// the minimizer, decoded from [`SuperKmer::minimizer_pos`] (scatter-phase
|
||||
/// value of the payload field).
|
||||
///
|
||||
/// # Parameters
|
||||
/// - `sk`: the super-kmer to serialise (must be in scatter phase)
|
||||
/// - `out`: destination writer
|
||||
/// - `k`: k-mer size used to build `sk`
|
||||
/// - `m`: minimizer size
|
||||
/// - `partition`: partition index computed from the minimizer hash
|
||||
/// ID is the xxHash-64 of the sequence. JSON annotation includes
|
||||
/// `seq_length`, `kmer_size`, `minimizer_size`, `partition`, `minimizer`.
|
||||
pub fn write_scatter<W: Write>(
|
||||
sk: &SuperKmer,
|
||||
out: &mut W,
|
||||
@@ -61,37 +42,26 @@ pub fn write_scatter<W: Write>(
|
||||
) -> io::Result<()> {
|
||||
let ascii = sk.to_ascii();
|
||||
let id = seq_id(&ascii);
|
||||
let seq_len = ascii.len();
|
||||
let min_seq = minimizer.to_ascii();
|
||||
|
||||
writeln!(
|
||||
let min_str = unsafe { std::str::from_utf8_unchecked(&min_seq) };
|
||||
write_record(
|
||||
&ascii,
|
||||
&id,
|
||||
&[
|
||||
("seq_length", JsonVal::Num(ascii.len() as u64)),
|
||||
("kmer_size", JsonVal::Num(k as u64)),
|
||||
("minimizer_size",JsonVal::Num(m as u64)),
|
||||
("partition", JsonVal::Num(partition as u64)),
|
||||
("minimizer", JsonVal::Str(min_str)),
|
||||
],
|
||||
out,
|
||||
">{id} {{\"seq_length\":{seq_len},\"kmer_size\":{k},\
|
||||
\"minimizer_size\":{m},\"partition\":{partition},\
|
||||
\"minimizer\":\"{min}\"}}",
|
||||
id = id,
|
||||
seq_len = seq_len,
|
||||
k = k,
|
||||
m = m,
|
||||
partition = partition,
|
||||
min = unsafe { std::str::from_utf8_unchecked(&min_seq) },
|
||||
)?;
|
||||
out.write_all(&ascii)?;
|
||||
out.write_all(b"\n")
|
||||
)
|
||||
}
|
||||
|
||||
/// Write one super-kmer in FASTA format — **count phase**.
|
||||
///
|
||||
/// The `count` field in the JSON annotation contains the occurrence count from
|
||||
/// [`SuperKmer::count`] (count-phase value of the payload field).
|
||||
///
|
||||
/// # Parameters
|
||||
/// - `sk`: the super-kmer to serialise (must be in count phase, i.e. after
|
||||
/// [`SuperKmer::init_count`] has been called)
|
||||
/// - `out`: destination writer
|
||||
/// - `k`: k-mer size
|
||||
/// - `m`: minimizer size
|
||||
/// - `partition`: partition index
|
||||
/// ID is the xxHash-64 of the sequence. JSON annotation includes
|
||||
/// `seq_length`, `kmer_size`, `minimizer_size`, `partition`, `count`.
|
||||
pub fn write_count<W: Write>(
|
||||
sk: &SuperKmer,
|
||||
out: &mut W,
|
||||
@@ -101,52 +71,47 @@ pub fn write_count<W: Write>(
|
||||
) -> io::Result<()> {
|
||||
let ascii = sk.to_ascii();
|
||||
let id = seq_id(&ascii);
|
||||
let seq_len = ascii.len();
|
||||
let count = sk.count();
|
||||
|
||||
writeln!(
|
||||
write_record(
|
||||
&ascii,
|
||||
&id,
|
||||
&[
|
||||
("seq_length", JsonVal::Num(ascii.len() as u64)),
|
||||
("kmer_size", JsonVal::Num(k as u64)),
|
||||
("minimizer_size",JsonVal::Num(m as u64)),
|
||||
("partition", JsonVal::Num(partition as u64)),
|
||||
("count", JsonVal::Num(sk.count() as u64)),
|
||||
],
|
||||
out,
|
||||
">{id} {{\"seq_length\":{seq_len},\"kmer_size\":{k},\
|
||||
\"minimizer_size\":{m},\"partition\":{partition},\
|
||||
\"count\":{count}}}",
|
||||
id = id,
|
||||
seq_len = seq_len,
|
||||
k = k,
|
||||
m = m,
|
||||
partition = partition,
|
||||
count = count,
|
||||
)?;
|
||||
out.write_all(&ascii)?;
|
||||
out.write_all(b"\n")
|
||||
)
|
||||
}
|
||||
|
||||
/// Write one unitig in FASTA format.
|
||||
///
|
||||
/// Header annotation (JSON):
|
||||
/// ```text
|
||||
/// >HASH {"seq_length":<seql>,"kmer_size":<k>,"n_kmers":<seql-k+1>}
|
||||
/// ```
|
||||
///
|
||||
/// `HASH` is the xxHash-64 of the ASCII sequence (16 uppercase hex digits).
|
||||
/// `n_kmers` is the number of distinct k-mers covered by this unitig.
|
||||
pub fn write_unitig<W: Write>(unitig: &Unitig, k: usize, out: &mut W) -> io::Result<()> {
|
||||
/// ID is `part_PPPPP_unitig_IIIIII` where `P` is the partition index and `I`
|
||||
/// is the unitig index within that partition. JSON annotation includes
|
||||
/// `seq_length`, `kmer_size`, `n_kmers`, `partition`, `unitig_index`.
|
||||
pub fn write_unitig<W: Write>(
|
||||
unitig: &Unitig,
|
||||
k: usize,
|
||||
partition: usize,
|
||||
index: usize,
|
||||
out: &mut W,
|
||||
) -> io::Result<()> {
|
||||
let ascii = unitig.to_ascii();
|
||||
let id = seq_id(&ascii);
|
||||
let seql = unitig.seql();
|
||||
let n_kmers = seql - k + 1;
|
||||
writeln!(
|
||||
let id = format!("part_{partition:05}_unitig_{index:06}");
|
||||
write_record(
|
||||
&ascii,
|
||||
&id,
|
||||
&[
|
||||
("seq_length", JsonVal::Num(seql as u64)),
|
||||
("kmer_size", JsonVal::Num(k as u64)),
|
||||
("n_kmers", JsonVal::Num((seql - k + 1) as u64)),
|
||||
("partition", JsonVal::Num(partition as u64)),
|
||||
("unitig_index", JsonVal::Num(index as u64)),
|
||||
],
|
||||
out,
|
||||
">{id} {{\"seq_length\":{seql},\"kmer_size\":{k},\"n_kmers\":{n_kmers}}}",
|
||||
)?;
|
||||
out.write_all(&ascii)?;
|
||||
out.write_all(b"\n")
|
||||
}
|
||||
|
||||
// ── internal helpers ──────────────────────────────────────────────────────────
|
||||
|
||||
/// xxHash-64 of the ASCII sequence, formatted as 16 uppercase hex digits.
|
||||
fn seq_id(ascii: &[u8]) -> String {
|
||||
format!("{:016X}", xxh64(ascii, 0))
|
||||
)
|
||||
}
|
||||
|
||||
// ── tests ─────────────────────────────────────────────────────────────────────
|
||||
@@ -178,9 +143,6 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn scatter_minimizer_decoded_from_hash() {
|
||||
// "ACG" right-aligned: A=00, C=01, G=10 → 0b000110 = 6
|
||||
// Left-aligned for m=3: shift by 64 − 2·3 = 58.
|
||||
// set_m(3) so that Minimizer::to_ascii() decodes exactly 3 bases.
|
||||
obikseq::params::set_m(3);
|
||||
let sk = make(b"ACGTACGTACGT");
|
||||
let minimizer = Minimizer::from_raw_unchecked(6u64 << (64 - 2 * 3));
|
||||
@@ -230,13 +192,34 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn count_sequence_line_correct() {
|
||||
// TTTTACGT canonicalises to ACGTAAAA (revcomp is ACGTAAAA < TTTTACGT)
|
||||
let sk = make(b"TTTTACGT");
|
||||
let out = capture(|w| write_count(&sk, w, 4, 2, 0));
|
||||
let lines: Vec<&str> = out.lines().collect();
|
||||
assert_eq!(lines[1], "ACGTAAAA");
|
||||
}
|
||||
|
||||
// ── write_unitig ──────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn unitig_id_format() {
|
||||
obikseq::params::set_k(4);
|
||||
let unitig = obikseq::packed_seq::PackedSeq::from_ascii(b"ACGTACGT");
|
||||
let out = capture(|w| write_unitig(&unitig, 4, 3, 17, w));
|
||||
let id = out.lines().next().unwrap();
|
||||
assert!(id.starts_with(">part_00003_unitig_000017"), "got: {id}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unitig_annotation_fields() {
|
||||
obikseq::params::set_k(4);
|
||||
let unitig = obikseq::packed_seq::PackedSeq::from_ascii(b"ACGTACGT");
|
||||
let out = capture(|w| write_unitig(&unitig, 4, 2, 5, w));
|
||||
assert!(out.contains("\"partition\":2"));
|
||||
assert!(out.contains("\"unitig_index\":5"));
|
||||
assert!(out.contains("\"n_kmers\":5"));
|
||||
assert!(out.contains("\"kmer_size\":4"));
|
||||
}
|
||||
|
||||
// ── ID stability ──────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
@@ -260,7 +243,7 @@ mod tests {
|
||||
.next()
|
||||
.unwrap()[1..]
|
||||
.to_string();
|
||||
assert_eq!(id1, id2, "same sequence must produce same ID");
|
||||
assert_eq!(id1, id2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -269,21 +252,11 @@ mod tests {
|
||||
let sk2 = make(b"TTTTTTTT");
|
||||
|
||||
let id1 = capture(|w| write_scatter(&sk1, w, 4, 2, 0, Minimizer::from_raw_unchecked(0)))
|
||||
.lines()
|
||||
.next()
|
||||
.unwrap()
|
||||
.split_whitespace()
|
||||
.next()
|
||||
.unwrap()[1..]
|
||||
.to_string();
|
||||
.lines().next().unwrap()
|
||||
.split_whitespace().next().unwrap()[1..].to_string();
|
||||
let id2 = capture(|w| write_scatter(&sk2, w, 4, 2, 0, Minimizer::from_raw_unchecked(0)))
|
||||
.lines()
|
||||
.next()
|
||||
.unwrap()
|
||||
.split_whitespace()
|
||||
.next()
|
||||
.unwrap()[1..]
|
||||
.to_string();
|
||||
.lines().next().unwrap()
|
||||
.split_whitespace().next().unwrap()[1..].to_string();
|
||||
assert_ne!(id1, id2);
|
||||
}
|
||||
|
||||
@@ -291,7 +264,7 @@ mod tests {
|
||||
fn id_is_16_hex_digits() {
|
||||
let sk = make(b"ACGTACGT");
|
||||
let out = capture(|w| write_scatter(&sk, w, 4, 2, 0, Minimizer::from_raw_unchecked(0)));
|
||||
let id = &out.lines().next().unwrap()[1..17]; // skip '>'
|
||||
let id = &out.lines().next().unwrap()[1..17];
|
||||
assert_eq!(id.len(), 16);
|
||||
assert!(id.chars().all(|c| c.is_ascii_hexdigit()));
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user