refactor: implement RoutableSuperKmer and update k-mer indexing pipeline

Replace raw SuperkMer routing with a new RoutableSuperKimer type that embeds canonical sequences and precomputed minimizers, enabling direct partition routing via hash. Update the build pipeline to yield RoutableSuperKmers throughout (builder, scatterer), refactor FASTA/unitig export commands to use the new type and compressed outputs (.fasta.gz, .unitigs.fasta.zst), revise SuperKmer header to store n_kmers instead of seql (avoiding 256-byte wrap), and update documentation to reflect minimizer-based theory, two evidence-encoding strategies for unitig-MPHF indexing (global offset vs. ID+rank), and the new obipipeline library architecture with parallel workers, biased scheduling, and error handling.
This commit is contained in:
Eric Coissac
2026-04-29 22:52:42 +02:00
parent 4e26e3bd40
commit 27f5e88a7b
72 changed files with 10093 additions and 1626 deletions
+43
View File
@@ -0,0 +1,43 @@
use std::fmt;
use std::io::{self, Write};
use xxhash_rust::xxh64::xxh64;
pub(crate) enum JsonVal<'a> {
Num(u64),
Str(&'a str),
}
impl fmt::Display for JsonVal<'_> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
JsonVal::Num(n) => write!(f, "{n}"),
JsonVal::Str(s) => write!(f, "\"{s}\""),
}
}
}
pub(crate) fn seq_id(ascii: &[u8]) -> String {
format!("{:016X}", xxh64(ascii, 0))
}
pub(crate) fn annotation<W: Write>(
writer: &mut W,
fields: &[(&str, JsonVal<'_>)],
) -> io::Result<()> {
write!(writer, "{{")?;
for (i, (k, v)) in fields.iter().enumerate() {
if i > 0 {
write!(writer, ",")?;
}
write!(writer, "\"{k}\":{v}")?;
}
write!(writer, "}}")
}
pub(crate) fn write_sequence<W: Write>(writer: &mut W, seq: &[u8], width: usize) -> io::Result<()> {
for chunk in seq.chunks(width) {
// SAFETY: seq is valid UTF-8; any contiguous slice of ASCII bytes is too
writeln!(writer, "{}", unsafe { std::str::from_utf8_unchecked(chunk) })?;
}
Ok(())
}
+15 -24
View File
@@ -30,6 +30,8 @@
#![deny(missing_docs)]
mod fasta;
use std::io::{self, Write};
use obikseq::{kmer::Kmer, superkmer::SuperKmer, unitig::Unitig};
@@ -168,8 +170,7 @@ mod tests {
#[test]
fn scatter_header_contains_minimizer_field() {
let mut sk = make(b"ACGTACGTACGT");
sk.set_minimizer_pos(2);
let sk = make(b"ACGTACGTACGT");
let out = capture(|w| write_scatter(&sk, w, 4, 3, 7, Kmer::from_raw(0)));
assert!(out.contains("\"minimizer\":\""));
assert!(!out.contains("\"count\":"));
@@ -178,16 +179,14 @@ mod tests {
#[test]
fn scatter_minimizer_decoded_from_hash() {
// min_hash for "ACG" (A=0,C=1,G=2, m=3): 0*16 + 1*4 + 2 = 6
let mut sk = make(b"ACGTACGTACGT");
sk.set_minimizer_pos(0);
let sk = make(b"ACGTACGTACGT");
let out = capture(|w| write_scatter(&sk, w, 4, 3, 0, Kmer::from_raw_right(6, 3)));
assert!(out.contains("\"minimizer\":\"ACG\""), "got: {out}");
}
#[test]
fn scatter_fields_present() {
let mut sk = make(b"ACGTACGTACGT");
sk.set_minimizer_pos(0);
let sk = make(b"ACGTACGTACGT");
let out = capture(|w| write_scatter(&sk, w, 4, 3, 5, Kmer::from_raw(0)));
assert!(out.contains("\"seq_length\":12"));
assert!(out.contains("\"kmer_size\":4"));
@@ -197,8 +196,7 @@ mod tests {
#[test]
fn scatter_sequence_line_correct() {
let mut sk = make(b"ACGTACGT");
sk.set_minimizer_pos(0);
let sk = make(b"ACGTACGT");
let out = capture(|w| write_scatter(&sk, w, 4, 2, 0, Kmer::from_raw(0)));
let lines: Vec<&str> = out.lines().collect();
assert_eq!(lines[1], "ACGTACGT");
@@ -209,7 +207,6 @@ mod tests {
#[test]
fn count_header_contains_count_field() {
let mut sk = make(b"ACGTACGTACGT");
sk.init_count();
sk.add(49);
let out = capture(|w| write_count(&sk, w, 4, 3, 2));
assert!(out.contains("\"count\":50"));
@@ -218,8 +215,7 @@ mod tests {
#[test]
fn count_fields_present() {
let mut sk = make(b"ACGTACGTACGT");
sk.init_count();
let sk = make(b"ACGTACGTACGT");
let out = capture(|w| write_count(&sk, w, 4, 3, 9));
assert!(out.contains("\"seq_length\":12"));
assert!(out.contains("\"kmer_size\":4"));
@@ -230,21 +226,19 @@ mod tests {
#[test]
fn count_sequence_line_correct() {
let mut sk = make(b"TTTTACGT");
sk.init_count();
// TTTTACGT canonicalises to ACGTAAAA (revcomp is ACGTAAAA < TTTTACGT)
let sk = make(b"TTTTACGT");
let out = capture(|w| write_count(&sk, w, 4, 2, 0));
let lines: Vec<&str> = out.lines().collect();
assert_eq!(lines[1], "TTTTACGT");
assert_eq!(lines[1], "ACGTAAAA");
}
// ── ID stability ──────────────────────────────────────────────────────────
#[test]
fn same_sequence_same_id() {
let mut sk1 = make(b"ACGTACGT");
sk1.set_minimizer_pos(0);
let mut sk2 = make(b"ACGTACGT");
sk2.set_minimizer_pos(4); // different pos, same sequence
let sk1 = make(b"ACGTACGT");
let sk2 = make(b"ACGTACGT");
let id1 = capture(|w| write_scatter(&sk1, w, 4, 2, 0, Kmer::from_raw(0)))
.lines()
@@ -267,10 +261,8 @@ mod tests {
#[test]
fn different_sequences_different_id() {
let mut sk1 = make(b"ACGTACGT");
sk1.set_minimizer_pos(0);
let mut sk2 = make(b"TTTTTTTT");
sk2.set_minimizer_pos(0);
let sk1 = make(b"ACGTACGT");
let sk2 = make(b"TTTTTTTT");
let id1 = capture(|w| write_scatter(&sk1, w, 4, 2, 0, Kmer::from_raw(0)))
.lines()
@@ -293,8 +285,7 @@ mod tests {
#[test]
fn id_is_16_hex_digits() {
let mut sk = make(b"ACGTACGT");
sk.set_minimizer_pos(0);
let sk = make(b"ACGTACGT");
let out = capture(|w| write_scatter(&sk, w, 4, 2, 0, Kmer::from_raw(0)));
let id = &out.lines().next().unwrap()[1..17]; // skip '>'
assert_eq!(id.len(), 16);