refactor: implement RoutableSuperKmer and update k-mer indexing pipeline
Replace raw SuperkMer routing with a new RoutableSuperKimer type that embeds canonical sequences and precomputed minimizers, enabling direct partition routing via hash. Update the build pipeline to yield RoutableSuperKmers throughout (builder, scatterer), refactor FASTA/unitig export commands to use the new type and compressed outputs (.fasta.gz, .unitigs.fasta.zst), revise SuperKmer header to store n_kmers instead of seql (avoiding 256-byte wrap), and update documentation to reflect minimizer-based theory, two evidence-encoding strategies for unitig-MPHF indexing (global offset vs. ID+rank), and the new obipipeline library architecture with parallel workers, biased scheduling, and error handling.
This commit is contained in:
@@ -0,0 +1,43 @@
|
||||
use std::fmt;
|
||||
use std::io::{self, Write};
|
||||
use xxhash_rust::xxh64::xxh64;
|
||||
|
||||
pub(crate) enum JsonVal<'a> {
|
||||
Num(u64),
|
||||
Str(&'a str),
|
||||
}
|
||||
|
||||
impl fmt::Display for JsonVal<'_> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
JsonVal::Num(n) => write!(f, "{n}"),
|
||||
JsonVal::Str(s) => write!(f, "\"{s}\""),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn seq_id(ascii: &[u8]) -> String {
|
||||
format!("{:016X}", xxh64(ascii, 0))
|
||||
}
|
||||
|
||||
pub(crate) fn annotation<W: Write>(
|
||||
writer: &mut W,
|
||||
fields: &[(&str, JsonVal<'_>)],
|
||||
) -> io::Result<()> {
|
||||
write!(writer, "{{")?;
|
||||
for (i, (k, v)) in fields.iter().enumerate() {
|
||||
if i > 0 {
|
||||
write!(writer, ",")?;
|
||||
}
|
||||
write!(writer, "\"{k}\":{v}")?;
|
||||
}
|
||||
write!(writer, "}}")
|
||||
}
|
||||
|
||||
pub(crate) fn write_sequence<W: Write>(writer: &mut W, seq: &[u8], width: usize) -> io::Result<()> {
|
||||
for chunk in seq.chunks(width) {
|
||||
// SAFETY: seq is valid UTF-8; any contiguous slice of ASCII bytes is too
|
||||
writeln!(writer, "{}", unsafe { std::str::from_utf8_unchecked(chunk) })?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
+15
-24
@@ -30,6 +30,8 @@
|
||||
|
||||
#![deny(missing_docs)]
|
||||
|
||||
mod fasta;
|
||||
|
||||
use std::io::{self, Write};
|
||||
|
||||
use obikseq::{kmer::Kmer, superkmer::SuperKmer, unitig::Unitig};
|
||||
@@ -168,8 +170,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn scatter_header_contains_minimizer_field() {
|
||||
let mut sk = make(b"ACGTACGTACGT");
|
||||
sk.set_minimizer_pos(2);
|
||||
let sk = make(b"ACGTACGTACGT");
|
||||
let out = capture(|w| write_scatter(&sk, w, 4, 3, 7, Kmer::from_raw(0)));
|
||||
assert!(out.contains("\"minimizer\":\""));
|
||||
assert!(!out.contains("\"count\":"));
|
||||
@@ -178,16 +179,14 @@ mod tests {
|
||||
#[test]
|
||||
fn scatter_minimizer_decoded_from_hash() {
|
||||
// min_hash for "ACG" (A=0,C=1,G=2, m=3): 0*16 + 1*4 + 2 = 6
|
||||
let mut sk = make(b"ACGTACGTACGT");
|
||||
sk.set_minimizer_pos(0);
|
||||
let sk = make(b"ACGTACGTACGT");
|
||||
let out = capture(|w| write_scatter(&sk, w, 4, 3, 0, Kmer::from_raw_right(6, 3)));
|
||||
assert!(out.contains("\"minimizer\":\"ACG\""), "got: {out}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn scatter_fields_present() {
|
||||
let mut sk = make(b"ACGTACGTACGT");
|
||||
sk.set_minimizer_pos(0);
|
||||
let sk = make(b"ACGTACGTACGT");
|
||||
let out = capture(|w| write_scatter(&sk, w, 4, 3, 5, Kmer::from_raw(0)));
|
||||
assert!(out.contains("\"seq_length\":12"));
|
||||
assert!(out.contains("\"kmer_size\":4"));
|
||||
@@ -197,8 +196,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn scatter_sequence_line_correct() {
|
||||
let mut sk = make(b"ACGTACGT");
|
||||
sk.set_minimizer_pos(0);
|
||||
let sk = make(b"ACGTACGT");
|
||||
let out = capture(|w| write_scatter(&sk, w, 4, 2, 0, Kmer::from_raw(0)));
|
||||
let lines: Vec<&str> = out.lines().collect();
|
||||
assert_eq!(lines[1], "ACGTACGT");
|
||||
@@ -209,7 +207,6 @@ mod tests {
|
||||
#[test]
|
||||
fn count_header_contains_count_field() {
|
||||
let mut sk = make(b"ACGTACGTACGT");
|
||||
sk.init_count();
|
||||
sk.add(49);
|
||||
let out = capture(|w| write_count(&sk, w, 4, 3, 2));
|
||||
assert!(out.contains("\"count\":50"));
|
||||
@@ -218,8 +215,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn count_fields_present() {
|
||||
let mut sk = make(b"ACGTACGTACGT");
|
||||
sk.init_count();
|
||||
let sk = make(b"ACGTACGTACGT");
|
||||
let out = capture(|w| write_count(&sk, w, 4, 3, 9));
|
||||
assert!(out.contains("\"seq_length\":12"));
|
||||
assert!(out.contains("\"kmer_size\":4"));
|
||||
@@ -230,21 +226,19 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn count_sequence_line_correct() {
|
||||
let mut sk = make(b"TTTTACGT");
|
||||
sk.init_count();
|
||||
// TTTTACGT canonicalises to ACGTAAAA (revcomp is ACGTAAAA < TTTTACGT)
|
||||
let sk = make(b"TTTTACGT");
|
||||
let out = capture(|w| write_count(&sk, w, 4, 2, 0));
|
||||
let lines: Vec<&str> = out.lines().collect();
|
||||
assert_eq!(lines[1], "TTTTACGT");
|
||||
assert_eq!(lines[1], "ACGTAAAA");
|
||||
}
|
||||
|
||||
// ── ID stability ──────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn same_sequence_same_id() {
|
||||
let mut sk1 = make(b"ACGTACGT");
|
||||
sk1.set_minimizer_pos(0);
|
||||
let mut sk2 = make(b"ACGTACGT");
|
||||
sk2.set_minimizer_pos(4); // different pos, same sequence
|
||||
let sk1 = make(b"ACGTACGT");
|
||||
let sk2 = make(b"ACGTACGT");
|
||||
|
||||
let id1 = capture(|w| write_scatter(&sk1, w, 4, 2, 0, Kmer::from_raw(0)))
|
||||
.lines()
|
||||
@@ -267,10 +261,8 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn different_sequences_different_id() {
|
||||
let mut sk1 = make(b"ACGTACGT");
|
||||
sk1.set_minimizer_pos(0);
|
||||
let mut sk2 = make(b"TTTTTTTT");
|
||||
sk2.set_minimizer_pos(0);
|
||||
let sk1 = make(b"ACGTACGT");
|
||||
let sk2 = make(b"TTTTTTTT");
|
||||
|
||||
let id1 = capture(|w| write_scatter(&sk1, w, 4, 2, 0, Kmer::from_raw(0)))
|
||||
.lines()
|
||||
@@ -293,8 +285,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn id_is_16_hex_digits() {
|
||||
let mut sk = make(b"ACGTACGT");
|
||||
sk.set_minimizer_pos(0);
|
||||
let sk = make(b"ACGTACGT");
|
||||
let out = capture(|w| write_scatter(&sk, w, 4, 2, 0, Kmer::from_raw(0)));
|
||||
let id = &out.lines().next().unwrap()[1..17]; // skip '>'
|
||||
assert_eq!(id.len(), 16);
|
||||
|
||||
Reference in New Issue
Block a user