refactor: centralize k-mer config and introduce packed sequences

Centralize k-mer and minimizer configuration using a thread-safe global module, and replace manual bit-packing with a memory-efficient `PackedSeq` type. Refactor core sequence and k-mer types to use compile-time length enforcement and centralized hashing. Introduce a new De Bruijn graph implementation with compact node encoding and traversal iterators. Update I/O, partitioning, and builder modules to align with the new architecture, and add the `xxhash-rust` dependency.
This commit is contained in:
Eric Coissac
2026-05-05 18:08:19 +02:00
parent 602f414957
commit 8c17bf958b
37 changed files with 2641 additions and 2456 deletions
+3
View File
@@ -6,3 +6,6 @@ edition = "2024"
[dependencies]
obikseq = { path = "../obikseq" }
xxhash-rust = { version = "0.8", features = ["xxh64"] }
[dev-dependencies]
obikseq = { path = "../obikseq", features = ["test-utils"] }
+17 -14
View File
@@ -34,7 +34,7 @@ mod fasta;
use std::io::{self, Write};
use obikseq::{kmer::CanonicalKmer, superkmer::SuperKmer, unitig::Unitig};
use obikseq::{Minimizer, SuperKmer, Unitig};
use xxhash_rust::xxh64::xxh64;
// ── public API ────────────────────────────────────────────────────────────────
@@ -57,12 +57,12 @@ pub fn write_scatter<W: Write>(
k: usize,
m: usize,
partition: usize,
minimizer: CanonicalKmer,
minimizer: Minimizer,
) -> io::Result<()> {
let ascii = sk.to_ascii();
let id = seq_id(&ascii);
let seq_len = ascii.len();
let min_seq = minimizer.to_ascii(m);
let min_seq = minimizer.to_ascii();
writeln!(
out,
@@ -154,7 +154,6 @@ fn seq_id(ascii: &[u8]) -> String {
#[cfg(test)]
mod tests {
use super::*;
use obikseq::kmer::Kmer;
use obikseq::superkmer::SuperKmer;
fn make(seq: &[u8]) -> SuperKmer {
@@ -172,23 +171,27 @@ mod tests {
#[test]
fn scatter_header_contains_minimizer_field() {
let sk = make(b"ACGTACGTACGT");
let out = capture(|w| write_scatter(&sk, w, 4, 3, 7, CanonicalKmer::from_raw_unchecked(0)));
let out = capture(|w| write_scatter(&sk, w, 4, 3, 7, Minimizer::from_raw_unchecked(0)));
assert!(out.contains("\"minimizer\":\""));
assert!(!out.contains("\"count\":"));
}
#[test]
fn scatter_minimizer_decoded_from_hash() {
// min_hash for "ACG" (A=0,C=1,G=2, m=3): 0*16 + 1*4 + 2 = 6
// "ACG" right-aligned: A=00, C=01, G=10 → 0b000110 = 6
// Left-aligned for m=3: shift by 64 2·3 = 58.
// set_m(3) so that Minimizer::to_ascii() decodes exactly 3 bases.
obikseq::params::set_m(3);
let sk = make(b"ACGTACGTACGT");
let out = capture(|w| write_scatter(&sk, w, 4, 3, 0, CanonicalKmer::from_raw_unchecked(Kmer::from_raw_right(6, 3).raw())));
let minimizer = Minimizer::from_raw_unchecked(6u64 << (64 - 2 * 3));
let out = capture(|w| write_scatter(&sk, w, 4, 3, 0, minimizer));
assert!(out.contains("\"minimizer\":\"ACG\""), "got: {out}");
}
#[test]
fn scatter_fields_present() {
let sk = make(b"ACGTACGTACGT");
let out = capture(|w| write_scatter(&sk, w, 4, 3, 5, CanonicalKmer::from_raw_unchecked(0)));
let out = capture(|w| write_scatter(&sk, w, 4, 3, 5, Minimizer::from_raw_unchecked(0)));
assert!(out.contains("\"seq_length\":12"));
assert!(out.contains("\"kmer_size\":4"));
assert!(out.contains("\"minimizer_size\":3"));
@@ -198,7 +201,7 @@ mod tests {
#[test]
fn scatter_sequence_line_correct() {
let sk = make(b"ACGTACGT");
let out = capture(|w| write_scatter(&sk, w, 4, 2, 0, CanonicalKmer::from_raw_unchecked(0)));
let out = capture(|w| write_scatter(&sk, w, 4, 2, 0, Minimizer::from_raw_unchecked(0)));
let lines: Vec<&str> = out.lines().collect();
assert_eq!(lines[1], "ACGTACGT");
}
@@ -241,7 +244,7 @@ mod tests {
let sk1 = make(b"ACGTACGT");
let sk2 = make(b"ACGTACGT");
let id1 = capture(|w| write_scatter(&sk1, w, 4, 2, 0, CanonicalKmer::from_raw_unchecked(0)))
let id1 = capture(|w| write_scatter(&sk1, w, 4, 2, 0, Minimizer::from_raw_unchecked(0)))
.lines()
.next()
.unwrap()
@@ -249,7 +252,7 @@ mod tests {
.next()
.unwrap()[1..]
.to_string();
let id2 = capture(|w| write_scatter(&sk2, w, 4, 2, 0, CanonicalKmer::from_raw_unchecked(0)))
let id2 = capture(|w| write_scatter(&sk2, w, 4, 2, 0, Minimizer::from_raw_unchecked(0)))
.lines()
.next()
.unwrap()
@@ -265,7 +268,7 @@ mod tests {
let sk1 = make(b"ACGTACGT");
let sk2 = make(b"TTTTTTTT");
let id1 = capture(|w| write_scatter(&sk1, w, 4, 2, 0, CanonicalKmer::from_raw_unchecked(0)))
let id1 = capture(|w| write_scatter(&sk1, w, 4, 2, 0, Minimizer::from_raw_unchecked(0)))
.lines()
.next()
.unwrap()
@@ -273,7 +276,7 @@ mod tests {
.next()
.unwrap()[1..]
.to_string();
let id2 = capture(|w| write_scatter(&sk2, w, 4, 2, 0, CanonicalKmer::from_raw_unchecked(0)))
let id2 = capture(|w| write_scatter(&sk2, w, 4, 2, 0, Minimizer::from_raw_unchecked(0)))
.lines()
.next()
.unwrap()
@@ -287,7 +290,7 @@ mod tests {
#[test]
fn id_is_16_hex_digits() {
let sk = make(b"ACGTACGT");
let out = capture(|w| write_scatter(&sk, w, 4, 2, 0, CanonicalKmer::from_raw_unchecked(0)));
let out = capture(|w| write_scatter(&sk, w, 4, 2, 0, Minimizer::from_raw_unchecked(0)));
let id = &out.lines().next().unwrap()[1..17]; // skip '>'
assert_eq!(id.len(), 16);
assert!(id.chars().all(|c| c.is_ascii_hexdigit()));