refactor: centralize k-mer config and introduce packed sequences
Centralize k-mer and minimizer configuration using a thread-safe global module, and replace manual bit-packing with a memory-efficient `PackedSeq` type. Refactor core sequence and k-mer types to use compile-time length enforcement and centralized hashing. Introduce a new De Bruijn graph implementation with compact node encoding and traversal iterators. Update I/O, partitioning, and builder modules to align with the new architecture, and add the `xxhash-rust` dependency.
This commit is contained in:
@@ -6,3 +6,6 @@ edition = "2024"
|
||||
[dependencies]
|
||||
obikseq = { path = "../obikseq" }
|
||||
xxhash-rust = { version = "0.8", features = ["xxh64"] }
|
||||
|
||||
[dev-dependencies]
|
||||
obikseq = { path = "../obikseq", features = ["test-utils"] }
|
||||
|
||||
+17
-14
@@ -34,7 +34,7 @@ mod fasta;
|
||||
|
||||
use std::io::{self, Write};
|
||||
|
||||
use obikseq::{kmer::CanonicalKmer, superkmer::SuperKmer, unitig::Unitig};
|
||||
use obikseq::{Minimizer, SuperKmer, Unitig};
|
||||
use xxhash_rust::xxh64::xxh64;
|
||||
|
||||
// ── public API ────────────────────────────────────────────────────────────────
|
||||
@@ -57,12 +57,12 @@ pub fn write_scatter<W: Write>(
|
||||
k: usize,
|
||||
m: usize,
|
||||
partition: usize,
|
||||
minimizer: CanonicalKmer,
|
||||
minimizer: Minimizer,
|
||||
) -> io::Result<()> {
|
||||
let ascii = sk.to_ascii();
|
||||
let id = seq_id(&ascii);
|
||||
let seq_len = ascii.len();
|
||||
let min_seq = minimizer.to_ascii(m);
|
||||
let min_seq = minimizer.to_ascii();
|
||||
|
||||
writeln!(
|
||||
out,
|
||||
@@ -154,7 +154,6 @@ fn seq_id(ascii: &[u8]) -> String {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use obikseq::kmer::Kmer;
|
||||
use obikseq::superkmer::SuperKmer;
|
||||
|
||||
fn make(seq: &[u8]) -> SuperKmer {
|
||||
@@ -172,23 +171,27 @@ mod tests {
|
||||
#[test]
|
||||
fn scatter_header_contains_minimizer_field() {
|
||||
let sk = make(b"ACGTACGTACGT");
|
||||
let out = capture(|w| write_scatter(&sk, w, 4, 3, 7, CanonicalKmer::from_raw_unchecked(0)));
|
||||
let out = capture(|w| write_scatter(&sk, w, 4, 3, 7, Minimizer::from_raw_unchecked(0)));
|
||||
assert!(out.contains("\"minimizer\":\""));
|
||||
assert!(!out.contains("\"count\":"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn scatter_minimizer_decoded_from_hash() {
|
||||
// min_hash for "ACG" (A=0,C=1,G=2, m=3): 0*16 + 1*4 + 2 = 6
|
||||
// "ACG" right-aligned: A=00, C=01, G=10 → 0b000110 = 6
|
||||
// Left-aligned for m=3: shift by 64 − 2·3 = 58.
|
||||
// set_m(3) so that Minimizer::to_ascii() decodes exactly 3 bases.
|
||||
obikseq::params::set_m(3);
|
||||
let sk = make(b"ACGTACGTACGT");
|
||||
let out = capture(|w| write_scatter(&sk, w, 4, 3, 0, CanonicalKmer::from_raw_unchecked(Kmer::from_raw_right(6, 3).raw())));
|
||||
let minimizer = Minimizer::from_raw_unchecked(6u64 << (64 - 2 * 3));
|
||||
let out = capture(|w| write_scatter(&sk, w, 4, 3, 0, minimizer));
|
||||
assert!(out.contains("\"minimizer\":\"ACG\""), "got: {out}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn scatter_fields_present() {
|
||||
let sk = make(b"ACGTACGTACGT");
|
||||
let out = capture(|w| write_scatter(&sk, w, 4, 3, 5, CanonicalKmer::from_raw_unchecked(0)));
|
||||
let out = capture(|w| write_scatter(&sk, w, 4, 3, 5, Minimizer::from_raw_unchecked(0)));
|
||||
assert!(out.contains("\"seq_length\":12"));
|
||||
assert!(out.contains("\"kmer_size\":4"));
|
||||
assert!(out.contains("\"minimizer_size\":3"));
|
||||
@@ -198,7 +201,7 @@ mod tests {
|
||||
#[test]
|
||||
fn scatter_sequence_line_correct() {
|
||||
let sk = make(b"ACGTACGT");
|
||||
let out = capture(|w| write_scatter(&sk, w, 4, 2, 0, CanonicalKmer::from_raw_unchecked(0)));
|
||||
let out = capture(|w| write_scatter(&sk, w, 4, 2, 0, Minimizer::from_raw_unchecked(0)));
|
||||
let lines: Vec<&str> = out.lines().collect();
|
||||
assert_eq!(lines[1], "ACGTACGT");
|
||||
}
|
||||
@@ -241,7 +244,7 @@ mod tests {
|
||||
let sk1 = make(b"ACGTACGT");
|
||||
let sk2 = make(b"ACGTACGT");
|
||||
|
||||
let id1 = capture(|w| write_scatter(&sk1, w, 4, 2, 0, CanonicalKmer::from_raw_unchecked(0)))
|
||||
let id1 = capture(|w| write_scatter(&sk1, w, 4, 2, 0, Minimizer::from_raw_unchecked(0)))
|
||||
.lines()
|
||||
.next()
|
||||
.unwrap()
|
||||
@@ -249,7 +252,7 @@ mod tests {
|
||||
.next()
|
||||
.unwrap()[1..]
|
||||
.to_string();
|
||||
let id2 = capture(|w| write_scatter(&sk2, w, 4, 2, 0, CanonicalKmer::from_raw_unchecked(0)))
|
||||
let id2 = capture(|w| write_scatter(&sk2, w, 4, 2, 0, Minimizer::from_raw_unchecked(0)))
|
||||
.lines()
|
||||
.next()
|
||||
.unwrap()
|
||||
@@ -265,7 +268,7 @@ mod tests {
|
||||
let sk1 = make(b"ACGTACGT");
|
||||
let sk2 = make(b"TTTTTTTT");
|
||||
|
||||
let id1 = capture(|w| write_scatter(&sk1, w, 4, 2, 0, CanonicalKmer::from_raw_unchecked(0)))
|
||||
let id1 = capture(|w| write_scatter(&sk1, w, 4, 2, 0, Minimizer::from_raw_unchecked(0)))
|
||||
.lines()
|
||||
.next()
|
||||
.unwrap()
|
||||
@@ -273,7 +276,7 @@ mod tests {
|
||||
.next()
|
||||
.unwrap()[1..]
|
||||
.to_string();
|
||||
let id2 = capture(|w| write_scatter(&sk2, w, 4, 2, 0, CanonicalKmer::from_raw_unchecked(0)))
|
||||
let id2 = capture(|w| write_scatter(&sk2, w, 4, 2, 0, Minimizer::from_raw_unchecked(0)))
|
||||
.lines()
|
||||
.next()
|
||||
.unwrap()
|
||||
@@ -287,7 +290,7 @@ mod tests {
|
||||
#[test]
|
||||
fn id_is_16_hex_digits() {
|
||||
let sk = make(b"ACGTACGT");
|
||||
let out = capture(|w| write_scatter(&sk, w, 4, 2, 0, CanonicalKmer::from_raw_unchecked(0)));
|
||||
let out = capture(|w| write_scatter(&sk, w, 4, 2, 0, Minimizer::from_raw_unchecked(0)));
|
||||
let id = &out.lines().next().unwrap()[1..17]; // skip '>'
|
||||
assert_eq!(id.len(), 16);
|
||||
assert!(id.chars().all(|c| c.is_ascii_hexdigit()));
|
||||
|
||||
Reference in New Issue
Block a user