refactor: centralize k-mer config and introduce packed sequences

Centralize k-mer and minimizer configuration using a thread-safe global module, and replace manual bit-packing with a memory-efficient `PackedSeq` type. Refactor core sequence and k-mer types to use compile-time length enforcement and centralized hashing. Introduce a new De Bruijn graph implementation with compact node encoding and traversal iterators. Update I/O, partitioning, and builder modules to align with the new architecture, and add the `xxhash-rust` dependency.
This commit is contained in:
Eric Coissac
2026-05-05 18:08:19 +02:00
parent 602f414957
commit 8c17bf958b
37 changed files with 2641 additions and 2456 deletions
+3
View File
@@ -7,3 +7,6 @@ edition = "2024"
obikseq = { path = "../obikseq" }
obikrope = { path = "../obikrope" }
lazy_static = "1.5.0"
[dev-dependencies]
obikseq = { path = "../obikseq", features = ["test-utils"] }
+3 -6
View File
@@ -21,7 +21,6 @@ pub(crate) static LN_CARD_ROT5: LazyLock<[f64; 1024]> =
pub(crate) static LN_CARD_ROT6: LazyLock<[f64; 4096]> =
LazyLock::new(|| build_log_class_size::<4096>(&NORMK6));
fn ln0(x: f64) -> f64 {
if x == 0.0 { 0.0 } else { x.ln() }
}
@@ -47,7 +46,7 @@ fn build_normalized_kmer<const N: usize>() -> [u64; N] {
for i in 0..N {
let la = (i as u64) << shift;
let ra = i as u64;
let rc_ra = Kmer::from_raw(la).revcomp(k).raw() >> shift;
let rc_ra = Kmer::from_raw(la).revcomp().raw() >> shift;
let circ = normalize_circular(ra, k);
let circ_rc = normalize_circular(rc_ra, k);
result[i] = circ.min(circ_rc);
@@ -107,12 +106,10 @@ pub(crate) const K_MAX: usize = 32;
pub(crate) const WS_MAX: usize = 6;
/// n·ln(n), with n_log_n[0] = 0. Indexed by n = 0..=K_MAX.
pub(crate) static N_LOG_N: LazyLock<[f64; K_MAX + 1]> =
LazyLock::new(|| build_n_log_n());
pub(crate) static N_LOG_N: LazyLock<[f64; K_MAX + 1]> = LazyLock::new(|| build_n_log_n());
/// H_max[k][ws]: maximum entropy for kmer length k and word size ws.
pub(crate) static EMAX: LazyLock<[[f64; WS_MAX + 1]; K_MAX + 1]> =
LazyLock::new(|| build_emax());
pub(crate) static EMAX: LazyLock<[[f64; WS_MAX + 1]; K_MAX + 1]> = LazyLock::new(|| build_emax());
/// ln(k ws + 1): log of the number of ws-words in a kmer of length k.
pub(crate) static LOG_NWORDS: LazyLock<[[f64; WS_MAX + 1]; K_MAX + 1]> =
+27 -18
View File
@@ -16,8 +16,8 @@
//! | super-kmer length = 256| k |
use obikrope::{ForwardCursor, Rope, RopeCursor};
use obikseq::kmer::CanonicalKmer;
use obikseq::RoutableSuperKmer;
use obikseq::kmer::Minimizer;
use crate::rolling_stat::RollingStat;
use crate::scratch::SuperKmerScratch;
@@ -26,11 +26,10 @@ use crate::scratch::SuperKmerScratch;
pub struct SuperKmerIter<'a> {
cursor: ForwardCursor<'a>,
k: usize,
m: usize,
theta: f64,
scratch: SuperKmerScratch,
stat: RollingStat,
prev_min: Option<CanonicalKmer>,
prev_min: Option<Minimizer>,
prev_min_pos: usize,
}
@@ -41,14 +40,13 @@ impl<'a> SuperKmerIter<'a> {
/// - `m`: minimizer size (1 < m < k)
/// - `level_max`: maximum sub-word size for entropy (16)
/// - `theta`: entropy threshold; k-mers with score ≤ theta are rejected
pub fn new(rope: &'a Rope, k: usize, m: usize, level_max: usize, theta: f64) -> Self {
pub fn new(rope: &'a Rope, k: usize, level_max: usize, theta: f64) -> Self {
Self {
cursor: rope.fw_cursor(),
k,
m,
theta,
scratch: SuperKmerScratch::new(),
stat: RollingStat::new(k, m, level_max),
stat: RollingStat::new(level_max),
prev_min: None,
prev_min_pos: 0,
}
@@ -66,7 +64,7 @@ impl<'a> SuperKmerIter<'a> {
return None;
}
self.prev_min?;
Some(self.scratch.emit(self.prev_min_pos, self.m))
Some(self.scratch.emit(self.prev_min_pos))
}
}
@@ -149,26 +147,31 @@ mod tests {
use super::*;
use obikrope::Rope;
fn setup() {
obikseq::params::set_k(K);
obikseq::params::set_m(5);
}
fn make_rope(data: &[u8]) -> Rope {
let mut r = Rope::new(None);
r.push(data.to_vec());
r
}
fn run_nofilter(data: &[u8], k: usize, m: usize) -> Vec<Vec<u8>> {
fn run_nofilter(data: &[u8], k: usize) -> Vec<Vec<u8>> {
let rope = make_rope(data);
SuperKmerIter::new(&rope, k, m, 1, 0.0)
SuperKmerIter::new(&rope, k, 1, 0.0)
.map(|rsk| rsk.superkmer().to_ascii())
.collect()
}
// k=11, m=5 — valeurs minimales du projet (k ∈ [11,31])
const K: usize = 11;
const M: usize = 5;
#[test]
fn single_segment_one_superkmer() {
let out = run_nofilter(b"ACGTACGTACGTACGTACGT\x00", K, M);
setup();
let out = run_nofilter(b"ACGTACGTACGTACGTACGT\x00", K);
assert!(!out.is_empty());
let total: Vec<u8> = out.into_iter().flatten().collect();
assert!(total.len() >= K);
@@ -176,29 +179,33 @@ mod tests {
#[test]
fn segment_shorter_than_k_emits_nothing() {
let out = run_nofilter(b"ACGTACGT\x00", K, M);
setup();
let out = run_nofilter(b"ACGTACGT\x00", K);
assert_eq!(out, Vec::<Vec<u8>>::new());
}
#[test]
fn empty_input_emits_nothing() {
let out = run_nofilter(b"", K, M);
setup();
let out = run_nofilter(b"", K);
assert_eq!(out, Vec::<Vec<u8>>::new());
}
#[test]
fn two_segments_both_emitted() {
let out = run_nofilter(b"ACGTACGTACGTACGT\x00TGCATGCATGCATGCA\x00", K, M);
setup();
let out = run_nofilter(b"ACGTACGTACGTACGT\x00TGCATGCATGCATGCA\x00", K);
assert!(!out.is_empty());
}
#[test]
fn low_complexity_kmer_is_rejected() {
let out_pass = run_nofilter(b"AAAAAAAAAAAACGTACGTACGT\x00", K, M);
setup();
let out_pass = run_nofilter(b"AAAAAAAAAAAACGTACGTACGT\x00", K);
assert!(!out_pass.is_empty());
let rope = make_rope(b"AAAAAAAAAAAAAAAAAAAA\x00");
let out_reject: Vec<Vec<u8>> = SuperKmerIter::new(&rope, K, M, 6, 0.9)
let out_reject: Vec<Vec<u8>> = SuperKmerIter::new(&rope, K, 6, 0.9)
.map(|rsk| rsk.superkmer().to_ascii())
.collect();
assert!(out_reject.is_empty());
@@ -206,12 +213,13 @@ mod tests {
#[test]
fn multi_slice_rope() {
setup();
let data = b"ACGTACGTACGTACGTACGT\x00";
let mid = data.len() / 2;
let mut rope = Rope::new(None);
rope.push(data[..mid].to_vec());
rope.push(data[mid..].to_vec());
let out: Vec<Vec<u8>> = SuperKmerIter::new(&rope, K, M, 1, 0.0)
let out: Vec<Vec<u8>> = SuperKmerIter::new(&rope, K, 1, 0.0)
.map(|rsk| rsk.superkmer().to_ascii())
.collect();
assert!(!out.is_empty());
@@ -219,8 +227,9 @@ mod tests {
#[test]
fn yields_minimizer_value() {
setup();
let rope = make_rope(b"ACGTACGTACGTACGTACGT\x00");
let results: Vec<RoutableSuperKmer> = SuperKmerIter::new(&rope, K, M, 1, 0.0).collect();
let results: Vec<RoutableSuperKmer> = SuperKmerIter::new(&rope, K, 1, 0.0).collect();
assert!(!results.is_empty());
}
}
+7 -2
View File
@@ -19,6 +19,11 @@ use obikrope::Rope;
use obikseq::RoutableSuperKmer;
/// Collect all super-kmers from a normalised rope chunk.
pub fn build_superkmers(rope: Rope, k: usize, m: usize, level_max: usize, theta: f64) -> Vec<RoutableSuperKmer> {
SuperKmerIter::new(&rope, k, m, level_max, theta).collect()
pub fn build_superkmers(
rope: Rope,
k: usize,
level_max: usize,
theta: f64,
) -> Vec<RoutableSuperKmer> {
SuperKmerIter::new(&rope, k, level_max, theta).collect()
}
+96 -76
View File
@@ -1,4 +1,5 @@
use obikseq::kmer::{CanonicalKmer, Kmer};
use obikseq::kmer::{Minimizer, hash_kmer};
use obikseq::params;
use crate::encoding::encode_nuc;
use crate::entropy_table::{WS_MAX, emax, entropy_norm_kmer, ln_class_size, log_nwords, n_log_n};
@@ -13,22 +14,7 @@ struct MmerItem {
hash: u64,
}
/// Bijective hash used to randomise the minimizer ordering.
/// The XOR seed (2^64/φ) breaks the mix64 fixed point at 0,
/// preventing poly-A/T kmers (canonical = 0) from always winning.
#[inline(always)]
fn hash_mmer(canonical: u64) -> u64 {
let x = canonical ^ 0x9e3779b97f4a7c15;
let x = x ^ (x >> 30);
let x = x.wrapping_mul(0xbf58476d1ce4e5b9);
let x = x ^ (x >> 27);
let x = x.wrapping_mul(0x94d049bb133111eb);
x ^ (x >> 31)
}
pub struct RollingStat {
k: usize,
m: usize,
entropy_max_k: usize,
rolling_k: u64,
rolling_rck: u64,
@@ -53,15 +39,15 @@ pub struct RollingStat {
}
impl RollingStat {
pub fn new(k: usize, m: usize, entropy_max_k: usize) -> Self {
pub fn new(entropy_max_k: usize) -> Self {
let k = params::k();
let m = params::m();
Self {
k,
m,
entropy_max_k,
rolling_k: 0,
rolling_rck: 0,
k_mask: (!0) >> (64 - k * 2),
m_mask: (!0) >> (64 - m * 2),
k_mask: (!0u64) >> (64 - k * 2),
m_mask: (!0u64) >> (64 - m * 2),
received: 0,
k1q: std::collections::VecDeque::with_capacity(k),
k2q: std::collections::VecDeque::with_capacity(k - 1),
@@ -85,12 +71,24 @@ impl RollingStat {
self.rolling_k = 0;
self.rolling_rck = 0;
self.received = 0;
for &i in &self.k1q { self.k1c[i as usize] = 0; }
for &i in &self.k2q { self.k2c[i as usize] = 0; }
for &i in &self.k3q { self.k3c[i as usize] = 0; }
for &i in &self.k4q { self.k4c[i as usize] = 0; }
for &i in &self.k5q { self.k5c[i as usize] = 0; }
for &i in &self.k6q { self.k6c[i as usize] = 0; }
for &i in &self.k1q {
self.k1c[i as usize] = 0;
}
for &i in &self.k2q {
self.k2c[i as usize] = 0;
}
for &i in &self.k3q {
self.k3c[i as usize] = 0;
}
for &i in &self.k4q {
self.k4c[i as usize] = 0;
}
for &i in &self.k5q {
self.k5c[i as usize] = 0;
}
for &i in &self.k6q {
self.k6c[i as usize] = 0;
}
self.k1q.clear();
self.k2q.clear();
self.k3q.clear();
@@ -127,12 +125,15 @@ impl RollingStat {
}
pub fn push(&mut self, nuc: u8) {
let k = params::k();
let m = params::m();
let bnuc = encode_nuc(nuc);
let cnuc = bnuc ^ 3;
self.rolling_k = ((self.rolling_k << 2) | (bnuc as u64)) & self.k_mask;
self.rolling_rck =
((self.rolling_rck >> 2) | ((cnuc as u64) << ((self.k - 1) * 2))) & self.k_mask;
((self.rolling_rck >> 2) | ((cnuc as u64) << ((k - 1) * 2))) & self.k_mask;
let canonical_k1 = entropy_norm_kmer(self.rolling_k & 3, 1, false);
let canonical_k2 = entropy_norm_kmer(self.rolling_k & 15, 2, false);
@@ -143,30 +144,37 @@ impl RollingStat {
self.received += 1;
if self.received >= self.m {
if self.received >= m {
let possible_canonical_m =
(self.rolling_k & self.m_mask).min(self.rolling_rck >> ((self.k - self.m) * 2));
let possible_hash_m = hash_mmer(possible_canonical_m);
let possible_pos_m = self.received - self.m;
(self.rolling_k & self.m_mask).min(self.rolling_rck >> ((k - m) * 2));
let possible_hash_m = hash_kmer(possible_canonical_m << 64 - m * 2);
let possible_pos_m = self.received - m;
while self.minimier.back().map_or(false, |it| it.hash >= possible_hash_m) {
while self
.minimier
.back()
.map_or(false, |it| it.hash >= possible_hash_m)
{
self.minimier.pop_back();
}
self.minimier
.push_back(MmerItem { position: possible_pos_m, canonical: possible_canonical_m, hash: possible_hash_m });
self.minimier.push_back(MmerItem {
position: possible_pos_m,
canonical: possible_canonical_m,
hash: possible_hash_m,
});
if self.received > self.k {
if self.received > k {
while self
.minimier
.front()
.map_or(false, |it| it.position + self.k < self.received)
.map_or(false, |it| it.position + k < self.received)
{
self.minimier.pop_front();
}
}
}
if self.received > self.k {
if self.received > k {
let old1 = self.k1q.pop_front().unwrap();
let f1 = self.k1c[old1 as usize];
Self::update_sums_decrement(&mut self.sum_f_log_f, &mut self.sum_f_log_s, 1, old1, f1);
@@ -199,37 +207,73 @@ impl RollingStat {
}
let g1 = self.k1c[canonical_k1 as usize];
Self::update_sums_increment(&mut self.sum_f_log_f, &mut self.sum_f_log_s, 1, canonical_k1, g1);
Self::update_sums_increment(
&mut self.sum_f_log_f,
&mut self.sum_f_log_s,
1,
canonical_k1,
g1,
);
self.k1c[canonical_k1 as usize] += 1;
self.k1q.push_back(canonical_k1);
if self.received >= 2 {
let g2 = self.k2c[canonical_k2 as usize];
Self::update_sums_increment(&mut self.sum_f_log_f, &mut self.sum_f_log_s, 2, canonical_k2, g2);
Self::update_sums_increment(
&mut self.sum_f_log_f,
&mut self.sum_f_log_s,
2,
canonical_k2,
g2,
);
self.k2c[canonical_k2 as usize] += 1;
self.k2q.push_back(canonical_k2);
if self.received >= 3 {
let g3 = self.k3c[canonical_k3 as usize];
Self::update_sums_increment(&mut self.sum_f_log_f, &mut self.sum_f_log_s, 3, canonical_k3, g3);
Self::update_sums_increment(
&mut self.sum_f_log_f,
&mut self.sum_f_log_s,
3,
canonical_k3,
g3,
);
self.k3c[canonical_k3 as usize] += 1;
self.k3q.push_back(canonical_k3);
if self.received >= 4 {
let g4 = self.k4c[canonical_k4 as usize];
Self::update_sums_increment(&mut self.sum_f_log_f, &mut self.sum_f_log_s, 4, canonical_k4, g4);
Self::update_sums_increment(
&mut self.sum_f_log_f,
&mut self.sum_f_log_s,
4,
canonical_k4,
g4,
);
self.k4c[canonical_k4 as usize] += 1;
self.k4q.push_back(canonical_k4);
if self.received >= 5 {
let g5 = self.k5c[canonical_k5 as usize];
Self::update_sums_increment(&mut self.sum_f_log_f, &mut self.sum_f_log_s, 5, canonical_k5, g5);
Self::update_sums_increment(
&mut self.sum_f_log_f,
&mut self.sum_f_log_s,
5,
canonical_k5,
g5,
);
self.k5c[canonical_k5 as usize] += 1;
self.k5q.push_back(canonical_k5);
if self.received >= 6 {
let g6 = self.k6c[canonical_k6 as usize];
Self::update_sums_increment(&mut self.sum_f_log_f, &mut self.sum_f_log_s, 6, canonical_k6, g6);
Self::update_sums_increment(
&mut self.sum_f_log_f,
&mut self.sum_f_log_s,
6,
canonical_k6,
g6,
);
self.k6c[canonical_k6 as usize] += 1;
self.k6q.push_back(canonical_k6);
}
@@ -240,31 +284,7 @@ impl RollingStat {
}
pub fn ready(&self) -> bool {
self.received >= self.k
}
pub fn kmer(&self) -> Option<Kmer> {
if self.ready() {
Some(Kmer::from_raw_right(self.rolling_k, self.k))
} else {
None
}
}
pub fn revcomp_kmer(&self) -> Option<Kmer> {
if self.ready() {
Some(Kmer::from_raw_right(self.rolling_rck, self.k))
} else {
None
}
}
pub fn canonical_kmer(&self) -> Option<Kmer> {
if self.ready() {
Some(Kmer::from_raw_right(self.rolling_k.min(self.rolling_rck), self.k))
} else {
None
}
self.received >= params::k()
}
pub fn minimizer_position(&self) -> Option<usize> {
@@ -283,22 +303,22 @@ impl RollingStat {
}
}
pub fn canonical_minimizer(&self) -> Option<CanonicalKmer> {
self.canonical_minimizer_raw().map(|raw| {
CanonicalKmer::from_raw_unchecked(Kmer::from_raw_right(raw, self.m).raw())
})
pub fn canonical_minimizer(&self) -> Option<Minimizer> {
self.canonical_minimizer_raw()
.map(|raw| Minimizer::from_raw_unchecked(raw << (64 - params::m() * 2)))
}
pub fn entropy(&self, order: usize) -> Option<f64> {
if !self.ready() {
return None;
}
let em = emax(self.k, order);
let k = params::k();
let em = emax(k, order);
if em <= 0.0 {
return Some(1.0);
}
let nwords = self.k - order + 1;
let log_nw = log_nwords(self.k, order);
let nwords = k - order + 1;
let log_nw = log_nwords(k, order);
let nw_f = nwords as f64;
let h_corr = log_nw + (self.sum_f_log_s[order] - self.sum_f_log_f[order]) / nw_f;
Some((h_corr / em).max(0.0))
+2 -2
View File
@@ -56,14 +56,14 @@ impl SuperKmerScratch {
///
/// The heap allocation (`Box<[u8]>`) is exactly sized to the sequence.
/// Resets the buffer to empty afterward.
pub fn emit(&mut self, min_pos: usize, m: usize) -> RoutableSuperKmer {
pub fn emit(&mut self, min_pos: usize) -> RoutableSuperKmer {
let seql = self.len;
debug_assert!(seql >= 1 && seql <= MAX_SUPERKMER_LEN);
let n = (seql + 3) / 4;
let seq: Box<[u8]> = self.buf[..n].into();
self.buf[..n].fill(0);
self.len = 0;
RoutableSuperKmer::build(min_pos, m, seql as u8, seq)
RoutableSuperKmer::build(min_pos, seql, seq)
}
/// Discard all accumulated nucleotides without producing a [`SuperKmer`].
pub fn reset(&mut self) {