diff --git a/.gitignore b/.gitignore index 2878242..c15af6f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ .venv/ src/target data-stress +*.fasta diff --git a/src/obiskbuilder/src/entropy.rs b/src/obiskbuilder/src/entropy.rs deleted file mode 100644 index a40dd53..0000000 --- a/src/obiskbuilder/src/entropy.rs +++ /dev/null @@ -1,99 +0,0 @@ -//! Entropy filter for low-complexity k-mer detection. -//! -//! Formula (corrected from the Go version): -//! H_corr = log(n_words) + (Σ fⱼ·log(sⱼ) − Σ fⱼ·log(fⱼ)) / n_words -//! H_max uses 4^ws raw categories (not the canonical class count) -//! Ĥ(ws) = H_corr / H_max ∈ [0,1] -//! score = min over ws=1..level_max of Ĥ(ws) -//! -//! A kmer is rejected if score ≤ theta. - -use crate::entropy_table::{emax, entropy_norm_kmer, ln_class_size, log_nwords, n_log_n}; - -/// Pre-computed entropy filter. One instance per worker thread (not Send/Sync -/// because of the mutable freq scratch buffer; wrap in a closure per thread). -pub struct EntropyFilter { - k: usize, - level_max: usize, - threshold: f64, - /// Reusable frequency buffer per word size (reset before each kmer). - freq_buf: Vec>, -} - -impl EntropyFilter { - pub fn new(k: usize, level_max: usize, threshold: f64) -> Self { - let level_max = level_max.min(k - 1).max(1); - let mut freq_buf: Vec> = vec![vec![]; level_max + 1]; - for ws in 1..=level_max { - freq_buf[ws] = vec![0u32; 1 << (ws * 2)]; - } - Self { k, level_max, threshold, freq_buf } - } - - /// Returns `true` if the kmer's entropy is strictly above the threshold. - pub fn accept(&mut self, kmer: u64) -> bool { - self.entropy(kmer) > self.threshold - } - - /// Compute the minimum normalised entropy across all word sizes. - pub fn entropy(&mut self, kmer: u64) -> f64 { - let k = self.k; - let mut min_entropy = f64::MAX; - - for ws in 1..=self.level_max { - let nwords = k - ws + 1; - let em = emax(k, ws); - if em <= 0.0 { - continue; - } - - let mask = (1usize << (ws * 2)) - 1; - let freq = &mut self.freq_buf[ws]; - - // Slide a ws-mer window; track only written indices (≤ nwords ≤ 31). - let mut dirty = [0u16; 32]; - let mut ndirty = 0usize; - let mut word = 0usize; - for i in 0..ws - 1 { - word = (word << 2) | ((kmer >> (2 * (k - 1 - i))) & 3) as usize; - } - for i in 0..nwords { - let base = ((kmer >> (2 * (k - ws - i))) & 3) as usize; - word = ((word << 2) | base) & mask; - let idx = entropy_norm_kmer(word as u64, ws, false) as usize; - if freq[idx] == 0 { - dirty[ndirty] = idx as u16; - ndirty += 1; - } - freq[idx] += 1; - } - - // H_corr = log(n_words) + (Σ fⱼ·log(sⱼ) − Σ fⱼ·log(fⱼ)) / n_words - // Reset freq in the same pass to avoid a separate zeroing loop. - let log_nw = log_nwords(k, ws); - let nw_f = nwords as f64; - let mut sum_f_log_f = 0.0f64; - let mut sum_f_log_s = 0.0f64; - for &j in &dirty[..ndirty] { - let j = j as usize; - let f = freq[j] as usize; - sum_f_log_f += n_log_n(f); - sum_f_log_s += f as f64 * ln_class_size(j as u64, ws, false); - freq[j] = 0; - } - - let h_corr = log_nw + (sum_f_log_s - sum_f_log_f) / nw_f; - let entropy = (h_corr / em).max(0.0); - if entropy < min_entropy { - min_entropy = entropy; - } - - // Early exit: k-mer is already rejected, no need to check further ws. - if min_entropy <= self.threshold { - break; - } - } - - if min_entropy == f64::MAX { 1.0 } else { min_entropy } - } -} diff --git a/src/obiskbuilder/src/entropy_table.rs b/src/obiskbuilder/src/entropy_table.rs index b7f2bfb..6aeb213 100644 --- a/src/obiskbuilder/src/entropy_table.rs +++ b/src/obiskbuilder/src/entropy_table.rs @@ -21,12 +21,6 @@ pub(crate) static LN_CARD_ROT5: LazyLock<[f64; 1024]> = pub(crate) static LN_CARD_ROT6: LazyLock<[f64; 4096]> = LazyLock::new(|| build_log_class_size::<4096>(&NORMK6)); -pub(crate) static CANON_K1: LazyLock> = LazyLock::new(|| build_canonical_list::<4>(&NORMK1)); -pub(crate) static CANON_K2: LazyLock> = LazyLock::new(|| build_canonical_list::<16>(&NORMK2)); -pub(crate) static CANON_K3: LazyLock> = LazyLock::new(|| build_canonical_list::<64>(&NORMK3)); -pub(crate) static CANON_K4: LazyLock> = LazyLock::new(|| build_canonical_list::<256>(&NORMK4)); -pub(crate) static CANON_K5: LazyLock> = LazyLock::new(|| build_canonical_list::<1024>(&NORMK5)); -pub(crate) static CANON_K6: LazyLock> = LazyLock::new(|| build_canonical_list::<4096>(&NORMK6)); fn ln0(x: f64) -> f64 { if x == 0.0 { 0.0 } else { x.ln() } @@ -61,10 +55,6 @@ fn build_normalized_kmer() -> [u64; N] { result } -fn build_canonical_list(norm: &[u64; N]) -> Vec { - (0..N).filter(|&i| norm[i] == i as u64).map(|i| i as u64).collect() -} - fn build_log_class_size(norm: &[u64; N]) -> [f64; N] { let mut sizes = [0u32; N]; for &c in norm { @@ -98,18 +88,6 @@ pub(crate) fn entropy_norm_kmer(kmer: u64, k: usize, left: bool) -> u64 { } // right-aligned → left-aligned } -pub(crate) fn canonical_kmers(k: usize) -> &'static [u64] { - match k { - 1 => &CANON_K1, - 2 => &CANON_K2, - 3 => &CANON_K3, - 4 => &CANON_K4, - 5 => &CANON_K5, - 6 => &CANON_K6, - _ => panic!("k must be 1..=6"), - } -} - pub(crate) fn ln_class_size(kmer: u64, k: usize, left: bool) -> f64 { let ra = if left { kmer >> (64 - k * 2) } else { kmer }; // left-aligned → right-aligned index match k { diff --git a/src/obiskbuilder/src/iter.rs b/src/obiskbuilder/src/iter.rs index bf16b91..7f0f9d4 100644 --- a/src/obiskbuilder/src/iter.rs +++ b/src/obiskbuilder/src/iter.rs @@ -5,9 +5,9 @@ //! //! # Cut conditions (checked in order per nucleotide, once the k-mer window is full) //! -//! In all three cases the triggering nucleotide S[j] is **not** added to the -//! current super-kmer; the super-kmer built so far is emitted; then the rope -//! cursor is retreated so the next segment can overlap correctly: +//! In all three cases the triggering nucleotide is **not** added to the current +//! super-kmer; the super-kmer built so far is emitted; then the rope cursor is +//! retreated so the next segment can overlap correctly: //! //! | Condition | cursor retreat | //! |------------------------|----------------| @@ -18,22 +18,18 @@ use obikrope::{ForwardCursor, Rope, RopeCursor}; use obikseq::superkmer::SuperKmer; -use crate::encoding::encode_nuc; -use crate::entropy::EntropyFilter; -use crate::minimizer::MinimizerState; +use crate::rolling_stat::RollingStat; use crate::scratch::SuperKmerScratch; -use crate::window::KmerWindow; -/// Iterator over `(minimizer, SuperKmer)` pairs. +/// Iterator over `(minimizer_hash, SuperKmer)` pairs. pub struct SuperKmerIter<'a> { cursor: ForwardCursor<'a>, k: usize, + theta: f64, scratch: SuperKmerScratch, - window: KmerWindow, - entropy: EntropyFilter, - minimizer: MinimizerState, - prev_minimizer: Option, - prev_minimizer_pos: u8, + stat: RollingStat, + prev_min: Option, + prev_min_pos: usize, } impl<'a> SuperKmerIter<'a> { @@ -41,54 +37,50 @@ impl<'a> SuperKmerIter<'a> { /// /// - `k`: k-mer size (1–31) /// - `m`: minimizer size (1 < m < k) - /// - `level_max`: maximum sub-word size for entropy (typically 6) + /// - `level_max`: maximum sub-word size for entropy (1–6) /// - `theta`: entropy threshold; k-mers with score ≤ theta are rejected pub fn new(rope: &'a Rope, k: usize, m: usize, level_max: usize, theta: f64) -> Self { Self { cursor: rope.fw_cursor(), k, + theta, scratch: SuperKmerScratch::new(), - window: KmerWindow::new(k), - entropy: EntropyFilter::new(k, level_max, theta), - minimizer: MinimizerState::new(k, m), - prev_minimizer: None, - prev_minimizer_pos: 0, + stat: RollingStat::new(k, m, level_max), + prev_min: None, + prev_min_pos: 0, } } - fn reset_state(&mut self) { - self.window.reset(); - self.minimizer.reset(); + fn reset(&mut self) { + self.stat.reset(); self.scratch.reset(); - self.prev_minimizer = None; - self.prev_minimizer_pos = 0; + self.prev_min = None; + self.prev_min_pos = 0; } - fn try_emit(&mut self, min: u64) -> Option<(u64, SuperKmer)> { + fn try_emit(&mut self) -> Option<(u64, SuperKmer)> { if self.scratch.len() < self.k { - self.scratch.reset(); return None; } + let min = self.prev_min?; let mut sk = self.scratch.emit(); - sk.set_minimizer_pos(self.prev_minimizer_pos); + sk.set_minimizer_pos(self.prev_min_pos as u8); Some((min, sk)) } } -impl<'a> Iterator for SuperKmerIter<'a> { +impl Iterator for SuperKmerIter<'_> { type Item = (u64, SuperKmer); fn next(&mut self) -> Option<(u64, SuperKmer)> { loop { let byte = match self.cursor.read_next().ok() { None => { - let min = self.prev_minimizer.unwrap_or(0); - return self.try_emit(min); + return self.try_emit(); } Some(0x00) => { - let min = self.prev_minimizer.unwrap_or(0); - let result = self.try_emit(min); - self.reset_state(); + let result = self.try_emit(); + self.reset(); if result.is_some() { return result; } @@ -97,36 +89,33 @@ impl<'a> Iterator for SuperKmerIter<'a> { Some(b) => b, }; - let base2 = encode_nuc(byte); - let kmer_ready = self.window.push(base2); - let current_min = self.minimizer.push(base2); + self.stat.push(byte); - if !kmer_ready { + if !self.stat.ready() { self.scratch.push(byte); continue; } - let kmer = self.window.kmer_u64(); - let min = current_min.unwrap(); - // ── 1. Entropy check ───────────────────────────────────────────── - if !self.entropy.accept(kmer) { - let prev_min = self.prev_minimizer.unwrap_or(0); - let result = self.try_emit(prev_min); + if self.stat.normalized_entropy().unwrap_or(1.0) <= self.theta { + let result = self.try_emit(); self.cursor.rewind(self.k - 1).ok(); - self.reset_state(); + self.reset(); if result.is_some() { return result; } continue; } + let min = self.stat.canonical_minimizer().map(|k| k.raw()).unwrap_or(0); + let min_pos = self.stat.minimizer_position().unwrap_or(0); + // ── 2. Minimizer change check ───────────────────────────────────── - if let Some(prev) = self.prev_minimizer { + if let Some(prev) = self.prev_min { if min != prev { - let result = self.try_emit(prev); + let result = self.try_emit(); self.cursor.rewind(self.k).ok(); - self.reset_state(); + self.reset(); if result.is_some() { return result; } @@ -136,17 +125,17 @@ impl<'a> Iterator for SuperKmerIter<'a> { // ── 3. Super-kmer length check ──────────────────────────────────── if self.scratch.len() == 256 { - let result = self.try_emit(min); + let result = self.try_emit(); self.cursor.rewind(self.k).ok(); - self.reset_state(); + self.reset(); if result.is_some() { return result; } continue; } - self.prev_minimizer = Some(min); - self.prev_minimizer_pos = self.minimizer.minimizer_pos(); + self.prev_min = Some(min); + self.prev_min_pos = min_pos; self.scratch.push(byte); } } @@ -157,6 +146,7 @@ impl<'a> Iterator for SuperKmerIter<'a> { #[cfg(test)] mod tests { use super::*; + use obikrope::Rope; fn make_rope(data: &[u8]) -> Rope { let mut r = Rope::new(); diff --git a/src/obiskbuilder/src/lib.rs b/src/obiskbuilder/src/lib.rs index 19a235a..88fa27b 100644 --- a/src/obiskbuilder/src/lib.rs +++ b/src/obiskbuilder/src/lib.rs @@ -5,11 +5,8 @@ #![deny(missing_docs)] -mod entropy; pub mod iter; -mod minimizer; mod scratch; -mod window; pub(crate) mod encoding; pub(crate) mod entropy_table; diff --git a/src/obiskbuilder/src/minimizer.rs b/src/obiskbuilder/src/minimizer.rs deleted file mode 100644 index 61f5f79..0000000 --- a/src/obiskbuilder/src/minimizer.rs +++ /dev/null @@ -1,96 +0,0 @@ -//! Sliding-window canonical minimizer via a monotone deque. -//! -//! The minimizer of a k-mer is the smallest canonical m-mer (min of forward -//! and reverse-complement) among the k−m+1 m-mers it contains. - -use std::collections::VecDeque; - -#[derive(Clone, Copy)] -struct MmerItem { - /// 0-based position of this m-mer's first base within the current segment. - position: usize, - canonical: u64, -} - -pub struct MinimizerState { - k: usize, - m: usize, - deque: VecDeque, - fwd_mmer: u64, - rvc_mmer: u64, - m_mask: u64, - rc_shift: u32, - /// Number of bases pushed since the last reset. - local_pos: usize, -} - -impl MinimizerState { - pub fn new(k: usize, m: usize) -> Self { - debug_assert!(m >= 1 && m < k && k <= 31); - Self { - k, - m, - deque: VecDeque::with_capacity(k - m + 2), - fwd_mmer: 0, - rvc_mmer: 0, - m_mask: (1u64 << (m * 2)) - 1, - rc_shift: ((m - 1) * 2) as u32, - local_pos: 0, - } - } - - /// Push a 2-bit base (0–3). - /// - /// Returns `Some(minimizer)` once we have a full k-mer (i.e., after the - /// k-th push), `None` during the warm-up phase. - pub fn push(&mut self, base: u8) -> Option { - let code = base as u64; - - // Update sliding m-mer (fwd + rvc). - self.fwd_mmer = ((self.fwd_mmer << 2) | code) & self.m_mask; - self.rvc_mmer = (self.rvc_mmer >> 2) | ((code ^ 3) << self.rc_shift); - - self.local_pos += 1; - - // Emit canonical m-mer once full. - if self.local_pos >= self.m { - let canonical = self.fwd_mmer.min(self.rvc_mmer); - let mmer_pos = self.local_pos - self.m; // start position of this m-mer - - // Maintain monotone deque: pop back while ≥ new canonical. - while self.deque.back().map_or(false, |it| it.canonical >= canonical) { - self.deque.pop_back(); - } - self.deque.push_back(MmerItem { position: mmer_pos, canonical }); - } - - // Return minimizer once we have a full k-mer. - if self.local_pos >= self.k { - let kmer_start = self.local_pos - self.k; - // Evict m-mers that have slid out of the k-mer window. - while self.deque.front().map_or(false, |it| it.position < kmer_start) { - self.deque.pop_front(); - } - Some(self.deque.front().map_or(0, |it| it.canonical)) - } else { - None - } - } - - /// Position of the current minimizer m-mer within the current superkmer. - /// - /// Valid only when the deque is non-empty (i.e. after at least one full k-mer). - /// Because `reset()` zeroes `local_pos` at the same time as the scratch buffer - /// is reset, `deque.front().position` directly equals the offset from the - /// superkmer start. - pub fn minimizer_pos(&self) -> u8 { - self.deque.front().map_or(0, |it| it.position as u8) - } - - pub fn reset(&mut self) { - self.deque.clear(); - self.fwd_mmer = 0; - self.rvc_mmer = 0; - self.local_pos = 0; - } -} diff --git a/src/obiskbuilder/src/rolling_stat.rs b/src/obiskbuilder/src/rolling_stat.rs index acc1671..232ffed 100644 --- a/src/obiskbuilder/src/rolling_stat.rs +++ b/src/obiskbuilder/src/rolling_stat.rs @@ -1,8 +1,7 @@ use obikseq::kmer::Kmer; use crate::encoding::encode_nuc; -use crate::entropy_table::{canonical_kmers, emax, entropy_norm_kmer, ln_class_size, log_nwords, n_log_n}; -use std::collections::VecDeque; +use crate::entropy_table::{WS_MAX, emax, entropy_norm_kmer, ln_class_size, log_nwords, n_log_n}; #[derive(Clone, Copy)] struct MmerItem { @@ -20,19 +19,21 @@ pub struct RollingStat { k_mask: u64, m_mask: u64, received: usize, - k1q: VecDeque, - k2q: VecDeque, - k3q: VecDeque, - k4q: VecDeque, - k5q: VecDeque, - k6q: VecDeque, - minimier: VecDeque, + k1q: std::collections::VecDeque, + k2q: std::collections::VecDeque, + k3q: std::collections::VecDeque, + k4q: std::collections::VecDeque, + k5q: std::collections::VecDeque, + k6q: std::collections::VecDeque, + minimier: std::collections::VecDeque, k1c: Vec, k2c: Vec, k3c: Vec, k4c: Vec, k5c: Vec, k6c: Vec, + sum_f_log_f: [f64; WS_MAX + 1], + sum_f_log_s: [f64; WS_MAX + 1], } impl RollingStat { @@ -46,19 +47,21 @@ impl RollingStat { k_mask: (!0) >> (64 - k * 2), m_mask: (!0) >> (64 - m * 2), received: 0, - k1q: VecDeque::with_capacity(k), - k2q: VecDeque::with_capacity(k - 1), - k3q: VecDeque::with_capacity(k - 2), - k4q: VecDeque::with_capacity(k - 3), - k5q: VecDeque::with_capacity(k - 4), - k6q: VecDeque::with_capacity(k - 5), - minimier: VecDeque::with_capacity(k-m+2), + k1q: std::collections::VecDeque::with_capacity(k), + k2q: std::collections::VecDeque::with_capacity(k - 1), + k3q: std::collections::VecDeque::with_capacity(k - 2), + k4q: std::collections::VecDeque::with_capacity(k - 3), + k5q: std::collections::VecDeque::with_capacity(k - 4), + k6q: std::collections::VecDeque::with_capacity(k - 5), + minimier: std::collections::VecDeque::with_capacity(k - m + 2), k1c: vec![0; 4_usize.pow(1)], k2c: vec![0; 4_usize.pow(2)], k3c: vec![0; 4_usize.pow(3)], k4c: vec![0; 4_usize.pow(4)], k5c: vec![0; 4_usize.pow(5)], k6c: vec![0; 4_usize.pow(6)], + sum_f_log_f: [0.0; WS_MAX + 1], + sum_f_log_s: [0.0; WS_MAX + 1], } } @@ -66,6 +69,12 @@ impl RollingStat { self.rolling_k = 0; self.rolling_rck = 0; self.received = 0; + for &i in &self.k1q { self.k1c[i as usize] = 0; } + for &i in &self.k2q { self.k2c[i as usize] = 0; } + for &i in &self.k3q { self.k3c[i as usize] = 0; } + for &i in &self.k4q { self.k4c[i as usize] = 0; } + for &i in &self.k5q { self.k5c[i as usize] = 0; } + for &i in &self.k6q { self.k6c[i as usize] = 0; } self.k1q.clear(); self.k2q.clear(); self.k3q.clear(); @@ -73,22 +82,41 @@ impl RollingStat { self.k5q.clear(); self.k6q.clear(); self.minimier.clear(); - self.k1c.fill(0); - self.k2c.fill(0); - self.k3c.fill(0); - self.k4c.fill(0); - self.k5c.fill(0); - self.k6c.fill(0); + self.sum_f_log_f = [0.0; WS_MAX + 1]; + self.sum_f_log_s = [0.0; WS_MAX + 1]; } - pub fn push(&mut self, nuc: char) { - let bnuc = encode_nuc(nuc as u8); + #[inline] + fn update_sums_decrement( + sum_f_log_f: &mut [f64; WS_MAX + 1], + sum_f_log_s: &mut [f64; WS_MAX + 1], + ws: usize, + canonical: u64, + f: usize, + ) { + sum_f_log_f[ws] += n_log_n(f - 1) - n_log_n(f); + sum_f_log_s[ws] -= ln_class_size(canonical, ws, false); + } + + #[inline] + fn update_sums_increment( + sum_f_log_f: &mut [f64; WS_MAX + 1], + sum_f_log_s: &mut [f64; WS_MAX + 1], + ws: usize, + canonical: u64, + g: usize, + ) { + sum_f_log_f[ws] += n_log_n(g + 1) - n_log_n(g); + sum_f_log_s[ws] += ln_class_size(canonical, ws, false); + } + + pub fn push(&mut self, nuc: u8) { + let bnuc = encode_nuc(nuc); let cnuc = bnuc ^ 3; self.rolling_k = ((self.rolling_k << 2) | (bnuc as u64)) & self.k_mask; self.rolling_rck = - ((self.rolling_rck >> 2) | ((cnuc as u64) << ((self.k -1) * 2))) & self.k_mask; - + ((self.rolling_rck >> 2) | ((cnuc as u64) << ((self.k - 1) * 2))) & self.k_mask; let canonical_k1 = entropy_norm_kmer(self.rolling_k & 3, 1, false); let canonical_k2 = entropy_norm_kmer(self.rolling_k & 15, 2, false); @@ -107,39 +135,84 @@ impl RollingStat { while self.minimier.back().map_or(false, |it| it.canonical >= possible_canonical_m) { self.minimier.pop_back(); } - self.minimier.push_back(MmerItem { position: possible_pos_m, canonical: possible_canonical_m }); + self.minimier + .push_back(MmerItem { position: possible_pos_m, canonical: possible_canonical_m }); if self.received > self.k { - while self.minimier.front().map_or(false, |it| it.position + self.k < self.received) { + while self + .minimier + .front() + .map_or(false, |it| it.position + self.k < self.received) + { self.minimier.pop_front(); } } } if self.received > self.k { - self.k1c[self.k1q.pop_front().unwrap() as usize] -= 1; - self.k2c[self.k2q.pop_front().unwrap() as usize] -= 1; - self.k3c[self.k3q.pop_front().unwrap() as usize] -= 1; - self.k4c[self.k4q.pop_front().unwrap() as usize] -= 1; - self.k5c[self.k5q.pop_front().unwrap() as usize] -= 1; - self.k6c[self.k6q.pop_front().unwrap() as usize] -= 1; + let old1 = self.k1q.pop_front().unwrap(); + let f1 = self.k1c[old1 as usize]; + Self::update_sums_decrement(&mut self.sum_f_log_f, &mut self.sum_f_log_s, 1, old1, f1); + self.k1c[old1 as usize] -= 1; + + let old2 = self.k2q.pop_front().unwrap(); + let f2 = self.k2c[old2 as usize]; + Self::update_sums_decrement(&mut self.sum_f_log_f, &mut self.sum_f_log_s, 2, old2, f2); + self.k2c[old2 as usize] -= 1; + + let old3 = self.k3q.pop_front().unwrap(); + let f3 = self.k3c[old3 as usize]; + Self::update_sums_decrement(&mut self.sum_f_log_f, &mut self.sum_f_log_s, 3, old3, f3); + self.k3c[old3 as usize] -= 1; + + let old4 = self.k4q.pop_front().unwrap(); + let f4 = self.k4c[old4 as usize]; + Self::update_sums_decrement(&mut self.sum_f_log_f, &mut self.sum_f_log_s, 4, old4, f4); + self.k4c[old4 as usize] -= 1; + + let old5 = self.k5q.pop_front().unwrap(); + let f5 = self.k5c[old5 as usize]; + Self::update_sums_decrement(&mut self.sum_f_log_f, &mut self.sum_f_log_s, 5, old5, f5); + self.k5c[old5 as usize] -= 1; + + let old6 = self.k6q.pop_front().unwrap(); + let f6 = self.k6c[old6 as usize]; + Self::update_sums_decrement(&mut self.sum_f_log_f, &mut self.sum_f_log_s, 6, old6, f6); + self.k6c[old6 as usize] -= 1; } + let g1 = self.k1c[canonical_k1 as usize]; + Self::update_sums_increment(&mut self.sum_f_log_f, &mut self.sum_f_log_s, 1, canonical_k1, g1); self.k1c[canonical_k1 as usize] += 1; self.k1q.push_back(canonical_k1); + if self.received >= 2 { + let g2 = self.k2c[canonical_k2 as usize]; + Self::update_sums_increment(&mut self.sum_f_log_f, &mut self.sum_f_log_s, 2, canonical_k2, g2); self.k2c[canonical_k2 as usize] += 1; self.k2q.push_back(canonical_k2); + if self.received >= 3 { + let g3 = self.k3c[canonical_k3 as usize]; + Self::update_sums_increment(&mut self.sum_f_log_f, &mut self.sum_f_log_s, 3, canonical_k3, g3); self.k3c[canonical_k3 as usize] += 1; self.k3q.push_back(canonical_k3); + if self.received >= 4 { + let g4 = self.k4c[canonical_k4 as usize]; + Self::update_sums_increment(&mut self.sum_f_log_f, &mut self.sum_f_log_s, 4, canonical_k4, g4); self.k4c[canonical_k4 as usize] += 1; self.k4q.push_back(canonical_k4); + if self.received >= 5 { + let g5 = self.k5c[canonical_k5 as usize]; + Self::update_sums_increment(&mut self.sum_f_log_f, &mut self.sum_f_log_s, 5, canonical_k5, g5); self.k5c[canonical_k5 as usize] += 1; self.k5q.push_back(canonical_k5); + if self.received >= 6 { + let g6 = self.k6c[canonical_k6 as usize]; + Self::update_sums_increment(&mut self.sum_f_log_f, &mut self.sum_f_log_s, 6, canonical_k6, g6); self.k6c[canonical_k6 as usize] += 1; self.k6q.push_back(canonical_k6); } @@ -171,8 +244,7 @@ impl RollingStat { pub fn canonical_kmer(&self) -> Option { if self.ready() { - Some(Kmer::from_raw_right( - self.rolling_k.min(self.rolling_rck), self.k)) + Some(Kmer::from_raw_right(self.rolling_k.min(self.rolling_rck), self.k)) } else { None } @@ -195,37 +267,24 @@ impl RollingStat { } pub fn entropy(&self, order: usize) -> Option { - if !self.ready() { return None; } - let k = self.k; - let em = emax(k, order); - if em <= 0.0 { return Some(1.0); } - let counts = match order { - 1 => &self.k1c, - 2 => &self.k2c, - 3 => &self.k3c, - 4 => &self.k4c, - 5 => &self.k5c, - 6 => &self.k6c, - _ => return None, - }; - let nwords = k - order + 1; - let log_nw = log_nwords(k, order); - let nw_f = nwords as f64; - let mut sum_f_log_f = 0.0f64; - let mut sum_f_log_s = 0.0f64; - for &j in canonical_kmers(order) { - let f = counts[j as usize]; - if f > 0 { - sum_f_log_f += n_log_n(f); - sum_f_log_s += f as f64 * ln_class_size(j, order, false); - } + if !self.ready() { + return None; } - let h_corr = log_nw + (sum_f_log_s - sum_f_log_f) / nw_f; + let em = emax(self.k, order); + if em <= 0.0 { + return Some(1.0); + } + let nwords = self.k - order + 1; + let log_nw = log_nwords(self.k, order); + let nw_f = nwords as f64; + let h_corr = log_nw + (self.sum_f_log_s[order] - self.sum_f_log_f[order]) / nw_f; Some((h_corr / em).max(0.0)) } pub fn normalized_entropy(&self) -> Option { - if !self.ready() { return None; } + if !self.ready() { + return None; + } let min_e = (1..=self.entropy_max_k) .filter_map(|ws| self.entropy(ws)) .fold(f64::MAX, f64::min); diff --git a/src/obiskbuilder/src/window.rs b/src/obiskbuilder/src/window.rs deleted file mode 100644 index 02e5f11..0000000 --- a/src/obiskbuilder/src/window.rs +++ /dev/null @@ -1,50 +0,0 @@ -//! Sliding k-mer window over 2-bit–encoded nucleotides. - -/// Ring buffer of the last k 2-bit–encoded bases. -/// Used to extract the current k-mer as a `u64` and to feed the entropy and -/// minimizer state machines. -pub struct KmerWindow { - buf: [u8; 32], - start: usize, - len: usize, - k: usize, -} - -impl KmerWindow { - pub fn new(k: usize) -> Self { - debug_assert!(k >= 1 && k <= 31); - Self { buf: [0u8; 32], start: 0, len: 0, k } - } - - pub fn k(&self) -> usize { self.k } - - pub fn is_full(&self) -> bool { self.len == self.k } - - /// Push a 2-bit base (0–3). Returns `true` once the window holds k bases. - #[inline] - pub fn push(&mut self, base: u8) -> bool { - if self.len < self.k { - self.buf[self.len] = base; - self.len += 1; - self.len == self.k - } else { - self.buf[self.start] = base; - self.start = (self.start + 1) % self.k; - true - } - } - - /// Current k-mer as a right-aligned `u64` (first base in MSB of the 2k-bit field). - pub fn kmer_u64(&self) -> u64 { - let mut v = 0u64; - for i in 0..self.k { - v = (v << 2) | self.buf[(self.start + i) % self.k] as u64; - } - v - } - - pub fn reset(&mut self) { - self.start = 0; - self.len = 0; - } -} diff --git a/src/profile.json.gz b/src/profile.json.gz index 20cdc1d..60b1016 100644 Binary files a/src/profile.json.gz and b/src/profile.json.gz differ