diff --git a/src/obiskbuilder/src/entropy_table.rs b/src/obiskbuilder/src/entropy_table.rs index 26bd667..b7f2bfb 100644 --- a/src/obiskbuilder/src/entropy_table.rs +++ b/src/obiskbuilder/src/entropy_table.rs @@ -21,6 +21,13 @@ pub(crate) static LN_CARD_ROT5: LazyLock<[f64; 1024]> = pub(crate) static LN_CARD_ROT6: LazyLock<[f64; 4096]> = LazyLock::new(|| build_log_class_size::<4096>(&NORMK6)); +pub(crate) static CANON_K1: LazyLock> = LazyLock::new(|| build_canonical_list::<4>(&NORMK1)); +pub(crate) static CANON_K2: LazyLock> = LazyLock::new(|| build_canonical_list::<16>(&NORMK2)); +pub(crate) static CANON_K3: LazyLock> = LazyLock::new(|| build_canonical_list::<64>(&NORMK3)); +pub(crate) static CANON_K4: LazyLock> = LazyLock::new(|| build_canonical_list::<256>(&NORMK4)); +pub(crate) static CANON_K5: LazyLock> = LazyLock::new(|| build_canonical_list::<1024>(&NORMK5)); +pub(crate) static CANON_K6: LazyLock> = LazyLock::new(|| build_canonical_list::<4096>(&NORMK6)); + fn ln0(x: f64) -> f64 { if x == 0.0 { 0.0 } else { x.ln() } } @@ -54,6 +61,10 @@ fn build_normalized_kmer() -> [u64; N] { result } +fn build_canonical_list(norm: &[u64; N]) -> Vec { + (0..N).filter(|&i| norm[i] == i as u64).map(|i| i as u64).collect() +} + fn build_log_class_size(norm: &[u64; N]) -> [f64; N] { let mut sizes = [0u32; N]; for &c in norm { @@ -87,6 +98,18 @@ pub(crate) fn entropy_norm_kmer(kmer: u64, k: usize, left: bool) -> u64 { } // right-aligned → left-aligned } +pub(crate) fn canonical_kmers(k: usize) -> &'static [u64] { + match k { + 1 => &CANON_K1, + 2 => &CANON_K2, + 3 => &CANON_K3, + 4 => &CANON_K4, + 5 => &CANON_K5, + 6 => &CANON_K6, + _ => panic!("k must be 1..=6"), + } +} + pub(crate) fn ln_class_size(kmer: u64, k: usize, left: bool) -> f64 { let ra = if left { kmer >> (64 - k * 2) } else { kmer }; // left-aligned → right-aligned index match k { diff --git a/src/obiskbuilder/src/rolling_stat.rs b/src/obiskbuilder/src/rolling_stat.rs index e33a9ca..acc1671 100644 --- a/src/obiskbuilder/src/rolling_stat.rs +++ b/src/obiskbuilder/src/rolling_stat.rs @@ -1,7 +1,7 @@ use obikseq::kmer::Kmer; use crate::encoding::encode_nuc; -use crate::entropy_table::{emax, entropy_norm_kmer, ln_class_size, log_nwords, n_log_n}; +use crate::entropy_table::{canonical_kmers, emax, entropy_norm_kmer, ln_class_size, log_nwords, n_log_n}; use std::collections::VecDeque; #[derive(Clone, Copy)] @@ -14,11 +14,11 @@ struct MmerItem { pub struct RollingStat { k: usize, m: usize, + entropy_max_k: usize, rolling_k: u64, rolling_rck: u64, k_mask: u64, m_mask: u64, - shift_left: usize, received: usize, k1q: VecDeque, k2q: VecDeque, @@ -36,15 +36,15 @@ pub struct RollingStat { } impl RollingStat { - pub fn new(k: usize, m: usize) -> Self { + pub fn new(k: usize, m: usize, entropy_max_k: usize) -> Self { Self { k, m, + entropy_max_k, rolling_k: 0, rolling_rck: 0, k_mask: (!0) >> (64 - k * 2), m_mask: (!0) >> (64 - m * 2), - shift_left: (m - 1) * 2, received: 0, k1q: VecDeque::with_capacity(k), k2q: VecDeque::with_capacity(k - 1), @@ -110,7 +110,7 @@ impl RollingStat { self.minimier.push_back(MmerItem { position: possible_pos_m, canonical: possible_canonical_m }); if self.received > self.k { - while self.minimier.front().map_or(false, |it| it.position + self.k <= self.received) { + while self.minimier.front().map_or(false, |it| it.position + self.k < self.received) { self.minimier.pop_front(); } } @@ -213,10 +213,11 @@ impl RollingStat { let nw_f = nwords as f64; let mut sum_f_log_f = 0.0f64; let mut sum_f_log_s = 0.0f64; - for (j, &f) in counts.iter().enumerate() { + for &j in canonical_kmers(order) { + let f = counts[j as usize]; if f > 0 { sum_f_log_f += n_log_n(f); - sum_f_log_s += f as f64 * ln_class_size(j as u64, order, false); + sum_f_log_s += f as f64 * ln_class_size(j, order, false); } } let h_corr = log_nw + (sum_f_log_s - sum_f_log_f) / nw_f; @@ -225,7 +226,7 @@ impl RollingStat { pub fn normalized_entropy(&self) -> Option { if !self.ready() { return None; } - let min_e = (1..=6) + let min_e = (1..=self.entropy_max_k) .filter_map(|ws| self.entropy(ws)) .fold(f64::MAX, f64::min); Some(if min_e == f64::MAX { 1.0 } else { min_e })