[obiskbuilder] Add canonical k-mer tables and refactor entropy computation
Introduce static precomputed lists of canonical k-mers (K1– K6) via build_canonical_list and expose them through a canonical_kmers() helper. Update RollingStat to accept entropy_max_k parameter, remove obsolete shift_left field and fix minimizer window condition. Refactor normalized_entropy() to use entropy_max_k instead of hardcoded 1..=6, and optimize count-based loop in compute_entropy() to iterate only over canonical indices.
This commit is contained in:
@@ -21,6 +21,13 @@ pub(crate) static LN_CARD_ROT5: LazyLock<[f64; 1024]> =
|
|||||||
pub(crate) static LN_CARD_ROT6: LazyLock<[f64; 4096]> =
|
pub(crate) static LN_CARD_ROT6: LazyLock<[f64; 4096]> =
|
||||||
LazyLock::new(|| build_log_class_size::<4096>(&NORMK6));
|
LazyLock::new(|| build_log_class_size::<4096>(&NORMK6));
|
||||||
|
|
||||||
|
pub(crate) static CANON_K1: LazyLock<Vec<u64>> = LazyLock::new(|| build_canonical_list::<4>(&NORMK1));
|
||||||
|
pub(crate) static CANON_K2: LazyLock<Vec<u64>> = LazyLock::new(|| build_canonical_list::<16>(&NORMK2));
|
||||||
|
pub(crate) static CANON_K3: LazyLock<Vec<u64>> = LazyLock::new(|| build_canonical_list::<64>(&NORMK3));
|
||||||
|
pub(crate) static CANON_K4: LazyLock<Vec<u64>> = LazyLock::new(|| build_canonical_list::<256>(&NORMK4));
|
||||||
|
pub(crate) static CANON_K5: LazyLock<Vec<u64>> = LazyLock::new(|| build_canonical_list::<1024>(&NORMK5));
|
||||||
|
pub(crate) static CANON_K6: LazyLock<Vec<u64>> = LazyLock::new(|| build_canonical_list::<4096>(&NORMK6));
|
||||||
|
|
||||||
fn ln0(x: f64) -> f64 {
|
fn ln0(x: f64) -> f64 {
|
||||||
if x == 0.0 { 0.0 } else { x.ln() }
|
if x == 0.0 { 0.0 } else { x.ln() }
|
||||||
}
|
}
|
||||||
@@ -54,6 +61,10 @@ fn build_normalized_kmer<const N: usize>() -> [u64; N] {
|
|||||||
result
|
result
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn build_canonical_list<const N: usize>(norm: &[u64; N]) -> Vec<u64> {
|
||||||
|
(0..N).filter(|&i| norm[i] == i as u64).map(|i| i as u64).collect()
|
||||||
|
}
|
||||||
|
|
||||||
fn build_log_class_size<const N: usize>(norm: &[u64; N]) -> [f64; N] {
|
fn build_log_class_size<const N: usize>(norm: &[u64; N]) -> [f64; N] {
|
||||||
let mut sizes = [0u32; N];
|
let mut sizes = [0u32; N];
|
||||||
for &c in norm {
|
for &c in norm {
|
||||||
@@ -87,6 +98,18 @@ pub(crate) fn entropy_norm_kmer(kmer: u64, k: usize, left: bool) -> u64 {
|
|||||||
} // right-aligned → left-aligned
|
} // right-aligned → left-aligned
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) fn canonical_kmers(k: usize) -> &'static [u64] {
|
||||||
|
match k {
|
||||||
|
1 => &CANON_K1,
|
||||||
|
2 => &CANON_K2,
|
||||||
|
3 => &CANON_K3,
|
||||||
|
4 => &CANON_K4,
|
||||||
|
5 => &CANON_K5,
|
||||||
|
6 => &CANON_K6,
|
||||||
|
_ => panic!("k must be 1..=6"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub(crate) fn ln_class_size(kmer: u64, k: usize, left: bool) -> f64 {
|
pub(crate) fn ln_class_size(kmer: u64, k: usize, left: bool) -> f64 {
|
||||||
let ra = if left { kmer >> (64 - k * 2) } else { kmer }; // left-aligned → right-aligned index
|
let ra = if left { kmer >> (64 - k * 2) } else { kmer }; // left-aligned → right-aligned index
|
||||||
match k {
|
match k {
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
use obikseq::kmer::Kmer;
|
use obikseq::kmer::Kmer;
|
||||||
|
|
||||||
use crate::encoding::encode_nuc;
|
use crate::encoding::encode_nuc;
|
||||||
use crate::entropy_table::{emax, entropy_norm_kmer, ln_class_size, log_nwords, n_log_n};
|
use crate::entropy_table::{canonical_kmers, emax, entropy_norm_kmer, ln_class_size, log_nwords, n_log_n};
|
||||||
use std::collections::VecDeque;
|
use std::collections::VecDeque;
|
||||||
|
|
||||||
#[derive(Clone, Copy)]
|
#[derive(Clone, Copy)]
|
||||||
@@ -14,11 +14,11 @@ struct MmerItem {
|
|||||||
pub struct RollingStat {
|
pub struct RollingStat {
|
||||||
k: usize,
|
k: usize,
|
||||||
m: usize,
|
m: usize,
|
||||||
|
entropy_max_k: usize,
|
||||||
rolling_k: u64,
|
rolling_k: u64,
|
||||||
rolling_rck: u64,
|
rolling_rck: u64,
|
||||||
k_mask: u64,
|
k_mask: u64,
|
||||||
m_mask: u64,
|
m_mask: u64,
|
||||||
shift_left: usize,
|
|
||||||
received: usize,
|
received: usize,
|
||||||
k1q: VecDeque<u64>,
|
k1q: VecDeque<u64>,
|
||||||
k2q: VecDeque<u64>,
|
k2q: VecDeque<u64>,
|
||||||
@@ -36,15 +36,15 @@ pub struct RollingStat {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl RollingStat {
|
impl RollingStat {
|
||||||
pub fn new(k: usize, m: usize) -> Self {
|
pub fn new(k: usize, m: usize, entropy_max_k: usize) -> Self {
|
||||||
Self {
|
Self {
|
||||||
k,
|
k,
|
||||||
m,
|
m,
|
||||||
|
entropy_max_k,
|
||||||
rolling_k: 0,
|
rolling_k: 0,
|
||||||
rolling_rck: 0,
|
rolling_rck: 0,
|
||||||
k_mask: (!0) >> (64 - k * 2),
|
k_mask: (!0) >> (64 - k * 2),
|
||||||
m_mask: (!0) >> (64 - m * 2),
|
m_mask: (!0) >> (64 - m * 2),
|
||||||
shift_left: (m - 1) * 2,
|
|
||||||
received: 0,
|
received: 0,
|
||||||
k1q: VecDeque::with_capacity(k),
|
k1q: VecDeque::with_capacity(k),
|
||||||
k2q: VecDeque::with_capacity(k - 1),
|
k2q: VecDeque::with_capacity(k - 1),
|
||||||
@@ -110,7 +110,7 @@ impl RollingStat {
|
|||||||
self.minimier.push_back(MmerItem { position: possible_pos_m, canonical: possible_canonical_m });
|
self.minimier.push_back(MmerItem { position: possible_pos_m, canonical: possible_canonical_m });
|
||||||
|
|
||||||
if self.received > self.k {
|
if self.received > self.k {
|
||||||
while self.minimier.front().map_or(false, |it| it.position + self.k <= self.received) {
|
while self.minimier.front().map_or(false, |it| it.position + self.k < self.received) {
|
||||||
self.minimier.pop_front();
|
self.minimier.pop_front();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -213,10 +213,11 @@ impl RollingStat {
|
|||||||
let nw_f = nwords as f64;
|
let nw_f = nwords as f64;
|
||||||
let mut sum_f_log_f = 0.0f64;
|
let mut sum_f_log_f = 0.0f64;
|
||||||
let mut sum_f_log_s = 0.0f64;
|
let mut sum_f_log_s = 0.0f64;
|
||||||
for (j, &f) in counts.iter().enumerate() {
|
for &j in canonical_kmers(order) {
|
||||||
|
let f = counts[j as usize];
|
||||||
if f > 0 {
|
if f > 0 {
|
||||||
sum_f_log_f += n_log_n(f);
|
sum_f_log_f += n_log_n(f);
|
||||||
sum_f_log_s += f as f64 * ln_class_size(j as u64, order, false);
|
sum_f_log_s += f as f64 * ln_class_size(j, order, false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
let h_corr = log_nw + (sum_f_log_s - sum_f_log_f) / nw_f;
|
let h_corr = log_nw + (sum_f_log_s - sum_f_log_f) / nw_f;
|
||||||
@@ -225,7 +226,7 @@ impl RollingStat {
|
|||||||
|
|
||||||
pub fn normalized_entropy(&self) -> Option<f64> {
|
pub fn normalized_entropy(&self) -> Option<f64> {
|
||||||
if !self.ready() { return None; }
|
if !self.ready() { return None; }
|
||||||
let min_e = (1..=6)
|
let min_e = (1..=self.entropy_max_k)
|
||||||
.filter_map(|ws| self.entropy(ws))
|
.filter_map(|ws| self.entropy(ws))
|
||||||
.fold(f64::MAX, f64::min);
|
.fold(f64::MAX, f64::min);
|
||||||
Some(if min_e == f64::MAX { 1.0 } else { min_e })
|
Some(if min_e == f64::MAX { 1.0 } else { min_e })
|
||||||
|
|||||||
Reference in New Issue
Block a user