🗑️ Refactor entropy and minimizer logic into RollingStat
- Remove `entropy.rs`, `minimizer.rs` and `window.rs`; consolidate logic into new module - Introduce unified state management in RollingStat with incremental entropy tracking and canonical minimizer computation via monotone deque - Update SuperKmerIter to use RollingStat instead of separate components, simplifying iteration and state transitions - Add `*.fasta` to .gitignore for generated FASTA outputs
This commit is contained in:
@@ -1,3 +1,4 @@
|
||||
.venv/
|
||||
src/target
|
||||
data-stress
|
||||
*.fasta
|
||||
|
||||
@@ -1,99 +0,0 @@
|
||||
//! Entropy filter for low-complexity k-mer detection.
|
||||
//!
|
||||
//! Formula (corrected from the Go version):
|
||||
//! H_corr = log(n_words) + (Σ fⱼ·log(sⱼ) − Σ fⱼ·log(fⱼ)) / n_words
|
||||
//! H_max uses 4^ws raw categories (not the canonical class count)
|
||||
//! Ĥ(ws) = H_corr / H_max ∈ [0,1]
|
||||
//! score = min over ws=1..level_max of Ĥ(ws)
|
||||
//!
|
||||
//! A kmer is rejected if score ≤ theta.
|
||||
|
||||
use crate::entropy_table::{emax, entropy_norm_kmer, ln_class_size, log_nwords, n_log_n};
|
||||
|
||||
/// Pre-computed entropy filter. One instance per worker thread (not Send/Sync
|
||||
/// because of the mutable freq scratch buffer; wrap in a closure per thread).
|
||||
pub struct EntropyFilter {
|
||||
k: usize,
|
||||
level_max: usize,
|
||||
threshold: f64,
|
||||
/// Reusable frequency buffer per word size (reset before each kmer).
|
||||
freq_buf: Vec<Vec<u32>>,
|
||||
}
|
||||
|
||||
impl EntropyFilter {
|
||||
pub fn new(k: usize, level_max: usize, threshold: f64) -> Self {
|
||||
let level_max = level_max.min(k - 1).max(1);
|
||||
let mut freq_buf: Vec<Vec<u32>> = vec![vec![]; level_max + 1];
|
||||
for ws in 1..=level_max {
|
||||
freq_buf[ws] = vec![0u32; 1 << (ws * 2)];
|
||||
}
|
||||
Self { k, level_max, threshold, freq_buf }
|
||||
}
|
||||
|
||||
/// Returns `true` if the kmer's entropy is strictly above the threshold.
|
||||
pub fn accept(&mut self, kmer: u64) -> bool {
|
||||
self.entropy(kmer) > self.threshold
|
||||
}
|
||||
|
||||
/// Compute the minimum normalised entropy across all word sizes.
|
||||
pub fn entropy(&mut self, kmer: u64) -> f64 {
|
||||
let k = self.k;
|
||||
let mut min_entropy = f64::MAX;
|
||||
|
||||
for ws in 1..=self.level_max {
|
||||
let nwords = k - ws + 1;
|
||||
let em = emax(k, ws);
|
||||
if em <= 0.0 {
|
||||
continue;
|
||||
}
|
||||
|
||||
let mask = (1usize << (ws * 2)) - 1;
|
||||
let freq = &mut self.freq_buf[ws];
|
||||
|
||||
// Slide a ws-mer window; track only written indices (≤ nwords ≤ 31).
|
||||
let mut dirty = [0u16; 32];
|
||||
let mut ndirty = 0usize;
|
||||
let mut word = 0usize;
|
||||
for i in 0..ws - 1 {
|
||||
word = (word << 2) | ((kmer >> (2 * (k - 1 - i))) & 3) as usize;
|
||||
}
|
||||
for i in 0..nwords {
|
||||
let base = ((kmer >> (2 * (k - ws - i))) & 3) as usize;
|
||||
word = ((word << 2) | base) & mask;
|
||||
let idx = entropy_norm_kmer(word as u64, ws, false) as usize;
|
||||
if freq[idx] == 0 {
|
||||
dirty[ndirty] = idx as u16;
|
||||
ndirty += 1;
|
||||
}
|
||||
freq[idx] += 1;
|
||||
}
|
||||
|
||||
// H_corr = log(n_words) + (Σ fⱼ·log(sⱼ) − Σ fⱼ·log(fⱼ)) / n_words
|
||||
// Reset freq in the same pass to avoid a separate zeroing loop.
|
||||
let log_nw = log_nwords(k, ws);
|
||||
let nw_f = nwords as f64;
|
||||
let mut sum_f_log_f = 0.0f64;
|
||||
let mut sum_f_log_s = 0.0f64;
|
||||
for &j in &dirty[..ndirty] {
|
||||
let j = j as usize;
|
||||
let f = freq[j] as usize;
|
||||
sum_f_log_f += n_log_n(f);
|
||||
sum_f_log_s += f as f64 * ln_class_size(j as u64, ws, false);
|
||||
freq[j] = 0;
|
||||
}
|
||||
|
||||
let h_corr = log_nw + (sum_f_log_s - sum_f_log_f) / nw_f;
|
||||
let entropy = (h_corr / em).max(0.0);
|
||||
if entropy < min_entropy {
|
||||
min_entropy = entropy;
|
||||
}
|
||||
|
||||
// Early exit: k-mer is already rejected, no need to check further ws.
|
||||
if min_entropy <= self.threshold {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if min_entropy == f64::MAX { 1.0 } else { min_entropy }
|
||||
}
|
||||
}
|
||||
@@ -21,12 +21,6 @@ pub(crate) static LN_CARD_ROT5: LazyLock<[f64; 1024]> =
|
||||
pub(crate) static LN_CARD_ROT6: LazyLock<[f64; 4096]> =
|
||||
LazyLock::new(|| build_log_class_size::<4096>(&NORMK6));
|
||||
|
||||
pub(crate) static CANON_K1: LazyLock<Vec<u64>> = LazyLock::new(|| build_canonical_list::<4>(&NORMK1));
|
||||
pub(crate) static CANON_K2: LazyLock<Vec<u64>> = LazyLock::new(|| build_canonical_list::<16>(&NORMK2));
|
||||
pub(crate) static CANON_K3: LazyLock<Vec<u64>> = LazyLock::new(|| build_canonical_list::<64>(&NORMK3));
|
||||
pub(crate) static CANON_K4: LazyLock<Vec<u64>> = LazyLock::new(|| build_canonical_list::<256>(&NORMK4));
|
||||
pub(crate) static CANON_K5: LazyLock<Vec<u64>> = LazyLock::new(|| build_canonical_list::<1024>(&NORMK5));
|
||||
pub(crate) static CANON_K6: LazyLock<Vec<u64>> = LazyLock::new(|| build_canonical_list::<4096>(&NORMK6));
|
||||
|
||||
fn ln0(x: f64) -> f64 {
|
||||
if x == 0.0 { 0.0 } else { x.ln() }
|
||||
@@ -61,10 +55,6 @@ fn build_normalized_kmer<const N: usize>() -> [u64; N] {
|
||||
result
|
||||
}
|
||||
|
||||
fn build_canonical_list<const N: usize>(norm: &[u64; N]) -> Vec<u64> {
|
||||
(0..N).filter(|&i| norm[i] == i as u64).map(|i| i as u64).collect()
|
||||
}
|
||||
|
||||
fn build_log_class_size<const N: usize>(norm: &[u64; N]) -> [f64; N] {
|
||||
let mut sizes = [0u32; N];
|
||||
for &c in norm {
|
||||
@@ -98,18 +88,6 @@ pub(crate) fn entropy_norm_kmer(kmer: u64, k: usize, left: bool) -> u64 {
|
||||
} // right-aligned → left-aligned
|
||||
}
|
||||
|
||||
pub(crate) fn canonical_kmers(k: usize) -> &'static [u64] {
|
||||
match k {
|
||||
1 => &CANON_K1,
|
||||
2 => &CANON_K2,
|
||||
3 => &CANON_K3,
|
||||
4 => &CANON_K4,
|
||||
5 => &CANON_K5,
|
||||
6 => &CANON_K6,
|
||||
_ => panic!("k must be 1..=6"),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn ln_class_size(kmer: u64, k: usize, left: bool) -> f64 {
|
||||
let ra = if left { kmer >> (64 - k * 2) } else { kmer }; // left-aligned → right-aligned index
|
||||
match k {
|
||||
|
||||
@@ -5,9 +5,9 @@
|
||||
//!
|
||||
//! # Cut conditions (checked in order per nucleotide, once the k-mer window is full)
|
||||
//!
|
||||
//! In all three cases the triggering nucleotide S[j] is **not** added to the
|
||||
//! current super-kmer; the super-kmer built so far is emitted; then the rope
|
||||
//! cursor is retreated so the next segment can overlap correctly:
|
||||
//! In all three cases the triggering nucleotide is **not** added to the current
|
||||
//! super-kmer; the super-kmer built so far is emitted; then the rope cursor is
|
||||
//! retreated so the next segment can overlap correctly:
|
||||
//!
|
||||
//! | Condition | cursor retreat |
|
||||
//! |------------------------|----------------|
|
||||
@@ -18,22 +18,18 @@
|
||||
use obikrope::{ForwardCursor, Rope, RopeCursor};
|
||||
use obikseq::superkmer::SuperKmer;
|
||||
|
||||
use crate::encoding::encode_nuc;
|
||||
use crate::entropy::EntropyFilter;
|
||||
use crate::minimizer::MinimizerState;
|
||||
use crate::rolling_stat::RollingStat;
|
||||
use crate::scratch::SuperKmerScratch;
|
||||
use crate::window::KmerWindow;
|
||||
|
||||
/// Iterator over `(minimizer, SuperKmer)` pairs.
|
||||
/// Iterator over `(minimizer_hash, SuperKmer)` pairs.
|
||||
pub struct SuperKmerIter<'a> {
|
||||
cursor: ForwardCursor<'a>,
|
||||
k: usize,
|
||||
theta: f64,
|
||||
scratch: SuperKmerScratch,
|
||||
window: KmerWindow,
|
||||
entropy: EntropyFilter,
|
||||
minimizer: MinimizerState,
|
||||
prev_minimizer: Option<u64>,
|
||||
prev_minimizer_pos: u8,
|
||||
stat: RollingStat,
|
||||
prev_min: Option<u64>,
|
||||
prev_min_pos: usize,
|
||||
}
|
||||
|
||||
impl<'a> SuperKmerIter<'a> {
|
||||
@@ -41,54 +37,50 @@ impl<'a> SuperKmerIter<'a> {
|
||||
///
|
||||
/// - `k`: k-mer size (1–31)
|
||||
/// - `m`: minimizer size (1 < m < k)
|
||||
/// - `level_max`: maximum sub-word size for entropy (typically 6)
|
||||
/// - `level_max`: maximum sub-word size for entropy (1–6)
|
||||
/// - `theta`: entropy threshold; k-mers with score ≤ theta are rejected
|
||||
pub fn new(rope: &'a Rope, k: usize, m: usize, level_max: usize, theta: f64) -> Self {
|
||||
Self {
|
||||
cursor: rope.fw_cursor(),
|
||||
k,
|
||||
theta,
|
||||
scratch: SuperKmerScratch::new(),
|
||||
window: KmerWindow::new(k),
|
||||
entropy: EntropyFilter::new(k, level_max, theta),
|
||||
minimizer: MinimizerState::new(k, m),
|
||||
prev_minimizer: None,
|
||||
prev_minimizer_pos: 0,
|
||||
stat: RollingStat::new(k, m, level_max),
|
||||
prev_min: None,
|
||||
prev_min_pos: 0,
|
||||
}
|
||||
}
|
||||
|
||||
fn reset_state(&mut self) {
|
||||
self.window.reset();
|
||||
self.minimizer.reset();
|
||||
fn reset(&mut self) {
|
||||
self.stat.reset();
|
||||
self.scratch.reset();
|
||||
self.prev_minimizer = None;
|
||||
self.prev_minimizer_pos = 0;
|
||||
self.prev_min = None;
|
||||
self.prev_min_pos = 0;
|
||||
}
|
||||
|
||||
fn try_emit(&mut self, min: u64) -> Option<(u64, SuperKmer)> {
|
||||
fn try_emit(&mut self) -> Option<(u64, SuperKmer)> {
|
||||
if self.scratch.len() < self.k {
|
||||
self.scratch.reset();
|
||||
return None;
|
||||
}
|
||||
let min = self.prev_min?;
|
||||
let mut sk = self.scratch.emit();
|
||||
sk.set_minimizer_pos(self.prev_minimizer_pos);
|
||||
sk.set_minimizer_pos(self.prev_min_pos as u8);
|
||||
Some((min, sk))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for SuperKmerIter<'a> {
|
||||
impl Iterator for SuperKmerIter<'_> {
|
||||
type Item = (u64, SuperKmer);
|
||||
|
||||
fn next(&mut self) -> Option<(u64, SuperKmer)> {
|
||||
loop {
|
||||
let byte = match self.cursor.read_next().ok() {
|
||||
None => {
|
||||
let min = self.prev_minimizer.unwrap_or(0);
|
||||
return self.try_emit(min);
|
||||
return self.try_emit();
|
||||
}
|
||||
Some(0x00) => {
|
||||
let min = self.prev_minimizer.unwrap_or(0);
|
||||
let result = self.try_emit(min);
|
||||
self.reset_state();
|
||||
let result = self.try_emit();
|
||||
self.reset();
|
||||
if result.is_some() {
|
||||
return result;
|
||||
}
|
||||
@@ -97,36 +89,33 @@ impl<'a> Iterator for SuperKmerIter<'a> {
|
||||
Some(b) => b,
|
||||
};
|
||||
|
||||
let base2 = encode_nuc(byte);
|
||||
let kmer_ready = self.window.push(base2);
|
||||
let current_min = self.minimizer.push(base2);
|
||||
self.stat.push(byte);
|
||||
|
||||
if !kmer_ready {
|
||||
if !self.stat.ready() {
|
||||
self.scratch.push(byte);
|
||||
continue;
|
||||
}
|
||||
|
||||
let kmer = self.window.kmer_u64();
|
||||
let min = current_min.unwrap();
|
||||
|
||||
// ── 1. Entropy check ─────────────────────────────────────────────
|
||||
if !self.entropy.accept(kmer) {
|
||||
let prev_min = self.prev_minimizer.unwrap_or(0);
|
||||
let result = self.try_emit(prev_min);
|
||||
if self.stat.normalized_entropy().unwrap_or(1.0) <= self.theta {
|
||||
let result = self.try_emit();
|
||||
self.cursor.rewind(self.k - 1).ok();
|
||||
self.reset_state();
|
||||
self.reset();
|
||||
if result.is_some() {
|
||||
return result;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
let min = self.stat.canonical_minimizer().map(|k| k.raw()).unwrap_or(0);
|
||||
let min_pos = self.stat.minimizer_position().unwrap_or(0);
|
||||
|
||||
// ── 2. Minimizer change check ─────────────────────────────────────
|
||||
if let Some(prev) = self.prev_minimizer {
|
||||
if let Some(prev) = self.prev_min {
|
||||
if min != prev {
|
||||
let result = self.try_emit(prev);
|
||||
let result = self.try_emit();
|
||||
self.cursor.rewind(self.k).ok();
|
||||
self.reset_state();
|
||||
self.reset();
|
||||
if result.is_some() {
|
||||
return result;
|
||||
}
|
||||
@@ -136,17 +125,17 @@ impl<'a> Iterator for SuperKmerIter<'a> {
|
||||
|
||||
// ── 3. Super-kmer length check ────────────────────────────────────
|
||||
if self.scratch.len() == 256 {
|
||||
let result = self.try_emit(min);
|
||||
let result = self.try_emit();
|
||||
self.cursor.rewind(self.k).ok();
|
||||
self.reset_state();
|
||||
self.reset();
|
||||
if result.is_some() {
|
||||
return result;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
self.prev_minimizer = Some(min);
|
||||
self.prev_minimizer_pos = self.minimizer.minimizer_pos();
|
||||
self.prev_min = Some(min);
|
||||
self.prev_min_pos = min_pos;
|
||||
self.scratch.push(byte);
|
||||
}
|
||||
}
|
||||
@@ -157,6 +146,7 @@ impl<'a> Iterator for SuperKmerIter<'a> {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use obikrope::Rope;
|
||||
|
||||
fn make_rope(data: &[u8]) -> Rope {
|
||||
let mut r = Rope::new();
|
||||
|
||||
@@ -5,11 +5,8 @@
|
||||
|
||||
#![deny(missing_docs)]
|
||||
|
||||
mod entropy;
|
||||
pub mod iter;
|
||||
mod minimizer;
|
||||
mod scratch;
|
||||
mod window;
|
||||
|
||||
pub(crate) mod encoding;
|
||||
pub(crate) mod entropy_table;
|
||||
|
||||
@@ -1,96 +0,0 @@
|
||||
//! Sliding-window canonical minimizer via a monotone deque.
|
||||
//!
|
||||
//! The minimizer of a k-mer is the smallest canonical m-mer (min of forward
|
||||
//! and reverse-complement) among the k−m+1 m-mers it contains.
|
||||
|
||||
use std::collections::VecDeque;
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
struct MmerItem {
|
||||
/// 0-based position of this m-mer's first base within the current segment.
|
||||
position: usize,
|
||||
canonical: u64,
|
||||
}
|
||||
|
||||
pub struct MinimizerState {
|
||||
k: usize,
|
||||
m: usize,
|
||||
deque: VecDeque<MmerItem>,
|
||||
fwd_mmer: u64,
|
||||
rvc_mmer: u64,
|
||||
m_mask: u64,
|
||||
rc_shift: u32,
|
||||
/// Number of bases pushed since the last reset.
|
||||
local_pos: usize,
|
||||
}
|
||||
|
||||
impl MinimizerState {
|
||||
pub fn new(k: usize, m: usize) -> Self {
|
||||
debug_assert!(m >= 1 && m < k && k <= 31);
|
||||
Self {
|
||||
k,
|
||||
m,
|
||||
deque: VecDeque::with_capacity(k - m + 2),
|
||||
fwd_mmer: 0,
|
||||
rvc_mmer: 0,
|
||||
m_mask: (1u64 << (m * 2)) - 1,
|
||||
rc_shift: ((m - 1) * 2) as u32,
|
||||
local_pos: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Push a 2-bit base (0–3).
|
||||
///
|
||||
/// Returns `Some(minimizer)` once we have a full k-mer (i.e., after the
|
||||
/// k-th push), `None` during the warm-up phase.
|
||||
pub fn push(&mut self, base: u8) -> Option<u64> {
|
||||
let code = base as u64;
|
||||
|
||||
// Update sliding m-mer (fwd + rvc).
|
||||
self.fwd_mmer = ((self.fwd_mmer << 2) | code) & self.m_mask;
|
||||
self.rvc_mmer = (self.rvc_mmer >> 2) | ((code ^ 3) << self.rc_shift);
|
||||
|
||||
self.local_pos += 1;
|
||||
|
||||
// Emit canonical m-mer once full.
|
||||
if self.local_pos >= self.m {
|
||||
let canonical = self.fwd_mmer.min(self.rvc_mmer);
|
||||
let mmer_pos = self.local_pos - self.m; // start position of this m-mer
|
||||
|
||||
// Maintain monotone deque: pop back while ≥ new canonical.
|
||||
while self.deque.back().map_or(false, |it| it.canonical >= canonical) {
|
||||
self.deque.pop_back();
|
||||
}
|
||||
self.deque.push_back(MmerItem { position: mmer_pos, canonical });
|
||||
}
|
||||
|
||||
// Return minimizer once we have a full k-mer.
|
||||
if self.local_pos >= self.k {
|
||||
let kmer_start = self.local_pos - self.k;
|
||||
// Evict m-mers that have slid out of the k-mer window.
|
||||
while self.deque.front().map_or(false, |it| it.position < kmer_start) {
|
||||
self.deque.pop_front();
|
||||
}
|
||||
Some(self.deque.front().map_or(0, |it| it.canonical))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Position of the current minimizer m-mer within the current superkmer.
|
||||
///
|
||||
/// Valid only when the deque is non-empty (i.e. after at least one full k-mer).
|
||||
/// Because `reset()` zeroes `local_pos` at the same time as the scratch buffer
|
||||
/// is reset, `deque.front().position` directly equals the offset from the
|
||||
/// superkmer start.
|
||||
pub fn minimizer_pos(&self) -> u8 {
|
||||
self.deque.front().map_or(0, |it| it.position as u8)
|
||||
}
|
||||
|
||||
pub fn reset(&mut self) {
|
||||
self.deque.clear();
|
||||
self.fwd_mmer = 0;
|
||||
self.rvc_mmer = 0;
|
||||
self.local_pos = 0;
|
||||
}
|
||||
}
|
||||
@@ -1,8 +1,7 @@
|
||||
use obikseq::kmer::Kmer;
|
||||
|
||||
use crate::encoding::encode_nuc;
|
||||
use crate::entropy_table::{canonical_kmers, emax, entropy_norm_kmer, ln_class_size, log_nwords, n_log_n};
|
||||
use std::collections::VecDeque;
|
||||
use crate::entropy_table::{WS_MAX, emax, entropy_norm_kmer, ln_class_size, log_nwords, n_log_n};
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
struct MmerItem {
|
||||
@@ -20,19 +19,21 @@ pub struct RollingStat {
|
||||
k_mask: u64,
|
||||
m_mask: u64,
|
||||
received: usize,
|
||||
k1q: VecDeque<u64>,
|
||||
k2q: VecDeque<u64>,
|
||||
k3q: VecDeque<u64>,
|
||||
k4q: VecDeque<u64>,
|
||||
k5q: VecDeque<u64>,
|
||||
k6q: VecDeque<u64>,
|
||||
minimier: VecDeque<MmerItem>,
|
||||
k1q: std::collections::VecDeque<u64>,
|
||||
k2q: std::collections::VecDeque<u64>,
|
||||
k3q: std::collections::VecDeque<u64>,
|
||||
k4q: std::collections::VecDeque<u64>,
|
||||
k5q: std::collections::VecDeque<u64>,
|
||||
k6q: std::collections::VecDeque<u64>,
|
||||
minimier: std::collections::VecDeque<MmerItem>,
|
||||
k1c: Vec<usize>,
|
||||
k2c: Vec<usize>,
|
||||
k3c: Vec<usize>,
|
||||
k4c: Vec<usize>,
|
||||
k5c: Vec<usize>,
|
||||
k6c: Vec<usize>,
|
||||
sum_f_log_f: [f64; WS_MAX + 1],
|
||||
sum_f_log_s: [f64; WS_MAX + 1],
|
||||
}
|
||||
|
||||
impl RollingStat {
|
||||
@@ -46,19 +47,21 @@ impl RollingStat {
|
||||
k_mask: (!0) >> (64 - k * 2),
|
||||
m_mask: (!0) >> (64 - m * 2),
|
||||
received: 0,
|
||||
k1q: VecDeque::with_capacity(k),
|
||||
k2q: VecDeque::with_capacity(k - 1),
|
||||
k3q: VecDeque::with_capacity(k - 2),
|
||||
k4q: VecDeque::with_capacity(k - 3),
|
||||
k5q: VecDeque::with_capacity(k - 4),
|
||||
k6q: VecDeque::with_capacity(k - 5),
|
||||
minimier: VecDeque::with_capacity(k-m+2),
|
||||
k1q: std::collections::VecDeque::with_capacity(k),
|
||||
k2q: std::collections::VecDeque::with_capacity(k - 1),
|
||||
k3q: std::collections::VecDeque::with_capacity(k - 2),
|
||||
k4q: std::collections::VecDeque::with_capacity(k - 3),
|
||||
k5q: std::collections::VecDeque::with_capacity(k - 4),
|
||||
k6q: std::collections::VecDeque::with_capacity(k - 5),
|
||||
minimier: std::collections::VecDeque::with_capacity(k - m + 2),
|
||||
k1c: vec![0; 4_usize.pow(1)],
|
||||
k2c: vec![0; 4_usize.pow(2)],
|
||||
k3c: vec![0; 4_usize.pow(3)],
|
||||
k4c: vec![0; 4_usize.pow(4)],
|
||||
k5c: vec![0; 4_usize.pow(5)],
|
||||
k6c: vec![0; 4_usize.pow(6)],
|
||||
sum_f_log_f: [0.0; WS_MAX + 1],
|
||||
sum_f_log_s: [0.0; WS_MAX + 1],
|
||||
}
|
||||
}
|
||||
|
||||
@@ -66,6 +69,12 @@ impl RollingStat {
|
||||
self.rolling_k = 0;
|
||||
self.rolling_rck = 0;
|
||||
self.received = 0;
|
||||
for &i in &self.k1q { self.k1c[i as usize] = 0; }
|
||||
for &i in &self.k2q { self.k2c[i as usize] = 0; }
|
||||
for &i in &self.k3q { self.k3c[i as usize] = 0; }
|
||||
for &i in &self.k4q { self.k4c[i as usize] = 0; }
|
||||
for &i in &self.k5q { self.k5c[i as usize] = 0; }
|
||||
for &i in &self.k6q { self.k6c[i as usize] = 0; }
|
||||
self.k1q.clear();
|
||||
self.k2q.clear();
|
||||
self.k3q.clear();
|
||||
@@ -73,22 +82,41 @@ impl RollingStat {
|
||||
self.k5q.clear();
|
||||
self.k6q.clear();
|
||||
self.minimier.clear();
|
||||
self.k1c.fill(0);
|
||||
self.k2c.fill(0);
|
||||
self.k3c.fill(0);
|
||||
self.k4c.fill(0);
|
||||
self.k5c.fill(0);
|
||||
self.k6c.fill(0);
|
||||
self.sum_f_log_f = [0.0; WS_MAX + 1];
|
||||
self.sum_f_log_s = [0.0; WS_MAX + 1];
|
||||
}
|
||||
|
||||
pub fn push(&mut self, nuc: char) {
|
||||
let bnuc = encode_nuc(nuc as u8);
|
||||
#[inline]
|
||||
fn update_sums_decrement(
|
||||
sum_f_log_f: &mut [f64; WS_MAX + 1],
|
||||
sum_f_log_s: &mut [f64; WS_MAX + 1],
|
||||
ws: usize,
|
||||
canonical: u64,
|
||||
f: usize,
|
||||
) {
|
||||
sum_f_log_f[ws] += n_log_n(f - 1) - n_log_n(f);
|
||||
sum_f_log_s[ws] -= ln_class_size(canonical, ws, false);
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn update_sums_increment(
|
||||
sum_f_log_f: &mut [f64; WS_MAX + 1],
|
||||
sum_f_log_s: &mut [f64; WS_MAX + 1],
|
||||
ws: usize,
|
||||
canonical: u64,
|
||||
g: usize,
|
||||
) {
|
||||
sum_f_log_f[ws] += n_log_n(g + 1) - n_log_n(g);
|
||||
sum_f_log_s[ws] += ln_class_size(canonical, ws, false);
|
||||
}
|
||||
|
||||
pub fn push(&mut self, nuc: u8) {
|
||||
let bnuc = encode_nuc(nuc);
|
||||
let cnuc = bnuc ^ 3;
|
||||
|
||||
self.rolling_k = ((self.rolling_k << 2) | (bnuc as u64)) & self.k_mask;
|
||||
self.rolling_rck =
|
||||
((self.rolling_rck >> 2) | ((cnuc as u64) << ((self.k -1) * 2))) & self.k_mask;
|
||||
|
||||
((self.rolling_rck >> 2) | ((cnuc as u64) << ((self.k - 1) * 2))) & self.k_mask;
|
||||
|
||||
let canonical_k1 = entropy_norm_kmer(self.rolling_k & 3, 1, false);
|
||||
let canonical_k2 = entropy_norm_kmer(self.rolling_k & 15, 2, false);
|
||||
@@ -107,39 +135,84 @@ impl RollingStat {
|
||||
while self.minimier.back().map_or(false, |it| it.canonical >= possible_canonical_m) {
|
||||
self.minimier.pop_back();
|
||||
}
|
||||
self.minimier.push_back(MmerItem { position: possible_pos_m, canonical: possible_canonical_m });
|
||||
self.minimier
|
||||
.push_back(MmerItem { position: possible_pos_m, canonical: possible_canonical_m });
|
||||
|
||||
if self.received > self.k {
|
||||
while self.minimier.front().map_or(false, |it| it.position + self.k < self.received) {
|
||||
while self
|
||||
.minimier
|
||||
.front()
|
||||
.map_or(false, |it| it.position + self.k < self.received)
|
||||
{
|
||||
self.minimier.pop_front();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if self.received > self.k {
|
||||
self.k1c[self.k1q.pop_front().unwrap() as usize] -= 1;
|
||||
self.k2c[self.k2q.pop_front().unwrap() as usize] -= 1;
|
||||
self.k3c[self.k3q.pop_front().unwrap() as usize] -= 1;
|
||||
self.k4c[self.k4q.pop_front().unwrap() as usize] -= 1;
|
||||
self.k5c[self.k5q.pop_front().unwrap() as usize] -= 1;
|
||||
self.k6c[self.k6q.pop_front().unwrap() as usize] -= 1;
|
||||
let old1 = self.k1q.pop_front().unwrap();
|
||||
let f1 = self.k1c[old1 as usize];
|
||||
Self::update_sums_decrement(&mut self.sum_f_log_f, &mut self.sum_f_log_s, 1, old1, f1);
|
||||
self.k1c[old1 as usize] -= 1;
|
||||
|
||||
let old2 = self.k2q.pop_front().unwrap();
|
||||
let f2 = self.k2c[old2 as usize];
|
||||
Self::update_sums_decrement(&mut self.sum_f_log_f, &mut self.sum_f_log_s, 2, old2, f2);
|
||||
self.k2c[old2 as usize] -= 1;
|
||||
|
||||
let old3 = self.k3q.pop_front().unwrap();
|
||||
let f3 = self.k3c[old3 as usize];
|
||||
Self::update_sums_decrement(&mut self.sum_f_log_f, &mut self.sum_f_log_s, 3, old3, f3);
|
||||
self.k3c[old3 as usize] -= 1;
|
||||
|
||||
let old4 = self.k4q.pop_front().unwrap();
|
||||
let f4 = self.k4c[old4 as usize];
|
||||
Self::update_sums_decrement(&mut self.sum_f_log_f, &mut self.sum_f_log_s, 4, old4, f4);
|
||||
self.k4c[old4 as usize] -= 1;
|
||||
|
||||
let old5 = self.k5q.pop_front().unwrap();
|
||||
let f5 = self.k5c[old5 as usize];
|
||||
Self::update_sums_decrement(&mut self.sum_f_log_f, &mut self.sum_f_log_s, 5, old5, f5);
|
||||
self.k5c[old5 as usize] -= 1;
|
||||
|
||||
let old6 = self.k6q.pop_front().unwrap();
|
||||
let f6 = self.k6c[old6 as usize];
|
||||
Self::update_sums_decrement(&mut self.sum_f_log_f, &mut self.sum_f_log_s, 6, old6, f6);
|
||||
self.k6c[old6 as usize] -= 1;
|
||||
}
|
||||
|
||||
let g1 = self.k1c[canonical_k1 as usize];
|
||||
Self::update_sums_increment(&mut self.sum_f_log_f, &mut self.sum_f_log_s, 1, canonical_k1, g1);
|
||||
self.k1c[canonical_k1 as usize] += 1;
|
||||
self.k1q.push_back(canonical_k1);
|
||||
|
||||
if self.received >= 2 {
|
||||
let g2 = self.k2c[canonical_k2 as usize];
|
||||
Self::update_sums_increment(&mut self.sum_f_log_f, &mut self.sum_f_log_s, 2, canonical_k2, g2);
|
||||
self.k2c[canonical_k2 as usize] += 1;
|
||||
self.k2q.push_back(canonical_k2);
|
||||
|
||||
if self.received >= 3 {
|
||||
let g3 = self.k3c[canonical_k3 as usize];
|
||||
Self::update_sums_increment(&mut self.sum_f_log_f, &mut self.sum_f_log_s, 3, canonical_k3, g3);
|
||||
self.k3c[canonical_k3 as usize] += 1;
|
||||
self.k3q.push_back(canonical_k3);
|
||||
|
||||
if self.received >= 4 {
|
||||
let g4 = self.k4c[canonical_k4 as usize];
|
||||
Self::update_sums_increment(&mut self.sum_f_log_f, &mut self.sum_f_log_s, 4, canonical_k4, g4);
|
||||
self.k4c[canonical_k4 as usize] += 1;
|
||||
self.k4q.push_back(canonical_k4);
|
||||
|
||||
if self.received >= 5 {
|
||||
let g5 = self.k5c[canonical_k5 as usize];
|
||||
Self::update_sums_increment(&mut self.sum_f_log_f, &mut self.sum_f_log_s, 5, canonical_k5, g5);
|
||||
self.k5c[canonical_k5 as usize] += 1;
|
||||
self.k5q.push_back(canonical_k5);
|
||||
|
||||
if self.received >= 6 {
|
||||
let g6 = self.k6c[canonical_k6 as usize];
|
||||
Self::update_sums_increment(&mut self.sum_f_log_f, &mut self.sum_f_log_s, 6, canonical_k6, g6);
|
||||
self.k6c[canonical_k6 as usize] += 1;
|
||||
self.k6q.push_back(canonical_k6);
|
||||
}
|
||||
@@ -171,8 +244,7 @@ impl RollingStat {
|
||||
|
||||
pub fn canonical_kmer(&self) -> Option<Kmer> {
|
||||
if self.ready() {
|
||||
Some(Kmer::from_raw_right(
|
||||
self.rolling_k.min(self.rolling_rck), self.k))
|
||||
Some(Kmer::from_raw_right(self.rolling_k.min(self.rolling_rck), self.k))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
@@ -195,37 +267,24 @@ impl RollingStat {
|
||||
}
|
||||
|
||||
pub fn entropy(&self, order: usize) -> Option<f64> {
|
||||
if !self.ready() { return None; }
|
||||
let k = self.k;
|
||||
let em = emax(k, order);
|
||||
if em <= 0.0 { return Some(1.0); }
|
||||
let counts = match order {
|
||||
1 => &self.k1c,
|
||||
2 => &self.k2c,
|
||||
3 => &self.k3c,
|
||||
4 => &self.k4c,
|
||||
5 => &self.k5c,
|
||||
6 => &self.k6c,
|
||||
_ => return None,
|
||||
};
|
||||
let nwords = k - order + 1;
|
||||
let log_nw = log_nwords(k, order);
|
||||
let nw_f = nwords as f64;
|
||||
let mut sum_f_log_f = 0.0f64;
|
||||
let mut sum_f_log_s = 0.0f64;
|
||||
for &j in canonical_kmers(order) {
|
||||
let f = counts[j as usize];
|
||||
if f > 0 {
|
||||
sum_f_log_f += n_log_n(f);
|
||||
sum_f_log_s += f as f64 * ln_class_size(j, order, false);
|
||||
}
|
||||
if !self.ready() {
|
||||
return None;
|
||||
}
|
||||
let h_corr = log_nw + (sum_f_log_s - sum_f_log_f) / nw_f;
|
||||
let em = emax(self.k, order);
|
||||
if em <= 0.0 {
|
||||
return Some(1.0);
|
||||
}
|
||||
let nwords = self.k - order + 1;
|
||||
let log_nw = log_nwords(self.k, order);
|
||||
let nw_f = nwords as f64;
|
||||
let h_corr = log_nw + (self.sum_f_log_s[order] - self.sum_f_log_f[order]) / nw_f;
|
||||
Some((h_corr / em).max(0.0))
|
||||
}
|
||||
|
||||
pub fn normalized_entropy(&self) -> Option<f64> {
|
||||
if !self.ready() { return None; }
|
||||
if !self.ready() {
|
||||
return None;
|
||||
}
|
||||
let min_e = (1..=self.entropy_max_k)
|
||||
.filter_map(|ws| self.entropy(ws))
|
||||
.fold(f64::MAX, f64::min);
|
||||
|
||||
@@ -1,50 +0,0 @@
|
||||
//! Sliding k-mer window over 2-bit–encoded nucleotides.
|
||||
|
||||
/// Ring buffer of the last k 2-bit–encoded bases.
|
||||
/// Used to extract the current k-mer as a `u64` and to feed the entropy and
|
||||
/// minimizer state machines.
|
||||
pub struct KmerWindow {
|
||||
buf: [u8; 32],
|
||||
start: usize,
|
||||
len: usize,
|
||||
k: usize,
|
||||
}
|
||||
|
||||
impl KmerWindow {
|
||||
pub fn new(k: usize) -> Self {
|
||||
debug_assert!(k >= 1 && k <= 31);
|
||||
Self { buf: [0u8; 32], start: 0, len: 0, k }
|
||||
}
|
||||
|
||||
pub fn k(&self) -> usize { self.k }
|
||||
|
||||
pub fn is_full(&self) -> bool { self.len == self.k }
|
||||
|
||||
/// Push a 2-bit base (0–3). Returns `true` once the window holds k bases.
|
||||
#[inline]
|
||||
pub fn push(&mut self, base: u8) -> bool {
|
||||
if self.len < self.k {
|
||||
self.buf[self.len] = base;
|
||||
self.len += 1;
|
||||
self.len == self.k
|
||||
} else {
|
||||
self.buf[self.start] = base;
|
||||
self.start = (self.start + 1) % self.k;
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
/// Current k-mer as a right-aligned `u64` (first base in MSB of the 2k-bit field).
|
||||
pub fn kmer_u64(&self) -> u64 {
|
||||
let mut v = 0u64;
|
||||
for i in 0..self.k {
|
||||
v = (v << 2) | self.buf[(self.start + i) % self.k] as u64;
|
||||
}
|
||||
v
|
||||
}
|
||||
|
||||
pub fn reset(&mut self) {
|
||||
self.start = 0;
|
||||
self.len = 0;
|
||||
}
|
||||
}
|
||||
Binary file not shown.
Reference in New Issue
Block a user