(feat) Add entropy-based filtering and rolling statistics for k-mers
- Introduce lazy_static dependency - Refactor encoding: rename encode_base →encode_nuc and make it pub(crate) - Add from_raw_right/raw Right methods to Kmer for right-aligned handling - Improve error message formatting and code readability in kmod.rs tests - Replace inline entropy computation with precomputed tables (entropy_table module)—using LazyLock for static lookup arrays - Simplify EntropyFilter by removing redundant tables and delegating to new entropy_table API - Add RollingStat module for real-time kmer statistics and minimizer tracking - Reorganize modules: move iter, encoding to pub(crate), add entropy_table and rolling_stat - Update imports across obiskbuilder crate accordingly
This commit is contained in:
Generated
+7
@@ -497,6 +497,12 @@ dependencies = [
|
|||||||
"libc",
|
"libc",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "lazy_static"
|
||||||
|
version = "1.5.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "libc"
|
name = "libc"
|
||||||
version = "0.2.185"
|
version = "0.2.185"
|
||||||
@@ -624,6 +630,7 @@ dependencies = [
|
|||||||
name = "obiskbuilder"
|
name = "obiskbuilder"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"lazy_static",
|
||||||
"obikrope",
|
"obikrope",
|
||||||
"obikseq",
|
"obikseq",
|
||||||
]
|
]
|
||||||
|
|||||||
+49
-13
@@ -4,7 +4,7 @@
|
|||||||
//! The low 64−2k bits are always zero. k is not stored — it is a parameter of
|
//! The low 64−2k bits are always zero. k is not stored — it is a parameter of
|
||||||
//! every operation that needs it, and will be owned by the collection-level indexer.
|
//! every operation that needs it, and will be owned by the collection-level indexer.
|
||||||
|
|
||||||
use crate::encoding::{encode_base, DEC4};
|
use crate::encoding::{DEC4, encode_base};
|
||||||
|
|
||||||
// ── KmerError ─────────────────────────────────────────────────────────────────
|
// ── KmerError ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
@@ -30,10 +30,11 @@ pub enum KmerError {
|
|||||||
impl std::fmt::Display for KmerError {
|
impl std::fmt::Display for KmerError {
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
match self {
|
match self {
|
||||||
KmerError::OutOfBounds { position, k, seql } =>
|
KmerError::OutOfBounds { position, k, seql } => write!(
|
||||||
write!(f, "kmer of length {k} at position {position} exceeds sequence length {seql}"),
|
f,
|
||||||
KmerError::InvalidK { k } =>
|
"kmer of length {k} at position {position} exceeds sequence length {seql}"
|
||||||
write!(f, "k={k} is invalid: must be in 1..=32"),
|
),
|
||||||
|
KmerError::InvalidK { k } => write!(f, "k={k} is invalid: must be in 1..=32"),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -55,12 +56,25 @@ impl Kmer {
|
|||||||
Kmer(raw)
|
Kmer(raw)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Wrap a raw right-aligned u64 value as a Kmer.
|
||||||
|
/// The raw value is shifted left by `2 * k` bits to align it with the leftmost position.
|
||||||
|
#[inline]
|
||||||
|
pub fn from_raw_right(raw: u64, k: usize) -> Self {
|
||||||
|
Kmer(raw << (64 - 2 * k))
|
||||||
|
}
|
||||||
|
|
||||||
/// Return the raw left-aligned u64 value.
|
/// Return the raw left-aligned u64 value.
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn raw(&self) -> u64 {
|
pub fn raw(&self) -> u64 {
|
||||||
self.0
|
self.0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Return the raw right-aligned u64 value.
|
||||||
|
#[inline]
|
||||||
|
pub fn raw_right(&self, k: usize) -> u64 {
|
||||||
|
self.0 >> (64 - 2 * k)
|
||||||
|
}
|
||||||
|
|
||||||
/// Encode the first k nucleotides of an ASCII slice into a Kmer.
|
/// Encode the first k nucleotides of an ASCII slice into a Kmer.
|
||||||
/// Zero allocation — result lives on the stack.
|
/// Zero allocation — result lives on the stack.
|
||||||
#[inline]
|
#[inline]
|
||||||
@@ -69,7 +83,11 @@ impl Kmer {
|
|||||||
return Err(KmerError::InvalidK { k });
|
return Err(KmerError::InvalidK { k });
|
||||||
}
|
}
|
||||||
if ascii.len() < k {
|
if ascii.len() < k {
|
||||||
return Err(KmerError::OutOfBounds { position: 0, k, seql: ascii.len() });
|
return Err(KmerError::OutOfBounds {
|
||||||
|
position: 0,
|
||||||
|
k,
|
||||||
|
seql: ascii.len(),
|
||||||
|
});
|
||||||
}
|
}
|
||||||
let mut val = 0u64;
|
let mut val = 0u64;
|
||||||
for i in 0..k {
|
for i in 0..k {
|
||||||
@@ -135,11 +153,16 @@ mod tests {
|
|||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
fn ascii_revcomp(seq: &[u8]) -> Vec<u8> {
|
fn ascii_revcomp(seq: &[u8]) -> Vec<u8> {
|
||||||
seq.iter().rev().map(|&b| match b {
|
seq.iter()
|
||||||
b'A' => b'T', b'T' => b'A',
|
.rev()
|
||||||
b'C' => b'G', b'G' => b'C',
|
.map(|&b| match b {
|
||||||
|
b'A' => b'T',
|
||||||
|
b'T' => b'A',
|
||||||
|
b'C' => b'G',
|
||||||
|
b'G' => b'C',
|
||||||
_ => b'A',
|
_ => b'A',
|
||||||
}).collect()
|
})
|
||||||
|
.collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
const K_VALUES: &[usize] = &[1, 2, 3, 4, 8, 11, 16, 31, 32];
|
const K_VALUES: &[usize] = &[1, 2, 3, 4, 8, 11, 16, 31, 32];
|
||||||
@@ -205,7 +228,12 @@ mod tests {
|
|||||||
let k = seq.len();
|
let k = seq.len();
|
||||||
let kmer = Kmer::from_ascii(seq, k).unwrap();
|
let kmer = Kmer::from_ascii(seq, k).unwrap();
|
||||||
let rc = kmer.revcomp(k);
|
let rc = kmer.revcomp(k);
|
||||||
assert_eq!(rc.to_ascii(k), *expected, "revcomp wrong for \"{}\"", std::str::from_utf8(seq).unwrap());
|
assert_eq!(
|
||||||
|
rc.to_ascii(k),
|
||||||
|
*expected,
|
||||||
|
"revcomp wrong for \"{}\"",
|
||||||
|
std::str::from_utf8(seq).unwrap()
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -224,7 +252,11 @@ mod tests {
|
|||||||
for &k in K_VALUES {
|
for &k in K_VALUES {
|
||||||
let ascii = make_seq(k);
|
let ascii = make_seq(k);
|
||||||
let kmer = Kmer::from_ascii(&ascii, k).unwrap();
|
let kmer = Kmer::from_ascii(&ascii, k).unwrap();
|
||||||
assert_eq!(kmer.revcomp(k).revcomp(k), kmer, "revcomp∘revcomp≠id for k={k}");
|
assert_eq!(
|
||||||
|
kmer.revcomp(k).revcomp(k),
|
||||||
|
kmer,
|
||||||
|
"revcomp∘revcomp≠id for k={k}"
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -257,7 +289,11 @@ mod tests {
|
|||||||
fn canonical_idempotent() {
|
fn canonical_idempotent() {
|
||||||
for &k in K_VALUES {
|
for &k in K_VALUES {
|
||||||
let kmer = Kmer::from_ascii(&make_seq(k), k).unwrap().canonical(k);
|
let kmer = Kmer::from_ascii(&make_seq(k), k).unwrap().canonical(k);
|
||||||
assert_eq!(kmer.canonical(k), kmer, "canonical not idempotent for k={k}");
|
assert_eq!(
|
||||||
|
kmer.canonical(k),
|
||||||
|
kmer,
|
||||||
|
"canonical not idempotent for k={k}"
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6,3 +6,4 @@ edition = "2024"
|
|||||||
[dependencies]
|
[dependencies]
|
||||||
obikseq = { path = "../obikseq" }
|
obikseq = { path = "../obikseq" }
|
||||||
obikrope = { path = "../obikrope" }
|
obikrope = { path = "../obikrope" }
|
||||||
|
lazy_static = "1.5.0"
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ pub const BYTE_LEN_MAX: usize = 64;
|
|||||||
|
|
||||||
/// Encode one uppercase ASCII nucleotide to its 2-bit value.
|
/// Encode one uppercase ASCII nucleotide to its 2-bit value.
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn encode_base(b: u8) -> u8 {
|
pub(crate) fn encode_nuc(b: u8) -> u8 {
|
||||||
match b {
|
match b {
|
||||||
b'A' => 0b00,
|
b'A' => 0b00,
|
||||||
b'C' => 0b01,
|
b'C' => 0b01,
|
||||||
|
|||||||
@@ -8,23 +8,14 @@
|
|||||||
//!
|
//!
|
||||||
//! A kmer is rejected if score ≤ theta.
|
//! A kmer is rejected if score ≤ theta.
|
||||||
|
|
||||||
|
use crate::entropy_table::{emax, entropy_norm_kmer, ln_class_size, log_nwords, n_log_n};
|
||||||
|
|
||||||
/// Pre-computed entropy filter. One instance per worker thread (not Send/Sync
|
/// Pre-computed entropy filter. One instance per worker thread (not Send/Sync
|
||||||
/// because of the mutable freq scratch buffer; wrap in a closure per thread).
|
/// because of the mutable freq scratch buffer; wrap in a closure per thread).
|
||||||
pub struct EntropyFilter {
|
pub struct EntropyFilter {
|
||||||
k: usize,
|
k: usize,
|
||||||
level_max: usize,
|
level_max: usize,
|
||||||
threshold: f64,
|
threshold: f64,
|
||||||
/// norm_tables[ws][raw_code] = canonical circular-rotation code
|
|
||||||
norm_tables: Vec<Vec<u16>>,
|
|
||||||
/// log_s_tables[ws][canonical] = ln(class_size), where class_size is the
|
|
||||||
/// number of raw codes mapping to this canonical form.
|
|
||||||
log_s_tables: Vec<Vec<f64>>,
|
|
||||||
/// n_log_n[n] = n · ln(n), n_log_n[0] = 0
|
|
||||||
n_log_n: Vec<f64>,
|
|
||||||
/// Pre-computed H_max per word size.
|
|
||||||
emax: Vec<f64>,
|
|
||||||
/// Pre-computed ln(n_words) per word size.
|
|
||||||
log_nwords: Vec<f64>,
|
|
||||||
/// Reusable frequency buffer per word size (reset before each kmer).
|
/// Reusable frequency buffer per word size (reset before each kmer).
|
||||||
freq_buf: Vec<Vec<u32>>,
|
freq_buf: Vec<Vec<u32>>,
|
||||||
}
|
}
|
||||||
@@ -32,63 +23,11 @@ pub struct EntropyFilter {
|
|||||||
impl EntropyFilter {
|
impl EntropyFilter {
|
||||||
pub fn new(k: usize, level_max: usize, threshold: f64) -> Self {
|
pub fn new(k: usize, level_max: usize, threshold: f64) -> Self {
|
||||||
let level_max = level_max.min(k - 1).max(1);
|
let level_max = level_max.min(k - 1).max(1);
|
||||||
|
|
||||||
let mut n_log_n = vec![0.0f64; k + 1];
|
|
||||||
for n in 1..=k {
|
|
||||||
n_log_n[n] = (n as f64) * (n as f64).ln();
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut norm_tables: Vec<Vec<u16>> = vec![vec![]; level_max + 1];
|
|
||||||
let mut log_s_tables: Vec<Vec<f64>> = vec![vec![]; level_max + 1];
|
|
||||||
let mut emax = vec![0.0f64; level_max + 1];
|
|
||||||
let mut log_nwords = vec![0.0f64; level_max + 1];
|
|
||||||
let mut freq_buf: Vec<Vec<u32>> = vec![vec![]; level_max + 1];
|
let mut freq_buf: Vec<Vec<u32>> = vec![vec![]; level_max + 1];
|
||||||
|
|
||||||
for ws in 1..=level_max {
|
for ws in 1..=level_max {
|
||||||
let table_size = 1usize << (ws * 2); // 4^ws
|
freq_buf[ws] = vec![0u32; 1 << (ws * 2)];
|
||||||
let nwords = k - ws + 1;
|
|
||||||
|
|
||||||
// Build circular-rotation canonical table.
|
|
||||||
let norm: Vec<u16> = (0..table_size)
|
|
||||||
.map(|c| normalize_circular(c as u64, ws) as u16)
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
// Count how many raw codes map to each canonical form → class sizes.
|
|
||||||
let mut class_sizes = vec![0u32; table_size];
|
|
||||||
for &c in &norm {
|
|
||||||
class_sizes[c as usize] += 1;
|
|
||||||
}
|
}
|
||||||
|
Self { k, level_max, threshold, freq_buf }
|
||||||
let log_s: Vec<f64> = class_sizes.iter()
|
|
||||||
.map(|&s| if s > 0 { (s as f64).ln() } else { 0.0 })
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
norm_tables[ws] = norm;
|
|
||||||
log_s_tables[ws] = log_s;
|
|
||||||
freq_buf[ws] = vec![0u32; table_size];
|
|
||||||
log_nwords[ws] = (nwords as f64).ln();
|
|
||||||
|
|
||||||
// H_max using 4^ws raw categories.
|
|
||||||
let n_raw = table_size;
|
|
||||||
let c = nwords / n_raw;
|
|
||||||
let r = nwords % n_raw;
|
|
||||||
let nf = nwords as f64;
|
|
||||||
let t1 = if c == 0 || n_raw == r {
|
|
||||||
0.0
|
|
||||||
} else {
|
|
||||||
let f1 = c as f64 / nf;
|
|
||||||
(n_raw - r) as f64 * f1 * f1.ln()
|
|
||||||
};
|
|
||||||
let t2 = if r == 0 {
|
|
||||||
0.0
|
|
||||||
} else {
|
|
||||||
let f2 = (c + 1) as f64 / nf;
|
|
||||||
r as f64 * f2 * f2.ln()
|
|
||||||
};
|
|
||||||
emax[ws] = -(t1 + t2);
|
|
||||||
}
|
|
||||||
|
|
||||||
Self { k, level_max, threshold, norm_tables, log_s_tables, n_log_n, emax, log_nwords, freq_buf }
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns `true` if the kmer's entropy is strictly above the threshold.
|
/// Returns `true` if the kmer's entropy is strictly above the threshold.
|
||||||
@@ -103,12 +42,12 @@ impl EntropyFilter {
|
|||||||
|
|
||||||
for ws in 1..=self.level_max {
|
for ws in 1..=self.level_max {
|
||||||
let nwords = k - ws + 1;
|
let nwords = k - ws + 1;
|
||||||
let emax = self.emax[ws];
|
let em = emax(k, ws);
|
||||||
if emax <= 0.0 { continue; }
|
if em <= 0.0 {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
let mask = (1usize << (ws * 2)) - 1;
|
let mask = (1usize << (ws * 2)) - 1;
|
||||||
let norm = &self.norm_tables[ws];
|
|
||||||
let log_s = &self.log_s_tables[ws];
|
|
||||||
let freq = &mut self.freq_buf[ws];
|
let freq = &mut self.freq_buf[ws];
|
||||||
|
|
||||||
// Slide a ws-mer window; track only written indices (≤ nwords ≤ 31).
|
// Slide a ws-mer window; track only written indices (≤ nwords ≤ 31).
|
||||||
@@ -121,7 +60,7 @@ impl EntropyFilter {
|
|||||||
for i in 0..nwords {
|
for i in 0..nwords {
|
||||||
let base = ((kmer >> (2 * (k - ws - i))) & 3) as usize;
|
let base = ((kmer >> (2 * (k - ws - i))) & 3) as usize;
|
||||||
word = ((word << 2) | base) & mask;
|
word = ((word << 2) | base) & mask;
|
||||||
let idx = norm[word] as usize;
|
let idx = entropy_norm_kmer(word as u64, ws, false) as usize;
|
||||||
if freq[idx] == 0 {
|
if freq[idx] == 0 {
|
||||||
dirty[ndirty] = idx as u16;
|
dirty[ndirty] = idx as u16;
|
||||||
ndirty += 1;
|
ndirty += 1;
|
||||||
@@ -131,20 +70,20 @@ impl EntropyFilter {
|
|||||||
|
|
||||||
// H_corr = log(n_words) + (Σ fⱼ·log(sⱼ) − Σ fⱼ·log(fⱼ)) / n_words
|
// H_corr = log(n_words) + (Σ fⱼ·log(sⱼ) − Σ fⱼ·log(fⱼ)) / n_words
|
||||||
// Reset freq in the same pass to avoid a separate zeroing loop.
|
// Reset freq in the same pass to avoid a separate zeroing loop.
|
||||||
let log_nw = self.log_nwords[ws];
|
let log_nw = log_nwords(k, ws);
|
||||||
let nw_f = nwords as f64;
|
let nw_f = nwords as f64;
|
||||||
let mut sum_f_log_f = 0.0f64;
|
let mut sum_f_log_f = 0.0f64;
|
||||||
let mut sum_f_log_s = 0.0f64;
|
let mut sum_f_log_s = 0.0f64;
|
||||||
for &j in &dirty[..ndirty] {
|
for &j in &dirty[..ndirty] {
|
||||||
let j = j as usize;
|
let j = j as usize;
|
||||||
let f = freq[j] as usize;
|
let f = freq[j] as usize;
|
||||||
sum_f_log_f += self.n_log_n[f];
|
sum_f_log_f += n_log_n(f);
|
||||||
sum_f_log_s += f as f64 * log_s[j];
|
sum_f_log_s += f as f64 * ln_class_size(j as u64, ws, false);
|
||||||
freq[j] = 0;
|
freq[j] = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
let h_corr = log_nw + (sum_f_log_s - sum_f_log_f) / nw_f;
|
let h_corr = log_nw + (sum_f_log_s - sum_f_log_f) / nw_f;
|
||||||
let entropy = (h_corr / emax).max(0.0);
|
let entropy = (h_corr / em).max(0.0);
|
||||||
if entropy < min_entropy {
|
if entropy < min_entropy {
|
||||||
min_entropy = entropy;
|
min_entropy = entropy;
|
||||||
}
|
}
|
||||||
@@ -158,18 +97,3 @@ impl EntropyFilter {
|
|||||||
if min_entropy == f64::MAX { 1.0 } else { min_entropy }
|
if min_entropy == f64::MAX { 1.0 } else { min_entropy }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Lexicographically smallest circular rotation of a ws-mer (right-aligned u64).
|
|
||||||
fn normalize_circular(kmer: u64, ws: usize) -> u64 {
|
|
||||||
let mask = (1u64 << (ws * 2)) - 1;
|
|
||||||
let mut canonical = kmer & mask;
|
|
||||||
let mut current = canonical;
|
|
||||||
for _ in 0..ws - 1 {
|
|
||||||
let top = (current >> ((ws - 1) * 2)) & 3;
|
|
||||||
current = ((current << 2) | top) & mask;
|
|
||||||
if current < canonical {
|
|
||||||
canonical = current;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
canonical
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -0,0 +1,175 @@
|
|||||||
|
use obikseq::kmer::Kmer;
|
||||||
|
use std::sync::LazyLock;
|
||||||
|
|
||||||
|
pub(crate) static NORMK1: LazyLock<[u64; 4]> = LazyLock::new(|| build_normalized_kmer::<4>());
|
||||||
|
pub(crate) static NORMK2: LazyLock<[u64; 16]> = LazyLock::new(|| build_normalized_kmer::<16>());
|
||||||
|
pub(crate) static NORMK3: LazyLock<[u64; 64]> = LazyLock::new(|| build_normalized_kmer::<64>());
|
||||||
|
pub(crate) static NORMK4: LazyLock<[u64; 256]> = LazyLock::new(|| build_normalized_kmer::<256>());
|
||||||
|
pub(crate) static NORMK5: LazyLock<[u64; 1024]> = LazyLock::new(|| build_normalized_kmer::<1024>());
|
||||||
|
pub(crate) static NORMK6: LazyLock<[u64; 4096]> = LazyLock::new(|| build_normalized_kmer::<4096>());
|
||||||
|
|
||||||
|
pub(crate) static LN_CARD_ROT1: LazyLock<[f64; 4]> =
|
||||||
|
LazyLock::new(|| build_log_class_size::<4>(&NORMK1));
|
||||||
|
pub(crate) static LN_CARD_ROT2: LazyLock<[f64; 16]> =
|
||||||
|
LazyLock::new(|| build_log_class_size::<16>(&NORMK2));
|
||||||
|
pub(crate) static LN_CARD_ROT3: LazyLock<[f64; 64]> =
|
||||||
|
LazyLock::new(|| build_log_class_size::<64>(&NORMK3));
|
||||||
|
pub(crate) static LN_CARD_ROT4: LazyLock<[f64; 256]> =
|
||||||
|
LazyLock::new(|| build_log_class_size::<256>(&NORMK4));
|
||||||
|
pub(crate) static LN_CARD_ROT5: LazyLock<[f64; 1024]> =
|
||||||
|
LazyLock::new(|| build_log_class_size::<1024>(&NORMK5));
|
||||||
|
pub(crate) static LN_CARD_ROT6: LazyLock<[f64; 4096]> =
|
||||||
|
LazyLock::new(|| build_log_class_size::<4096>(&NORMK6));
|
||||||
|
|
||||||
|
fn ln0(x: f64) -> f64 {
|
||||||
|
if x == 0.0 { 0.0 } else { x.ln() }
|
||||||
|
}
|
||||||
|
|
||||||
|
fn normalize_circular(kmer: u64, ws: usize) -> u64 {
|
||||||
|
let mask = (1u64 << (ws * 2)) - 1;
|
||||||
|
let mut canonical = kmer & mask;
|
||||||
|
let mut current = canonical;
|
||||||
|
for _ in 0..ws - 1 {
|
||||||
|
let top = (current >> ((ws - 1) * 2)) & 3;
|
||||||
|
current = ((current << 2) | top) & mask;
|
||||||
|
if current < canonical {
|
||||||
|
canonical = current;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
canonical
|
||||||
|
}
|
||||||
|
|
||||||
|
fn build_normalized_kmer<const N: usize>() -> [u64; N] {
|
||||||
|
let mut result = [0u64; N];
|
||||||
|
let k = N.ilog(4) as usize;
|
||||||
|
let shift = 64 - k * 2;
|
||||||
|
for i in 0..N {
|
||||||
|
let la = (i as u64) << shift;
|
||||||
|
let ra = i as u64;
|
||||||
|
let rc_ra = Kmer::from_raw(la).revcomp(k).raw() >> shift;
|
||||||
|
let circ = normalize_circular(ra, k);
|
||||||
|
let circ_rc = normalize_circular(rc_ra, k);
|
||||||
|
result[i] = circ.min(circ_rc);
|
||||||
|
}
|
||||||
|
result
|
||||||
|
}
|
||||||
|
|
||||||
|
fn build_log_class_size<const N: usize>(norm: &[u64; N]) -> [f64; N] {
|
||||||
|
let mut sizes = [0u32; N];
|
||||||
|
for &c in norm {
|
||||||
|
sizes[c as usize] += 1;
|
||||||
|
}
|
||||||
|
let mut result = [0.0f64; N];
|
||||||
|
for i in 0..N {
|
||||||
|
if sizes[i] > 0 {
|
||||||
|
result[i] = ln0(sizes[i] as f64);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
result
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn entropy_norm_kmer(kmer: u64, k: usize, left: bool) -> u64 {
|
||||||
|
let shift = 64 - k * 2;
|
||||||
|
let ra = if left { kmer >> shift } else { kmer }; // left-aligned → right-aligned index
|
||||||
|
let canonical_ra = match k {
|
||||||
|
1 => NORMK1[ra as usize],
|
||||||
|
2 => NORMK2[ra as usize],
|
||||||
|
3 => NORMK3[ra as usize],
|
||||||
|
4 => NORMK4[ra as usize],
|
||||||
|
5 => NORMK5[ra as usize],
|
||||||
|
6 => NORMK6[ra as usize],
|
||||||
|
_ => panic!("k must be 1..=6"),
|
||||||
|
};
|
||||||
|
if left {
|
||||||
|
canonical_ra << shift
|
||||||
|
} else {
|
||||||
|
canonical_ra
|
||||||
|
} // right-aligned → left-aligned
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn ln_class_size(kmer: u64, k: usize, left: bool) -> f64 {
|
||||||
|
let ra = if left { kmer >> (64 - k * 2) } else { kmer }; // left-aligned → right-aligned index
|
||||||
|
match k {
|
||||||
|
1 => LN_CARD_ROT1[NORMK1[ra as usize] as usize],
|
||||||
|
2 => LN_CARD_ROT2[NORMK2[ra as usize] as usize],
|
||||||
|
3 => LN_CARD_ROT3[NORMK3[ra as usize] as usize],
|
||||||
|
4 => LN_CARD_ROT4[NORMK4[ra as usize] as usize],
|
||||||
|
5 => LN_CARD_ROT5[NORMK5[ra as usize] as usize],
|
||||||
|
6 => LN_CARD_ROT6[NORMK6[ra as usize] as usize],
|
||||||
|
_ => panic!("k must be 1..=6"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── k-dependent tables (k ≤ K_MAX, ws ≤ WS_MAX) ──────────────────────────────
|
||||||
|
|
||||||
|
pub(crate) const K_MAX: usize = 32;
|
||||||
|
pub(crate) const WS_MAX: usize = 6;
|
||||||
|
|
||||||
|
/// n·ln(n), with n_log_n[0] = 0. Indexed by n = 0..=K_MAX.
|
||||||
|
pub(crate) static N_LOG_N: LazyLock<[f64; K_MAX + 1]> =
|
||||||
|
LazyLock::new(|| build_n_log_n());
|
||||||
|
|
||||||
|
/// H_max[k][ws]: maximum entropy for kmer length k and word size ws.
|
||||||
|
pub(crate) static EMAX: LazyLock<[[f64; WS_MAX + 1]; K_MAX + 1]> =
|
||||||
|
LazyLock::new(|| build_emax());
|
||||||
|
|
||||||
|
/// ln(k − ws + 1): log of the number of ws-words in a kmer of length k.
|
||||||
|
pub(crate) static LOG_NWORDS: LazyLock<[[f64; WS_MAX + 1]; K_MAX + 1]> =
|
||||||
|
LazyLock::new(|| build_log_nwords());
|
||||||
|
|
||||||
|
pub(crate) fn n_log_n(n: usize) -> f64 {
|
||||||
|
N_LOG_N[n]
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn emax(k: usize, ws: usize) -> f64 {
|
||||||
|
EMAX[k][ws]
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn log_nwords(k: usize, ws: usize) -> f64 {
|
||||||
|
LOG_NWORDS[k][ws]
|
||||||
|
}
|
||||||
|
|
||||||
|
fn build_n_log_n() -> [f64; K_MAX + 1] {
|
||||||
|
let mut result = [0.0f64; K_MAX + 1];
|
||||||
|
for n in 1..=K_MAX {
|
||||||
|
result[n] = (n as f64) * (n as f64).ln();
|
||||||
|
}
|
||||||
|
result
|
||||||
|
}
|
||||||
|
|
||||||
|
fn build_emax() -> [[f64; WS_MAX + 1]; K_MAX + 1] {
|
||||||
|
let mut result = [[0.0f64; WS_MAX + 1]; K_MAX + 1];
|
||||||
|
for k in 2..=K_MAX {
|
||||||
|
for ws in 1..=WS_MAX.min(k - 1) {
|
||||||
|
let n_raw = 1usize << (ws * 2); // 4^ws
|
||||||
|
let nwords = k - ws + 1;
|
||||||
|
let c = nwords / n_raw;
|
||||||
|
let r = nwords % n_raw;
|
||||||
|
let nf = nwords as f64;
|
||||||
|
let t1 = if c == 0 || n_raw == r {
|
||||||
|
0.0
|
||||||
|
} else {
|
||||||
|
let f1 = c as f64 / nf;
|
||||||
|
(n_raw - r) as f64 * f1 * f1.ln()
|
||||||
|
};
|
||||||
|
let t2 = if r == 0 {
|
||||||
|
0.0
|
||||||
|
} else {
|
||||||
|
let f2 = (c + 1) as f64 / nf;
|
||||||
|
r as f64 * f2 * f2.ln()
|
||||||
|
};
|
||||||
|
result[k][ws] = -(t1 + t2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
result
|
||||||
|
}
|
||||||
|
|
||||||
|
fn build_log_nwords() -> [[f64; WS_MAX + 1]; K_MAX + 1] {
|
||||||
|
let mut result = [[0.0f64; WS_MAX + 1]; K_MAX + 1];
|
||||||
|
for k in 2..=K_MAX {
|
||||||
|
for ws in 1..=WS_MAX.min(k - 1) {
|
||||||
|
result[k][ws] = ((k - ws + 1) as f64).ln();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
result
|
||||||
|
}
|
||||||
@@ -18,7 +18,7 @@
|
|||||||
use obikrope::{ForwardCursor, Rope, RopeCursor};
|
use obikrope::{ForwardCursor, Rope, RopeCursor};
|
||||||
use obikseq::superkmer::SuperKmer;
|
use obikseq::superkmer::SuperKmer;
|
||||||
|
|
||||||
use crate::encoding::encode_base;
|
use crate::encoding::encode_nuc;
|
||||||
use crate::entropy::EntropyFilter;
|
use crate::entropy::EntropyFilter;
|
||||||
use crate::minimizer::MinimizerState;
|
use crate::minimizer::MinimizerState;
|
||||||
use crate::scratch::SuperKmerScratch;
|
use crate::scratch::SuperKmerScratch;
|
||||||
@@ -97,7 +97,7 @@ impl<'a> Iterator for SuperKmerIter<'a> {
|
|||||||
Some(b) => b,
|
Some(b) => b,
|
||||||
};
|
};
|
||||||
|
|
||||||
let base2 = encode_base(byte);
|
let base2 = encode_nuc(byte);
|
||||||
let kmer_ready = self.window.push(base2);
|
let kmer_ready = self.window.push(base2);
|
||||||
let current_min = self.minimizer.push(base2);
|
let current_min = self.minimizer.push(base2);
|
||||||
|
|
||||||
|
|||||||
@@ -5,12 +5,15 @@
|
|||||||
|
|
||||||
#![deny(missing_docs)]
|
#![deny(missing_docs)]
|
||||||
|
|
||||||
mod encoding;
|
|
||||||
mod entropy;
|
mod entropy;
|
||||||
|
pub mod iter;
|
||||||
mod minimizer;
|
mod minimizer;
|
||||||
mod scratch;
|
mod scratch;
|
||||||
mod window;
|
mod window;
|
||||||
pub mod iter;
|
|
||||||
|
pub(crate) mod encoding;
|
||||||
|
pub(crate) mod entropy_table;
|
||||||
|
pub(crate) mod rolling_stat;
|
||||||
|
|
||||||
pub use iter::SuperKmerIter;
|
pub use iter::SuperKmerIter;
|
||||||
pub use scratch::SuperKmerScratch;
|
pub use scratch::SuperKmerScratch;
|
||||||
|
|||||||
@@ -0,0 +1,227 @@
|
|||||||
|
use obikseq::kmer::Kmer;
|
||||||
|
|
||||||
|
use crate::encoding::encode_nuc;
|
||||||
|
use crate::entropy_table::{emax, entropy_norm_kmer, ln_class_size, log_nwords, n_log_n};
|
||||||
|
use std::collections::VecDeque;
|
||||||
|
|
||||||
|
#[derive(Clone, Copy)]
|
||||||
|
struct MmerItem {
|
||||||
|
/// 0-based position of this m-mer's first base within the current segment.
|
||||||
|
position: usize,
|
||||||
|
canonical: u64,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct RollingStat {
|
||||||
|
k: usize,
|
||||||
|
m: usize,
|
||||||
|
rolling_k: u64,
|
||||||
|
rolling_rck: u64,
|
||||||
|
k_mask: u64,
|
||||||
|
m_mask: u64,
|
||||||
|
shift_left: usize,
|
||||||
|
received: usize,
|
||||||
|
k1q: VecDeque<u64>,
|
||||||
|
k2q: VecDeque<u64>,
|
||||||
|
k3q: VecDeque<u64>,
|
||||||
|
k4q: VecDeque<u64>,
|
||||||
|
k5q: VecDeque<u64>,
|
||||||
|
k6q: VecDeque<u64>,
|
||||||
|
minimier: VecDeque<MmerItem>,
|
||||||
|
k1c: Vec<usize>,
|
||||||
|
k2c: Vec<usize>,
|
||||||
|
k3c: Vec<usize>,
|
||||||
|
k4c: Vec<usize>,
|
||||||
|
k5c: Vec<usize>,
|
||||||
|
k6c: Vec<usize>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl RollingStat {
|
||||||
|
pub fn new(k: usize, m: usize) -> Self {
|
||||||
|
Self {
|
||||||
|
k,
|
||||||
|
m,
|
||||||
|
rolling_k: 0,
|
||||||
|
rolling_rck: 0,
|
||||||
|
k_mask: (!0) >> (64 - k * 2),
|
||||||
|
m_mask: (!0) >> (64 - m * 2),
|
||||||
|
shift_left: (m - 1) * 2,
|
||||||
|
received: 0,
|
||||||
|
k1q: VecDeque::with_capacity(k),
|
||||||
|
k2q: VecDeque::with_capacity(k - 1),
|
||||||
|
k3q: VecDeque::with_capacity(k - 2),
|
||||||
|
k4q: VecDeque::with_capacity(k - 3),
|
||||||
|
k5q: VecDeque::with_capacity(k - 4),
|
||||||
|
k6q: VecDeque::with_capacity(k - 5),
|
||||||
|
minimier: VecDeque::with_capacity(k-m+2),
|
||||||
|
k1c: vec![0; 4_usize.pow(1)],
|
||||||
|
k2c: vec![0; 4_usize.pow(2)],
|
||||||
|
k3c: vec![0; 4_usize.pow(3)],
|
||||||
|
k4c: vec![0; 4_usize.pow(4)],
|
||||||
|
k5c: vec![0; 4_usize.pow(5)],
|
||||||
|
k6c: vec![0; 4_usize.pow(6)],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn reset(&mut self) {
|
||||||
|
self.rolling_k = 0;
|
||||||
|
self.rolling_rck = 0;
|
||||||
|
self.received = 0;
|
||||||
|
self.k1q.clear();
|
||||||
|
self.k2q.clear();
|
||||||
|
self.k3q.clear();
|
||||||
|
self.k4q.clear();
|
||||||
|
self.k5q.clear();
|
||||||
|
self.k6q.clear();
|
||||||
|
self.minimier.clear();
|
||||||
|
self.k1c.fill(0);
|
||||||
|
self.k2c.fill(0);
|
||||||
|
self.k3c.fill(0);
|
||||||
|
self.k4c.fill(0);
|
||||||
|
self.k5c.fill(0);
|
||||||
|
self.k6c.fill(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn push(&mut self, nuc: char) {
|
||||||
|
let bnuc = encode_nuc(nuc as u8);
|
||||||
|
let cnuc = bnuc ^ 3;
|
||||||
|
|
||||||
|
self.rolling_k = ((self.rolling_k << 2) | (cnuc as u64)) & self.k_mask;
|
||||||
|
self.rolling_rck =
|
||||||
|
((self.rolling_rck >> 2) | ((cnuc as u64) << (self.k * 2))) & self.k_mask;
|
||||||
|
|
||||||
|
|
||||||
|
let canonical_k1 = entropy_norm_kmer(self.rolling_k & 3, 1, false);
|
||||||
|
let canonical_k2 = entropy_norm_kmer(self.rolling_k & 15, 2, false);
|
||||||
|
let canonical_k3 = entropy_norm_kmer(self.rolling_k & 63, 3, false);
|
||||||
|
let canonical_k4 = entropy_norm_kmer(self.rolling_k & 255, 4, false);
|
||||||
|
let canonical_k5 = entropy_norm_kmer(self.rolling_k & 1023, 5, false);
|
||||||
|
let canonical_k6 = entropy_norm_kmer(self.rolling_k & 4095, 6, false);
|
||||||
|
|
||||||
|
self.received += 1;
|
||||||
|
|
||||||
|
if self.received >= self.m {
|
||||||
|
let possible_canonical_m =
|
||||||
|
(self.rolling_k & self.m_mask).min(self.rolling_rck >> ((self.k - self.m) * 2));
|
||||||
|
let possible_pos_m = self.received - self.m;
|
||||||
|
|
||||||
|
while self.minimier.back().map_or(false, |it| it.canonical >= possible_canonical_m) {
|
||||||
|
self.minimier.pop_back();
|
||||||
|
}
|
||||||
|
self.minimier.push_back(MmerItem { position: possible_pos_m, canonical: possible_canonical_m });
|
||||||
|
}
|
||||||
|
|
||||||
|
if self.received > self.k {
|
||||||
|
self.k1c[self.k1q.pop_front().unwrap() as usize] -= 1;
|
||||||
|
self.k2c[self.k2q.pop_front().unwrap() as usize] -= 1;
|
||||||
|
self.k3c[self.k3q.pop_front().unwrap() as usize] -= 1;
|
||||||
|
self.k4c[self.k4q.pop_front().unwrap() as usize] -= 1;
|
||||||
|
self.k5c[self.k5q.pop_front().unwrap() as usize] -= 1;
|
||||||
|
self.k6c[self.k6q.pop_front().unwrap() as usize] -= 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
self.k1c[canonical_k1 as usize] += 1;
|
||||||
|
self.k1q.push_back(canonical_k1);
|
||||||
|
if self.received >= 2 {
|
||||||
|
self.k2c[canonical_k2 as usize] += 1;
|
||||||
|
self.k2q.push_back(canonical_k2);
|
||||||
|
if self.received >= 3 {
|
||||||
|
self.k3c[canonical_k3 as usize] += 1;
|
||||||
|
self.k3q.push_back(canonical_k3);
|
||||||
|
if self.received >= 4 {
|
||||||
|
self.k4c[canonical_k4 as usize] += 1;
|
||||||
|
self.k4q.push_back(canonical_k4);
|
||||||
|
if self.received >= 5 {
|
||||||
|
self.k5c[canonical_k5 as usize] += 1;
|
||||||
|
self.k5q.push_back(canonical_k5);
|
||||||
|
if self.received >= 6 {
|
||||||
|
self.k6c[canonical_k6 as usize] += 1;
|
||||||
|
self.k6q.push_back(canonical_k6);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn ready(&self) -> bool {
|
||||||
|
self.received >= self.k
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn kmer(&self) -> Option<Kmer> {
|
||||||
|
if self.ready() {
|
||||||
|
Some(Kmer::from_raw_right(self.rolling_k, self.k))
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn revcomp_kmer(&self) -> Option<Kmer> {
|
||||||
|
if self.ready() {
|
||||||
|
Some(Kmer::from_raw_right(self.rolling_rck, self.k))
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn canonical_kmer(&self) -> Option<Kmer> {
|
||||||
|
if self.ready() {
|
||||||
|
Some(Kmer::from_raw_right(
|
||||||
|
self.rolling_k.min(self.rolling_rck), self.k))
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn minimizer_position(&self) -> Option<usize> {
|
||||||
|
if self.ready() {
|
||||||
|
self.minimier.front().map(|it| it.position)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn minimizer_canonical(&self) -> Option<Kmer> {
|
||||||
|
if self.ready() {
|
||||||
|
self.minimier.front().map(|it| Kmer::from_raw_right(it.canonical, self.k))
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn entropy(&self, order: usize) -> Option<f64> {
|
||||||
|
if !self.ready() { return None; }
|
||||||
|
let k = self.k;
|
||||||
|
let em = emax(k, order);
|
||||||
|
if em <= 0.0 { return Some(1.0); }
|
||||||
|
let counts = match order {
|
||||||
|
1 => &self.k1c,
|
||||||
|
2 => &self.k2c,
|
||||||
|
3 => &self.k3c,
|
||||||
|
4 => &self.k4c,
|
||||||
|
5 => &self.k5c,
|
||||||
|
6 => &self.k6c,
|
||||||
|
_ => return None,
|
||||||
|
};
|
||||||
|
let nwords = k - order + 1;
|
||||||
|
let log_nw = log_nwords(k, order);
|
||||||
|
let nw_f = nwords as f64;
|
||||||
|
let mut sum_f_log_f = 0.0f64;
|
||||||
|
let mut sum_f_log_s = 0.0f64;
|
||||||
|
for (j, &f) in counts.iter().enumerate() {
|
||||||
|
if f > 0 {
|
||||||
|
sum_f_log_f += n_log_n(f);
|
||||||
|
sum_f_log_s += f as f64 * ln_class_size(j as u64, order, false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let h_corr = log_nw + (sum_f_log_s - sum_f_log_f) / nw_f;
|
||||||
|
Some((h_corr / em).max(0.0))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn normalized_entropy(&self) -> Option<f64> {
|
||||||
|
if !self.ready() { return None; }
|
||||||
|
let min_e = (1..=6)
|
||||||
|
.filter_map(|ws| self.entropy(ws))
|
||||||
|
.fold(f64::MAX, f64::min);
|
||||||
|
Some(if min_e == f64::MAX { 1.0 } else { min_e })
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,7 +1,7 @@
|
|||||||
//! Stack-allocated scratch buffer for building a SuperKmer before heap emission.
|
//! Stack-allocated scratch buffer for building a SuperKmer before heap emission.
|
||||||
|
|
||||||
|
use crate::encoding::{BYTE_LEN_MAX, encode_nuc};
|
||||||
use obikseq::superkmer::SuperKmer;
|
use obikseq::superkmer::SuperKmer;
|
||||||
use crate::encoding::{encode_base, BYTE_LEN_MAX};
|
|
||||||
|
|
||||||
/// Maximum nucleotides in a super-kmer (fits one `u64` segment window, kept ≤ 256).
|
/// Maximum nucleotides in a super-kmer (fits one `u64` segment window, kept ≤ 256).
|
||||||
pub const MAX_SUPERKMER_LEN: usize = 256;
|
pub const MAX_SUPERKMER_LEN: usize = 256;
|
||||||
@@ -20,7 +20,10 @@ impl SuperKmerScratch {
|
|||||||
/// Create an empty scratch buffer.
|
/// Create an empty scratch buffer.
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn new() -> Self {
|
pub fn new() -> Self {
|
||||||
Self { buf: [0u8; BYTE_LEN_MAX], len: 0 }
|
Self {
|
||||||
|
buf: [0u8; BYTE_LEN_MAX],
|
||||||
|
len: 0,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Number of nucleotides accumulated so far.
|
/// Number of nucleotides accumulated so far.
|
||||||
@@ -45,7 +48,7 @@ impl SuperKmerScratch {
|
|||||||
debug_assert!(self.len < MAX_SUPERKMER_LEN, "SuperKmerScratch overflow");
|
debug_assert!(self.len < MAX_SUPERKMER_LEN, "SuperKmerScratch overflow");
|
||||||
let slot = self.len / 4;
|
let slot = self.len / 4;
|
||||||
let shift = 6 - 2 * (self.len % 4);
|
let shift = 6 - 2 * (self.len % 4);
|
||||||
self.buf[slot] |= encode_base(base) << shift;
|
self.buf[slot] |= encode_nuc(base) << shift;
|
||||||
self.len += 1;
|
self.len += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -72,5 +75,7 @@ impl SuperKmerScratch {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl Default for SuperKmerScratch {
|
impl Default for SuperKmerScratch {
|
||||||
fn default() -> Self { Self::new() }
|
fn default() -> Self {
|
||||||
|
Self::new()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Binary file not shown.
Reference in New Issue
Block a user