diff --git a/src/Cargo.lock b/src/Cargo.lock index 139b17b..ce0c031 100644 --- a/src/Cargo.lock +++ b/src/Cargo.lock @@ -497,6 +497,12 @@ dependencies = [ "libc", ] +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" + [[package]] name = "libc" version = "0.2.185" @@ -624,6 +630,7 @@ dependencies = [ name = "obiskbuilder" version = "0.1.0" dependencies = [ + "lazy_static", "obikrope", "obikseq", ] diff --git a/src/obikseq/src/kmer.rs b/src/obikseq/src/kmer.rs index 2366690..32a769e 100644 --- a/src/obikseq/src/kmer.rs +++ b/src/obikseq/src/kmer.rs @@ -4,7 +4,7 @@ //! The low 64−2k bits are always zero. k is not stored — it is a parameter of //! every operation that needs it, and will be owned by the collection-level indexer. -use crate::encoding::{encode_base, DEC4}; +use crate::encoding::{DEC4, encode_base}; // ── KmerError ───────────────────────────────────────────────────────────────── @@ -30,10 +30,11 @@ pub enum KmerError { impl std::fmt::Display for KmerError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { - KmerError::OutOfBounds { position, k, seql } => - write!(f, "kmer of length {k} at position {position} exceeds sequence length {seql}"), - KmerError::InvalidK { k } => - write!(f, "k={k} is invalid: must be in 1..=32"), + KmerError::OutOfBounds { position, k, seql } => write!( + f, + "kmer of length {k} at position {position} exceeds sequence length {seql}" + ), + KmerError::InvalidK { k } => write!(f, "k={k} is invalid: must be in 1..=32"), } } } @@ -55,12 +56,25 @@ impl Kmer { Kmer(raw) } + /// Wrap a raw right-aligned u64 value as a Kmer. + /// The raw value is shifted left by `2 * k` bits to align it with the leftmost position. + #[inline] + pub fn from_raw_right(raw: u64, k: usize) -> Self { + Kmer(raw << (64 - 2 * k)) + } + /// Return the raw left-aligned u64 value. #[inline] pub fn raw(&self) -> u64 { self.0 } + /// Return the raw right-aligned u64 value. + #[inline] + pub fn raw_right(&self, k: usize) -> u64 { + self.0 >> (64 - 2 * k) + } + /// Encode the first k nucleotides of an ASCII slice into a Kmer. /// Zero allocation — result lives on the stack. #[inline] @@ -69,7 +83,11 @@ impl Kmer { return Err(KmerError::InvalidK { k }); } if ascii.len() < k { - return Err(KmerError::OutOfBounds { position: 0, k, seql: ascii.len() }); + return Err(KmerError::OutOfBounds { + position: 0, + k, + seql: ascii.len(), + }); } let mut val = 0u64; for i in 0..k { @@ -98,7 +116,7 @@ impl Kmer { pub fn write_ascii(&self, k: usize, buf: &mut Vec) { let bytes = self.0.to_be_bytes(); let full = k / 4; - let rem = k % 4; + let rem = k % 4; for i in 0..full { buf.extend_from_slice(&DEC4[bytes[i] as usize].to_be_bytes()); } @@ -112,10 +130,10 @@ impl Kmer { /// Zero allocation — result lives on the stack. #[inline] pub fn revcomp(&self, k: usize) -> Self { - let x = !self.0; // complement - let x = x.swap_bytes(); // reverse bytes + let x = !self.0; // complement + let x = x.swap_bytes(); // reverse bytes let x = ((x >> 4) & 0x0F0F0F0F0F0F0F0F) | ((x & 0x0F0F0F0F0F0F0F0F) << 4); // swap nibbles - let x = ((x >> 2) & 0x3333333333333333) | ((x & 0x3333333333333333) << 2); // swap 2-bit groups + let x = ((x >> 2) & 0x3333333333333333) | ((x & 0x3333333333333333) << 2); // swap 2-bit groups Kmer(x << (64 - 2 * k)) } @@ -135,11 +153,16 @@ mod tests { use super::*; fn ascii_revcomp(seq: &[u8]) -> Vec { - seq.iter().rev().map(|&b| match b { - b'A' => b'T', b'T' => b'A', - b'C' => b'G', b'G' => b'C', - _ => b'A', - }).collect() + seq.iter() + .rev() + .map(|&b| match b { + b'A' => b'T', + b'T' => b'A', + b'C' => b'G', + b'G' => b'C', + _ => b'A', + }) + .collect() } const K_VALUES: &[usize] = &[1, 2, 3, 4, 8, 11, 16, 31, 32]; @@ -194,10 +217,10 @@ mod tests { #[test] fn revcomp_known_values() { let cases: &[(&[u8], &[u8])] = &[ - (b"A", b"T"), - (b"AC", b"GT"), - (b"ACG", b"CGT"), - (b"ACGT", b"ACGT"), // palindrome + (b"A", b"T"), + (b"AC", b"GT"), + (b"ACG", b"CGT"), + (b"ACGT", b"ACGT"), // palindrome (b"AAAA", b"TTTT"), (b"TTTT", b"AAAA"), ]; @@ -205,7 +228,12 @@ mod tests { let k = seq.len(); let kmer = Kmer::from_ascii(seq, k).unwrap(); let rc = kmer.revcomp(k); - assert_eq!(rc.to_ascii(k), *expected, "revcomp wrong for \"{}\"", std::str::from_utf8(seq).unwrap()); + assert_eq!( + rc.to_ascii(k), + *expected, + "revcomp wrong for \"{}\"", + std::str::from_utf8(seq).unwrap() + ); } } @@ -224,7 +252,11 @@ mod tests { for &k in K_VALUES { let ascii = make_seq(k); let kmer = Kmer::from_ascii(&ascii, k).unwrap(); - assert_eq!(kmer.revcomp(k).revcomp(k), kmer, "revcomp∘revcomp≠id for k={k}"); + assert_eq!( + kmer.revcomp(k).revcomp(k), + kmer, + "revcomp∘revcomp≠id for k={k}" + ); } } @@ -248,7 +280,7 @@ mod tests { for &k in K_VALUES { let ascii = make_seq(k); let kmer = Kmer::from_ascii(&ascii, k).unwrap().canonical(k); - let rc = kmer.revcomp(k); + let rc = kmer.revcomp(k); assert!(kmer.0 <= rc.0, "canonical not minimal for k={k}"); } } @@ -257,7 +289,11 @@ mod tests { fn canonical_idempotent() { for &k in K_VALUES { let kmer = Kmer::from_ascii(&make_seq(k), k).unwrap().canonical(k); - assert_eq!(kmer.canonical(k), kmer, "canonical not idempotent for k={k}"); + assert_eq!( + kmer.canonical(k), + kmer, + "canonical not idempotent for k={k}" + ); } } } diff --git a/src/obiskbuilder/Cargo.toml b/src/obiskbuilder/Cargo.toml index 4a4832c..3b2cb51 100644 --- a/src/obiskbuilder/Cargo.toml +++ b/src/obiskbuilder/Cargo.toml @@ -6,3 +6,4 @@ edition = "2024" [dependencies] obikseq = { path = "../obikseq" } obikrope = { path = "../obikrope" } +lazy_static = "1.5.0" diff --git a/src/obiskbuilder/src/encoding.rs b/src/obiskbuilder/src/encoding.rs index 0d33129..728c639 100644 --- a/src/obiskbuilder/src/encoding.rs +++ b/src/obiskbuilder/src/encoding.rs @@ -5,7 +5,7 @@ pub const BYTE_LEN_MAX: usize = 64; /// Encode one uppercase ASCII nucleotide to its 2-bit value. #[inline] -pub fn encode_base(b: u8) -> u8 { +pub(crate) fn encode_nuc(b: u8) -> u8 { match b { b'A' => 0b00, b'C' => 0b01, diff --git a/src/obiskbuilder/src/entropy.rs b/src/obiskbuilder/src/entropy.rs index c263cd4..a40dd53 100644 --- a/src/obiskbuilder/src/entropy.rs +++ b/src/obiskbuilder/src/entropy.rs @@ -8,23 +8,14 @@ //! //! A kmer is rejected if score ≤ theta. +use crate::entropy_table::{emax, entropy_norm_kmer, ln_class_size, log_nwords, n_log_n}; + /// Pre-computed entropy filter. One instance per worker thread (not Send/Sync /// because of the mutable freq scratch buffer; wrap in a closure per thread). pub struct EntropyFilter { k: usize, level_max: usize, threshold: f64, - /// norm_tables[ws][raw_code] = canonical circular-rotation code - norm_tables: Vec>, - /// log_s_tables[ws][canonical] = ln(class_size), where class_size is the - /// number of raw codes mapping to this canonical form. - log_s_tables: Vec>, - /// n_log_n[n] = n · ln(n), n_log_n[0] = 0 - n_log_n: Vec, - /// Pre-computed H_max per word size. - emax: Vec, - /// Pre-computed ln(n_words) per word size. - log_nwords: Vec, /// Reusable frequency buffer per word size (reset before each kmer). freq_buf: Vec>, } @@ -32,63 +23,11 @@ pub struct EntropyFilter { impl EntropyFilter { pub fn new(k: usize, level_max: usize, threshold: f64) -> Self { let level_max = level_max.min(k - 1).max(1); - - let mut n_log_n = vec![0.0f64; k + 1]; - for n in 1..=k { - n_log_n[n] = (n as f64) * (n as f64).ln(); - } - - let mut norm_tables: Vec> = vec![vec![]; level_max + 1]; - let mut log_s_tables: Vec> = vec![vec![]; level_max + 1]; - let mut emax = vec![0.0f64; level_max + 1]; - let mut log_nwords = vec![0.0f64; level_max + 1]; let mut freq_buf: Vec> = vec![vec![]; level_max + 1]; - for ws in 1..=level_max { - let table_size = 1usize << (ws * 2); // 4^ws - let nwords = k - ws + 1; - - // Build circular-rotation canonical table. - let norm: Vec = (0..table_size) - .map(|c| normalize_circular(c as u64, ws) as u16) - .collect(); - - // Count how many raw codes map to each canonical form → class sizes. - let mut class_sizes = vec![0u32; table_size]; - for &c in &norm { - class_sizes[c as usize] += 1; - } - - let log_s: Vec = class_sizes.iter() - .map(|&s| if s > 0 { (s as f64).ln() } else { 0.0 }) - .collect(); - - norm_tables[ws] = norm; - log_s_tables[ws] = log_s; - freq_buf[ws] = vec![0u32; table_size]; - log_nwords[ws] = (nwords as f64).ln(); - - // H_max using 4^ws raw categories. - let n_raw = table_size; - let c = nwords / n_raw; - let r = nwords % n_raw; - let nf = nwords as f64; - let t1 = if c == 0 || n_raw == r { - 0.0 - } else { - let f1 = c as f64 / nf; - (n_raw - r) as f64 * f1 * f1.ln() - }; - let t2 = if r == 0 { - 0.0 - } else { - let f2 = (c + 1) as f64 / nf; - r as f64 * f2 * f2.ln() - }; - emax[ws] = -(t1 + t2); + freq_buf[ws] = vec![0u32; 1 << (ws * 2)]; } - - Self { k, level_max, threshold, norm_tables, log_s_tables, n_log_n, emax, log_nwords, freq_buf } + Self { k, level_max, threshold, freq_buf } } /// Returns `true` if the kmer's entropy is strictly above the threshold. @@ -103,12 +42,12 @@ impl EntropyFilter { for ws in 1..=self.level_max { let nwords = k - ws + 1; - let emax = self.emax[ws]; - if emax <= 0.0 { continue; } + let em = emax(k, ws); + if em <= 0.0 { + continue; + } let mask = (1usize << (ws * 2)) - 1; - let norm = &self.norm_tables[ws]; - let log_s = &self.log_s_tables[ws]; let freq = &mut self.freq_buf[ws]; // Slide a ws-mer window; track only written indices (≤ nwords ≤ 31). @@ -121,7 +60,7 @@ impl EntropyFilter { for i in 0..nwords { let base = ((kmer >> (2 * (k - ws - i))) & 3) as usize; word = ((word << 2) | base) & mask; - let idx = norm[word] as usize; + let idx = entropy_norm_kmer(word as u64, ws, false) as usize; if freq[idx] == 0 { dirty[ndirty] = idx as u16; ndirty += 1; @@ -131,20 +70,20 @@ impl EntropyFilter { // H_corr = log(n_words) + (Σ fⱼ·log(sⱼ) − Σ fⱼ·log(fⱼ)) / n_words // Reset freq in the same pass to avoid a separate zeroing loop. - let log_nw = self.log_nwords[ws]; + let log_nw = log_nwords(k, ws); let nw_f = nwords as f64; let mut sum_f_log_f = 0.0f64; let mut sum_f_log_s = 0.0f64; for &j in &dirty[..ndirty] { let j = j as usize; let f = freq[j] as usize; - sum_f_log_f += self.n_log_n[f]; - sum_f_log_s += f as f64 * log_s[j]; + sum_f_log_f += n_log_n(f); + sum_f_log_s += f as f64 * ln_class_size(j as u64, ws, false); freq[j] = 0; } let h_corr = log_nw + (sum_f_log_s - sum_f_log_f) / nw_f; - let entropy = (h_corr / emax).max(0.0); + let entropy = (h_corr / em).max(0.0); if entropy < min_entropy { min_entropy = entropy; } @@ -158,18 +97,3 @@ impl EntropyFilter { if min_entropy == f64::MAX { 1.0 } else { min_entropy } } } - -/// Lexicographically smallest circular rotation of a ws-mer (right-aligned u64). -fn normalize_circular(kmer: u64, ws: usize) -> u64 { - let mask = (1u64 << (ws * 2)) - 1; - let mut canonical = kmer & mask; - let mut current = canonical; - for _ in 0..ws - 1 { - let top = (current >> ((ws - 1) * 2)) & 3; - current = ((current << 2) | top) & mask; - if current < canonical { - canonical = current; - } - } - canonical -} diff --git a/src/obiskbuilder/src/entropy_table.rs b/src/obiskbuilder/src/entropy_table.rs new file mode 100644 index 0000000..26bd667 --- /dev/null +++ b/src/obiskbuilder/src/entropy_table.rs @@ -0,0 +1,175 @@ +use obikseq::kmer::Kmer; +use std::sync::LazyLock; + +pub(crate) static NORMK1: LazyLock<[u64; 4]> = LazyLock::new(|| build_normalized_kmer::<4>()); +pub(crate) static NORMK2: LazyLock<[u64; 16]> = LazyLock::new(|| build_normalized_kmer::<16>()); +pub(crate) static NORMK3: LazyLock<[u64; 64]> = LazyLock::new(|| build_normalized_kmer::<64>()); +pub(crate) static NORMK4: LazyLock<[u64; 256]> = LazyLock::new(|| build_normalized_kmer::<256>()); +pub(crate) static NORMK5: LazyLock<[u64; 1024]> = LazyLock::new(|| build_normalized_kmer::<1024>()); +pub(crate) static NORMK6: LazyLock<[u64; 4096]> = LazyLock::new(|| build_normalized_kmer::<4096>()); + +pub(crate) static LN_CARD_ROT1: LazyLock<[f64; 4]> = + LazyLock::new(|| build_log_class_size::<4>(&NORMK1)); +pub(crate) static LN_CARD_ROT2: LazyLock<[f64; 16]> = + LazyLock::new(|| build_log_class_size::<16>(&NORMK2)); +pub(crate) static LN_CARD_ROT3: LazyLock<[f64; 64]> = + LazyLock::new(|| build_log_class_size::<64>(&NORMK3)); +pub(crate) static LN_CARD_ROT4: LazyLock<[f64; 256]> = + LazyLock::new(|| build_log_class_size::<256>(&NORMK4)); +pub(crate) static LN_CARD_ROT5: LazyLock<[f64; 1024]> = + LazyLock::new(|| build_log_class_size::<1024>(&NORMK5)); +pub(crate) static LN_CARD_ROT6: LazyLock<[f64; 4096]> = + LazyLock::new(|| build_log_class_size::<4096>(&NORMK6)); + +fn ln0(x: f64) -> f64 { + if x == 0.0 { 0.0 } else { x.ln() } +} + +fn normalize_circular(kmer: u64, ws: usize) -> u64 { + let mask = (1u64 << (ws * 2)) - 1; + let mut canonical = kmer & mask; + let mut current = canonical; + for _ in 0..ws - 1 { + let top = (current >> ((ws - 1) * 2)) & 3; + current = ((current << 2) | top) & mask; + if current < canonical { + canonical = current; + } + } + canonical +} + +fn build_normalized_kmer() -> [u64; N] { + let mut result = [0u64; N]; + let k = N.ilog(4) as usize; + let shift = 64 - k * 2; + for i in 0..N { + let la = (i as u64) << shift; + let ra = i as u64; + let rc_ra = Kmer::from_raw(la).revcomp(k).raw() >> shift; + let circ = normalize_circular(ra, k); + let circ_rc = normalize_circular(rc_ra, k); + result[i] = circ.min(circ_rc); + } + result +} + +fn build_log_class_size(norm: &[u64; N]) -> [f64; N] { + let mut sizes = [0u32; N]; + for &c in norm { + sizes[c as usize] += 1; + } + let mut result = [0.0f64; N]; + for i in 0..N { + if sizes[i] > 0 { + result[i] = ln0(sizes[i] as f64); + } + } + result +} + +pub(crate) fn entropy_norm_kmer(kmer: u64, k: usize, left: bool) -> u64 { + let shift = 64 - k * 2; + let ra = if left { kmer >> shift } else { kmer }; // left-aligned → right-aligned index + let canonical_ra = match k { + 1 => NORMK1[ra as usize], + 2 => NORMK2[ra as usize], + 3 => NORMK3[ra as usize], + 4 => NORMK4[ra as usize], + 5 => NORMK5[ra as usize], + 6 => NORMK6[ra as usize], + _ => panic!("k must be 1..=6"), + }; + if left { + canonical_ra << shift + } else { + canonical_ra + } // right-aligned → left-aligned +} + +pub(crate) fn ln_class_size(kmer: u64, k: usize, left: bool) -> f64 { + let ra = if left { kmer >> (64 - k * 2) } else { kmer }; // left-aligned → right-aligned index + match k { + 1 => LN_CARD_ROT1[NORMK1[ra as usize] as usize], + 2 => LN_CARD_ROT2[NORMK2[ra as usize] as usize], + 3 => LN_CARD_ROT3[NORMK3[ra as usize] as usize], + 4 => LN_CARD_ROT4[NORMK4[ra as usize] as usize], + 5 => LN_CARD_ROT5[NORMK5[ra as usize] as usize], + 6 => LN_CARD_ROT6[NORMK6[ra as usize] as usize], + _ => panic!("k must be 1..=6"), + } +} + +// ── k-dependent tables (k ≤ K_MAX, ws ≤ WS_MAX) ────────────────────────────── + +pub(crate) const K_MAX: usize = 32; +pub(crate) const WS_MAX: usize = 6; + +/// n·ln(n), with n_log_n[0] = 0. Indexed by n = 0..=K_MAX. +pub(crate) static N_LOG_N: LazyLock<[f64; K_MAX + 1]> = + LazyLock::new(|| build_n_log_n()); + +/// H_max[k][ws]: maximum entropy for kmer length k and word size ws. +pub(crate) static EMAX: LazyLock<[[f64; WS_MAX + 1]; K_MAX + 1]> = + LazyLock::new(|| build_emax()); + +/// ln(k − ws + 1): log of the number of ws-words in a kmer of length k. +pub(crate) static LOG_NWORDS: LazyLock<[[f64; WS_MAX + 1]; K_MAX + 1]> = + LazyLock::new(|| build_log_nwords()); + +pub(crate) fn n_log_n(n: usize) -> f64 { + N_LOG_N[n] +} + +pub(crate) fn emax(k: usize, ws: usize) -> f64 { + EMAX[k][ws] +} + +pub(crate) fn log_nwords(k: usize, ws: usize) -> f64 { + LOG_NWORDS[k][ws] +} + +fn build_n_log_n() -> [f64; K_MAX + 1] { + let mut result = [0.0f64; K_MAX + 1]; + for n in 1..=K_MAX { + result[n] = (n as f64) * (n as f64).ln(); + } + result +} + +fn build_emax() -> [[f64; WS_MAX + 1]; K_MAX + 1] { + let mut result = [[0.0f64; WS_MAX + 1]; K_MAX + 1]; + for k in 2..=K_MAX { + for ws in 1..=WS_MAX.min(k - 1) { + let n_raw = 1usize << (ws * 2); // 4^ws + let nwords = k - ws + 1; + let c = nwords / n_raw; + let r = nwords % n_raw; + let nf = nwords as f64; + let t1 = if c == 0 || n_raw == r { + 0.0 + } else { + let f1 = c as f64 / nf; + (n_raw - r) as f64 * f1 * f1.ln() + }; + let t2 = if r == 0 { + 0.0 + } else { + let f2 = (c + 1) as f64 / nf; + r as f64 * f2 * f2.ln() + }; + result[k][ws] = -(t1 + t2); + } + } + result +} + +fn build_log_nwords() -> [[f64; WS_MAX + 1]; K_MAX + 1] { + let mut result = [[0.0f64; WS_MAX + 1]; K_MAX + 1]; + for k in 2..=K_MAX { + for ws in 1..=WS_MAX.min(k - 1) { + result[k][ws] = ((k - ws + 1) as f64).ln(); + } + } + result +} diff --git a/src/obiskbuilder/src/iter.rs b/src/obiskbuilder/src/iter.rs index 91760aa..bf16b91 100644 --- a/src/obiskbuilder/src/iter.rs +++ b/src/obiskbuilder/src/iter.rs @@ -18,7 +18,7 @@ use obikrope::{ForwardCursor, Rope, RopeCursor}; use obikseq::superkmer::SuperKmer; -use crate::encoding::encode_base; +use crate::encoding::encode_nuc; use crate::entropy::EntropyFilter; use crate::minimizer::MinimizerState; use crate::scratch::SuperKmerScratch; @@ -97,7 +97,7 @@ impl<'a> Iterator for SuperKmerIter<'a> { Some(b) => b, }; - let base2 = encode_base(byte); + let base2 = encode_nuc(byte); let kmer_ready = self.window.push(base2); let current_min = self.minimizer.push(base2); diff --git a/src/obiskbuilder/src/lib.rs b/src/obiskbuilder/src/lib.rs index 8afdee3..19a235a 100644 --- a/src/obiskbuilder/src/lib.rs +++ b/src/obiskbuilder/src/lib.rs @@ -5,12 +5,15 @@ #![deny(missing_docs)] -mod encoding; mod entropy; +pub mod iter; mod minimizer; mod scratch; mod window; -pub mod iter; + +pub(crate) mod encoding; +pub(crate) mod entropy_table; +pub(crate) mod rolling_stat; pub use iter::SuperKmerIter; pub use scratch::SuperKmerScratch; diff --git a/src/obiskbuilder/src/rolling_stat.rs b/src/obiskbuilder/src/rolling_stat.rs new file mode 100644 index 0000000..b3fc6f9 --- /dev/null +++ b/src/obiskbuilder/src/rolling_stat.rs @@ -0,0 +1,227 @@ +use obikseq::kmer::Kmer; + +use crate::encoding::encode_nuc; +use crate::entropy_table::{emax, entropy_norm_kmer, ln_class_size, log_nwords, n_log_n}; +use std::collections::VecDeque; + +#[derive(Clone, Copy)] +struct MmerItem { + /// 0-based position of this m-mer's first base within the current segment. + position: usize, + canonical: u64, +} + +pub struct RollingStat { + k: usize, + m: usize, + rolling_k: u64, + rolling_rck: u64, + k_mask: u64, + m_mask: u64, + shift_left: usize, + received: usize, + k1q: VecDeque, + k2q: VecDeque, + k3q: VecDeque, + k4q: VecDeque, + k5q: VecDeque, + k6q: VecDeque, + minimier: VecDeque, + k1c: Vec, + k2c: Vec, + k3c: Vec, + k4c: Vec, + k5c: Vec, + k6c: Vec, +} + +impl RollingStat { + pub fn new(k: usize, m: usize) -> Self { + Self { + k, + m, + rolling_k: 0, + rolling_rck: 0, + k_mask: (!0) >> (64 - k * 2), + m_mask: (!0) >> (64 - m * 2), + shift_left: (m - 1) * 2, + received: 0, + k1q: VecDeque::with_capacity(k), + k2q: VecDeque::with_capacity(k - 1), + k3q: VecDeque::with_capacity(k - 2), + k4q: VecDeque::with_capacity(k - 3), + k5q: VecDeque::with_capacity(k - 4), + k6q: VecDeque::with_capacity(k - 5), + minimier: VecDeque::with_capacity(k-m+2), + k1c: vec![0; 4_usize.pow(1)], + k2c: vec![0; 4_usize.pow(2)], + k3c: vec![0; 4_usize.pow(3)], + k4c: vec![0; 4_usize.pow(4)], + k5c: vec![0; 4_usize.pow(5)], + k6c: vec![0; 4_usize.pow(6)], + } + } + + pub fn reset(&mut self) { + self.rolling_k = 0; + self.rolling_rck = 0; + self.received = 0; + self.k1q.clear(); + self.k2q.clear(); + self.k3q.clear(); + self.k4q.clear(); + self.k5q.clear(); + self.k6q.clear(); + self.minimier.clear(); + self.k1c.fill(0); + self.k2c.fill(0); + self.k3c.fill(0); + self.k4c.fill(0); + self.k5c.fill(0); + self.k6c.fill(0); + } + + pub fn push(&mut self, nuc: char) { + let bnuc = encode_nuc(nuc as u8); + let cnuc = bnuc ^ 3; + + self.rolling_k = ((self.rolling_k << 2) | (cnuc as u64)) & self.k_mask; + self.rolling_rck = + ((self.rolling_rck >> 2) | ((cnuc as u64) << (self.k * 2))) & self.k_mask; + + + let canonical_k1 = entropy_norm_kmer(self.rolling_k & 3, 1, false); + let canonical_k2 = entropy_norm_kmer(self.rolling_k & 15, 2, false); + let canonical_k3 = entropy_norm_kmer(self.rolling_k & 63, 3, false); + let canonical_k4 = entropy_norm_kmer(self.rolling_k & 255, 4, false); + let canonical_k5 = entropy_norm_kmer(self.rolling_k & 1023, 5, false); + let canonical_k6 = entropy_norm_kmer(self.rolling_k & 4095, 6, false); + + self.received += 1; + + if self.received >= self.m { + let possible_canonical_m = + (self.rolling_k & self.m_mask).min(self.rolling_rck >> ((self.k - self.m) * 2)); + let possible_pos_m = self.received - self.m; + + while self.minimier.back().map_or(false, |it| it.canonical >= possible_canonical_m) { + self.minimier.pop_back(); + } + self.minimier.push_back(MmerItem { position: possible_pos_m, canonical: possible_canonical_m }); + } + + if self.received > self.k { + self.k1c[self.k1q.pop_front().unwrap() as usize] -= 1; + self.k2c[self.k2q.pop_front().unwrap() as usize] -= 1; + self.k3c[self.k3q.pop_front().unwrap() as usize] -= 1; + self.k4c[self.k4q.pop_front().unwrap() as usize] -= 1; + self.k5c[self.k5q.pop_front().unwrap() as usize] -= 1; + self.k6c[self.k6q.pop_front().unwrap() as usize] -= 1; + } + + self.k1c[canonical_k1 as usize] += 1; + self.k1q.push_back(canonical_k1); + if self.received >= 2 { + self.k2c[canonical_k2 as usize] += 1; + self.k2q.push_back(canonical_k2); + if self.received >= 3 { + self.k3c[canonical_k3 as usize] += 1; + self.k3q.push_back(canonical_k3); + if self.received >= 4 { + self.k4c[canonical_k4 as usize] += 1; + self.k4q.push_back(canonical_k4); + if self.received >= 5 { + self.k5c[canonical_k5 as usize] += 1; + self.k5q.push_back(canonical_k5); + if self.received >= 6 { + self.k6c[canonical_k6 as usize] += 1; + self.k6q.push_back(canonical_k6); + } + } + } + } + } + } + + pub fn ready(&self) -> bool { + self.received >= self.k + } + + pub fn kmer(&self) -> Option { + if self.ready() { + Some(Kmer::from_raw_right(self.rolling_k, self.k)) + } else { + None + } + } + + pub fn revcomp_kmer(&self) -> Option { + if self.ready() { + Some(Kmer::from_raw_right(self.rolling_rck, self.k)) + } else { + None + } + } + + pub fn canonical_kmer(&self) -> Option { + if self.ready() { + Some(Kmer::from_raw_right( + self.rolling_k.min(self.rolling_rck), self.k)) + } else { + None + } + } + + pub fn minimizer_position(&self) -> Option { + if self.ready() { + self.minimier.front().map(|it| it.position) + } else { + None + } + } + + pub fn minimizer_canonical(&self) -> Option { + if self.ready() { + self.minimier.front().map(|it| Kmer::from_raw_right(it.canonical, self.k)) + } else { + None + } + } + + pub fn entropy(&self, order: usize) -> Option { + if !self.ready() { return None; } + let k = self.k; + let em = emax(k, order); + if em <= 0.0 { return Some(1.0); } + let counts = match order { + 1 => &self.k1c, + 2 => &self.k2c, + 3 => &self.k3c, + 4 => &self.k4c, + 5 => &self.k5c, + 6 => &self.k6c, + _ => return None, + }; + let nwords = k - order + 1; + let log_nw = log_nwords(k, order); + let nw_f = nwords as f64; + let mut sum_f_log_f = 0.0f64; + let mut sum_f_log_s = 0.0f64; + for (j, &f) in counts.iter().enumerate() { + if f > 0 { + sum_f_log_f += n_log_n(f); + sum_f_log_s += f as f64 * ln_class_size(j as u64, order, false); + } + } + let h_corr = log_nw + (sum_f_log_s - sum_f_log_f) / nw_f; + Some((h_corr / em).max(0.0)) + } + + pub fn normalized_entropy(&self) -> Option { + if !self.ready() { return None; } + let min_e = (1..=6) + .filter_map(|ws| self.entropy(ws)) + .fold(f64::MAX, f64::min); + Some(if min_e == f64::MAX { 1.0 } else { min_e }) + } +} diff --git a/src/obiskbuilder/src/scratch.rs b/src/obiskbuilder/src/scratch.rs index 2540951..ceb3cf7 100644 --- a/src/obiskbuilder/src/scratch.rs +++ b/src/obiskbuilder/src/scratch.rs @@ -1,7 +1,7 @@ //! Stack-allocated scratch buffer for building a SuperKmer before heap emission. +use crate::encoding::{BYTE_LEN_MAX, encode_nuc}; use obikseq::superkmer::SuperKmer; -use crate::encoding::{encode_base, BYTE_LEN_MAX}; /// Maximum nucleotides in a super-kmer (fits one `u64` segment window, kept ≤ 256). pub const MAX_SUPERKMER_LEN: usize = 256; @@ -20,7 +20,10 @@ impl SuperKmerScratch { /// Create an empty scratch buffer. #[inline] pub fn new() -> Self { - Self { buf: [0u8; BYTE_LEN_MAX], len: 0 } + Self { + buf: [0u8; BYTE_LEN_MAX], + len: 0, + } } /// Number of nucleotides accumulated so far. @@ -45,7 +48,7 @@ impl SuperKmerScratch { debug_assert!(self.len < MAX_SUPERKMER_LEN, "SuperKmerScratch overflow"); let slot = self.len / 4; let shift = 6 - 2 * (self.len % 4); - self.buf[slot] |= encode_base(base) << shift; + self.buf[slot] |= encode_nuc(base) << shift; self.len += 1; } @@ -72,5 +75,7 @@ impl SuperKmerScratch { } impl Default for SuperKmerScratch { - fn default() -> Self { Self::new() } + fn default() -> Self { + Self::new() + } } diff --git a/src/profile.json.gz b/src/profile.json.gz new file mode 100644 index 0000000..20cdc1d Binary files /dev/null and b/src/profile.json.gz differ