(feat) Add entropy-based filtering and rolling statistics for k-mers
- Introduce lazy_static dependency - Refactor encoding: rename encode_base →encode_nuc and make it pub(crate) - Add from_raw_right/raw Right methods to Kmer for right-aligned handling - Improve error message formatting and code readability in kmod.rs tests - Replace inline entropy computation with precomputed tables (entropy_table module)—using LazyLock for static lookup arrays - Simplify EntropyFilter by removing redundant tables and delegating to new entropy_table API - Add RollingStat module for real-time kmer statistics and minimizer tracking - Reorganize modules: move iter, encoding to pub(crate), add entropy_table and rolling_stat - Update imports across obiskbuilder crate accordingly
This commit is contained in:
+59
-23
@@ -4,7 +4,7 @@
|
||||
//! The low 64−2k bits are always zero. k is not stored — it is a parameter of
|
||||
//! every operation that needs it, and will be owned by the collection-level indexer.
|
||||
|
||||
use crate::encoding::{encode_base, DEC4};
|
||||
use crate::encoding::{DEC4, encode_base};
|
||||
|
||||
// ── KmerError ─────────────────────────────────────────────────────────────────
|
||||
|
||||
@@ -30,10 +30,11 @@ pub enum KmerError {
|
||||
impl std::fmt::Display for KmerError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
KmerError::OutOfBounds { position, k, seql } =>
|
||||
write!(f, "kmer of length {k} at position {position} exceeds sequence length {seql}"),
|
||||
KmerError::InvalidK { k } =>
|
||||
write!(f, "k={k} is invalid: must be in 1..=32"),
|
||||
KmerError::OutOfBounds { position, k, seql } => write!(
|
||||
f,
|
||||
"kmer of length {k} at position {position} exceeds sequence length {seql}"
|
||||
),
|
||||
KmerError::InvalidK { k } => write!(f, "k={k} is invalid: must be in 1..=32"),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -55,12 +56,25 @@ impl Kmer {
|
||||
Kmer(raw)
|
||||
}
|
||||
|
||||
/// Wrap a raw right-aligned u64 value as a Kmer.
|
||||
/// The raw value is shifted left by `2 * k` bits to align it with the leftmost position.
|
||||
#[inline]
|
||||
pub fn from_raw_right(raw: u64, k: usize) -> Self {
|
||||
Kmer(raw << (64 - 2 * k))
|
||||
}
|
||||
|
||||
/// Return the raw left-aligned u64 value.
|
||||
#[inline]
|
||||
pub fn raw(&self) -> u64 {
|
||||
self.0
|
||||
}
|
||||
|
||||
/// Return the raw right-aligned u64 value.
|
||||
#[inline]
|
||||
pub fn raw_right(&self, k: usize) -> u64 {
|
||||
self.0 >> (64 - 2 * k)
|
||||
}
|
||||
|
||||
/// Encode the first k nucleotides of an ASCII slice into a Kmer.
|
||||
/// Zero allocation — result lives on the stack.
|
||||
#[inline]
|
||||
@@ -69,7 +83,11 @@ impl Kmer {
|
||||
return Err(KmerError::InvalidK { k });
|
||||
}
|
||||
if ascii.len() < k {
|
||||
return Err(KmerError::OutOfBounds { position: 0, k, seql: ascii.len() });
|
||||
return Err(KmerError::OutOfBounds {
|
||||
position: 0,
|
||||
k,
|
||||
seql: ascii.len(),
|
||||
});
|
||||
}
|
||||
let mut val = 0u64;
|
||||
for i in 0..k {
|
||||
@@ -98,7 +116,7 @@ impl Kmer {
|
||||
pub fn write_ascii(&self, k: usize, buf: &mut Vec<u8>) {
|
||||
let bytes = self.0.to_be_bytes();
|
||||
let full = k / 4;
|
||||
let rem = k % 4;
|
||||
let rem = k % 4;
|
||||
for i in 0..full {
|
||||
buf.extend_from_slice(&DEC4[bytes[i] as usize].to_be_bytes());
|
||||
}
|
||||
@@ -112,10 +130,10 @@ impl Kmer {
|
||||
/// Zero allocation — result lives on the stack.
|
||||
#[inline]
|
||||
pub fn revcomp(&self, k: usize) -> Self {
|
||||
let x = !self.0; // complement
|
||||
let x = x.swap_bytes(); // reverse bytes
|
||||
let x = !self.0; // complement
|
||||
let x = x.swap_bytes(); // reverse bytes
|
||||
let x = ((x >> 4) & 0x0F0F0F0F0F0F0F0F) | ((x & 0x0F0F0F0F0F0F0F0F) << 4); // swap nibbles
|
||||
let x = ((x >> 2) & 0x3333333333333333) | ((x & 0x3333333333333333) << 2); // swap 2-bit groups
|
||||
let x = ((x >> 2) & 0x3333333333333333) | ((x & 0x3333333333333333) << 2); // swap 2-bit groups
|
||||
Kmer(x << (64 - 2 * k))
|
||||
}
|
||||
|
||||
@@ -135,11 +153,16 @@ mod tests {
|
||||
use super::*;
|
||||
|
||||
fn ascii_revcomp(seq: &[u8]) -> Vec<u8> {
|
||||
seq.iter().rev().map(|&b| match b {
|
||||
b'A' => b'T', b'T' => b'A',
|
||||
b'C' => b'G', b'G' => b'C',
|
||||
_ => b'A',
|
||||
}).collect()
|
||||
seq.iter()
|
||||
.rev()
|
||||
.map(|&b| match b {
|
||||
b'A' => b'T',
|
||||
b'T' => b'A',
|
||||
b'C' => b'G',
|
||||
b'G' => b'C',
|
||||
_ => b'A',
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
const K_VALUES: &[usize] = &[1, 2, 3, 4, 8, 11, 16, 31, 32];
|
||||
@@ -194,10 +217,10 @@ mod tests {
|
||||
#[test]
|
||||
fn revcomp_known_values() {
|
||||
let cases: &[(&[u8], &[u8])] = &[
|
||||
(b"A", b"T"),
|
||||
(b"AC", b"GT"),
|
||||
(b"ACG", b"CGT"),
|
||||
(b"ACGT", b"ACGT"), // palindrome
|
||||
(b"A", b"T"),
|
||||
(b"AC", b"GT"),
|
||||
(b"ACG", b"CGT"),
|
||||
(b"ACGT", b"ACGT"), // palindrome
|
||||
(b"AAAA", b"TTTT"),
|
||||
(b"TTTT", b"AAAA"),
|
||||
];
|
||||
@@ -205,7 +228,12 @@ mod tests {
|
||||
let k = seq.len();
|
||||
let kmer = Kmer::from_ascii(seq, k).unwrap();
|
||||
let rc = kmer.revcomp(k);
|
||||
assert_eq!(rc.to_ascii(k), *expected, "revcomp wrong for \"{}\"", std::str::from_utf8(seq).unwrap());
|
||||
assert_eq!(
|
||||
rc.to_ascii(k),
|
||||
*expected,
|
||||
"revcomp wrong for \"{}\"",
|
||||
std::str::from_utf8(seq).unwrap()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -224,7 +252,11 @@ mod tests {
|
||||
for &k in K_VALUES {
|
||||
let ascii = make_seq(k);
|
||||
let kmer = Kmer::from_ascii(&ascii, k).unwrap();
|
||||
assert_eq!(kmer.revcomp(k).revcomp(k), kmer, "revcomp∘revcomp≠id for k={k}");
|
||||
assert_eq!(
|
||||
kmer.revcomp(k).revcomp(k),
|
||||
kmer,
|
||||
"revcomp∘revcomp≠id for k={k}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -248,7 +280,7 @@ mod tests {
|
||||
for &k in K_VALUES {
|
||||
let ascii = make_seq(k);
|
||||
let kmer = Kmer::from_ascii(&ascii, k).unwrap().canonical(k);
|
||||
let rc = kmer.revcomp(k);
|
||||
let rc = kmer.revcomp(k);
|
||||
assert!(kmer.0 <= rc.0, "canonical not minimal for k={k}");
|
||||
}
|
||||
}
|
||||
@@ -257,7 +289,11 @@ mod tests {
|
||||
fn canonical_idempotent() {
|
||||
for &k in K_VALUES {
|
||||
let kmer = Kmer::from_ascii(&make_seq(k), k).unwrap().canonical(k);
|
||||
assert_eq!(kmer.canonical(k), kmer, "canonical not idempotent for k={k}");
|
||||
assert_eq!(
|
||||
kmer.canonical(k),
|
||||
kmer,
|
||||
"canonical not idempotent for k={k}"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user