first implementation but far to be optimal
This commit is contained in:
@@ -0,0 +1,9 @@
|
||||
[package]
|
||||
name = "obiskbuilder"
|
||||
version = "0.1.0"
|
||||
edition = "2024"
|
||||
|
||||
[dependencies]
|
||||
obikseq = { path = "../obikseq" }
|
||||
obiread = { path = "../obiread" }
|
||||
bytes = "1"
|
||||
@@ -0,0 +1,16 @@
|
||||
//! 2-bit encoding helpers shared across obiskbuilder.
|
||||
|
||||
/// Maximum bytes needed for a 256-nucleotide super-kmer (256/4 = 64).
|
||||
pub const BYTE_LEN_MAX: usize = 64;
|
||||
|
||||
/// Encode one uppercase ASCII nucleotide to its 2-bit value.
|
||||
#[inline]
|
||||
pub fn encode_base(b: u8) -> u8 {
|
||||
match b {
|
||||
b'A' => 0b00,
|
||||
b'C' => 0b01,
|
||||
b'G' => 0b10,
|
||||
b'T' => 0b11,
|
||||
_ => 0b00,
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,175 @@
|
||||
//! Entropy filter for low-complexity k-mer detection.
|
||||
//!
|
||||
//! Formula (corrected from the Go version):
|
||||
//! H_corr = log(n_words) + (Σ fⱼ·log(sⱼ) − Σ fⱼ·log(fⱼ)) / n_words
|
||||
//! H_max uses 4^ws raw categories (not the canonical class count)
|
||||
//! Ĥ(ws) = H_corr / H_max ∈ [0,1]
|
||||
//! score = min over ws=1..level_max of Ĥ(ws)
|
||||
//!
|
||||
//! A kmer is rejected if score ≤ theta.
|
||||
|
||||
/// Pre-computed entropy filter. One instance per worker thread (not Send/Sync
|
||||
/// because of the mutable freq scratch buffer; wrap in a closure per thread).
|
||||
pub struct EntropyFilter {
|
||||
k: usize,
|
||||
level_max: usize,
|
||||
threshold: f64,
|
||||
/// norm_tables[ws][raw_code] = canonical circular-rotation code
|
||||
norm_tables: Vec<Vec<u16>>,
|
||||
/// log_s_tables[ws][canonical] = ln(class_size), where class_size is the
|
||||
/// number of raw codes mapping to this canonical form.
|
||||
log_s_tables: Vec<Vec<f64>>,
|
||||
/// n_log_n[n] = n · ln(n), n_log_n[0] = 0
|
||||
n_log_n: Vec<f64>,
|
||||
/// Pre-computed H_max per word size.
|
||||
emax: Vec<f64>,
|
||||
/// Pre-computed ln(n_words) per word size.
|
||||
log_nwords: Vec<f64>,
|
||||
/// Reusable frequency buffer per word size (reset before each kmer).
|
||||
freq_buf: Vec<Vec<u32>>,
|
||||
}
|
||||
|
||||
impl EntropyFilter {
|
||||
pub fn new(k: usize, level_max: usize, threshold: f64) -> Self {
|
||||
let level_max = level_max.min(k - 1).max(1);
|
||||
|
||||
let mut n_log_n = vec![0.0f64; k + 1];
|
||||
for n in 1..=k {
|
||||
n_log_n[n] = (n as f64) * (n as f64).ln();
|
||||
}
|
||||
|
||||
let mut norm_tables: Vec<Vec<u16>> = vec![vec![]; level_max + 1];
|
||||
let mut log_s_tables: Vec<Vec<f64>> = vec![vec![]; level_max + 1];
|
||||
let mut emax = vec![0.0f64; level_max + 1];
|
||||
let mut log_nwords = vec![0.0f64; level_max + 1];
|
||||
let mut freq_buf: Vec<Vec<u32>> = vec![vec![]; level_max + 1];
|
||||
|
||||
for ws in 1..=level_max {
|
||||
let table_size = 1usize << (ws * 2); // 4^ws
|
||||
let nwords = k - ws + 1;
|
||||
|
||||
// Build circular-rotation canonical table.
|
||||
let norm: Vec<u16> = (0..table_size)
|
||||
.map(|c| normalize_circular(c as u64, ws) as u16)
|
||||
.collect();
|
||||
|
||||
// Count how many raw codes map to each canonical form → class sizes.
|
||||
let mut class_sizes = vec![0u32; table_size];
|
||||
for &c in &norm {
|
||||
class_sizes[c as usize] += 1;
|
||||
}
|
||||
|
||||
let log_s: Vec<f64> = class_sizes.iter()
|
||||
.map(|&s| if s > 0 { (s as f64).ln() } else { 0.0 })
|
||||
.collect();
|
||||
|
||||
norm_tables[ws] = norm;
|
||||
log_s_tables[ws] = log_s;
|
||||
freq_buf[ws] = vec![0u32; table_size];
|
||||
log_nwords[ws] = (nwords as f64).ln();
|
||||
|
||||
// H_max using 4^ws raw categories.
|
||||
let n_raw = table_size;
|
||||
let c = nwords / n_raw;
|
||||
let r = nwords % n_raw;
|
||||
let nf = nwords as f64;
|
||||
let t1 = if c == 0 || n_raw == r {
|
||||
0.0
|
||||
} else {
|
||||
let f1 = c as f64 / nf;
|
||||
(n_raw - r) as f64 * f1 * f1.ln()
|
||||
};
|
||||
let t2 = if r == 0 {
|
||||
0.0
|
||||
} else {
|
||||
let f2 = (c + 1) as f64 / nf;
|
||||
r as f64 * f2 * f2.ln()
|
||||
};
|
||||
emax[ws] = -(t1 + t2);
|
||||
}
|
||||
|
||||
Self { k, level_max, threshold, norm_tables, log_s_tables, n_log_n, emax, log_nwords, freq_buf }
|
||||
}
|
||||
|
||||
/// Returns `true` if the kmer's entropy is strictly above the threshold.
|
||||
pub fn accept(&mut self, kmer: u64) -> bool {
|
||||
self.entropy(kmer) > self.threshold
|
||||
}
|
||||
|
||||
/// Compute the minimum normalised entropy across all word sizes.
|
||||
pub fn entropy(&mut self, kmer: u64) -> f64 {
|
||||
let k = self.k;
|
||||
let mut min_entropy = f64::MAX;
|
||||
|
||||
for ws in 1..=self.level_max {
|
||||
let nwords = k - ws + 1;
|
||||
let emax = self.emax[ws];
|
||||
if emax <= 0.0 { continue; }
|
||||
|
||||
let mask = (1usize << (ws * 2)) - 1;
|
||||
let norm = &self.norm_tables[ws];
|
||||
let log_s = &self.log_s_tables[ws];
|
||||
let freq = &mut self.freq_buf[ws];
|
||||
|
||||
// Slide a ws-mer window; track only written indices (≤ nwords ≤ 31).
|
||||
let mut dirty = [0u16; 32];
|
||||
let mut ndirty = 0usize;
|
||||
let mut word = 0usize;
|
||||
for i in 0..ws - 1 {
|
||||
word = (word << 2) | ((kmer >> (2 * (k - 1 - i))) & 3) as usize;
|
||||
}
|
||||
for i in 0..nwords {
|
||||
let base = ((kmer >> (2 * (k - ws - i))) & 3) as usize;
|
||||
word = ((word << 2) | base) & mask;
|
||||
let idx = norm[word] as usize;
|
||||
if freq[idx] == 0 {
|
||||
dirty[ndirty] = idx as u16;
|
||||
ndirty += 1;
|
||||
}
|
||||
freq[idx] += 1;
|
||||
}
|
||||
|
||||
// H_corr = log(n_words) + (Σ fⱼ·log(sⱼ) − Σ fⱼ·log(fⱼ)) / n_words
|
||||
// Reset freq in the same pass to avoid a separate zeroing loop.
|
||||
let log_nw = self.log_nwords[ws];
|
||||
let nw_f = nwords as f64;
|
||||
let mut sum_f_log_f = 0.0f64;
|
||||
let mut sum_f_log_s = 0.0f64;
|
||||
for &j in &dirty[..ndirty] {
|
||||
let j = j as usize;
|
||||
let f = freq[j] as usize;
|
||||
sum_f_log_f += self.n_log_n[f];
|
||||
sum_f_log_s += f as f64 * log_s[j];
|
||||
freq[j] = 0;
|
||||
}
|
||||
|
||||
let h_corr = log_nw + (sum_f_log_s - sum_f_log_f) / nw_f;
|
||||
let entropy = (h_corr / emax).max(0.0);
|
||||
if entropy < min_entropy {
|
||||
min_entropy = entropy;
|
||||
}
|
||||
|
||||
// Early exit: k-mer is already rejected, no need to check further ws.
|
||||
if min_entropy <= self.threshold {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if min_entropy == f64::MAX { 1.0 } else { min_entropy }
|
||||
}
|
||||
}
|
||||
|
||||
/// Lexicographically smallest circular rotation of a ws-mer (right-aligned u64).
|
||||
fn normalize_circular(kmer: u64, ws: usize) -> u64 {
|
||||
let mask = (1u64 << (ws * 2)) - 1;
|
||||
let mut canonical = kmer & mask;
|
||||
let mut current = canonical;
|
||||
for _ in 0..ws - 1 {
|
||||
let top = (current >> ((ws - 1) * 2)) & 3;
|
||||
current = ((current << 2) | top) & mask;
|
||||
if current < canonical {
|
||||
canonical = current;
|
||||
}
|
||||
}
|
||||
canonical
|
||||
}
|
||||
@@ -0,0 +1,280 @@
|
||||
//! Iterator that consumes a normalised rope and yields `(minimizer, SuperKmer)` pairs.
|
||||
//!
|
||||
//! The normalised format is `ACGT…\x00ACGT…\x00` — uppercase ACGT segments of
|
||||
//! length ≥ k separated by NUL bytes.
|
||||
//!
|
||||
//! # Cut conditions (checked in order per nucleotide, once the k-mer window is full)
|
||||
//!
|
||||
//! In all three cases the triggering nucleotide S[j] is **not** added to the
|
||||
//! current super-kmer; the super-kmer built so far is emitted; then the rope
|
||||
//! cursor is retreated so the next segment can overlap correctly:
|
||||
//!
|
||||
//! | Condition | cursor retreat |
|
||||
//! |------------------------|----------------|
|
||||
//! | entropy(kmer) ≤ θ | k−1 |
|
||||
//! | minimizer changed | k |
|
||||
//! | super-kmer length = 256| k |
|
||||
|
||||
use bytes::Bytes;
|
||||
use obiread::tape::RopeCursor;
|
||||
use obikseq::superkmer::SuperKmer;
|
||||
|
||||
use crate::encoding::encode_base;
|
||||
use crate::entropy::EntropyFilter;
|
||||
use crate::minimizer::MinimizerState;
|
||||
use crate::scratch::SuperKmerScratch;
|
||||
use crate::window::KmerWindow;
|
||||
|
||||
/// Iterator over `(minimizer, SuperKmer)` pairs.
|
||||
pub struct SuperKmerIter {
|
||||
rope: Vec<Bytes>,
|
||||
cursor: RopeCursor,
|
||||
k: usize,
|
||||
scratch: SuperKmerScratch,
|
||||
window: KmerWindow,
|
||||
entropy: EntropyFilter,
|
||||
minimizer: MinimizerState,
|
||||
prev_minimizer: Option<u64>,
|
||||
prev_minimizer_pos: u8,
|
||||
}
|
||||
|
||||
impl SuperKmerIter {
|
||||
/// Build an iterator from a normalised rope chunk.
|
||||
///
|
||||
/// - `k`: k-mer size (1–31)
|
||||
/// - `m`: minimizer size (1 < m < k)
|
||||
/// - `level_max`: maximum sub-word size for entropy (typically 6)
|
||||
/// - `theta`: entropy threshold; k-mers with score ≤ theta are rejected
|
||||
pub fn new(
|
||||
chunks: Vec<Bytes>,
|
||||
k: usize,
|
||||
m: usize,
|
||||
level_max: usize,
|
||||
theta: f64,
|
||||
) -> Self {
|
||||
Self {
|
||||
rope: chunks,
|
||||
cursor: RopeCursor::new(),
|
||||
k,
|
||||
scratch: SuperKmerScratch::new(),
|
||||
window: KmerWindow::new(k),
|
||||
entropy: EntropyFilter::new(k, level_max, theta),
|
||||
minimizer: MinimizerState::new(k, m),
|
||||
prev_minimizer: None,
|
||||
prev_minimizer_pos: 0,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn read_byte(&mut self) -> Option<u8> {
|
||||
let b = self.cursor.peek(&self.rope)?;
|
||||
self.cursor.advance(&self.rope);
|
||||
Some(b)
|
||||
}
|
||||
|
||||
fn reset_state(&mut self) {
|
||||
self.window.reset();
|
||||
self.minimizer.reset();
|
||||
self.scratch.reset();
|
||||
self.prev_minimizer = None;
|
||||
self.prev_minimizer_pos = 0;
|
||||
}
|
||||
|
||||
/// Emit the current scratch as a super-kmer if it holds at least k nucleotides.
|
||||
/// Returns `None` and silently discards short buffers (< k nt, no complete kmer).
|
||||
fn try_emit(&mut self, min: u64) -> Option<(u64, SuperKmer)> {
|
||||
if self.scratch.len() < self.k {
|
||||
self.scratch.reset();
|
||||
return None;
|
||||
}
|
||||
let mut sk = self.scratch.emit();
|
||||
sk.set_minimizer_pos(self.prev_minimizer_pos);
|
||||
Some((min, sk))
|
||||
}
|
||||
}
|
||||
|
||||
impl Iterator for SuperKmerIter {
|
||||
type Item = (u64, SuperKmer);
|
||||
|
||||
fn next(&mut self) -> Option<(u64, SuperKmer)> {
|
||||
loop {
|
||||
let byte = match self.read_byte() {
|
||||
None => {
|
||||
// EOF: flush whatever we have.
|
||||
let min = self.prev_minimizer.unwrap_or(0);
|
||||
return self.try_emit(min);
|
||||
}
|
||||
Some(0x00) => {
|
||||
// Segment boundary: flush current super-kmer, then reset.
|
||||
let min = self.prev_minimizer.unwrap_or(0);
|
||||
let result = self.try_emit(min);
|
||||
self.reset_state();
|
||||
if result.is_some() {
|
||||
return result;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
Some(b) => b,
|
||||
};
|
||||
|
||||
let base2 = encode_base(byte);
|
||||
|
||||
// Update sliding windows; kmer_ready is true once we have k bases.
|
||||
let kmer_ready = self.window.push(base2);
|
||||
let current_min = self.minimizer.push(base2);
|
||||
|
||||
if !kmer_ready {
|
||||
// Warm-up phase: just accumulate.
|
||||
self.scratch.push(byte);
|
||||
continue;
|
||||
}
|
||||
|
||||
let kmer = self.window.kmer_u64();
|
||||
let min = current_min.unwrap(); // guaranteed when kmer_ready
|
||||
|
||||
// ── 1. Entropy check ─────────────────────────────────────────────
|
||||
if !self.entropy.accept(kmer) {
|
||||
let prev_min = self.prev_minimizer.unwrap_or(0);
|
||||
let result = self.try_emit(prev_min);
|
||||
self.cursor.retreat(self.k - 1, &self.rope);
|
||||
self.reset_state();
|
||||
if result.is_some() {
|
||||
return result;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// ── 2. Minimizer change check ─────────────────────────────────────
|
||||
if let Some(prev) = self.prev_minimizer {
|
||||
if min != prev {
|
||||
let result = self.try_emit(prev);
|
||||
self.cursor.retreat(self.k, &self.rope);
|
||||
self.reset_state();
|
||||
if result.is_some() {
|
||||
return result;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// ── 3. Super-kmer length check ────────────────────────────────────
|
||||
// S[j] would be the 257th nucleotide → don't add it.
|
||||
if self.scratch.len() == 256 {
|
||||
let result = self.try_emit(min);
|
||||
self.cursor.retreat(self.k, &self.rope);
|
||||
self.reset_state();
|
||||
if result.is_some() {
|
||||
return result;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// All checks passed: accept S[j].
|
||||
self.prev_minimizer = Some(min);
|
||||
self.prev_minimizer_pos = self.minimizer.minimizer_pos();
|
||||
self.scratch.push(byte);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ── tests ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use bytes::BytesMut;
|
||||
|
||||
fn chunks(data: &[u8]) -> Vec<Bytes> {
|
||||
vec![BytesMut::from(data).freeze()]
|
||||
}
|
||||
|
||||
/// theta=0: accept everything; level_max=1 (minimal entropy computation).
|
||||
fn run_nofilter(data: &[u8], k: usize, m: usize) -> Vec<Vec<u8>> {
|
||||
SuperKmerIter::new(chunks(data), k, m, 1, 0.0)
|
||||
.map(|(_, sk)| sk.to_ascii())
|
||||
.collect()
|
||||
}
|
||||
|
||||
// ── basic segmentation (no entropy / minimizer cut) ───────────────────────
|
||||
|
||||
#[test]
|
||||
fn single_segment_one_superkmer() {
|
||||
// k=4, m=2; one segment "ACGTACGT\x00"
|
||||
// All consecutive 4-mers share the same minimizer (or not — but we
|
||||
// just check that we get the full sequence back with theta=0).
|
||||
let out = run_nofilter(b"ACGTACGT\x00", 4, 2);
|
||||
assert!(!out.is_empty());
|
||||
// The concatenation of all emitted superkmers covers the segment.
|
||||
let total: Vec<u8> = out.into_iter().flatten().collect();
|
||||
assert!(total.len() >= 4);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn segment_shorter_than_k_emits_nothing() {
|
||||
let out = run_nofilter(b"ACG\x00", 4, 2);
|
||||
assert_eq!(out, Vec::<Vec<u8>>::new());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn empty_input_emits_nothing() {
|
||||
let out = run_nofilter(b"", 4, 2);
|
||||
assert_eq!(out, Vec::<Vec<u8>>::new());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn two_segments_both_emitted() {
|
||||
let out = run_nofilter(b"ACGTACGT\x00TTTTTTTT\x00", 4, 2);
|
||||
assert!(!out.is_empty());
|
||||
}
|
||||
|
||||
// ── entropy cut ───────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn low_complexity_kmer_is_rejected() {
|
||||
// "AAAAAAAAACGT\x00": the run of A's is low-complexity;
|
||||
// with a strict threshold the A-only kmers get cut.
|
||||
// With theta=0 everything passes → we get one superkmer.
|
||||
let out_pass = run_nofilter(b"AAAAAAAAACGT\x00", 4, 2);
|
||||
assert!(!out_pass.is_empty());
|
||||
|
||||
// With a high threshold polyA should be rejected.
|
||||
let out_reject: Vec<Vec<u8>> =
|
||||
SuperKmerIter::new(chunks(b"AAAAAAAA\x00"), 4, 2, 6, 0.9)
|
||||
.map(|(_, sk)| sk.to_ascii())
|
||||
.collect();
|
||||
// The sequence is pure polyA: all kmers should fail entropy → no emission.
|
||||
assert!(out_reject.is_empty());
|
||||
}
|
||||
|
||||
// ── multi-slice rope ──────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn multi_slice_rope() {
|
||||
let data = b"ACGTACGTACGT\x00";
|
||||
let mid = data.len() / 2;
|
||||
let rope = vec![
|
||||
BytesMut::from(&data[..mid]).freeze(),
|
||||
BytesMut::from(&data[mid..]).freeze(),
|
||||
];
|
||||
let out: Vec<Vec<u8>> = SuperKmerIter::new(rope, 4, 2, 1, 0.0)
|
||||
.map(|(_, sk)| sk.to_ascii())
|
||||
.collect();
|
||||
assert!(!out.is_empty());
|
||||
}
|
||||
|
||||
// ── minimizer is returned ─────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn yields_minimizer_value() {
|
||||
let data = b"ACGTACGT\x00";
|
||||
let results: Vec<(u64, Vec<u8>)> =
|
||||
SuperKmerIter::new(chunks(data), 4, 2, 1, 0.0)
|
||||
.map(|(min, sk)| (min, sk.to_ascii()))
|
||||
.collect();
|
||||
assert!(!results.is_empty());
|
||||
// The minimizer is a non-trivial u64 (just check it's been computed).
|
||||
for (min, _) in &results {
|
||||
let _ = min; // value is format-specific; just verify no panic
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
//! Super-kmer construction pipeline.
|
||||
//!
|
||||
//! Consumes normalised rope chunks from `obiread` and produces
|
||||
//! `(minimizer, SuperKmer)` pairs ready for scatter routing.
|
||||
|
||||
#![deny(missing_docs)]
|
||||
|
||||
mod encoding;
|
||||
mod entropy;
|
||||
mod minimizer;
|
||||
mod scratch;
|
||||
mod window;
|
||||
pub mod iter;
|
||||
|
||||
pub use iter::SuperKmerIter;
|
||||
pub use scratch::SuperKmerScratch;
|
||||
@@ -0,0 +1,96 @@
|
||||
//! Sliding-window canonical minimizer via a monotone deque.
|
||||
//!
|
||||
//! The minimizer of a k-mer is the smallest canonical m-mer (min of forward
|
||||
//! and reverse-complement) among the k−m+1 m-mers it contains.
|
||||
|
||||
use std::collections::VecDeque;
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
struct MmerItem {
|
||||
/// 0-based position of this m-mer's first base within the current segment.
|
||||
position: usize,
|
||||
canonical: u64,
|
||||
}
|
||||
|
||||
pub struct MinimizerState {
|
||||
k: usize,
|
||||
m: usize,
|
||||
deque: VecDeque<MmerItem>,
|
||||
fwd_mmer: u64,
|
||||
rvc_mmer: u64,
|
||||
m_mask: u64,
|
||||
rc_shift: u32,
|
||||
/// Number of bases pushed since the last reset.
|
||||
local_pos: usize,
|
||||
}
|
||||
|
||||
impl MinimizerState {
|
||||
pub fn new(k: usize, m: usize) -> Self {
|
||||
debug_assert!(m >= 1 && m < k && k <= 31);
|
||||
Self {
|
||||
k,
|
||||
m,
|
||||
deque: VecDeque::with_capacity(k - m + 2),
|
||||
fwd_mmer: 0,
|
||||
rvc_mmer: 0,
|
||||
m_mask: (1u64 << (m * 2)) - 1,
|
||||
rc_shift: ((m - 1) * 2) as u32,
|
||||
local_pos: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Push a 2-bit base (0–3).
|
||||
///
|
||||
/// Returns `Some(minimizer)` once we have a full k-mer (i.e., after the
|
||||
/// k-th push), `None` during the warm-up phase.
|
||||
pub fn push(&mut self, base: u8) -> Option<u64> {
|
||||
let code = base as u64;
|
||||
|
||||
// Update sliding m-mer (fwd + rvc).
|
||||
self.fwd_mmer = ((self.fwd_mmer << 2) | code) & self.m_mask;
|
||||
self.rvc_mmer = (self.rvc_mmer >> 2) | ((code ^ 3) << self.rc_shift);
|
||||
|
||||
self.local_pos += 1;
|
||||
|
||||
// Emit canonical m-mer once full.
|
||||
if self.local_pos >= self.m {
|
||||
let canonical = self.fwd_mmer.min(self.rvc_mmer);
|
||||
let mmer_pos = self.local_pos - self.m; // start position of this m-mer
|
||||
|
||||
// Maintain monotone deque: pop back while ≥ new canonical.
|
||||
while self.deque.back().map_or(false, |it| it.canonical >= canonical) {
|
||||
self.deque.pop_back();
|
||||
}
|
||||
self.deque.push_back(MmerItem { position: mmer_pos, canonical });
|
||||
}
|
||||
|
||||
// Return minimizer once we have a full k-mer.
|
||||
if self.local_pos >= self.k {
|
||||
let kmer_start = self.local_pos - self.k;
|
||||
// Evict m-mers that have slid out of the k-mer window.
|
||||
while self.deque.front().map_or(false, |it| it.position < kmer_start) {
|
||||
self.deque.pop_front();
|
||||
}
|
||||
Some(self.deque.front().map_or(0, |it| it.canonical))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Position of the current minimizer m-mer within the current superkmer.
|
||||
///
|
||||
/// Valid only when the deque is non-empty (i.e. after at least one full k-mer).
|
||||
/// Because `reset()` zeroes `local_pos` at the same time as the scratch buffer
|
||||
/// is reset, `deque.front().position` directly equals the offset from the
|
||||
/// superkmer start.
|
||||
pub fn minimizer_pos(&self) -> u8 {
|
||||
self.deque.front().map_or(0, |it| it.position as u8)
|
||||
}
|
||||
|
||||
pub fn reset(&mut self) {
|
||||
self.deque.clear();
|
||||
self.fwd_mmer = 0;
|
||||
self.rvc_mmer = 0;
|
||||
self.local_pos = 0;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,76 @@
|
||||
//! Stack-allocated scratch buffer for building a SuperKmer before heap emission.
|
||||
|
||||
use obikseq::superkmer::SuperKmer;
|
||||
use crate::encoding::{encode_base, BYTE_LEN_MAX};
|
||||
|
||||
/// Maximum nucleotides in a super-kmer (fits one `u64` segment window, kept ≤ 256).
|
||||
pub const MAX_SUPERKMER_LEN: usize = 256;
|
||||
|
||||
/// Stack buffer accumulating 2-bit–packed nucleotides for one super-kmer.
|
||||
///
|
||||
/// Nucleotides are packed MSB-first, four per byte, matching `SuperKmer` layout.
|
||||
/// Call [`SuperKmerScratch::push`] to append, [`SuperKmerScratch::emit`] to
|
||||
/// finalise and produce a heap-allocated [`SuperKmer`].
|
||||
pub struct SuperKmerScratch {
|
||||
buf: [u8; BYTE_LEN_MAX],
|
||||
len: usize,
|
||||
}
|
||||
|
||||
impl SuperKmerScratch {
|
||||
/// Create an empty scratch buffer.
|
||||
#[inline]
|
||||
pub fn new() -> Self {
|
||||
Self { buf: [0u8; BYTE_LEN_MAX], len: 0 }
|
||||
}
|
||||
|
||||
/// Number of nucleotides accumulated so far.
|
||||
#[inline]
|
||||
pub fn len(&self) -> usize {
|
||||
self.len
|
||||
}
|
||||
|
||||
/// `true` when no nucleotides have been accumulated.
|
||||
#[inline]
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.len == 0
|
||||
}
|
||||
|
||||
/// Append one uppercase ASCII nucleotide (A/C/G/T).
|
||||
///
|
||||
/// # Panics
|
||||
/// Panics in debug mode if `base` is not one of `A`, `C`, `G`, `T` or if
|
||||
/// the buffer is full (256 nucleotides).
|
||||
#[inline]
|
||||
pub fn push(&mut self, base: u8) {
|
||||
debug_assert!(self.len < MAX_SUPERKMER_LEN, "SuperKmerScratch overflow");
|
||||
let slot = self.len / 4;
|
||||
let shift = 6 - 2 * (self.len % 4);
|
||||
self.buf[slot] |= encode_base(base) << shift;
|
||||
self.len += 1;
|
||||
}
|
||||
|
||||
/// Consume the accumulated nucleotides and produce a [`SuperKmer`].
|
||||
///
|
||||
/// The heap allocation (`Box<[u8]>`) is exactly sized to the sequence.
|
||||
/// Resets the buffer to empty afterward.
|
||||
pub fn emit(&mut self) -> SuperKmer {
|
||||
let seql = self.len;
|
||||
debug_assert!(seql >= 1 && seql <= MAX_SUPERKMER_LEN);
|
||||
let n = (seql + 3) / 4;
|
||||
let seq: Box<[u8]> = self.buf[..n].into();
|
||||
self.buf[..n].fill(0);
|
||||
self.len = 0;
|
||||
SuperKmer::new(seql as u8, seq)
|
||||
}
|
||||
|
||||
/// Discard all accumulated nucleotides without producing a [`SuperKmer`].
|
||||
pub fn reset(&mut self) {
|
||||
let n = (self.len + 3) / 4;
|
||||
self.buf[..n].fill(0);
|
||||
self.len = 0;
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for SuperKmerScratch {
|
||||
fn default() -> Self { Self::new() }
|
||||
}
|
||||
@@ -0,0 +1,50 @@
|
||||
//! Sliding k-mer window over 2-bit–encoded nucleotides.
|
||||
|
||||
/// Ring buffer of the last k 2-bit–encoded bases.
|
||||
/// Used to extract the current k-mer as a `u64` and to feed the entropy and
|
||||
/// minimizer state machines.
|
||||
pub struct KmerWindow {
|
||||
buf: [u8; 32],
|
||||
start: usize,
|
||||
len: usize,
|
||||
k: usize,
|
||||
}
|
||||
|
||||
impl KmerWindow {
|
||||
pub fn new(k: usize) -> Self {
|
||||
debug_assert!(k >= 1 && k <= 31);
|
||||
Self { buf: [0u8; 32], start: 0, len: 0, k }
|
||||
}
|
||||
|
||||
pub fn k(&self) -> usize { self.k }
|
||||
|
||||
pub fn is_full(&self) -> bool { self.len == self.k }
|
||||
|
||||
/// Push a 2-bit base (0–3). Returns `true` once the window holds k bases.
|
||||
#[inline]
|
||||
pub fn push(&mut self, base: u8) -> bool {
|
||||
if self.len < self.k {
|
||||
self.buf[self.len] = base;
|
||||
self.len += 1;
|
||||
self.len == self.k
|
||||
} else {
|
||||
self.buf[self.start] = base;
|
||||
self.start = (self.start + 1) % self.k;
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
/// Current k-mer as a right-aligned `u64` (first base in MSB of the 2k-bit field).
|
||||
pub fn kmer_u64(&self) -> u64 {
|
||||
let mut v = 0u64;
|
||||
for i in 0..self.k {
|
||||
v = (v << 2) | self.buf[(self.start + i) % self.k] as u64;
|
||||
}
|
||||
v
|
||||
}
|
||||
|
||||
pub fn reset(&mut self) {
|
||||
self.start = 0;
|
||||
self.len = 0;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user