🔧 Replace degenerate minimizer logic with hash-based random ordering
- Add `hash` field to MmerItem for stable, randomized minimizer ordering - Introduce hash_mMER() using mix64 with XOR seed to avoid fixed points (e.g., poly-A/T) - Remove is_degenerate() and minimizer_worse(), simplifying comparison to hash-only - Update push logic: compare hashes instead of canonical values with degeneracy checks
This commit is contained in:
@@ -7,7 +7,23 @@ use crate::entropy_table::{WS_MAX, emax, entropy_norm_kmer, ln_class_size, log_n
|
||||
struct MmerItem {
|
||||
/// 0-based position of this m-mer's first base within the current segment.
|
||||
position: usize,
|
||||
/// Raw canonical m-mer value (right-aligned), used for partition key computation.
|
||||
canonical: u64,
|
||||
/// mix64 hash of the canonical m-mer, used as the random ordering key.
|
||||
hash: u64,
|
||||
}
|
||||
|
||||
/// Bijective hash used to randomise the minimizer ordering.
|
||||
/// The XOR seed (2^64/φ) breaks the mix64 fixed point at 0,
|
||||
/// preventing poly-A/T kmers (canonical = 0) from always winning.
|
||||
#[inline(always)]
|
||||
fn hash_mmer(canonical: u64) -> u64 {
|
||||
let x = canonical ^ 0x9e3779b97f4a7c15;
|
||||
let x = x ^ (x >> 30);
|
||||
let x = x.wrapping_mul(0xbf58476d1ce4e5b9);
|
||||
let x = x ^ (x >> 27);
|
||||
let x = x.wrapping_mul(0x94d049bb133111eb);
|
||||
x ^ (x >> 31)
|
||||
}
|
||||
|
||||
pub struct RollingStat {
|
||||
@@ -110,19 +126,6 @@ impl RollingStat {
|
||||
sum_f_log_s[ws] += ln_class_size(canonical, ws, false);
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn is_degenerate(canonical: u64, m_mask: u64) -> bool {
|
||||
canonical == 0 || canonical == (0x5555555555555555 & m_mask)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn minimizer_worse(existing: u64, candidate: u64, m_mask: u64) -> bool {
|
||||
let ed = Self::is_degenerate(existing, m_mask);
|
||||
let cd = Self::is_degenerate(candidate, m_mask);
|
||||
if ed != cd { return ed; }
|
||||
existing >= candidate
|
||||
}
|
||||
|
||||
pub fn push(&mut self, nuc: u8) {
|
||||
let bnuc = encode_nuc(nuc);
|
||||
let cnuc = bnuc ^ 3;
|
||||
@@ -143,13 +146,14 @@ impl RollingStat {
|
||||
if self.received >= self.m {
|
||||
let possible_canonical_m =
|
||||
(self.rolling_k & self.m_mask).min(self.rolling_rck >> ((self.k - self.m) * 2));
|
||||
let possible_hash_m = hash_mmer(possible_canonical_m);
|
||||
let possible_pos_m = self.received - self.m;
|
||||
|
||||
while self.minimier.back().map_or(false, |it| Self::minimizer_worse(it.canonical, possible_canonical_m, self.m_mask)) {
|
||||
while self.minimier.back().map_or(false, |it| it.hash >= possible_hash_m) {
|
||||
self.minimier.pop_back();
|
||||
}
|
||||
self.minimier
|
||||
.push_back(MmerItem { position: possible_pos_m, canonical: possible_canonical_m });
|
||||
.push_back(MmerItem { position: possible_pos_m, canonical: possible_canonical_m, hash: possible_hash_m });
|
||||
|
||||
if self.received > self.k {
|
||||
while self
|
||||
@@ -271,14 +275,6 @@ impl RollingStat {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn canonical_minimizer(&self) -> Option<Kmer> {
|
||||
if self.ready() {
|
||||
self.minimier.front().map(|it| Kmer::from_raw_right(it.canonical, self.m))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
pub fn canonical_minimizer_raw(&self) -> Option<u64> {
|
||||
if self.ready() {
|
||||
self.minimier.front().map(|it| it.canonical)
|
||||
|
||||
Reference in New Issue
Block a user