From 58391886a39d8dfcf3addebde3083ea84bcec821 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Mon, 27 Apr 2026 16:56:13 +0200 Subject: [PATCH] :wrench: Replace degenerate minimizer logic with hash-based random ordering - Add `hash` field to MmerItem for stable, randomized minimizer ordering - Introduce hash_mMER() using mix64 with XOR seed to avoid fixed points (e.g., poly-A/T) - Remove is_degenerate() and minimizer_worse(), simplifying comparison to hash-only - Update push logic: compare hashes instead of canonical values with degeneracy checks --- .gitignore | 2 + docmd/implementation/superkmer.md | 44 +++++++++++++++++ docmd/theory/minimizer.md | 73 ++++++++++++++++++++++++++++ mkdocs.yml | 4 +- src/obiskbuilder/src/rolling_stat.rs | 42 ++++++++-------- 5 files changed, 141 insertions(+), 24 deletions(-) create mode 100644 docmd/theory/minimizer.md diff --git a/.gitignore b/.gitignore index c15af6f..18703dc 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,5 @@ src/target data-stress *.fasta +*.zst +*.zst.meta diff --git a/docmd/implementation/superkmer.md b/docmd/implementation/superkmer.md index 5a672aa..49ce401 100644 --- a/docmd/implementation/superkmer.md +++ b/docmd/implementation/superkmer.md @@ -82,6 +82,50 @@ bits[len - shift..].fill(false) return seq -- palindrome: either orientation valid ``` +## Minimizer sliding window + +Super-kmers are built by `SuperKmerIter` (crate `obiskbuilder`), which maintains the current minimizer with a **monotonic deque** over a sliding window of W = k − m + 1 m-mer positions. + +Each deque entry stores: + +| Field | Type | Purpose | +|------------|-------|----------------------------------------------| +| `position` | usize | 0-based start of this m-mer in the segment | +| `canonical`| u64 | right-aligned canonical m-mer value (lex-min of fwd and rc); used as partition key | +| `hash` | u64 | $H(\text{canonical})$ — ordering key for random minimizer selection | + +The hash $H$ is the seeded splitmix64 finalizer (see [Minimizer selection](../theory/minimizer.md)): + +```rust +fn hash_mmer(canonical: u64) -> u64 { + let x = canonical ^ 0x9e3779b97f4a7c15; // seed: eliminates fixed point at 0 + let x = x ^ (x >> 30); + let x = x.wrapping_mul(0xbf58476d1ce4e5b9); + let x = x ^ (x >> 27); + let x = x.wrapping_mul(0x94d049bb133111eb); + x ^ (x >> 31) +} +``` + +On each new nucleotide, once the window is full, the deque is updated: + +!!! abstract "Algorithm — minimizer deque update" + ```text + procedure UpdateMinimizer(deque, position, canonical, hash, k, received): + -- pop dominated entries from the back + while deque.back.hash ≥ hash: + deque.pop_back() + deque.push_back({position, canonical, hash}) + + -- evict expired entries from the front + while deque.front.position + k < received: + deque.pop_front() + ``` + +The front of the deque is always the current minimizer. Because the deque is maintained in strictly increasing hash order, each entry is popped at most once — O(1) amortized per nucleotide. + +A super-kmer boundary is emitted when the minimizer changes: `deque.front.hash ≠ prev_hash`. The `canonical` field of the front entry is **not** used for boundary detection — that uses the hash alone. The canonical value is stored so that the partition key $H(\text{canonical})$ can be recomputed independently at routing time from the stored `minimizer_pos`, without inheriting the minimum-order-statistic bias (see [Minimizer selection — partition key independence](../theory/minimizer.md#partition-key-independence)). + ## Kmer extraction A k-mer is extracted from a super-kmer with `SuperKmer::kmer(i, k)`, which returns a `Kmer` — a left-aligned `u64` newtype (see [Kmer implementation](kmer.md)): diff --git a/docmd/theory/minimizer.md b/docmd/theory/minimizer.md new file mode 100644 index 0000000..848346a --- /dev/null +++ b/docmd/theory/minimizer.md @@ -0,0 +1,73 @@ +# Minimizer selection + +## Definition + +A **minimizer** of a k-mer window is the m-mer (m < k) with the smallest value under some total order ≺ among all k − m + 1 overlapping m-mers in the window. The minimizer is always taken in **canonical form** (lexicographic minimum of forward and reverse complement) to ensure strand-independence. + +The minimizer partitions the sequence into **super-kmers**: maximal contiguous runs of overlapping k-mers that share the same minimizer. A single minimizer anchors each super-kmer, enabling partitioned storage and indexing. + +## Lexicographic ordering and its bias + +The classical definition uses lexicographic order on the canonical m-mer value. In 2-bit encoding (A=00, C=01, G=10, T=11), the canonical form is $\min_{\text{lex}}(\text{fwd}, \text{rc})$, so AT-rich m-mers have systematically small values: + +$$\text{canonical}(\text{AAAA}\cdots\text{A}) = \text{canonical}(\text{TTTT}\cdots\text{T}) = 0$$ + +Since small values always win the lex comparison, low-complexity AT-rich m-mers dominate as minimizers across large genomic regions. On real metagenomics data with k=31, m=11 and 256 partitions, this produces a max/min partition ratio of ≈ 2.75 — and a single pathological partition when the hash function has a fixed point at 0. + +## Random minimizer + +A **random minimizer** replaces lex order with a hash order: define $H : \{0,1\}^{2m} \to \{0,1\}^{64}$ and select the m-mer with the **minimum $H$ value** in the window. + +The key property: because $H$ is a bijection with well-distributed outputs, each distinct m-mer in the window has equal probability of holding the minimum hash value. Selection probability is no longer correlated with nucleotide composition. + +## Why the canonical form remains lexicographic + +An apparent alternative is to redefine the canonical form of each m-mer as the strand with the smaller hash value: + +$$\text{canonical}_H(v) = \arg\min(H(\text{fwd}),\ H(\text{rc}))$$ + +This must be rejected. The hash of this new canonical is $\min(H(\text{fwd}), H(\text{rc}))$ — the minimum of two i.i.d. Uniform$[0, 2^{64})$ values. Its distribution is: + +$$F(x) = 1 - \left(1 - \frac{x}{2^{64}}\right)^2$$ + +with density $f(x) = 2(1 - x/2^{64})$, which is approximately **twice as large near 0 than near $2^{64}$**. The low-order partition bits inherit this bias: partition 0 receives roughly twice as many super-kmers as the last partition. + +The lex canonical form does not have this problem: $\text{canonical}_{\text{lex}}(v)$ is a fixed, deterministic representative of each equivalence class, and $H(\text{canonical}_{\text{lex}})$ is uniformly distributed over $[0, 2^{64})$ independently of the min/max relationship between the two strands. + +## Partition key independence + +A further subtlety arises when the selection hash is used directly as the partition key. The selected minimizer is the m-mer with the **minimum** $H$ value in a window of $W = k - m + 1$ positions. The minimum of $W$ i.i.d. Uniform$[0,2^{64})$ values has distribution: + +$$F(x) = 1 - \left(1 - \frac{x}{2^{64}}\right)^W \approx \frac{Wx}{2^{64}}$$ + +concentrated near 0 relative to the full range. Using this minimum-hash directly as the partition key creates the same bias as lex ordering, just distributed differently. + +The correct approach is to decouple selection from partition routing: + +- **Selection** uses $H(\text{canonical}_{\text{lex}}(m\text{-mer}))$ to pick the minimizer in the window. +- **Partition routing** recomputes $H(\text{canonical}_{\text{lex}}(\text{minimizer}))$ from the stored minimizer position. This is the hash of a specific kmer value, not the minimum of a window — it is uniformly distributed over $[0, 2^{64})$. + +## Seed and fixed-point elimination + +The splitmix64 finalizer has a fixed point at 0: + +$$\text{mix64}(0) = 0$$ + +Since $\text{canonical}_{\text{lex}}(\text{AAAA}\cdots\text{A}) = 0$, using unseeded mix64 causes all-A m-mers to win every window comparison, recreating a pathological partition identical to the lex-ordering bias. + +The fix is a non-zero XOR seed applied before mixing: + +$$H(x) = \text{mix64}(x \oplus s), \quad s = \lfloor 2^{64}/\varphi \rfloor = \texttt{0x9e3779b97f4a7c15}$$ + +where $\varphi$ is the golden ratio. This maps 0 to $\text{mix64}(s)$, a well-distributed non-zero value. No canonical m-mer value has a systematically small $H$. + +!!! abstract "Hash function $H$" + ``` + H(x): + x ← x ⊕ 0x9e3779b97f4a7c15 + x ← x ⊕ (x >> 30) + x ← x × 0xbf58476d1ce4e5b9 + x ← x ⊕ (x >> 27) + x ← x × 0x94d049bb133111eb + return x ⊕ (x >> 31) + ``` diff --git a/mkdocs.yml b/mkdocs.yml index 0dfca5e..be83ff1 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -10,6 +10,7 @@ plugins: - bibtex: bib_file: docmd/references.bib csl_file: docmd/ecology-letters.csl + enable_inline_citations: false markdown_extensions: - admonition @@ -25,9 +26,10 @@ extra_javascript: nav: - Home: index.md - Theory: - - Kmers and super-kmers: theory/kmers.md + - Kmers and super-kmers: kmers.md - DNA encoding: theory/encoding.md - Entropy filter: theory/entropy.md + - Minimizer selection: theory/minimizer.md - Partitioning architecture: theory/indexing.md - Implementation: - SuperKmer: implementation/superkmer.md diff --git a/src/obiskbuilder/src/rolling_stat.rs b/src/obiskbuilder/src/rolling_stat.rs index df0a4d2..02f9f05 100644 --- a/src/obiskbuilder/src/rolling_stat.rs +++ b/src/obiskbuilder/src/rolling_stat.rs @@ -7,7 +7,23 @@ use crate::entropy_table::{WS_MAX, emax, entropy_norm_kmer, ln_class_size, log_n struct MmerItem { /// 0-based position of this m-mer's first base within the current segment. position: usize, + /// Raw canonical m-mer value (right-aligned), used for partition key computation. canonical: u64, + /// mix64 hash of the canonical m-mer, used as the random ordering key. + hash: u64, +} + +/// Bijective hash used to randomise the minimizer ordering. +/// The XOR seed (2^64/φ) breaks the mix64 fixed point at 0, +/// preventing poly-A/T kmers (canonical = 0) from always winning. +#[inline(always)] +fn hash_mmer(canonical: u64) -> u64 { + let x = canonical ^ 0x9e3779b97f4a7c15; + let x = x ^ (x >> 30); + let x = x.wrapping_mul(0xbf58476d1ce4e5b9); + let x = x ^ (x >> 27); + let x = x.wrapping_mul(0x94d049bb133111eb); + x ^ (x >> 31) } pub struct RollingStat { @@ -110,19 +126,6 @@ impl RollingStat { sum_f_log_s[ws] += ln_class_size(canonical, ws, false); } - #[inline] - fn is_degenerate(canonical: u64, m_mask: u64) -> bool { - canonical == 0 || canonical == (0x5555555555555555 & m_mask) - } - - #[inline] - fn minimizer_worse(existing: u64, candidate: u64, m_mask: u64) -> bool { - let ed = Self::is_degenerate(existing, m_mask); - let cd = Self::is_degenerate(candidate, m_mask); - if ed != cd { return ed; } - existing >= candidate - } - pub fn push(&mut self, nuc: u8) { let bnuc = encode_nuc(nuc); let cnuc = bnuc ^ 3; @@ -143,13 +146,14 @@ impl RollingStat { if self.received >= self.m { let possible_canonical_m = (self.rolling_k & self.m_mask).min(self.rolling_rck >> ((self.k - self.m) * 2)); + let possible_hash_m = hash_mmer(possible_canonical_m); let possible_pos_m = self.received - self.m; - while self.minimier.back().map_or(false, |it| Self::minimizer_worse(it.canonical, possible_canonical_m, self.m_mask)) { + while self.minimier.back().map_or(false, |it| it.hash >= possible_hash_m) { self.minimier.pop_back(); } self.minimier - .push_back(MmerItem { position: possible_pos_m, canonical: possible_canonical_m }); + .push_back(MmerItem { position: possible_pos_m, canonical: possible_canonical_m, hash: possible_hash_m }); if self.received > self.k { while self @@ -271,14 +275,6 @@ impl RollingStat { } } - pub fn canonical_minimizer(&self) -> Option { - if self.ready() { - self.minimier.front().map(|it| Kmer::from_raw_right(it.canonical, self.m)) - } else { - None - } - } - pub fn canonical_minimizer_raw(&self) -> Option { if self.ready() { self.minimier.front().map(|it| it.canonical)