📖 Update super-kmer theory and implementation to prefer non-degenerate m-mers

- Update super-kmer definition in `kmERS.md` to specify that non-degenerate m-mers are preferred over degenerate ones (degeneracy = homopolymer).
- Refactor `superkmer.rs`: change `.canonical()` to mutate in-place and return bool.
- Add `m` field & canonical-aware minimizer position calculation to SuperKmerIter in obiskbuilder.
- Add helper functions `is_degenerate` and minimizer comparison logic to rolling_stat.rs for consistent tie-breaking.
- Minor formatting cleanup in superkmer command and chunk processing.
This commit is contained in:
Eric Coissac
2026-04-20 17:49:52 +02:00
parent b534c693ac
commit 380b5a6f94
5 changed files with 43 additions and 22 deletions
+8 -1
View File
@@ -25,6 +25,7 @@ use crate::scratch::SuperKmerScratch;
pub struct SuperKmerIter<'a> {
cursor: ForwardCursor<'a>,
k: usize,
m: usize,
theta: f64,
scratch: SuperKmerScratch,
stat: RollingStat,
@@ -43,6 +44,7 @@ impl<'a> SuperKmerIter<'a> {
Self {
cursor: rope.fw_cursor(),
k,
m,
theta,
scratch: SuperKmerScratch::new(),
stat: RollingStat::new(k, m, level_max),
@@ -64,7 +66,12 @@ impl<'a> SuperKmerIter<'a> {
}
let min = self.prev_min?;
let mut sk = self.scratch.emit();
sk.set_minimizer_pos(self.prev_min_pos as u8);
let min_pos = if sk.canonical() {
self.prev_min_pos
} else {
sk.seql() - self.m - self.prev_min_pos
};
sk.set_minimizer_pos(min_pos as u8);
Some((min, sk))
}
}
+14 -1
View File
@@ -110,6 +110,19 @@ impl RollingStat {
sum_f_log_s[ws] += ln_class_size(canonical, ws, false);
}
#[inline]
fn is_degenerate(canonical: u64, m_mask: u64) -> bool {
canonical == 0 || canonical == (0x5555555555555555 & m_mask)
}
#[inline]
fn minimizer_worse(existing: u64, candidate: u64, m_mask: u64) -> bool {
let ed = Self::is_degenerate(existing, m_mask);
let cd = Self::is_degenerate(candidate, m_mask);
if ed != cd { return ed; }
existing >= candidate
}
pub fn push(&mut self, nuc: u8) {
let bnuc = encode_nuc(nuc);
let cnuc = bnuc ^ 3;
@@ -132,7 +145,7 @@ impl RollingStat {
(self.rolling_k & self.m_mask).min(self.rolling_rck >> ((self.k - self.m) * 2));
let possible_pos_m = self.received - self.m;
while self.minimier.back().map_or(false, |it| it.canonical >= possible_canonical_m) {
while self.minimier.back().map_or(false, |it| Self::minimizer_worse(it.canonical, possible_canonical_m, self.m_mask)) {
self.minimier.pop_back();
}
self.minimier