📖 Update super-kmer theory and implementation to prefer non-degenerate m-mers

- Update super-kmer definition in `kmERS.md` to specify that non-degenerate m-mers are preferred over degenerate ones (degeneracy = homopolymer).
- Refactor `superkmer.rs`: change `.canonical()` to mutate in-place and return bool.
- Add `m` field & canonical-aware minimizer position calculation to SuperKmerIter in obiskbuilder.
- Add helper functions `is_degenerate` and minimizer comparison logic to rolling_stat.rs for consistent tie-breaking.
- Minor formatting cleanup in superkmer command and chunk processing.
This commit is contained in:
Eric Coissac
2026-04-20 17:49:52 +02:00
parent b534c693ac
commit 380b5a6f94
5 changed files with 43 additions and 22 deletions
+1 -1
View File
@@ -9,7 +9,7 @@ A **kmer** is a DNA subsequence of fixed length k. Two constraints govern the ch
## Super-kmers
A **super-kmer** is a maximal run of consecutive kmers from a DNA read, each overlapping the next by k1 nucleotides. Each kmer of the run carries the same **canonical minimizer**. The **canonical minimizer** of a kmer is the smallest value of `min(m-mer, revcomp(m-mer))` over all m-mers within the kmer (m < k, m odd).
A **super-kmer** is a maximal run of consecutive kmers from a DNA read, each overlapping the next by k1 nucleotides. Each kmer of the run carries the same **canonical minimizer**. The **canonical minimizer** of a kmer is the smallest value of `min(m-mer, revcomp(m-mer))` over all m-mers within the kmer (m < k, m odd), with the constraint that **non-degenerate m-mers are always preferred** over degenerate ones. A degenerate m-mer is one composed of a single repeated nucleotide (all-A, all-C, all-G, or all-T); such m-mers are selected only if no non-degenerate candidate exists in the window.
### Canonical super-kmers
+11 -12
View File
@@ -55,8 +55,10 @@ fn detect_format(source: &str, hint: Option<&str>) -> Format {
_ => Format::Fasta,
};
}
if source.ends_with(".fq") || source.ends_with(".fastq")
|| source.ends_with(".fq.gz") || source.ends_with(".fastq.gz")
if source.ends_with(".fq")
|| source.ends_with(".fastq")
|| source.ends_with(".fq.gz")
|| source.ends_with(".fastq.gz")
{
Format::Fastq
} else {
@@ -114,22 +116,20 @@ pub fn run(args: SuperkmerArgs) {
thread::spawn(move || {
for raw_chunk in raw_rx {
let norm = match format {
Format::Fasta => {
obiread::normalize::normalize_fasta_chunk(raw_chunk, k)
}
Format::Fastq => {
obiread::normalize::normalize_fastq_chunk(raw_chunk, k)
}
Format::Fasta => obiread::normalize::normalize_fasta_chunk(raw_chunk, k),
Format::Fastq => obiread::normalize::normalize_fastq_chunk(raw_chunk, k),
};
const BATCH_SIZE: usize = 10_000;
let mut batch = Vec::with_capacity(BATCH_SIZE);
for sk in SuperKmerIter::new(&norm, k, m, level_max, theta) {
batch.push(sk);
if batch.len() == BATCH_SIZE {
sk_tx.send(std::mem::replace(
sk_tx
.send(std::mem::replace(
&mut batch,
Vec::with_capacity(BATCH_SIZE),
)).unwrap();
))
.unwrap();
}
}
if !batch.is_empty() {
@@ -151,8 +151,7 @@ pub fn run(args: SuperkmerArgs) {
for batch in sk_rx {
for (min_hash, sk) in batch {
let partition = (mix64(min_hash) % partitions) as u32;
write_scatter(&sk, &mut out, k, m, partition, min_hash)
.expect("write error");
write_scatter(&sk, &mut out, k, m, partition, min_hash).expect("write error");
}
}
out.flush().expect("flush error");
+7 -5
View File
@@ -286,21 +286,23 @@ impl SuperKmer {
Ok(self.kmer(i, k)?.canonical(k))
}
/// Return this super-kmer in canonical form (lexicographic minimum of forward and revcomp).
pub fn canonical(mut self) -> Self {
/// Put this super-kmer in canonical form (lexicographic minimum of forward and revcomp).
///
/// Returns `true` if already canonical (no change), `false` if revcomp was applied.
pub fn canonical(&mut self) -> bool {
let seql = self.seql();
for i in 0..seql {
let fwd = self.nucleotide(i);
let rev = complement(self.nucleotide(seql - 1 - i));
if fwd < rev {
return self;
return true;
}
if fwd > rev {
self.revcomp();
return self;
return false;
}
}
self
true
}
}
+8 -1
View File
@@ -25,6 +25,7 @@ use crate::scratch::SuperKmerScratch;
pub struct SuperKmerIter<'a> {
cursor: ForwardCursor<'a>,
k: usize,
m: usize,
theta: f64,
scratch: SuperKmerScratch,
stat: RollingStat,
@@ -43,6 +44,7 @@ impl<'a> SuperKmerIter<'a> {
Self {
cursor: rope.fw_cursor(),
k,
m,
theta,
scratch: SuperKmerScratch::new(),
stat: RollingStat::new(k, m, level_max),
@@ -64,7 +66,12 @@ impl<'a> SuperKmerIter<'a> {
}
let min = self.prev_min?;
let mut sk = self.scratch.emit();
sk.set_minimizer_pos(self.prev_min_pos as u8);
let min_pos = if sk.canonical() {
self.prev_min_pos
} else {
sk.seql() - self.m - self.prev_min_pos
};
sk.set_minimizer_pos(min_pos as u8);
Some((min, sk))
}
}
+14 -1
View File
@@ -110,6 +110,19 @@ impl RollingStat {
sum_f_log_s[ws] += ln_class_size(canonical, ws, false);
}
#[inline]
fn is_degenerate(canonical: u64, m_mask: u64) -> bool {
canonical == 0 || canonical == (0x5555555555555555 & m_mask)
}
#[inline]
fn minimizer_worse(existing: u64, candidate: u64, m_mask: u64) -> bool {
let ed = Self::is_degenerate(existing, m_mask);
let cd = Self::is_degenerate(candidate, m_mask);
if ed != cd { return ed; }
existing >= candidate
}
pub fn push(&mut self, nuc: u8) {
let bnuc = encode_nuc(nuc);
let cnuc = bnuc ^ 3;
@@ -132,7 +145,7 @@ impl RollingStat {
(self.rolling_k & self.m_mask).min(self.rolling_rck >> ((self.k - self.m) * 2));
let possible_pos_m = self.received - self.m;
while self.minimier.back().map_or(false, |it| it.canonical >= possible_canonical_m) {
while self.minimier.back().map_or(false, |it| Self::minimizer_worse(it.canonical, possible_canonical_m, self.m_mask)) {
self.minimier.pop_back();
}
self.minimier