From 380b5a6f94e31460fb1f8443b55b2bf66dc46248 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Mon, 20 Apr 2026 17:49:52 +0200 Subject: [PATCH] :book: Update super-kmer theory and implementation to prefer non-degenerate m-mers - Update super-kmer definition in `kmERS.md` to specify that non-degenerate m-mers are preferred over degenerate ones (degeneracy = homopolymer). - Refactor `superkmer.rs`: change `.canonical()` to mutate in-place and return bool. - Add `m` field & canonical-aware minimizer position calculation to SuperKmerIter in obiskbuilder. - Add helper functions `is_degenerate` and minimizer comparison logic to rolling_stat.rs for consistent tie-breaking. - Minor formatting cleanup in superkmer command and chunk processing. --- docmd/theory/kmers.md | 2 +- src/obikmer/src/cmd/superkmer.rs | 27 +++++++++++++-------------- src/obikseq/src/superkmer.rs | 12 +++++++----- src/obiskbuilder/src/iter.rs | 9 ++++++++- src/obiskbuilder/src/rolling_stat.rs | 15 ++++++++++++++- 5 files changed, 43 insertions(+), 22 deletions(-) diff --git a/docmd/theory/kmers.md b/docmd/theory/kmers.md index 521afa7..0c44c24 100644 --- a/docmd/theory/kmers.md +++ b/docmd/theory/kmers.md @@ -9,7 +9,7 @@ A **kmer** is a DNA subsequence of fixed length k. Two constraints govern the ch ## Super-kmers -A **super-kmer** is a maximal run of consecutive kmers from a DNA read, each overlapping the next by k−1 nucleotides. Each kmer of the run carries the same **canonical minimizer**. The **canonical minimizer** of a kmer is the smallest value of `min(m-mer, revcomp(m-mer))` over all m-mers within the kmer (m < k, m odd). +A **super-kmer** is a maximal run of consecutive kmers from a DNA read, each overlapping the next by k−1 nucleotides. Each kmer of the run carries the same **canonical minimizer**. The **canonical minimizer** of a kmer is the smallest value of `min(m-mer, revcomp(m-mer))` over all m-mers within the kmer (m < k, m odd), with the constraint that **non-degenerate m-mers are always preferred** over degenerate ones. A degenerate m-mer is one composed of a single repeated nucleotide (all-A, all-C, all-G, or all-T); such m-mers are selected only if no non-degenerate candidate exists in the window. ### Canonical super-kmers diff --git a/src/obikmer/src/cmd/superkmer.rs b/src/obikmer/src/cmd/superkmer.rs index 434ae21..e309122 100644 --- a/src/obikmer/src/cmd/superkmer.rs +++ b/src/obikmer/src/cmd/superkmer.rs @@ -55,8 +55,10 @@ fn detect_format(source: &str, hint: Option<&str>) -> Format { _ => Format::Fasta, }; } - if source.ends_with(".fq") || source.ends_with(".fastq") - || source.ends_with(".fq.gz") || source.ends_with(".fastq.gz") + if source.ends_with(".fq") + || source.ends_with(".fastq") + || source.ends_with(".fq.gz") + || source.ends_with(".fastq.gz") { Format::Fastq } else { @@ -114,22 +116,20 @@ pub fn run(args: SuperkmerArgs) { thread::spawn(move || { for raw_chunk in raw_rx { let norm = match format { - Format::Fasta => { - obiread::normalize::normalize_fasta_chunk(raw_chunk, k) - } - Format::Fastq => { - obiread::normalize::normalize_fastq_chunk(raw_chunk, k) - } + Format::Fasta => obiread::normalize::normalize_fasta_chunk(raw_chunk, k), + Format::Fastq => obiread::normalize::normalize_fastq_chunk(raw_chunk, k), }; const BATCH_SIZE: usize = 10_000; let mut batch = Vec::with_capacity(BATCH_SIZE); for sk in SuperKmerIter::new(&norm, k, m, level_max, theta) { batch.push(sk); if batch.len() == BATCH_SIZE { - sk_tx.send(std::mem::replace( - &mut batch, - Vec::with_capacity(BATCH_SIZE), - )).unwrap(); + sk_tx + .send(std::mem::replace( + &mut batch, + Vec::with_capacity(BATCH_SIZE), + )) + .unwrap(); } } if !batch.is_empty() { @@ -151,8 +151,7 @@ pub fn run(args: SuperkmerArgs) { for batch in sk_rx { for (min_hash, sk) in batch { let partition = (mix64(min_hash) % partitions) as u32; - write_scatter(&sk, &mut out, k, m, partition, min_hash) - .expect("write error"); + write_scatter(&sk, &mut out, k, m, partition, min_hash).expect("write error"); } } out.flush().expect("flush error"); diff --git a/src/obikseq/src/superkmer.rs b/src/obikseq/src/superkmer.rs index 038cf77..c44154c 100644 --- a/src/obikseq/src/superkmer.rs +++ b/src/obikseq/src/superkmer.rs @@ -286,21 +286,23 @@ impl SuperKmer { Ok(self.kmer(i, k)?.canonical(k)) } - /// Return this super-kmer in canonical form (lexicographic minimum of forward and revcomp). - pub fn canonical(mut self) -> Self { + /// Put this super-kmer in canonical form (lexicographic minimum of forward and revcomp). + /// + /// Returns `true` if already canonical (no change), `false` if revcomp was applied. + pub fn canonical(&mut self) -> bool { let seql = self.seql(); for i in 0..seql { let fwd = self.nucleotide(i); let rev = complement(self.nucleotide(seql - 1 - i)); if fwd < rev { - return self; + return true; } if fwd > rev { self.revcomp(); - return self; + return false; } } - self + true } } diff --git a/src/obiskbuilder/src/iter.rs b/src/obiskbuilder/src/iter.rs index abb333f..d6a06fb 100644 --- a/src/obiskbuilder/src/iter.rs +++ b/src/obiskbuilder/src/iter.rs @@ -25,6 +25,7 @@ use crate::scratch::SuperKmerScratch; pub struct SuperKmerIter<'a> { cursor: ForwardCursor<'a>, k: usize, + m: usize, theta: f64, scratch: SuperKmerScratch, stat: RollingStat, @@ -43,6 +44,7 @@ impl<'a> SuperKmerIter<'a> { Self { cursor: rope.fw_cursor(), k, + m, theta, scratch: SuperKmerScratch::new(), stat: RollingStat::new(k, m, level_max), @@ -64,7 +66,12 @@ impl<'a> SuperKmerIter<'a> { } let min = self.prev_min?; let mut sk = self.scratch.emit(); - sk.set_minimizer_pos(self.prev_min_pos as u8); + let min_pos = if sk.canonical() { + self.prev_min_pos + } else { + sk.seql() - self.m - self.prev_min_pos + }; + sk.set_minimizer_pos(min_pos as u8); Some((min, sk)) } } diff --git a/src/obiskbuilder/src/rolling_stat.rs b/src/obiskbuilder/src/rolling_stat.rs index 002f52b..df0a4d2 100644 --- a/src/obiskbuilder/src/rolling_stat.rs +++ b/src/obiskbuilder/src/rolling_stat.rs @@ -110,6 +110,19 @@ impl RollingStat { sum_f_log_s[ws] += ln_class_size(canonical, ws, false); } + #[inline] + fn is_degenerate(canonical: u64, m_mask: u64) -> bool { + canonical == 0 || canonical == (0x5555555555555555 & m_mask) + } + + #[inline] + fn minimizer_worse(existing: u64, candidate: u64, m_mask: u64) -> bool { + let ed = Self::is_degenerate(existing, m_mask); + let cd = Self::is_degenerate(candidate, m_mask); + if ed != cd { return ed; } + existing >= candidate + } + pub fn push(&mut self, nuc: u8) { let bnuc = encode_nuc(nuc); let cnuc = bnuc ^ 3; @@ -132,7 +145,7 @@ impl RollingStat { (self.rolling_k & self.m_mask).min(self.rolling_rck >> ((self.k - self.m) * 2)); let possible_pos_m = self.received - self.m; - while self.minimier.back().map_or(false, |it| it.canonical >= possible_canonical_m) { + while self.minimier.back().map_or(false, |it| Self::minimizer_worse(it.canonical, possible_canonical_m, self.m_mask)) { self.minimier.pop_back(); } self.minimier