📖 Update super-kmer theory and implementation to prefer non-degenerate m-mers
- Update super-kmer definition in `kmERS.md` to specify that non-degenerate m-mers are preferred over degenerate ones (degeneracy = homopolymer). - Refactor `superkmer.rs`: change `.canonical()` to mutate in-place and return bool. - Add `m` field & canonical-aware minimizer position calculation to SuperKmerIter in obiskbuilder. - Add helper functions `is_degenerate` and minimizer comparison logic to rolling_stat.rs for consistent tie-breaking. - Minor formatting cleanup in superkmer command and chunk processing.
This commit is contained in:
@@ -9,7 +9,7 @@ A **kmer** is a DNA subsequence of fixed length k. Two constraints govern the ch
|
||||
|
||||
## Super-kmers
|
||||
|
||||
A **super-kmer** is a maximal run of consecutive kmers from a DNA read, each overlapping the next by k−1 nucleotides. Each kmer of the run carries the same **canonical minimizer**. The **canonical minimizer** of a kmer is the smallest value of `min(m-mer, revcomp(m-mer))` over all m-mers within the kmer (m < k, m odd).
|
||||
A **super-kmer** is a maximal run of consecutive kmers from a DNA read, each overlapping the next by k−1 nucleotides. Each kmer of the run carries the same **canonical minimizer**. The **canonical minimizer** of a kmer is the smallest value of `min(m-mer, revcomp(m-mer))` over all m-mers within the kmer (m < k, m odd), with the constraint that **non-degenerate m-mers are always preferred** over degenerate ones. A degenerate m-mer is one composed of a single repeated nucleotide (all-A, all-C, all-G, or all-T); such m-mers are selected only if no non-degenerate candidate exists in the window.
|
||||
|
||||
### Canonical super-kmers
|
||||
|
||||
|
||||
@@ -55,8 +55,10 @@ fn detect_format(source: &str, hint: Option<&str>) -> Format {
|
||||
_ => Format::Fasta,
|
||||
};
|
||||
}
|
||||
if source.ends_with(".fq") || source.ends_with(".fastq")
|
||||
|| source.ends_with(".fq.gz") || source.ends_with(".fastq.gz")
|
||||
if source.ends_with(".fq")
|
||||
|| source.ends_with(".fastq")
|
||||
|| source.ends_with(".fq.gz")
|
||||
|| source.ends_with(".fastq.gz")
|
||||
{
|
||||
Format::Fastq
|
||||
} else {
|
||||
@@ -114,22 +116,20 @@ pub fn run(args: SuperkmerArgs) {
|
||||
thread::spawn(move || {
|
||||
for raw_chunk in raw_rx {
|
||||
let norm = match format {
|
||||
Format::Fasta => {
|
||||
obiread::normalize::normalize_fasta_chunk(raw_chunk, k)
|
||||
}
|
||||
Format::Fastq => {
|
||||
obiread::normalize::normalize_fastq_chunk(raw_chunk, k)
|
||||
}
|
||||
Format::Fasta => obiread::normalize::normalize_fasta_chunk(raw_chunk, k),
|
||||
Format::Fastq => obiread::normalize::normalize_fastq_chunk(raw_chunk, k),
|
||||
};
|
||||
const BATCH_SIZE: usize = 10_000;
|
||||
let mut batch = Vec::with_capacity(BATCH_SIZE);
|
||||
for sk in SuperKmerIter::new(&norm, k, m, level_max, theta) {
|
||||
batch.push(sk);
|
||||
if batch.len() == BATCH_SIZE {
|
||||
sk_tx.send(std::mem::replace(
|
||||
sk_tx
|
||||
.send(std::mem::replace(
|
||||
&mut batch,
|
||||
Vec::with_capacity(BATCH_SIZE),
|
||||
)).unwrap();
|
||||
))
|
||||
.unwrap();
|
||||
}
|
||||
}
|
||||
if !batch.is_empty() {
|
||||
@@ -151,8 +151,7 @@ pub fn run(args: SuperkmerArgs) {
|
||||
for batch in sk_rx {
|
||||
for (min_hash, sk) in batch {
|
||||
let partition = (mix64(min_hash) % partitions) as u32;
|
||||
write_scatter(&sk, &mut out, k, m, partition, min_hash)
|
||||
.expect("write error");
|
||||
write_scatter(&sk, &mut out, k, m, partition, min_hash).expect("write error");
|
||||
}
|
||||
}
|
||||
out.flush().expect("flush error");
|
||||
|
||||
@@ -286,21 +286,23 @@ impl SuperKmer {
|
||||
Ok(self.kmer(i, k)?.canonical(k))
|
||||
}
|
||||
|
||||
/// Return this super-kmer in canonical form (lexicographic minimum of forward and revcomp).
|
||||
pub fn canonical(mut self) -> Self {
|
||||
/// Put this super-kmer in canonical form (lexicographic minimum of forward and revcomp).
|
||||
///
|
||||
/// Returns `true` if already canonical (no change), `false` if revcomp was applied.
|
||||
pub fn canonical(&mut self) -> bool {
|
||||
let seql = self.seql();
|
||||
for i in 0..seql {
|
||||
let fwd = self.nucleotide(i);
|
||||
let rev = complement(self.nucleotide(seql - 1 - i));
|
||||
if fwd < rev {
|
||||
return self;
|
||||
return true;
|
||||
}
|
||||
if fwd > rev {
|
||||
self.revcomp();
|
||||
return self;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
self
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -25,6 +25,7 @@ use crate::scratch::SuperKmerScratch;
|
||||
pub struct SuperKmerIter<'a> {
|
||||
cursor: ForwardCursor<'a>,
|
||||
k: usize,
|
||||
m: usize,
|
||||
theta: f64,
|
||||
scratch: SuperKmerScratch,
|
||||
stat: RollingStat,
|
||||
@@ -43,6 +44,7 @@ impl<'a> SuperKmerIter<'a> {
|
||||
Self {
|
||||
cursor: rope.fw_cursor(),
|
||||
k,
|
||||
m,
|
||||
theta,
|
||||
scratch: SuperKmerScratch::new(),
|
||||
stat: RollingStat::new(k, m, level_max),
|
||||
@@ -64,7 +66,12 @@ impl<'a> SuperKmerIter<'a> {
|
||||
}
|
||||
let min = self.prev_min?;
|
||||
let mut sk = self.scratch.emit();
|
||||
sk.set_minimizer_pos(self.prev_min_pos as u8);
|
||||
let min_pos = if sk.canonical() {
|
||||
self.prev_min_pos
|
||||
} else {
|
||||
sk.seql() - self.m - self.prev_min_pos
|
||||
};
|
||||
sk.set_minimizer_pos(min_pos as u8);
|
||||
Some((min, sk))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -110,6 +110,19 @@ impl RollingStat {
|
||||
sum_f_log_s[ws] += ln_class_size(canonical, ws, false);
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn is_degenerate(canonical: u64, m_mask: u64) -> bool {
|
||||
canonical == 0 || canonical == (0x5555555555555555 & m_mask)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn minimizer_worse(existing: u64, candidate: u64, m_mask: u64) -> bool {
|
||||
let ed = Self::is_degenerate(existing, m_mask);
|
||||
let cd = Self::is_degenerate(candidate, m_mask);
|
||||
if ed != cd { return ed; }
|
||||
existing >= candidate
|
||||
}
|
||||
|
||||
pub fn push(&mut self, nuc: u8) {
|
||||
let bnuc = encode_nuc(nuc);
|
||||
let cnuc = bnuc ^ 3;
|
||||
@@ -132,7 +145,7 @@ impl RollingStat {
|
||||
(self.rolling_k & self.m_mask).min(self.rolling_rck >> ((self.k - self.m) * 2));
|
||||
let possible_pos_m = self.received - self.m;
|
||||
|
||||
while self.minimier.back().map_or(false, |it| it.canonical >= possible_canonical_m) {
|
||||
while self.minimier.back().map_or(false, |it| Self::minimizer_worse(it.canonical, possible_canonical_m, self.m_mask)) {
|
||||
self.minimier.pop_back();
|
||||
}
|
||||
self.minimier
|
||||
|
||||
Reference in New Issue
Block a user