📖 Update super-kmer theory and implementation to prefer non-degenerate m-mers
- Update super-kmer definition in `kmERS.md` to specify that non-degenerate m-mers are preferred over degenerate ones (degeneracy = homopolymer). - Refactor `superkmer.rs`: change `.canonical()` to mutate in-place and return bool. - Add `m` field & canonical-aware minimizer position calculation to SuperKmerIter in obiskbuilder. - Add helper functions `is_degenerate` and minimizer comparison logic to rolling_stat.rs for consistent tie-breaking. - Minor formatting cleanup in superkmer command and chunk processing.
This commit is contained in:
@@ -9,7 +9,7 @@ A **kmer** is a DNA subsequence of fixed length k. Two constraints govern the ch
|
|||||||
|
|
||||||
## Super-kmers
|
## Super-kmers
|
||||||
|
|
||||||
A **super-kmer** is a maximal run of consecutive kmers from a DNA read, each overlapping the next by k−1 nucleotides. Each kmer of the run carries the same **canonical minimizer**. The **canonical minimizer** of a kmer is the smallest value of `min(m-mer, revcomp(m-mer))` over all m-mers within the kmer (m < k, m odd).
|
A **super-kmer** is a maximal run of consecutive kmers from a DNA read, each overlapping the next by k−1 nucleotides. Each kmer of the run carries the same **canonical minimizer**. The **canonical minimizer** of a kmer is the smallest value of `min(m-mer, revcomp(m-mer))` over all m-mers within the kmer (m < k, m odd), with the constraint that **non-degenerate m-mers are always preferred** over degenerate ones. A degenerate m-mer is one composed of a single repeated nucleotide (all-A, all-C, all-G, or all-T); such m-mers are selected only if no non-degenerate candidate exists in the window.
|
||||||
|
|
||||||
### Canonical super-kmers
|
### Canonical super-kmers
|
||||||
|
|
||||||
|
|||||||
@@ -55,8 +55,10 @@ fn detect_format(source: &str, hint: Option<&str>) -> Format {
|
|||||||
_ => Format::Fasta,
|
_ => Format::Fasta,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
if source.ends_with(".fq") || source.ends_with(".fastq")
|
if source.ends_with(".fq")
|
||||||
|| source.ends_with(".fq.gz") || source.ends_with(".fastq.gz")
|
|| source.ends_with(".fastq")
|
||||||
|
|| source.ends_with(".fq.gz")
|
||||||
|
|| source.ends_with(".fastq.gz")
|
||||||
{
|
{
|
||||||
Format::Fastq
|
Format::Fastq
|
||||||
} else {
|
} else {
|
||||||
@@ -114,22 +116,20 @@ pub fn run(args: SuperkmerArgs) {
|
|||||||
thread::spawn(move || {
|
thread::spawn(move || {
|
||||||
for raw_chunk in raw_rx {
|
for raw_chunk in raw_rx {
|
||||||
let norm = match format {
|
let norm = match format {
|
||||||
Format::Fasta => {
|
Format::Fasta => obiread::normalize::normalize_fasta_chunk(raw_chunk, k),
|
||||||
obiread::normalize::normalize_fasta_chunk(raw_chunk, k)
|
Format::Fastq => obiread::normalize::normalize_fastq_chunk(raw_chunk, k),
|
||||||
}
|
|
||||||
Format::Fastq => {
|
|
||||||
obiread::normalize::normalize_fastq_chunk(raw_chunk, k)
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
const BATCH_SIZE: usize = 10_000;
|
const BATCH_SIZE: usize = 10_000;
|
||||||
let mut batch = Vec::with_capacity(BATCH_SIZE);
|
let mut batch = Vec::with_capacity(BATCH_SIZE);
|
||||||
for sk in SuperKmerIter::new(&norm, k, m, level_max, theta) {
|
for sk in SuperKmerIter::new(&norm, k, m, level_max, theta) {
|
||||||
batch.push(sk);
|
batch.push(sk);
|
||||||
if batch.len() == BATCH_SIZE {
|
if batch.len() == BATCH_SIZE {
|
||||||
sk_tx.send(std::mem::replace(
|
sk_tx
|
||||||
&mut batch,
|
.send(std::mem::replace(
|
||||||
Vec::with_capacity(BATCH_SIZE),
|
&mut batch,
|
||||||
)).unwrap();
|
Vec::with_capacity(BATCH_SIZE),
|
||||||
|
))
|
||||||
|
.unwrap();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !batch.is_empty() {
|
if !batch.is_empty() {
|
||||||
@@ -151,8 +151,7 @@ pub fn run(args: SuperkmerArgs) {
|
|||||||
for batch in sk_rx {
|
for batch in sk_rx {
|
||||||
for (min_hash, sk) in batch {
|
for (min_hash, sk) in batch {
|
||||||
let partition = (mix64(min_hash) % partitions) as u32;
|
let partition = (mix64(min_hash) % partitions) as u32;
|
||||||
write_scatter(&sk, &mut out, k, m, partition, min_hash)
|
write_scatter(&sk, &mut out, k, m, partition, min_hash).expect("write error");
|
||||||
.expect("write error");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
out.flush().expect("flush error");
|
out.flush().expect("flush error");
|
||||||
|
|||||||
@@ -286,21 +286,23 @@ impl SuperKmer {
|
|||||||
Ok(self.kmer(i, k)?.canonical(k))
|
Ok(self.kmer(i, k)?.canonical(k))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return this super-kmer in canonical form (lexicographic minimum of forward and revcomp).
|
/// Put this super-kmer in canonical form (lexicographic minimum of forward and revcomp).
|
||||||
pub fn canonical(mut self) -> Self {
|
///
|
||||||
|
/// Returns `true` if already canonical (no change), `false` if revcomp was applied.
|
||||||
|
pub fn canonical(&mut self) -> bool {
|
||||||
let seql = self.seql();
|
let seql = self.seql();
|
||||||
for i in 0..seql {
|
for i in 0..seql {
|
||||||
let fwd = self.nucleotide(i);
|
let fwd = self.nucleotide(i);
|
||||||
let rev = complement(self.nucleotide(seql - 1 - i));
|
let rev = complement(self.nucleotide(seql - 1 - i));
|
||||||
if fwd < rev {
|
if fwd < rev {
|
||||||
return self;
|
return true;
|
||||||
}
|
}
|
||||||
if fwd > rev {
|
if fwd > rev {
|
||||||
self.revcomp();
|
self.revcomp();
|
||||||
return self;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
self
|
true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -25,6 +25,7 @@ use crate::scratch::SuperKmerScratch;
|
|||||||
pub struct SuperKmerIter<'a> {
|
pub struct SuperKmerIter<'a> {
|
||||||
cursor: ForwardCursor<'a>,
|
cursor: ForwardCursor<'a>,
|
||||||
k: usize,
|
k: usize,
|
||||||
|
m: usize,
|
||||||
theta: f64,
|
theta: f64,
|
||||||
scratch: SuperKmerScratch,
|
scratch: SuperKmerScratch,
|
||||||
stat: RollingStat,
|
stat: RollingStat,
|
||||||
@@ -43,6 +44,7 @@ impl<'a> SuperKmerIter<'a> {
|
|||||||
Self {
|
Self {
|
||||||
cursor: rope.fw_cursor(),
|
cursor: rope.fw_cursor(),
|
||||||
k,
|
k,
|
||||||
|
m,
|
||||||
theta,
|
theta,
|
||||||
scratch: SuperKmerScratch::new(),
|
scratch: SuperKmerScratch::new(),
|
||||||
stat: RollingStat::new(k, m, level_max),
|
stat: RollingStat::new(k, m, level_max),
|
||||||
@@ -64,7 +66,12 @@ impl<'a> SuperKmerIter<'a> {
|
|||||||
}
|
}
|
||||||
let min = self.prev_min?;
|
let min = self.prev_min?;
|
||||||
let mut sk = self.scratch.emit();
|
let mut sk = self.scratch.emit();
|
||||||
sk.set_minimizer_pos(self.prev_min_pos as u8);
|
let min_pos = if sk.canonical() {
|
||||||
|
self.prev_min_pos
|
||||||
|
} else {
|
||||||
|
sk.seql() - self.m - self.prev_min_pos
|
||||||
|
};
|
||||||
|
sk.set_minimizer_pos(min_pos as u8);
|
||||||
Some((min, sk))
|
Some((min, sk))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -110,6 +110,19 @@ impl RollingStat {
|
|||||||
sum_f_log_s[ws] += ln_class_size(canonical, ws, false);
|
sum_f_log_s[ws] += ln_class_size(canonical, ws, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn is_degenerate(canonical: u64, m_mask: u64) -> bool {
|
||||||
|
canonical == 0 || canonical == (0x5555555555555555 & m_mask)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn minimizer_worse(existing: u64, candidate: u64, m_mask: u64) -> bool {
|
||||||
|
let ed = Self::is_degenerate(existing, m_mask);
|
||||||
|
let cd = Self::is_degenerate(candidate, m_mask);
|
||||||
|
if ed != cd { return ed; }
|
||||||
|
existing >= candidate
|
||||||
|
}
|
||||||
|
|
||||||
pub fn push(&mut self, nuc: u8) {
|
pub fn push(&mut self, nuc: u8) {
|
||||||
let bnuc = encode_nuc(nuc);
|
let bnuc = encode_nuc(nuc);
|
||||||
let cnuc = bnuc ^ 3;
|
let cnuc = bnuc ^ 3;
|
||||||
@@ -132,7 +145,7 @@ impl RollingStat {
|
|||||||
(self.rolling_k & self.m_mask).min(self.rolling_rck >> ((self.k - self.m) * 2));
|
(self.rolling_k & self.m_mask).min(self.rolling_rck >> ((self.k - self.m) * 2));
|
||||||
let possible_pos_m = self.received - self.m;
|
let possible_pos_m = self.received - self.m;
|
||||||
|
|
||||||
while self.minimier.back().map_or(false, |it| it.canonical >= possible_canonical_m) {
|
while self.minimier.back().map_or(false, |it| Self::minimizer_worse(it.canonical, possible_canonical_m, self.m_mask)) {
|
||||||
self.minimier.pop_back();
|
self.minimier.pop_back();
|
||||||
}
|
}
|
||||||
self.minimier
|
self.minimier
|
||||||
|
|||||||
Reference in New Issue
Block a user