✨ add PhantomData import for generic type safety
- Added `use std::marker::PhantomData;` to prepare for generic scheduler implementations - Ensures type safety and avoids unused lifetime/type parameters warnings
This commit is contained in:
@@ -333,8 +333,6 @@ impl SuperKmer {
|
||||
|
||||
/// Extract the canonical kmer of length k starting at nucleotide position i (0-based).
|
||||
///
|
||||
/// Equivalent to `self.kmer(i, k)?.canonical(k)` but avoids the redundant `revcomp` call
|
||||
/// when the super-kmer is already in canonical form (which is the normal case).
|
||||
/// Returns an error if k is invalid (0 or > 32) or if position i + k exceeds the sequence length.
|
||||
pub fn canonical_kmer(&self, i: usize, k: usize) -> Result<Kmer, KmerError> {
|
||||
Ok(self.kmer(i, k)?.canonical(k))
|
||||
@@ -367,6 +365,16 @@ impl SuperKmer {
|
||||
true
|
||||
}
|
||||
|
||||
/// Iterate over all kmers of length `k` in order, yielding each as a left-aligned [`Kmer`].
|
||||
pub fn iter_kmers(&self, k: usize) -> impl Iterator<Item = Kmer> + '_ {
|
||||
SKKmerIter::new(self, k)
|
||||
}
|
||||
|
||||
/// Iterate over all canonical kmers of length `k` in order.
|
||||
pub fn iter_canonical_kmers(&self, k: usize) -> impl Iterator<Item = Kmer> + '_ {
|
||||
self.iter_kmers(k).map(move |km| km.canonical(k))
|
||||
}
|
||||
|
||||
/// Returns the XXH3 hash of the super-kmer sequence.
|
||||
pub fn hash(&self) -> u64 {
|
||||
if self.is_canonical() {
|
||||
@@ -379,6 +387,53 @@ impl SuperKmer {
|
||||
}
|
||||
}
|
||||
|
||||
struct SKKmerIter<'a> {
|
||||
skmer: &'a SuperKmer,
|
||||
mask: u64,
|
||||
lshift: usize,
|
||||
current: u64,
|
||||
pos: usize,
|
||||
max_pos: usize,
|
||||
}
|
||||
|
||||
impl<'a> SKKmerIter<'a> {
|
||||
fn new(skmer: &'a SuperKmer, k: usize) -> Self {
|
||||
let seql = skmer.seql();
|
||||
let lshift = 64 - k * 2;
|
||||
let mask = ((!0u128) << (lshift + 2)) as u64;
|
||||
Self {
|
||||
skmer,
|
||||
mask,
|
||||
lshift,
|
||||
current: if seql >= k { skmer.kmer(0, k).unwrap().raw() } else { 0 },
|
||||
pos: k,
|
||||
max_pos: seql,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for SKKmerIter<'a> {
|
||||
type Item = Kmer;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
if self.pos > self.max_pos {
|
||||
return None;
|
||||
}
|
||||
// Emit current kmer first, then slide the window forward.
|
||||
let result = Kmer::from_raw(self.current);
|
||||
if self.pos < self.max_pos {
|
||||
let byte_pos = self.pos / 4;
|
||||
// Nucleotide at position r within its byte occupies bits 7-2r (MSB) and 6-2r (LSB).
|
||||
// Extract right-aligned, then place at lshift.
|
||||
let inner_shift = 6 - 2 * (self.pos & 3);
|
||||
let nuc = (((self.skmer.seq[byte_pos] >> inner_shift) & 3) as u64) << self.lshift;
|
||||
self.current = ((self.current << 2) & self.mask) | nuc;
|
||||
}
|
||||
self.pos += 1;
|
||||
Some(result)
|
||||
}
|
||||
}
|
||||
|
||||
// ── helpers ───────────────────────────────────────────────────────────────────
|
||||
|
||||
fn complement(base: u8) -> u8 {
|
||||
@@ -755,4 +810,121 @@ mod tests {
|
||||
sk.set_minimizer_pos(3);
|
||||
assert_eq!(sk.to_ascii(), ascii);
|
||||
}
|
||||
|
||||
// ── iter_kmers ────────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_count() {
|
||||
let ascii = b"ACGTACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
for k in [1usize, 3, 4, 5, 8, 12] {
|
||||
let n = sk.iter_kmers(k).count();
|
||||
assert_eq!(n, ascii.len() - k + 1, "count mismatch for k={k}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_first_is_kmer_0() {
|
||||
let ascii = b"ACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
for k in 1..=ascii.len() {
|
||||
let first = sk.iter_kmers(k).next().unwrap();
|
||||
assert_eq!(first, sk.kmer(0, k).unwrap(), "k={k}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_matches_kmer_at_each_position() {
|
||||
let ascii = b"ACGTACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let k = 4;
|
||||
let kmers: Vec<Kmer> = sk.iter_kmers(k).collect();
|
||||
assert_eq!(kmers.len(), ascii.len() - k + 1);
|
||||
for (i, &km) in kmers.iter().enumerate() {
|
||||
assert_eq!(km, sk.kmer(i, k).unwrap(), "mismatch at pos {i}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_single_when_seql_eq_k() {
|
||||
let ascii = b"ACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let k = ascii.len();
|
||||
let kmers: Vec<Kmer> = sk.iter_kmers(k).collect();
|
||||
assert_eq!(kmers.len(), 1);
|
||||
assert_eq!(kmers[0], sk.kmer(0, k).unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_two_when_seql_eq_k_plus_one() {
|
||||
let ascii = b"ACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let k = ascii.len() - 1;
|
||||
let kmers: Vec<Kmer> = sk.iter_kmers(k).collect();
|
||||
assert_eq!(kmers.len(), 2);
|
||||
assert_eq!(kmers[0], sk.kmer(0, k).unwrap());
|
||||
assert_eq!(kmers[1], sk.kmer(1, k).unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_all_k_values() {
|
||||
// For every valid k, each yielded kmer must match kmer(i, k).
|
||||
let ascii = b"ACGTACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let seql = ascii.len();
|
||||
for k in 1..=seql {
|
||||
let kmers: Vec<Kmer> = sk.iter_kmers(k).collect();
|
||||
assert_eq!(kmers.len(), seql - k + 1, "k={k}");
|
||||
for (i, &km) in kmers.iter().enumerate() {
|
||||
assert_eq!(km, sk.kmer(i, k).unwrap(), "k={k}, pos={i}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_crosses_byte_boundary() {
|
||||
// Positions 3→4 and 7→8 cross a 4-nucleotide byte boundary.
|
||||
let ascii = b"ACGTACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let k = 3;
|
||||
let kmers: Vec<Kmer> = sk.iter_kmers(k).collect();
|
||||
for boundary in [3usize, 4, 7, 8] {
|
||||
if boundary + 1 < kmers.len() {
|
||||
assert_eq!(
|
||||
kmers[boundary],
|
||||
sk.kmer(boundary, k).unwrap(),
|
||||
"pos={boundary}"
|
||||
);
|
||||
assert_eq!(
|
||||
kmers[boundary + 1],
|
||||
sk.kmer(boundary + 1, k).unwrap(),
|
||||
"pos={}",
|
||||
boundary + 1
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_k1_yields_all_nucleotides() {
|
||||
let ascii = b"ACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let kmers: Vec<Kmer> = sk.iter_kmers(1).collect();
|
||||
assert_eq!(kmers.len(), 4);
|
||||
for (i, &km) in kmers.iter().enumerate() {
|
||||
assert_eq!(km, sk.kmer(i, 1).unwrap(), "pos={i}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_long_sequence() {
|
||||
let ascii = make_seq(20);
|
||||
let sk = SuperKmer::from_ascii(&ascii);
|
||||
let k = 7;
|
||||
let kmers: Vec<Kmer> = sk.iter_kmers(k).collect();
|
||||
assert_eq!(kmers.len(), ascii.len() - k + 1);
|
||||
for (i, &km) in kmers.iter().enumerate() {
|
||||
assert_eq!(km, sk.kmer(i, k).unwrap(), "pos={i}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user