refactor: abstract k-mer types and fix bit alignment
Abstracts k-mer storage using a `RawKmer` alias and `KMER_BITS` constant to simplify bit manipulation and enable future extension to larger types. Updates bit-shifting and masking logic across `kmer.rs` and `packed_seq.rs` to prevent overflow and improve type safety. Adapts the MPHF layer to iterate over indexed canonical k-mers with explicit slot bounds validation and bit-level encoding. Fixes test suite compilation errors by correcting method names, adding tuple destructuring, and passing the required `IndexMode::Exact` parameter.
This commit is contained in:
+54
-42
@@ -1,7 +1,8 @@
|
||||
//! Compact 2-bit kmer stored as a left-aligned u64.
|
||||
//! Compact 2-bit kmer stored as a left-aligned [`RawKmer`].
|
||||
//!
|
||||
//! Nucleotide 0 occupies bits 63–62, nucleotide i occupies bits 63−2i and 62−2i.
|
||||
//! The low 64−2·len bits are always zero.
|
||||
//! Nucleotide 0 occupies bits KMER_BITS−1 and KMER_BITS−2, nucleotide i occupies
|
||||
//! bits KMER_BITS−1−2i and KMER_BITS−2−2i.
|
||||
//! The low `KMER_BITS − 2·len` bits are always zero.
|
||||
//!
|
||||
//! The length is not stored in the struct — it is supplied by the [`KmerLength`]
|
||||
//! type parameter. Two public marker types cover the common cases:
|
||||
@@ -15,6 +16,17 @@
|
||||
//! Tests that need a fixed length independent of the global singletons can use
|
||||
//! [`ConstLen<N>`].
|
||||
|
||||
/// Underlying storage type for a raw (2-bit-per-base, left-aligned) kmer.
|
||||
///
|
||||
/// All kmer bit-width constants (`KMER_BITS`) and shift calculations are derived
|
||||
/// from this type. To extend the maximum supported k beyond 32, change this alias
|
||||
/// to `u128` and update `KMER_BITS` — callers that use `RawKmer` directly will
|
||||
/// then pick up the wider type automatically.
|
||||
pub type RawKmer = u64;
|
||||
|
||||
/// Bit width of [`RawKmer`]: the number of bits available for nucleotide encoding.
|
||||
pub const KMER_BITS: usize = 64;
|
||||
|
||||
use serde::Serialize;
|
||||
use std::io::{self, Write};
|
||||
use std::marker::PhantomData;
|
||||
@@ -109,35 +121,35 @@ impl Annotation for KmerAnnotation {}
|
||||
|
||||
// ── KmerOf ────────────────────────────────────────────────────────────────────
|
||||
|
||||
/// A DNA kmer of length `L::len()` encoded as a left-aligned u64 (2 bits/nucleotide, MSB-first).
|
||||
/// The low `64 − 2·L::len()` bits are always zero.
|
||||
/// A DNA kmer of length `L::len()` encoded as a left-aligned [`RawKmer`] (2 bits/nucleotide, MSB-first).
|
||||
/// The low `KMER_BITS − 2·L::len()` bits are always zero.
|
||||
#[repr(transparent)]
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
pub struct KmerOf<L: KmerLength>(u64, PhantomData<L>);
|
||||
pub struct KmerOf<L: KmerLength>(RawKmer, PhantomData<L>);
|
||||
|
||||
impl<L: KmerLength> KmerOf<L> {
|
||||
/// Wrap a raw left-aligned u64 value as a kmer.
|
||||
/// Wrap a raw left-aligned [`RawKmer`] value as a kmer.
|
||||
#[inline]
|
||||
pub const fn from_raw(raw: u64) -> Self {
|
||||
pub const fn from_raw(raw: RawKmer) -> Self {
|
||||
KmerOf(raw, PhantomData)
|
||||
}
|
||||
|
||||
/// Wrap a raw right-aligned u64 value, shifting it into left-aligned position.
|
||||
/// Wrap a raw right-aligned [`RawKmer`] value, shifting it into left-aligned position.
|
||||
#[inline]
|
||||
pub fn from_raw_right(raw: u64) -> Self {
|
||||
KmerOf(raw << (64 - 2 * L::len()), PhantomData)
|
||||
pub fn from_raw_right(raw: RawKmer) -> Self {
|
||||
KmerOf(raw << (KMER_BITS - 2 * L::len()), PhantomData)
|
||||
}
|
||||
|
||||
/// Return the raw left-aligned u64 value.
|
||||
/// Return the raw left-aligned [`RawKmer`] value.
|
||||
#[inline]
|
||||
pub fn raw(&self) -> u64 {
|
||||
pub fn raw(&self) -> RawKmer {
|
||||
self.0
|
||||
}
|
||||
|
||||
/// Return the raw right-aligned u64 value.
|
||||
/// Return the raw right-aligned [`RawKmer`] value.
|
||||
#[inline]
|
||||
pub fn raw_right(&self) -> u64 {
|
||||
self.0 >> (64 - 2 * L::len())
|
||||
pub fn raw_right(&self) -> RawKmer {
|
||||
self.0 >> (KMER_BITS - 2 * L::len())
|
||||
}
|
||||
|
||||
/// Encode the first `L::len()` nucleotides of an ASCII slice into a kmer.
|
||||
@@ -153,11 +165,11 @@ impl<L: KmerLength> KmerOf<L> {
|
||||
seql: ascii.len(),
|
||||
});
|
||||
}
|
||||
let mut val = 0u64;
|
||||
let mut val: RawKmer = 0;
|
||||
for i in 0..k {
|
||||
val = (val << 2) | encode_base(ascii[i]) as u64;
|
||||
val = (val << 2) | encode_base(ascii[i]) as RawKmer;
|
||||
}
|
||||
Ok(KmerOf(val << (64 - 2 * k), PhantomData))
|
||||
Ok(KmerOf(val << (KMER_BITS - 2 * k), PhantomData))
|
||||
}
|
||||
|
||||
/// Decode into a freshly allocated ASCII `Vec<u8>`.
|
||||
@@ -193,27 +205,27 @@ impl<L: KmerLength> KmerOf<L> {
|
||||
let x = x.swap_bytes();
|
||||
let x = ((x >> 4) & 0x0F0F0F0F0F0F0F0F) | ((x & 0x0F0F0F0F0F0F0F0F) << 4);
|
||||
let x = ((x >> 2) & 0x3333333333333333) | ((x & 0x3333333333333333) << 2);
|
||||
KmerOf(x << (64 - 2 * k), PhantomData)
|
||||
KmerOf(x << (KMER_BITS - 2 * k), PhantomData)
|
||||
}
|
||||
|
||||
/// Slide the window one base to the right: drop nucleotide 0, append `nuc` at position `L::len()-1`.
|
||||
pub fn push_right(self, nuc: u8) -> Self {
|
||||
let k = L::len();
|
||||
let shifted = self.0 << 2 & (!0u64 << (64 - 2 * (k - 1)));
|
||||
KmerOf(shifted | ((nuc as u64 & 3) << (64 - 2 * k)), PhantomData)
|
||||
let shifted = self.0 << 2 & (RawKmer::MAX << (KMER_BITS - 2 * (k - 1)));
|
||||
KmerOf(shifted | ((nuc as RawKmer & 3) << (KMER_BITS - 2 * k)), PhantomData)
|
||||
}
|
||||
|
||||
/// Slide the window one base to the left: drop nucleotide `L::len()-1`, prepend `nuc` at position 0.
|
||||
pub fn push_left(self, nuc: u8) -> Self {
|
||||
let k = L::len();
|
||||
let shifted = (self.0 >> 2) & (!0u64 << (64 - 2 * k));
|
||||
KmerOf(shifted | ((nuc as u64 & 3) << 62), PhantomData)
|
||||
let shifted = (self.0 >> 2) & (RawKmer::MAX << (KMER_BITS - 2 * k));
|
||||
KmerOf(shifted | ((nuc as RawKmer & 3) << (KMER_BITS - 2)), PhantomData)
|
||||
}
|
||||
|
||||
/// Returns `true` if the last `L::len()-1` nucleotides of `self` equal the first `L::len()-1` of `other`.
|
||||
pub fn is_overlapping(self, other: Self) -> bool {
|
||||
let k = L::len();
|
||||
let mask = !0u64 << (64 - 2 * (k - 1));
|
||||
let mask = RawKmer::MAX << (KMER_BITS - 2 * (k - 1));
|
||||
(self.0 << 2 & mask) == (other.0 & mask)
|
||||
}
|
||||
}
|
||||
@@ -231,7 +243,7 @@ impl<L: KmerLength> Sequence for KmerOf<L> {
|
||||
|
||||
#[inline]
|
||||
fn nucleotide(&self, i: usize) -> u8 {
|
||||
((self.0 >> (62 - 2 * i)) & 0b11) as u8
|
||||
((self.0 >> (KMER_BITS - 2 - 2 * i)) & 0b11) as u8
|
||||
}
|
||||
|
||||
#[inline]
|
||||
@@ -255,21 +267,21 @@ impl<L: KmerLength> Sequence for KmerOf<L> {
|
||||
/// [`CanonicalKmerOf::from_raw_unchecked`] (trusted paths such as deserialisation).
|
||||
#[repr(transparent)]
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
pub struct CanonicalKmerOf<L: KmerLength>(u64, PhantomData<L>);
|
||||
pub struct CanonicalKmerOf<L: KmerLength>(RawKmer, PhantomData<L>);
|
||||
|
||||
impl<L: KmerLength> CanonicalKmerOf<L> {
|
||||
/// Wrap a raw left-aligned u64 without verifying the canonical invariant.
|
||||
/// Wrap a raw left-aligned [`RawKmer`] without verifying the canonical invariant.
|
||||
///
|
||||
/// # Safety (logical)
|
||||
/// The caller must guarantee `raw == min(raw, revcomp(raw))`.
|
||||
#[inline]
|
||||
pub fn from_raw_unchecked(raw: u64) -> Self {
|
||||
pub fn from_raw_unchecked(raw: RawKmer) -> Self {
|
||||
CanonicalKmerOf(raw, PhantomData)
|
||||
}
|
||||
|
||||
/// Return the raw left-aligned u64 value.
|
||||
/// Return the raw left-aligned [`RawKmer`] value.
|
||||
#[inline]
|
||||
pub fn raw(&self) -> u64 {
|
||||
pub fn raw(&self) -> RawKmer {
|
||||
self.0
|
||||
}
|
||||
|
||||
@@ -307,25 +319,25 @@ impl<L: KmerLength> CanonicalKmerOf<L> {
|
||||
/// Return the four left canonical neighbours (each already canonical).
|
||||
pub fn left_canonical_neighbors(&self) -> [CanonicalKmerOf<L>; 4] {
|
||||
let k = L::len();
|
||||
let shifted = (self.0 >> 2) & (!0u64 << (64 - 2 * k));
|
||||
let shifted = (self.0 >> 2) & (RawKmer::MAX << (KMER_BITS - 2 * k));
|
||||
[
|
||||
KmerOf::<L>(shifted, PhantomData).canonical(),
|
||||
KmerOf::<L>(shifted | (1u64 << 62), PhantomData).canonical(),
|
||||
KmerOf::<L>(shifted | (2u64 << 62), PhantomData).canonical(),
|
||||
KmerOf::<L>(shifted | (3u64 << 62), PhantomData).canonical(),
|
||||
KmerOf::<L>(shifted | ((1 as RawKmer) << (KMER_BITS - 2)), PhantomData).canonical(),
|
||||
KmerOf::<L>(shifted | ((2 as RawKmer) << (KMER_BITS - 2)), PhantomData).canonical(),
|
||||
KmerOf::<L>(shifted | ((3 as RawKmer) << (KMER_BITS - 2)), PhantomData).canonical(),
|
||||
]
|
||||
}
|
||||
|
||||
/// Return the four right canonical neighbours (each already canonical).
|
||||
pub fn right_canonical_neighbors(&self) -> [CanonicalKmerOf<L>; 4] {
|
||||
let k = L::len();
|
||||
let shifted = self.0 << 2 & (!0u64 << (64 - 2 * (k - 1)));
|
||||
let shift = 64 - 2 * k;
|
||||
let shifted = self.0 << 2 & (RawKmer::MAX << (KMER_BITS - 2 * (k - 1)));
|
||||
let shift = KMER_BITS - 2 * k;
|
||||
[
|
||||
KmerOf::<L>(shifted, PhantomData).canonical(),
|
||||
KmerOf::<L>(shifted | (1u64 << shift), PhantomData).canonical(),
|
||||
KmerOf::<L>(shifted | (2u64 << shift), PhantomData).canonical(),
|
||||
KmerOf::<L>(shifted | (3u64 << shift), PhantomData).canonical(),
|
||||
KmerOf::<L>(shifted | ((1 as RawKmer) << shift), PhantomData).canonical(),
|
||||
KmerOf::<L>(shifted | ((2 as RawKmer) << shift), PhantomData).canonical(),
|
||||
KmerOf::<L>(shifted | ((3 as RawKmer) << shift), PhantomData).canonical(),
|
||||
]
|
||||
}
|
||||
|
||||
@@ -349,7 +361,7 @@ impl<L: KmerLength> Sequence for CanonicalKmerOf<L> {
|
||||
|
||||
#[inline]
|
||||
fn nucleotide(&self, i: usize) -> u8 {
|
||||
((self.0 >> (62 - 2 * i)) & 0b11) as u8
|
||||
((self.0 >> (KMER_BITS - 2 - 2 * i)) & 0b11) as u8
|
||||
}
|
||||
|
||||
fn canonical(&self) -> Self::Canonical {
|
||||
@@ -386,7 +398,7 @@ pub type Minimizer = CanonicalKmerOf<MLen>;
|
||||
/// 64-bit representation directly, which is useful when the canonical
|
||||
/// invariant is not required or has already been handled.
|
||||
#[inline]
|
||||
pub fn hash_kmer(raw: u64) -> u64 {
|
||||
pub fn hash_kmer(raw: RawKmer) -> u64 {
|
||||
mix64(raw ^ 0x9e3779b97f4a7c15)
|
||||
}
|
||||
|
||||
|
||||
@@ -20,7 +20,7 @@ pub mod superkmer;
|
||||
pub mod unitig;
|
||||
|
||||
pub use annotations::Annotation;
|
||||
pub use kmer::{CanonicalKmer, Kmer, Minimizer, hash_kmer};
|
||||
pub use kmer::{CanonicalKmer, Kmer, Minimizer, RawKmer, KMER_BITS, hash_kmer};
|
||||
pub use packed_seq::MAX_KMERS_PER_CHUNK;
|
||||
pub use params::{k, m, set_k, set_m};
|
||||
pub use routable::RoutableSuperKmer;
|
||||
|
||||
@@ -18,7 +18,7 @@ use bitvec::prelude::*;
|
||||
|
||||
use crate::Sequence;
|
||||
use crate::encoding::{DEC4, encode_base};
|
||||
use crate::kmer::{CanonicalKmer, Kmer, KmerError, KLen, KmerLength, KmerOf, MLen, Minimizer};
|
||||
use crate::kmer::{CanonicalKmer, Kmer, KmerError, KLen, KmerLength, KmerOf, MLen, Minimizer, RawKmer, KMER_BITS};
|
||||
use crate::params::k;
|
||||
use crate::revcomp_lookup::REVCOMP4;
|
||||
|
||||
@@ -198,8 +198,8 @@ impl PackedSeq {
|
||||
return Err(KmerError::OutOfBounds { position: i, k: len, seql });
|
||||
}
|
||||
let bits = self.seq.view_bits::<Msb0>();
|
||||
let raw: u64 = bits[i * 2..(i + len) * 2].load_be();
|
||||
Ok(KmerOf::from_raw(raw << (64 - 2 * len)))
|
||||
let raw: RawKmer = bits[i * 2..(i + len) * 2].load_be();
|
||||
Ok(KmerOf::from_raw(raw << (KMER_BITS - 2 * len)))
|
||||
}
|
||||
|
||||
/// Extract the kmer of length `params::k()` at nucleotide position `i`. Zero allocation.
|
||||
@@ -322,9 +322,9 @@ impl AsRef<PackedSeq> for PackedSeq {
|
||||
/// sequence into the iterator, so no reference escapes the closure.
|
||||
pub struct PackedSeqKmerIter<S> {
|
||||
seq: S,
|
||||
mask: u64,
|
||||
mask: RawKmer,
|
||||
lshift: usize,
|
||||
current: u64,
|
||||
current: RawKmer,
|
||||
pos: usize,
|
||||
max_pos: usize,
|
||||
}
|
||||
@@ -341,8 +341,8 @@ impl<S: AsRef<PackedSeq>> PackedSeqKmerIter<S> {
|
||||
let ps = seq.as_ref();
|
||||
let seql = ps.seql();
|
||||
let klen = k();
|
||||
let lshift = 64 - klen * 2;
|
||||
let mask = ((!0u128) << (lshift + 2)) as u64;
|
||||
let lshift = KMER_BITS - klen * 2;
|
||||
let mask = ((!0u128) << (lshift + 2)) as RawKmer;
|
||||
let current = if seql >= klen {
|
||||
ps.extract::<KLen>(0).map(|km| km.raw()).unwrap_or(0)
|
||||
} else {
|
||||
@@ -362,7 +362,7 @@ impl<S: AsRef<PackedSeq>> Iterator for PackedSeqKmerIter<S> {
|
||||
let result = Kmer::from_raw(self.current);
|
||||
if self.pos < self.max_pos {
|
||||
let inner_shift = 6 - 2 * (self.pos & 3);
|
||||
let nuc = ((self.seq.as_ref().seq[self.pos / 4] >> inner_shift) & 3) as u64;
|
||||
let nuc = ((self.seq.as_ref().seq[self.pos / 4] >> inner_shift) & 3) as RawKmer;
|
||||
self.current = ((self.current << 2) & self.mask) | (nuc << self.lshift);
|
||||
}
|
||||
self.pos += 1;
|
||||
|
||||
Reference in New Issue
Block a user