first implementation but far to be optimal
This commit is contained in:
@@ -0,0 +1,14 @@
|
||||
[package]
|
||||
name = "obikseq"
|
||||
version = "0.1.0"
|
||||
edition = "2024"
|
||||
|
||||
[dependencies]
|
||||
bitvec = "1"
|
||||
|
||||
[dev-dependencies]
|
||||
criterion2 = { version = "3", features = ["cargo_bench_support"] }
|
||||
|
||||
[[bench]]
|
||||
name = "superkmer"
|
||||
harness = false
|
||||
@@ -0,0 +1,95 @@
|
||||
use criterion::{BatchSize, BenchmarkId, Criterion, Throughput, criterion_group, criterion_main};
|
||||
use obikseq::superkmer::SuperKmer;
|
||||
|
||||
const LENGTHS: &[usize] = &[1, 4, 8, 16, 40, 64, 128, 255, 256];
|
||||
|
||||
fn make_ascii(len: usize) -> Vec<u8> {
|
||||
(0..len).map(|i| b"ACGT"[i % 4]).collect()
|
||||
}
|
||||
|
||||
fn bench_from_ascii(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("from_ascii");
|
||||
for &len in LENGTHS {
|
||||
let ascii = make_ascii(len);
|
||||
group.throughput(Throughput::Bytes(len as u64));
|
||||
group.bench_with_input(BenchmarkId::from_parameter(len), &ascii, |b, ascii| {
|
||||
b.iter(|| SuperKmer::from_ascii(std::hint::black_box(ascii)));
|
||||
});
|
||||
}
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_to_ascii(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("to_ascii");
|
||||
for &len in LENGTHS {
|
||||
let sk = SuperKmer::from_ascii(&make_ascii(len));
|
||||
group.throughput(Throughput::Bytes(len as u64));
|
||||
group.bench_with_input(BenchmarkId::from_parameter(len), &sk, |b, sk| {
|
||||
b.iter(|| std::hint::black_box(sk).to_ascii());
|
||||
});
|
||||
}
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_write_ascii(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("write_ascii");
|
||||
for &len in LENGTHS {
|
||||
let sk = SuperKmer::from_ascii(&make_ascii(len));
|
||||
group.throughput(Throughput::Bytes(len as u64));
|
||||
group.bench_with_input(BenchmarkId::from_parameter(len), &sk, |b, sk| {
|
||||
let mut buf = Vec::with_capacity(len);
|
||||
b.iter(|| {
|
||||
buf.clear();
|
||||
std::hint::black_box(sk).write_ascii(&mut buf);
|
||||
});
|
||||
});
|
||||
}
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_revcomp(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("revcomp");
|
||||
for &len in LENGTHS {
|
||||
let sk = SuperKmer::from_ascii(&make_ascii(len));
|
||||
group.throughput(Throughput::Bytes(len as u64));
|
||||
group.bench_with_input(BenchmarkId::from_parameter(len), &sk, |b, sk| {
|
||||
b.iter_batched(
|
||||
|| sk.clone(),
|
||||
|mut s| { std::hint::black_box(&mut s).revcomp(); s },
|
||||
BatchSize::SmallInput,
|
||||
);
|
||||
});
|
||||
}
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_canonical(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("canonical");
|
||||
for &len in LENGTHS {
|
||||
let sk_rc = SuperKmer::from_ascii(&vec![b'T'; len]);
|
||||
let sk_fwd = SuperKmer::from_ascii(&vec![b'A'; len]);
|
||||
group.throughput(Throughput::Bytes(len as u64));
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("needs_revcomp", len), &sk_rc, |b, sk| {
|
||||
b.iter_batched(
|
||||
|| sk.clone(),
|
||||
|s| std::hint::black_box(s).canonical(),
|
||||
BatchSize::SmallInput,
|
||||
);
|
||||
}
|
||||
);
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("already_canonical", len), &sk_fwd, |b, sk| {
|
||||
b.iter_batched(
|
||||
|| sk.clone(),
|
||||
|s| std::hint::black_box(s).canonical(),
|
||||
BatchSize::SmallInput,
|
||||
);
|
||||
}
|
||||
);
|
||||
}
|
||||
group.finish();
|
||||
}
|
||||
|
||||
criterion_group!(benches, bench_from_ascii, bench_to_ascii, bench_write_ascii, bench_revcomp, bench_canonical);
|
||||
criterion_main!(benches);
|
||||
@@ -0,0 +1,31 @@
|
||||
// ENC: ASCII byte → 2-bit nucleotide code (A=0, C=1, G=2, T/U=3).
|
||||
// Indexed by b & 0x1F — 32 entries, handles upper/lowercase identically,
|
||||
// ambiguous bases and unknowns silently map to A (0).
|
||||
pub(crate) static ENC: [u8; 32] = [
|
||||
// _ A B C D E F G H I J K L M N O
|
||||
0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
// P Q R S T U V W X Y Z ...
|
||||
0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
];
|
||||
|
||||
// DEC4: packed byte → 4 ASCII nucleotides encoded as big-endian u32.
|
||||
// DEC4[b].to_be_bytes() == [nuc0, nuc1, nuc2, nuc3] where nuc0 is at the MSB of b.
|
||||
pub(crate) static DEC4: [u32; 256] = build_dec4();
|
||||
|
||||
pub(crate) fn encode_base(b: u8) -> u8 {
|
||||
ENC[(b & 0x1F) as usize]
|
||||
}
|
||||
|
||||
const fn build_dec4() -> [u32; 256] {
|
||||
const DECODE: [u8; 4] = [b'A', b'C', b'G', b'T'];
|
||||
let mut table = [0u32; 256];
|
||||
let mut i = 0usize;
|
||||
while i < 256 {
|
||||
table[i] = (DECODE[(i >> 6) & 3] as u32) << 24
|
||||
| (DECODE[(i >> 4) & 3] as u32) << 16
|
||||
| (DECODE[(i >> 2) & 3] as u32) << 8
|
||||
| DECODE[i & 3] as u32;
|
||||
i += 1;
|
||||
}
|
||||
table
|
||||
}
|
||||
@@ -0,0 +1,263 @@
|
||||
//! Compact 2-bit kmer stored as a left-aligned u64.
|
||||
//!
|
||||
//! Nucleotide 0 occupies bits 63–62, nucleotide i occupies bits 63−2i and 62−2i.
|
||||
//! The low 64−2k bits are always zero. k is not stored — it is a parameter of
|
||||
//! every operation that needs it, and will be owned by the collection-level indexer.
|
||||
|
||||
use crate::encoding::{encode_base, DEC4};
|
||||
|
||||
// ── KmerError ─────────────────────────────────────────────────────────────────
|
||||
|
||||
/// Errors produced by kmer operations.
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
pub enum KmerError {
|
||||
/// The requested kmer extends past the end of the sequence.
|
||||
OutOfBounds {
|
||||
/// Start position of the requested kmer.
|
||||
position: usize,
|
||||
/// Requested kmer length.
|
||||
k: usize,
|
||||
/// Actual sequence length.
|
||||
seql: usize,
|
||||
},
|
||||
/// k is zero or exceeds 32 (would not fit in a u64).
|
||||
InvalidK {
|
||||
/// The invalid k value.
|
||||
k: usize,
|
||||
},
|
||||
}
|
||||
|
||||
impl std::fmt::Display for KmerError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
KmerError::OutOfBounds { position, k, seql } =>
|
||||
write!(f, "kmer of length {k} at position {position} exceeds sequence length {seql}"),
|
||||
KmerError::InvalidK { k } =>
|
||||
write!(f, "k={k} is invalid: must be in 1..=32"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for KmerError {}
|
||||
|
||||
// ── Kmer ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
/// A DNA kmer of length k encoded as a left-aligned u64 (2 bits/nucleotide, MSB-first).
|
||||
/// k is not stored in the struct — it must be supplied by the caller.
|
||||
#[repr(transparent)]
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
pub struct Kmer(u64);
|
||||
|
||||
impl Kmer {
|
||||
/// Wrap a raw left-aligned u64 value as a Kmer.
|
||||
#[inline]
|
||||
pub fn from_raw(raw: u64) -> Self {
|
||||
Kmer(raw)
|
||||
}
|
||||
|
||||
/// Return the raw left-aligned u64 value.
|
||||
#[inline]
|
||||
pub fn raw(&self) -> u64 {
|
||||
self.0
|
||||
}
|
||||
|
||||
/// Encode the first k nucleotides of an ASCII slice into a Kmer.
|
||||
/// Zero allocation — result lives on the stack.
|
||||
#[inline]
|
||||
pub fn from_ascii(ascii: &[u8], k: usize) -> Result<Self, KmerError> {
|
||||
if k == 0 || k > 32 {
|
||||
return Err(KmerError::InvalidK { k });
|
||||
}
|
||||
if ascii.len() < k {
|
||||
return Err(KmerError::OutOfBounds { position: 0, k, seql: ascii.len() });
|
||||
}
|
||||
let mut val = 0u64;
|
||||
for i in 0..k {
|
||||
val = (val << 2) | encode_base(ascii[i]) as u64;
|
||||
}
|
||||
Ok(Kmer(val << (64 - 2 * k)))
|
||||
}
|
||||
|
||||
/// Extract nucleotide i (0-based from 5′ end) as a 2-bit value.
|
||||
#[inline]
|
||||
pub fn nucleotide(&self, i: usize) -> u8 {
|
||||
((self.0 >> (62 - 2 * i)) & 0b11) as u8
|
||||
}
|
||||
|
||||
/// Decode this kmer into a freshly allocated ASCII `Vec<u8>`.
|
||||
#[inline]
|
||||
pub fn to_ascii(&self, k: usize) -> Vec<u8> {
|
||||
let mut buf = Vec::with_capacity(k);
|
||||
self.write_ascii(k, &mut buf);
|
||||
buf
|
||||
}
|
||||
|
||||
/// Decode this kmer into ASCII nucleotides, appending into `buf`.
|
||||
/// Zero allocation — caller owns the buffer.
|
||||
#[inline]
|
||||
pub fn write_ascii(&self, k: usize, buf: &mut Vec<u8>) {
|
||||
let bytes = self.0.to_be_bytes();
|
||||
let full = k / 4;
|
||||
let rem = k % 4;
|
||||
for i in 0..full {
|
||||
buf.extend_from_slice(&DEC4[bytes[i] as usize].to_be_bytes());
|
||||
}
|
||||
if rem > 0 {
|
||||
let decoded = DEC4[bytes[full] as usize].to_be_bytes();
|
||||
buf.extend_from_slice(&decoded[..rem]);
|
||||
}
|
||||
}
|
||||
|
||||
/// Compute the reverse complement of this kmer.
|
||||
/// Zero allocation — result lives on the stack.
|
||||
#[inline]
|
||||
pub fn revcomp(&self, k: usize) -> Self {
|
||||
let x = !self.0; // complement
|
||||
let x = x.swap_bytes(); // reverse bytes
|
||||
let x = ((x >> 4) & 0x0F0F0F0F0F0F0F0F) | ((x & 0x0F0F0F0F0F0F0F0F) << 4); // swap nibbles
|
||||
let x = ((x >> 2) & 0x3333333333333333) | ((x & 0x3333333333333333) << 2); // swap 2-bit groups
|
||||
Kmer(x << (64 - 2 * k))
|
||||
}
|
||||
|
||||
/// Return the canonical form: lexicographic minimum of forward and reverse complement.
|
||||
/// Zero allocation — result lives on the stack.
|
||||
#[inline]
|
||||
pub fn canonical(&self, k: usize) -> Self {
|
||||
let rc = self.revcomp(k);
|
||||
if self.0 <= rc.0 { *self } else { rc }
|
||||
}
|
||||
}
|
||||
|
||||
// ── tests ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn ascii_revcomp(seq: &[u8]) -> Vec<u8> {
|
||||
seq.iter().rev().map(|&b| match b {
|
||||
b'A' => b'T', b'T' => b'A',
|
||||
b'C' => b'G', b'G' => b'C',
|
||||
_ => b'A',
|
||||
}).collect()
|
||||
}
|
||||
|
||||
const K_VALUES: &[usize] = &[1, 2, 3, 4, 8, 11, 16, 31, 32];
|
||||
|
||||
fn make_seq(k: usize) -> Vec<u8> {
|
||||
(0..k).map(|i| b"ACGT"[i % 4]).collect()
|
||||
}
|
||||
|
||||
// ── from_ascii / to_ascii ─────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn ascii_roundtrip() {
|
||||
for &k in K_VALUES {
|
||||
let ascii = make_seq(k);
|
||||
let kmer = Kmer::from_ascii(&ascii, k).unwrap();
|
||||
assert_eq!(kmer.to_ascii(k), ascii, "roundtrip failed for k={k}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn from_ascii_all_bases() {
|
||||
for (base, expected) in [(b'A', b'A'), (b'C', b'C'), (b'G', b'G'), (b'T', b'T')] {
|
||||
let kmer = Kmer::from_ascii(&[base], 1).unwrap();
|
||||
assert_eq!(kmer.to_ascii(1), vec![expected]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn from_ascii_invalid_k() {
|
||||
assert!(Kmer::from_ascii(b"A", 0).is_err());
|
||||
assert!(Kmer::from_ascii(b"ACGT", 33).is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn from_ascii_too_short() {
|
||||
assert!(Kmer::from_ascii(b"ACG", 4).is_err());
|
||||
}
|
||||
|
||||
// ── nucleotide ────────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn nucleotide_extraction() {
|
||||
let kmer = Kmer::from_ascii(b"ACGT", 4).unwrap();
|
||||
assert_eq!(kmer.nucleotide(0), 0b00); // A
|
||||
assert_eq!(kmer.nucleotide(1), 0b01); // C
|
||||
assert_eq!(kmer.nucleotide(2), 0b10); // G
|
||||
assert_eq!(kmer.nucleotide(3), 0b11); // T
|
||||
}
|
||||
|
||||
// ── revcomp ───────────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn revcomp_known_values() {
|
||||
let cases: &[(&[u8], &[u8])] = &[
|
||||
(b"A", b"T"),
|
||||
(b"AC", b"GT"),
|
||||
(b"ACG", b"CGT"),
|
||||
(b"ACGT", b"ACGT"), // palindrome
|
||||
(b"AAAA", b"TTTT"),
|
||||
(b"TTTT", b"AAAA"),
|
||||
];
|
||||
for (seq, expected) in cases {
|
||||
let k = seq.len();
|
||||
let kmer = Kmer::from_ascii(seq, k).unwrap();
|
||||
let rc = kmer.revcomp(k);
|
||||
assert_eq!(rc.to_ascii(k), *expected, "revcomp wrong for \"{}\"", std::str::from_utf8(seq).unwrap());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn revcomp_vs_reference() {
|
||||
for &k in K_VALUES {
|
||||
let ascii = make_seq(k);
|
||||
let expected = ascii_revcomp(&ascii);
|
||||
let rc = Kmer::from_ascii(&ascii, k).unwrap().revcomp(k);
|
||||
assert_eq!(rc.to_ascii(k), expected, "revcomp wrong for k={k}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn revcomp_involution() {
|
||||
for &k in K_VALUES {
|
||||
let ascii = make_seq(k);
|
||||
let kmer = Kmer::from_ascii(&ascii, k).unwrap();
|
||||
assert_eq!(kmer.revcomp(k).revcomp(k), kmer, "revcomp∘revcomp≠id for k={k}");
|
||||
}
|
||||
}
|
||||
|
||||
// ── canonical ─────────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn canonical_palindrome() {
|
||||
let kmer = Kmer::from_ascii(b"ACGT", 4).unwrap();
|
||||
assert_eq!(kmer.canonical(4), kmer);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_chooses_lesser() {
|
||||
let kmer = Kmer::from_ascii(b"TTTT", 4).unwrap();
|
||||
let expected = Kmer::from_ascii(b"AAAA", 4).unwrap();
|
||||
assert_eq!(kmer.canonical(4), expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_is_minimal() {
|
||||
for &k in K_VALUES {
|
||||
let ascii = make_seq(k);
|
||||
let kmer = Kmer::from_ascii(&ascii, k).unwrap().canonical(k);
|
||||
let rc = kmer.revcomp(k);
|
||||
assert!(kmer.0 <= rc.0, "canonical not minimal for k={k}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_idempotent() {
|
||||
for &k in K_VALUES {
|
||||
let kmer = Kmer::from_ascii(&make_seq(k), k).unwrap().canonical(k);
|
||||
assert_eq!(kmer.canonical(k), kmer, "canonical not idempotent for k={k}");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,11 @@
|
||||
//! Sequence encoding and manipulation primitives for obikmer.
|
||||
//!
|
||||
//! Provides [`superkmer::SuperKmer`]: a compact 2-bit–encoded DNA sequence
|
||||
//! with in-place reverse complement and canonical-form support.
|
||||
|
||||
#![deny(missing_docs)]
|
||||
|
||||
mod encoding;
|
||||
pub mod kmer;
|
||||
mod revcomp_lookup;
|
||||
pub mod superkmer;
|
||||
@@ -0,0 +1,20 @@
|
||||
/// Reverse-complement lookup table for 4-base chunks encoded as u8 (2 bits/base, MSB-first).
|
||||
/// REVCOMP4[x] is the reverse complement of the 4 bases encoded in x.
|
||||
pub(crate) static REVCOMP4: [u8; 256] = build_revcomp4();
|
||||
|
||||
pub(crate) const fn revcomp4(x: u8) -> u8 {
|
||||
let x = !x; // complement all bases
|
||||
let x = (x >> 4) | (x << 4); // swap nibbles
|
||||
let x = ((x >> 2) & 0x33) | ((x & 0x33) << 2); // swap 2-bit groups
|
||||
x
|
||||
}
|
||||
|
||||
const fn build_revcomp4() -> [u8; 256] {
|
||||
let mut table = [0u8; 256];
|
||||
let mut i = 0usize;
|
||||
while i < 256 {
|
||||
table[i] = revcomp4(i as u8);
|
||||
i += 1;
|
||||
}
|
||||
table
|
||||
}
|
||||
@@ -0,0 +1,673 @@
|
||||
//! Compact 2-bit DNA super-kmer with in-place reverse complement and canonical form.
|
||||
|
||||
use bitvec::prelude::*;
|
||||
use crate::encoding::{encode_base, DEC4};
|
||||
use crate::kmer::{Kmer, KmerError};
|
||||
use crate::revcomp_lookup::REVCOMP4;
|
||||
|
||||
// ── SuperKmerHeader ───────────────────────────────────────────────────────────
|
||||
|
||||
/// 32-bit super-kmer header.
|
||||
///
|
||||
/// Bit layout (MSB → LSB):
|
||||
///
|
||||
/// ```text
|
||||
/// [31 .......... 8] [7 ...... 0]
|
||||
/// payload (24 b) SEQL (8 b)
|
||||
/// ```
|
||||
///
|
||||
/// SEQL encodes the sequence length: 1–255 map directly; 0 encodes 256.
|
||||
///
|
||||
/// # Temporal dual-use of the payload field
|
||||
///
|
||||
/// The 24-bit payload field serves two distinct roles that are **never active
|
||||
/// at the same time**, separated by the routing step of the scatter pipeline:
|
||||
///
|
||||
/// | Phase | Bits [15:8] | Bits [31:16] |
|
||||
/// |---|---|---|
|
||||
/// | **Scatter** (before routing) | minimizer start position (0–255) | unused (zero) |
|
||||
/// | **Count** (after routing) | low byte of occurrence count | high bytes of occurrence count |
|
||||
///
|
||||
/// During scatter, [`set_minimizer_pos`] stores the 0-based position of the
|
||||
/// minimizer's first nucleotide within the super-kmer. At routing time,
|
||||
/// [`init_count`] overwrites the entire payload with `1`, marking the
|
||||
/// super-kmer as seen once and enabling the usual [`increment`] / [`add`] /
|
||||
/// [`set_count`] operations during deduplication.
|
||||
///
|
||||
/// [`set_minimizer_pos`]: SuperKmerHeader::set_minimizer_pos
|
||||
/// [`init_count`]: SuperKmerHeader::init_count
|
||||
/// [`increment`]: SuperKmerHeader::increment
|
||||
/// [`add`]: SuperKmerHeader::add
|
||||
/// [`set_count`]: SuperKmerHeader::set_count
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub(crate) struct SuperKmerHeader(u32);
|
||||
|
||||
impl SuperKmerHeader {
|
||||
pub(crate) fn new(seql: u8) -> Self {
|
||||
Self(seql as u32)
|
||||
}
|
||||
|
||||
fn seql(&self) -> u8 {
|
||||
self.0 as u8
|
||||
}
|
||||
|
||||
// ── scatter phase ─────────────────────────────────────────────────────────
|
||||
|
||||
/// Store the minimizer start position (bits [15:8]).
|
||||
/// Only meaningful during the scatter phase, before [`init_count`].
|
||||
///
|
||||
/// [`init_count`]: SuperKmerHeader::init_count
|
||||
fn set_minimizer_pos(&mut self, pos: u8) {
|
||||
self.0 = (self.0 & 0xFF) | ((pos as u32) << 8);
|
||||
}
|
||||
|
||||
/// Return the minimizer start position stored during scatter.
|
||||
/// Only meaningful before [`init_count`] is called.
|
||||
///
|
||||
/// [`init_count`]: SuperKmerHeader::init_count
|
||||
fn minimizer_pos(&self) -> u8 {
|
||||
(self.0 >> 8) as u8
|
||||
}
|
||||
|
||||
// ── count phase ───────────────────────────────────────────────────────────
|
||||
|
||||
/// Transition from scatter to count phase: set occurrence count to 1.
|
||||
/// Overwrites the minimizer position stored in the payload.
|
||||
fn init_count(&mut self) {
|
||||
self.0 = (self.0 & 0xFF) | (1 << 8);
|
||||
}
|
||||
|
||||
fn count(&self) -> u32 {
|
||||
self.0 >> 8
|
||||
}
|
||||
|
||||
fn increment(&mut self) {
|
||||
self.0 += 1 << 8;
|
||||
}
|
||||
|
||||
fn add(&mut self, n: u32) {
|
||||
self.0 += n << 8;
|
||||
}
|
||||
|
||||
fn set_count(&mut self, n: u32) {
|
||||
self.0 = (self.0 & 0xFF) | (n << 8);
|
||||
}
|
||||
}
|
||||
|
||||
// ── SuperKmer ─────────────────────────────────────────────────────────────────
|
||||
|
||||
/// Canonical super-kmer: 32-bit header followed by a byte-aligned 2-bit nucleotide sequence.
|
||||
/// Nucleotide 0 is at the MSB of `seq[0]`. Always stored in canonical form.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct SuperKmer {
|
||||
header: SuperKmerHeader,
|
||||
seq: Box<[u8]>,
|
||||
}
|
||||
|
||||
impl SuperKmer {
|
||||
/// `seql` is the raw stored byte: 1–255 for lengths 1–255, 0 for length 256.
|
||||
pub fn new(seql: u8, seq: Box<[u8]>) -> Self {
|
||||
let len = stored_to_len(seql);
|
||||
debug_assert_eq!(seq.len(), byte_len(len));
|
||||
Self {
|
||||
header: SuperKmerHeader::new(seql),
|
||||
seq,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the sequence length in nucleotides (1–256).
|
||||
pub fn seql(&self) -> usize {
|
||||
stored_to_len(self.header.seql())
|
||||
}
|
||||
|
||||
/// Returns the occurrence count of this super-kmer.
|
||||
pub fn count(&self) -> u32 {
|
||||
self.header.count()
|
||||
}
|
||||
|
||||
/// Increments the occurrence count by 1.
|
||||
pub fn increment(&mut self) {
|
||||
self.header.increment();
|
||||
}
|
||||
|
||||
/// Adds `n` to the occurrence count.
|
||||
pub fn add(&mut self, n: u32) {
|
||||
self.header.add(n);
|
||||
}
|
||||
|
||||
/// Sets the occurrence count to an absolute value.
|
||||
pub fn set_count(&mut self, n: u32) {
|
||||
self.header.set_count(n);
|
||||
}
|
||||
|
||||
// ── scatter / routing interface ───────────────────────────────────────────
|
||||
|
||||
/// Store the 0-based position of the minimizer's first nucleotide within
|
||||
/// this super-kmer.
|
||||
///
|
||||
/// **Scatter phase only.** Must be called before [`init_count`].
|
||||
/// The position is encoded in the payload field that later holds the
|
||||
/// occurrence count; the two uses are mutually exclusive by pipeline phase.
|
||||
///
|
||||
/// [`init_count`]: SuperKmer::init_count
|
||||
pub fn set_minimizer_pos(&mut self, pos: u8) {
|
||||
self.header.set_minimizer_pos(pos);
|
||||
}
|
||||
|
||||
/// Return the stored minimizer start position.
|
||||
///
|
||||
/// **Scatter phase only.** Only meaningful before [`init_count`] is called.
|
||||
///
|
||||
/// [`init_count`]: SuperKmer::init_count
|
||||
pub fn minimizer_pos(&self) -> u8 {
|
||||
self.header.minimizer_pos()
|
||||
}
|
||||
|
||||
/// Transition from scatter phase to count phase: set occurrence count to 1.
|
||||
///
|
||||
/// Call this once at routing time. After this call, [`minimizer_pos`] is
|
||||
/// no longer valid and the count methods ([`count`], [`increment`], [`add`],
|
||||
/// [`set_count`]) become meaningful.
|
||||
///
|
||||
/// [`minimizer_pos`]: SuperKmer::minimizer_pos
|
||||
/// [`count`]: SuperKmer::count
|
||||
/// [`increment`]: SuperKmer::increment
|
||||
/// [`add`]: SuperKmer::add
|
||||
/// [`set_count`]: SuperKmer::set_count
|
||||
pub fn init_count(&mut self) {
|
||||
self.header.init_count();
|
||||
}
|
||||
|
||||
/// Extract nucleotide i (0-based from 5' end) as a 2-bit value.
|
||||
pub fn nucleotide(&self, i: usize) -> u8 {
|
||||
(self.seq[i / 4] >> (6 - 2 * (i % 4))) & 0b11
|
||||
}
|
||||
|
||||
/// Reverse-complement this super-kmer in place.
|
||||
pub fn revcomp(&mut self) {
|
||||
let seql = self.seql();
|
||||
let n = byte_len(seql);
|
||||
|
||||
// Step 1: swap bytes outside-in, applying revcomp4 to each.
|
||||
{
|
||||
let bytes = &mut self.seq[..n];
|
||||
let (mut lo, mut hi) = (0, n - 1);
|
||||
while lo < hi {
|
||||
(bytes[lo], bytes[hi]) = (REVCOMP4[bytes[hi] as usize], REVCOMP4[bytes[lo] as usize]);
|
||||
lo += 1;
|
||||
hi -= 1;
|
||||
}
|
||||
if lo == hi {
|
||||
bytes[lo] = REVCOMP4[bytes[lo] as usize];
|
||||
}
|
||||
}
|
||||
|
||||
// Step 2: left-shift to flush padding T's introduced by complementing padding A's.
|
||||
let shift = n * 8 - seql * 2;
|
||||
if shift > 0 {
|
||||
let bits = self.seq[..n].view_bits_mut::<Msb0>();
|
||||
bits.rotate_left(shift);
|
||||
let len = bits.len();
|
||||
bits[len - shift..].fill(false);
|
||||
}
|
||||
}
|
||||
|
||||
/// Encode an ASCII nucleotide sequence (ACGT, length 1–256) into a new SuperKmer.
|
||||
/// The result is not yet in canonical form; call `.canonical()` if needed.
|
||||
pub fn from_ascii(ascii: &[u8]) -> Self {
|
||||
let seql = ascii.len();
|
||||
debug_assert!(seql >= 1 && seql <= 256, "super-kmer length must be 1..=256");
|
||||
let n = byte_len(seql);
|
||||
let mut seq = vec![0u8; n];
|
||||
|
||||
let full = seql / 4;
|
||||
for i in 0..full {
|
||||
seq[i] = encode_base(ascii[i * 4]) << 6
|
||||
| encode_base(ascii[i * 4 + 1]) << 4
|
||||
| encode_base(ascii[i * 4 + 2]) << 2
|
||||
| encode_base(ascii[i * 4 + 3]);
|
||||
}
|
||||
let rem = seql % 4;
|
||||
if rem > 0 {
|
||||
let mut last = 0u8;
|
||||
for j in 0..rem {
|
||||
last |= encode_base(ascii[full * 4 + j]) << (6 - 2 * j);
|
||||
}
|
||||
seq[full] = last;
|
||||
}
|
||||
|
||||
Self::new(seql as u8, seq.into_boxed_slice()) // 256usize as u8 == 0, intentional
|
||||
}
|
||||
|
||||
/// Decode this super-kmer sequence into ASCII nucleotides, appending into `buf`.
|
||||
pub fn write_ascii(&self, buf: &mut Vec<u8>) {
|
||||
let seql = self.seql();
|
||||
let full = seql / 4;
|
||||
|
||||
for i in 0..full {
|
||||
buf.extend_from_slice(&DEC4[self.seq[i] as usize].to_be_bytes());
|
||||
}
|
||||
let rem = seql % 4;
|
||||
if rem > 0 {
|
||||
let bytes = DEC4[self.seq[full] as usize].to_be_bytes();
|
||||
buf.extend_from_slice(&bytes[..rem]);
|
||||
}
|
||||
}
|
||||
|
||||
/// Decode this super-kmer sequence into a fresh ASCII `Vec<u8>`.
|
||||
pub fn to_ascii(&self) -> Vec<u8> {
|
||||
let mut buf = Vec::with_capacity(self.seql());
|
||||
self.write_ascii(&mut buf);
|
||||
buf
|
||||
}
|
||||
|
||||
/// Extract the kmer of length k starting at nucleotide position i (0-based).
|
||||
///
|
||||
/// Returns an error if k is invalid (0 or > 32) or if position i + k exceeds the sequence length.
|
||||
pub fn kmer(&self, i: usize, k: usize) -> Result<Kmer, KmerError> {
|
||||
if k == 0 || k > 32 {
|
||||
return Err(KmerError::InvalidK { k });
|
||||
}
|
||||
let seql = self.seql();
|
||||
if i + k > seql {
|
||||
return Err(KmerError::OutOfBounds { position: i, k, seql });
|
||||
}
|
||||
let bits = self.seq.view_bits::<Msb0>();
|
||||
let raw: u64 = bits[i * 2..(i + k) * 2].load_be();
|
||||
Ok(Kmer::from_raw(raw << (64 - 2 * k)))
|
||||
}
|
||||
|
||||
/// Extract the canonical kmer of length k starting at nucleotide position i (0-based).
|
||||
///
|
||||
/// Equivalent to `self.kmer(i, k)?.canonical(k)` but avoids the redundant `revcomp` call
|
||||
/// when the super-kmer is already in canonical form (which is the normal case).
|
||||
/// Returns an error if k is invalid (0 or > 32) or if position i + k exceeds the sequence length.
|
||||
pub fn canonical_kmer(&self, i: usize, k: usize) -> Result<Kmer, KmerError> {
|
||||
Ok(self.kmer(i, k)?.canonical(k))
|
||||
}
|
||||
|
||||
/// Return this super-kmer in canonical form (lexicographic minimum of forward and revcomp).
|
||||
pub fn canonical(mut self) -> Self {
|
||||
let seql = self.seql();
|
||||
for i in 0..seql {
|
||||
let fwd = self.nucleotide(i);
|
||||
let rev = complement(self.nucleotide(seql - 1 - i));
|
||||
if fwd < rev {
|
||||
return self;
|
||||
}
|
||||
if fwd > rev {
|
||||
self.revcomp();
|
||||
return self;
|
||||
}
|
||||
}
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
// ── helpers ───────────────────────────────────────────────────────────────────
|
||||
|
||||
fn complement(base: u8) -> u8 {
|
||||
!base & 0b11
|
||||
}
|
||||
|
||||
fn byte_len(seql: usize) -> usize {
|
||||
(seql + 3) / 4
|
||||
}
|
||||
|
||||
/// Stored u8 → actual length: 0 encodes 256, 1–255 are identity.
|
||||
fn stored_to_len(s: u8) -> usize {
|
||||
if s == 0 { 256 } else { s as usize }
|
||||
}
|
||||
|
||||
// ── tests ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
/// Repeating ACGT pattern of the given length.
|
||||
fn make_seq(len: usize) -> Vec<u8> {
|
||||
(0..len).map(|i| b"ACGT"[i % 4]).collect()
|
||||
}
|
||||
|
||||
/// Reference revcomp on ASCII bytes.
|
||||
fn ascii_revcomp(seq: &[u8]) -> Vec<u8> {
|
||||
seq.iter().rev().map(|&b| match b {
|
||||
b'A' => b'T', b'T' => b'A',
|
||||
b'C' => b'G', b'G' => b'C',
|
||||
_ => b'A',
|
||||
}).collect()
|
||||
}
|
||||
|
||||
fn all_lengths() -> impl Iterator<Item = usize> {
|
||||
(1..=9).chain([255, 256])
|
||||
}
|
||||
|
||||
// ── kmer extraction ───────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn kmer_first_matches_from_ascii() {
|
||||
let ascii = b"ACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let k = 4;
|
||||
let kmer = sk.kmer(0, k).unwrap();
|
||||
let expected = crate::kmer::Kmer::from_ascii(&ascii[..k], k).unwrap();
|
||||
assert_eq!(kmer, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn kmer_last_position() {
|
||||
let ascii = b"ACGTACGT";
|
||||
let seql = ascii.len();
|
||||
let k = 4;
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let kmer = sk.kmer(seql - k, k).unwrap();
|
||||
let expected = crate::kmer::Kmer::from_ascii(&ascii[seql - k..], k).unwrap();
|
||||
assert_eq!(kmer, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn kmer_all_positions() {
|
||||
let ascii = b"ACGTACGTACGT";
|
||||
let k = 4;
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
for i in 0..=ascii.len() - k {
|
||||
let kmer = sk.kmer(i, k).unwrap();
|
||||
let expected = crate::kmer::Kmer::from_ascii(&ascii[i..i + k], k).unwrap();
|
||||
assert_eq!(kmer, expected, "mismatch at position {i}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn kmer_out_of_bounds() {
|
||||
let sk = SuperKmer::from_ascii(b"ACGT");
|
||||
assert!(sk.kmer(2, 4).is_err()); // 2 + 4 > 4
|
||||
assert!(sk.kmer(4, 1).is_err()); // 4 + 1 > 4
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn kmer_invalid_k() {
|
||||
let sk = SuperKmer::from_ascii(b"ACGT");
|
||||
assert!(sk.kmer(0, 0).is_err());
|
||||
assert!(sk.kmer(0, 33).is_err());
|
||||
}
|
||||
|
||||
// ── canonical_kmer ────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn canonical_kmer_is_min_of_kmer_and_revcomp() {
|
||||
let sk = SuperKmer::from_ascii(b"ACGTACGT");
|
||||
let k = 4;
|
||||
for i in 0..=(sk.seql() - k) {
|
||||
let ck = sk.canonical_kmer(i, k).unwrap();
|
||||
let fwd = sk.kmer(i, k).unwrap();
|
||||
assert_eq!(ck, fwd.canonical(k));
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_kmer_palindrome_unchanged() {
|
||||
// ACGT is its own reverse complement
|
||||
let sk = SuperKmer::from_ascii(b"ACGT");
|
||||
let ck = sk.canonical_kmer(0, 4).unwrap();
|
||||
let fwd = sk.kmer(0, 4).unwrap();
|
||||
assert_eq!(ck, fwd);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_kmer_tttt_becomes_aaaa() {
|
||||
let sk = SuperKmer::from_ascii(b"TTTT");
|
||||
let ck = sk.canonical_kmer(0, 4).unwrap();
|
||||
let expected = Kmer::from_ascii(b"AAAA", 4).unwrap();
|
||||
assert_eq!(ck, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_kmer_errors_propagate() {
|
||||
let sk = SuperKmer::from_ascii(b"ACGT");
|
||||
assert!(sk.canonical_kmer(2, 4).is_err()); // out of bounds
|
||||
assert!(sk.canonical_kmer(0, 0).is_err()); // invalid k
|
||||
}
|
||||
|
||||
// ── count ─────────────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn count_starts_at_zero() {
|
||||
let sk = SuperKmer::from_ascii(b"ACGT");
|
||||
assert_eq!(sk.count(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn increment_adds_one() {
|
||||
let mut sk = SuperKmer::from_ascii(b"ACGT");
|
||||
sk.increment();
|
||||
assert_eq!(sk.count(), 1);
|
||||
sk.increment();
|
||||
assert_eq!(sk.count(), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn add_increases_count() {
|
||||
let mut sk = SuperKmer::from_ascii(b"ACGT");
|
||||
sk.add(42);
|
||||
assert_eq!(sk.count(), 42);
|
||||
sk.add(8);
|
||||
assert_eq!(sk.count(), 50);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn set_count_overwrites() {
|
||||
let mut sk = SuperKmer::from_ascii(b"ACGT");
|
||||
sk.add(100);
|
||||
sk.set_count(7);
|
||||
assert_eq!(sk.count(), 7);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn increment_preserves_seql() {
|
||||
for len in all_lengths() {
|
||||
let mut sk = SuperKmer::from_ascii(&make_seq(len));
|
||||
sk.increment();
|
||||
assert_eq!(sk.seql(), len, "increment altered seql for len={len}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn add_preserves_seql() {
|
||||
for len in all_lengths() {
|
||||
let mut sk = SuperKmer::from_ascii(&make_seq(len));
|
||||
sk.add(1000);
|
||||
assert_eq!(sk.seql(), len, "add altered seql for len={len}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn set_count_preserves_seql() {
|
||||
for len in all_lengths() {
|
||||
let mut sk = SuperKmer::from_ascii(&make_seq(len));
|
||||
sk.set_count(999);
|
||||
assert_eq!(sk.seql(), len, "set_count altered seql for len={len}");
|
||||
assert_eq!(sk.count(), 999);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn count_does_not_affect_sequence() {
|
||||
let ascii = b"ACGTACGT".to_vec();
|
||||
let mut sk = SuperKmer::from_ascii(&ascii);
|
||||
sk.set_count(16_000_000);
|
||||
assert_eq!(sk.to_ascii(), ascii);
|
||||
}
|
||||
|
||||
// ── seql encoding ─────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn seql_roundtrip() {
|
||||
for len in all_lengths() {
|
||||
let sk = SuperKmer::from_ascii(&make_seq(len));
|
||||
assert_eq!(sk.seql(), len, "seql() wrong for len={len}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn seql_256_stored_as_zero() {
|
||||
let sk = SuperKmer::from_ascii(&make_seq(256));
|
||||
assert_eq!(sk.header.seql(), 0u8);
|
||||
assert_eq!(sk.seql(), 256);
|
||||
}
|
||||
|
||||
// ── from_ascii / to_ascii roundtrip ───────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn ascii_roundtrip_all_lengths() {
|
||||
for len in all_lengths() {
|
||||
let ascii = make_seq(len);
|
||||
let sk = SuperKmer::from_ascii(&ascii);
|
||||
assert_eq!(sk.to_ascii(), ascii, "roundtrip failed for len={len}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ascii_roundtrip_all_bases() {
|
||||
for (base, expected) in [(b'A', b'A'), (b'C', b'C'), (b'G', b'G'), (b'T', b'T')] {
|
||||
let ascii = vec![base; 4];
|
||||
let sk = SuperKmer::from_ascii(&ascii);
|
||||
assert_eq!(sk.to_ascii(), vec![expected; 4]);
|
||||
}
|
||||
}
|
||||
|
||||
// ── revcomp correctness ───────────────────────────────────────────────────
|
||||
|
||||
/// Known (seq, expected_revcomp) pairs — one per shift value × two byte counts.
|
||||
#[test]
|
||||
fn revcomp_known_values() {
|
||||
let cases = [
|
||||
// shift=6
|
||||
("A", "T"),
|
||||
("ACGTA", "TACGT"),
|
||||
// shift=4
|
||||
("AC", "GT"),
|
||||
("ACGTAC", "GTACGT"),
|
||||
// shift=2
|
||||
("ACG", "CGT"),
|
||||
("ACGTACG", "CGTACGT"),
|
||||
// shift=0
|
||||
("ACGT", "ACGT"),
|
||||
("ACGTACGT", "ACGTACGT"),
|
||||
];
|
||||
for (seq, expected) in cases {
|
||||
let mut sk = SuperKmer::from_ascii(seq.as_bytes());
|
||||
sk.revcomp();
|
||||
assert_eq!(
|
||||
sk.to_ascii(), expected.as_bytes(),
|
||||
"revcomp wrong for \"{seq}\""
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn revcomp_vs_reference_all_lengths() {
|
||||
for len in all_lengths() {
|
||||
let ascii = make_seq(len);
|
||||
let expected = ascii_revcomp(&ascii);
|
||||
let mut sk = SuperKmer::from_ascii(&ascii);
|
||||
sk.revcomp();
|
||||
assert_eq!(sk.to_ascii(), expected, "revcomp wrong for len={len}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn revcomp_involution_all_lengths() {
|
||||
for len in all_lengths() {
|
||||
let ascii = make_seq(len);
|
||||
let mut sk = SuperKmer::from_ascii(&ascii);
|
||||
sk.revcomp();
|
||||
sk.revcomp();
|
||||
assert_eq!(sk.to_ascii(), ascii, "revcomp∘revcomp≠id for len={len}");
|
||||
}
|
||||
}
|
||||
|
||||
// ── canonical ─────────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn canonical_palindrome_unchanged() {
|
||||
// ACGT is its own revcomp
|
||||
let sk = SuperKmer::from_ascii(b"ACGT").canonical();
|
||||
assert_eq!(sk.to_ascii(), b"ACGT");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_chooses_forward() {
|
||||
// "AAAA" < "TTTT" → stays as-is
|
||||
let sk = SuperKmer::from_ascii(b"AAAA").canonical();
|
||||
assert_eq!(sk.to_ascii(), b"AAAA");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_chooses_revcomp() {
|
||||
// "TTTT" > "AAAA" → flipped
|
||||
let sk = SuperKmer::from_ascii(b"TTTT").canonical();
|
||||
assert_eq!(sk.to_ascii(), b"AAAA");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_is_minimal_all_lengths() {
|
||||
for len in all_lengths() {
|
||||
let ascii = make_seq(len);
|
||||
let sk = SuperKmer::from_ascii(&ascii).canonical();
|
||||
let fwd = sk.to_ascii();
|
||||
let rev = ascii_revcomp(&fwd);
|
||||
assert!(fwd <= rev, "canonical not minimal for len={len}");
|
||||
}
|
||||
}
|
||||
|
||||
// ── scatter / routing lifecycle ───────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn minimizer_pos_roundtrip() {
|
||||
let mut sk = SuperKmer::from_ascii(b"ACGTACGT");
|
||||
sk.set_minimizer_pos(42);
|
||||
assert_eq!(sk.minimizer_pos(), 42);
|
||||
assert_eq!(sk.seql(), 8, "set_minimizer_pos altered seql");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn minimizer_pos_boundary_values() {
|
||||
let mut sk = SuperKmer::from_ascii(b"ACGTACGT");
|
||||
sk.set_minimizer_pos(0);
|
||||
assert_eq!(sk.minimizer_pos(), 0);
|
||||
sk.set_minimizer_pos(255);
|
||||
assert_eq!(sk.minimizer_pos(), 255);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn init_count_resets_to_one_and_enables_counting() {
|
||||
let mut sk = SuperKmer::from_ascii(b"ACGTACGT");
|
||||
sk.set_minimizer_pos(7);
|
||||
sk.init_count();
|
||||
assert_eq!(sk.count(), 1);
|
||||
sk.increment();
|
||||
assert_eq!(sk.count(), 2);
|
||||
sk.add(10);
|
||||
assert_eq!(sk.count(), 12);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn init_count_preserves_seql() {
|
||||
for len in all_lengths() {
|
||||
let mut sk = SuperKmer::from_ascii(&make_seq(len));
|
||||
sk.set_minimizer_pos(0);
|
||||
sk.init_count();
|
||||
assert_eq!(sk.seql(), len, "init_count altered seql for len={len}");
|
||||
assert_eq!(sk.count(), 1);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn minimizer_pos_does_not_affect_sequence() {
|
||||
let ascii = b"ACGTACGT".to_vec();
|
||||
let mut sk = SuperKmer::from_ascii(&ascii);
|
||||
sk.set_minimizer_pos(3);
|
||||
assert_eq!(sk.to_ascii(), ascii);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user