refactor: implement RoutableSuperKmer and update k-mer indexing pipeline
Replace raw SuperkMer routing with a new RoutableSuperKimer type that embeds canonical sequences and precomputed minimizers, enabling direct partition routing via hash. Update the build pipeline to yield RoutableSuperKmers throughout (builder, scatterer), refactor FASTA/unitig export commands to use the new type and compressed outputs (.fasta.gz, .unitigs.fasta.zst), revise SuperKmer header to store n_kmers instead of seql (avoiding 256-byte wrap), and update documentation to reflect minimizer-based theory, two evidence-encoding strategies for unitig-MPHF indexing (global offset vs. ID+rank), and the new obipipeline library architecture with parallel workers, biased scheduling, and error handling.
This commit is contained in:
@@ -5,6 +5,8 @@ edition = "2024"
|
||||
|
||||
[dependencies]
|
||||
bitvec = "1"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1.0.149"
|
||||
xxhash-rust = { version = "0.8.15", features = ["xxh3", "const_xxh3"] }
|
||||
|
||||
[dev-dependencies]
|
||||
|
||||
@@ -40,7 +40,7 @@ fn bench_write_ascii(c: &mut Criterion) {
|
||||
let mut buf = Vec::with_capacity(len);
|
||||
b.iter(|| {
|
||||
buf.clear();
|
||||
std::hint::black_box(sk).write_ascii(&mut buf);
|
||||
std::hint::black_box(sk).write_ascii(&mut buf).unwrap();
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
@@ -0,0 +1,12 @@
|
||||
use serde::Serialize;
|
||||
use serde_json;
|
||||
use std::io::{self, Write};
|
||||
|
||||
/// Serialize `self` as a single-line JSON object into a writer.
|
||||
pub trait Annotation: Serialize {
|
||||
/// Write the annotation as compact JSON into `writer`.
|
||||
fn write<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
let s = serde_json::to_string(self).map_err(io::Error::other)?;
|
||||
writer.write_all(s.as_bytes())
|
||||
}
|
||||
}
|
||||
@@ -4,6 +4,8 @@
|
||||
//! The low 64−2k bits are always zero. k is not stored — it is a parameter of
|
||||
//! every operation that needs it, and will be owned by the collection-level indexer.
|
||||
|
||||
use std::io::{self, Write};
|
||||
|
||||
use crate::encoding::{DEC4, encode_base};
|
||||
|
||||
// ── KmerError ─────────────────────────────────────────────────────────────────
|
||||
@@ -115,24 +117,24 @@ impl Kmer {
|
||||
#[inline]
|
||||
pub fn to_ascii(&self, k: usize) -> Vec<u8> {
|
||||
let mut buf = Vec::with_capacity(k);
|
||||
self.write_ascii(k, &mut buf);
|
||||
self.write_ascii(k, &mut buf).unwrap();
|
||||
buf
|
||||
}
|
||||
|
||||
/// Decode this kmer into ASCII nucleotides, appending into `buf`.
|
||||
/// Zero allocation — caller owns the buffer.
|
||||
/// Decode this kmer into ASCII nucleotides, writing into `writer`.
|
||||
#[inline]
|
||||
pub fn write_ascii(&self, k: usize, buf: &mut Vec<u8>) {
|
||||
pub fn write_ascii<W: Write>(&self, k: usize, writer: &mut W) -> io::Result<()> {
|
||||
let bytes = self.0.to_be_bytes();
|
||||
let full = k / 4;
|
||||
let rem = k % 4;
|
||||
for i in 0..full {
|
||||
buf.extend_from_slice(&DEC4[bytes[i] as usize].to_be_bytes());
|
||||
writer.write_all(&DEC4[bytes[i] as usize].to_be_bytes())?;
|
||||
}
|
||||
if rem > 0 {
|
||||
let decoded = DEC4[bytes[full] as usize].to_be_bytes();
|
||||
buf.extend_from_slice(&decoded[..rem]);
|
||||
writer.write_all(&decoded[..rem])?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Compute the reverse complement of this kmer.
|
||||
|
||||
@@ -5,8 +5,17 @@
|
||||
|
||||
#![deny(missing_docs)]
|
||||
|
||||
mod annotations;
|
||||
|
||||
mod encoding;
|
||||
pub mod kmer;
|
||||
mod revcomp_lookup;
|
||||
/// Routable super-kmer: canonical sequence paired with its minimizer for scatter routing.
|
||||
pub mod routable;
|
||||
pub mod superkmer;
|
||||
|
||||
pub mod unitig;
|
||||
|
||||
pub use annotations::Annotation;
|
||||
pub use routable::RoutableSuperKmer;
|
||||
pub use superkmer::SuperKmer;
|
||||
|
||||
@@ -0,0 +1,59 @@
|
||||
//! Super-kmer with routing metadata: canonical sequence + pre-computed minimizer.
|
||||
|
||||
use super::kmer::Kmer;
|
||||
use super::SuperKmer;
|
||||
|
||||
/// Owned wrapper that pairs a canonical [`SuperKmer`] with its minimizer [`Kmer`].
|
||||
///
|
||||
/// Created at the single point where raw sequence bytes are emitted from the
|
||||
/// scratch buffer. The minimizer position (given in original orientation) is
|
||||
/// adjusted for any flip applied during canonicalisation. After routing, call
|
||||
/// [`into_superkmer`] to discard the metadata and continue with the bare sequence.
|
||||
///
|
||||
/// [`into_superkmer`]: RoutableSuperKmer::into_superkmer
|
||||
pub struct RoutableSuperKmer {
|
||||
superkmer: SuperKmer,
|
||||
minimizer: Kmer,
|
||||
}
|
||||
|
||||
impl RoutableSuperKmer {
|
||||
/// Construct from raw packed bytes.
|
||||
///
|
||||
/// `min_pos` is the 0-based minimizer position in the **original** (pre-flip)
|
||||
/// orientation. `m` is the minimizer length. `seql` and `seq` are the
|
||||
/// raw length byte and 2-bit-packed nucleotides as produced by the scratch
|
||||
/// buffer.
|
||||
pub fn build(min_pos: usize, m: usize, seql: u8, seq: Box<[u8]>) -> Self {
|
||||
let (sk, already_canonical) = SuperKmer::build(seql, seq);
|
||||
let adjusted_pos = if already_canonical {
|
||||
min_pos
|
||||
} else {
|
||||
sk.len() - m - min_pos
|
||||
};
|
||||
let minimizer = sk.kmer(adjusted_pos, m).unwrap().canonical(m);
|
||||
Self {
|
||||
superkmer: sk,
|
||||
minimizer,
|
||||
}
|
||||
}
|
||||
|
||||
/// Borrow the canonical super-kmer sequence.
|
||||
pub fn superkmer(&self) -> &SuperKmer {
|
||||
&self.superkmer
|
||||
}
|
||||
|
||||
/// Borrow the canonical minimizer kmer.
|
||||
pub fn minimizer(&self) -> &Kmer {
|
||||
&self.minimizer
|
||||
}
|
||||
|
||||
/// Consume this wrapper and return the inner [`SuperKmer`].
|
||||
pub fn into_superkmer(self) -> SuperKmer {
|
||||
self.superkmer
|
||||
}
|
||||
|
||||
/// Sequence length in nucleotides.
|
||||
pub fn len(&self) -> usize {
|
||||
self.superkmer.len()
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,5 @@
|
||||
pub trait Sequence {
|
||||
fn len(&self) -> usize;
|
||||
fn sequence(&self) -> &[u8];
|
||||
fn revcomp(&self) -> Self;
|
||||
}
|
||||
+54
-587
@@ -1,4 +1,7 @@
|
||||
//! Compact 2-bit DNA super-kmer with in-place reverse complement and canonical form.
|
||||
use std::io::{self, Write};
|
||||
|
||||
use serde::Serialize;
|
||||
|
||||
use crate::encoding::{DEC4, encode_base};
|
||||
use crate::kmer::{Kmer, KmerError};
|
||||
@@ -14,70 +17,24 @@ use xxhash_rust::xxh3::xxh3_64;
|
||||
///
|
||||
/// ```text
|
||||
/// [31 .......... 8] [7 ...... 0]
|
||||
/// payload (24 b) SEQL (8 b)
|
||||
/// count (24 b) SEQL (8 b)
|
||||
/// ```
|
||||
///
|
||||
/// SEQL encodes the sequence length: 1–255 map directly; 0 encodes 256.
|
||||
///
|
||||
/// # Temporal dual-use of the payload field
|
||||
///
|
||||
/// The 24-bit payload field serves two distinct roles that are **never active
|
||||
/// at the same time**, separated by the routing step of the scatter pipeline:
|
||||
///
|
||||
/// | Phase | Bits [15:8] | Bits [31:16] |
|
||||
/// |---|---|---|
|
||||
/// | **Scatter** (before routing) | minimizer start position (0–255) | unused (zero) |
|
||||
/// | **Count** (after routing) | low byte of occurrence count | high bytes of occurrence count |
|
||||
///
|
||||
/// During scatter, [`set_minimizer_pos`] stores the 0-based position of the
|
||||
/// minimizer's first nucleotide within the super-kmer. At routing time,
|
||||
/// [`init_count`] overwrites the entire payload with `1`, marking the
|
||||
/// super-kmer as seen once and enabling the usual [`increment`] / [`add`] /
|
||||
/// [`set_count`] operations during deduplication.
|
||||
///
|
||||
/// [`set_minimizer_pos`]: SuperKmerHeader::set_minimizer_pos
|
||||
/// [`init_count`]: SuperKmerHeader::init_count
|
||||
/// [`increment`]: SuperKmerHeader::increment
|
||||
/// [`add`]: SuperKmerHeader::add
|
||||
/// [`set_count`]: SuperKmerHeader::set_count
|
||||
/// The count field starts at 1 and accumulates occurrence counts during
|
||||
/// deduplication.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub(crate) struct SuperKmerHeader(u32);
|
||||
|
||||
impl SuperKmerHeader {
|
||||
pub(crate) fn new(seql: u8) -> Self {
|
||||
Self(seql as u32)
|
||||
Self((1 << 8) | seql as u32)
|
||||
}
|
||||
|
||||
fn seql(&self) -> u8 {
|
||||
self.0 as u8
|
||||
}
|
||||
|
||||
// ── scatter phase ─────────────────────────────────────────────────────────
|
||||
|
||||
/// Store the minimizer start position (bits [15:8]).
|
||||
/// Only meaningful during the scatter phase, before [`init_count`].
|
||||
///
|
||||
/// [`init_count`]: SuperKmerHeader::init_count
|
||||
fn set_minimizer_pos(&mut self, pos: u8) {
|
||||
self.0 = (self.0 & 0xFF) | ((pos as u32) << 8);
|
||||
}
|
||||
|
||||
/// Return the minimizer start position stored during scatter.
|
||||
/// Only meaningful before [`init_count`] is called.
|
||||
///
|
||||
/// [`init_count`]: SuperKmerHeader::init_count
|
||||
fn minimizer_pos(&self) -> u8 {
|
||||
(self.0 >> 8) as u8
|
||||
}
|
||||
|
||||
// ── count phase ───────────────────────────────────────────────────────────
|
||||
|
||||
/// Transition from scatter to count phase: set occurrence count to 1.
|
||||
/// Overwrites the minimizer position stored in the payload.
|
||||
fn init_count(&mut self) {
|
||||
self.0 = (self.0 & 0xFF) | (1 << 8);
|
||||
}
|
||||
|
||||
fn count(&self) -> u32 {
|
||||
self.0 >> 8
|
||||
}
|
||||
@@ -95,6 +52,15 @@ impl SuperKmerHeader {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct CountAnnotation {
|
||||
seq_length: usize,
|
||||
kmer_size: usize,
|
||||
minimizer_size: usize,
|
||||
partition: u32,
|
||||
count: u32,
|
||||
}
|
||||
|
||||
// ── SuperKmer ─────────────────────────────────────────────────────────────────
|
||||
|
||||
/// Canonical super-kmer: 32-bit header followed by a byte-aligned 2-bit nucleotide sequence.
|
||||
@@ -127,12 +93,18 @@ impl std::hash::Hash for SuperKmer {
|
||||
impl SuperKmer {
|
||||
/// `seql` is the raw stored byte: 1–255 for lengths 1–255, 0 for length 256.
|
||||
pub fn new(seql: u8, seq: Box<[u8]>) -> Self {
|
||||
let len = stored_to_len(seql);
|
||||
debug_assert_eq!(seq.len(), byte_len(len));
|
||||
Self {
|
||||
Self::build(seql, seq).0
|
||||
}
|
||||
|
||||
/// Construct and canonicalise in place, returning `(sk, already_canonical)`.
|
||||
/// `already_canonical` is `true` when the sequence was not flipped.
|
||||
pub fn build(seql: u8, seq: Box<[u8]>) -> (Self, bool) {
|
||||
let mut sk = Self {
|
||||
header: SuperKmerHeader::new(seql),
|
||||
seq,
|
||||
}
|
||||
};
|
||||
let already_canonical = sk.canonical(); // true = pas retourné
|
||||
(sk, already_canonical)
|
||||
}
|
||||
|
||||
/// Deserialise from a raw 32-bit header word and packed sequence bytes.
|
||||
@@ -141,14 +113,19 @@ impl SuperKmer {
|
||||
let seql = (bits & 0xFF) as u8;
|
||||
let len = stored_to_len(seql);
|
||||
debug_assert_eq!(seq.len(), byte_len(len));
|
||||
Self {
|
||||
let sk = Self {
|
||||
header: SuperKmerHeader(bits),
|
||||
seq,
|
||||
}
|
||||
};
|
||||
debug_assert!(
|
||||
sk.is_canonical(),
|
||||
"SuperKmer deserialised from disk is not canonical"
|
||||
);
|
||||
sk
|
||||
}
|
||||
|
||||
/// Returns the sequence length in nucleotides (1–256).
|
||||
pub fn seql(&self) -> usize {
|
||||
pub fn len(&self) -> usize {
|
||||
stored_to_len(self.header.seql())
|
||||
}
|
||||
|
||||
@@ -172,44 +149,6 @@ impl SuperKmer {
|
||||
self.header.set_count(n);
|
||||
}
|
||||
|
||||
// ── scatter / routing interface ───────────────────────────────────────────
|
||||
|
||||
/// Store the 0-based position of the minimizer's first nucleotide within
|
||||
/// this super-kmer.
|
||||
///
|
||||
/// **Scatter phase only.** Must be called before [`init_count`].
|
||||
/// The position is encoded in the payload field that later holds the
|
||||
/// occurrence count; the two uses are mutually exclusive by pipeline phase.
|
||||
///
|
||||
/// [`init_count`]: SuperKmer::init_count
|
||||
pub fn set_minimizer_pos(&mut self, pos: u8) {
|
||||
self.header.set_minimizer_pos(pos);
|
||||
}
|
||||
|
||||
/// Return the stored minimizer start position.
|
||||
///
|
||||
/// **Scatter phase only.** Only meaningful before [`init_count`] is called.
|
||||
///
|
||||
/// [`init_count`]: SuperKmer::init_count
|
||||
pub fn minimizer_pos(&self) -> u8 {
|
||||
self.header.minimizer_pos()
|
||||
}
|
||||
|
||||
/// Transition from scatter phase to count phase: set occurrence count to 1.
|
||||
///
|
||||
/// Call this once at routing time. After this call, [`minimizer_pos`] is
|
||||
/// no longer valid and the count methods ([`count`], [`increment`], [`add`],
|
||||
/// [`set_count`]) become meaningful.
|
||||
///
|
||||
/// [`minimizer_pos`]: SuperKmer::minimizer_pos
|
||||
/// [`count`]: SuperKmer::count
|
||||
/// [`increment`]: SuperKmer::increment
|
||||
/// [`add`]: SuperKmer::add
|
||||
/// [`set_count`]: SuperKmer::set_count
|
||||
pub fn init_count(&mut self) {
|
||||
self.header.init_count();
|
||||
}
|
||||
|
||||
/// Extract nucleotide i (0-based from 5' end) as a 2-bit value.
|
||||
pub fn nucleotide(&self, i: usize) -> u8 {
|
||||
(self.seq[i / 4] >> (6 - 2 * (i % 4))) & 0b11
|
||||
@@ -217,7 +156,7 @@ impl SuperKmer {
|
||||
|
||||
/// Reverse-complement this super-kmer in place.
|
||||
pub fn revcomp(&mut self) {
|
||||
let seql = self.seql();
|
||||
let seql = self.len();
|
||||
let n = byte_len(seql);
|
||||
|
||||
// Step 1: swap bytes outside-in, applying revcomp4 to each.
|
||||
@@ -245,8 +184,7 @@ impl SuperKmer {
|
||||
}
|
||||
}
|
||||
|
||||
/// Encode an ASCII nucleotide sequence (ACGT, length 1–256) into a new SuperKmer.
|
||||
/// The result is not yet in canonical form; call `.canonical()` if needed.
|
||||
/// Encode an ASCII nucleotide sequence (ACGT, length 1–256) into a canonical SuperKmer.
|
||||
pub fn from_ascii(ascii: &[u8]) -> Self {
|
||||
let seql = ascii.len();
|
||||
debug_assert!(
|
||||
@@ -275,25 +213,26 @@ impl SuperKmer {
|
||||
Self::new(seql as u8, seq.into_boxed_slice()) // 256usize as u8 == 0, intentional
|
||||
}
|
||||
|
||||
/// Decode this super-kmer sequence into ASCII nucleotides, appending into `buf`.
|
||||
pub fn write_ascii(&self, buf: &mut Vec<u8>) {
|
||||
let seql = self.seql();
|
||||
/// Decode this super-kmer sequence into ASCII nucleotides, writing into `writer`.
|
||||
pub fn write_ascii<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
let seql = self.len();
|
||||
let full = seql / 4;
|
||||
|
||||
for i in 0..full {
|
||||
buf.extend_from_slice(&DEC4[self.seq[i] as usize].to_be_bytes());
|
||||
writer.write_all(&DEC4[self.seq[i] as usize].to_be_bytes())?;
|
||||
}
|
||||
let rem = seql % 4;
|
||||
if rem > 0 {
|
||||
let bytes = DEC4[self.seq[full] as usize].to_be_bytes();
|
||||
buf.extend_from_slice(&bytes[..rem]);
|
||||
writer.write_all(&bytes[..rem])?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Decode this super-kmer sequence into a fresh ASCII `Vec<u8>`.
|
||||
pub fn to_ascii(&self) -> Vec<u8> {
|
||||
let mut buf = Vec::with_capacity(self.seql());
|
||||
self.write_ascii(&mut buf);
|
||||
let mut buf = Vec::with_capacity(self.len());
|
||||
self.write_ascii(&mut buf).unwrap();
|
||||
buf
|
||||
}
|
||||
|
||||
@@ -318,7 +257,7 @@ impl SuperKmer {
|
||||
if k == 0 || k > 32 {
|
||||
return Err(KmerError::InvalidK { k });
|
||||
}
|
||||
let seql = self.seql();
|
||||
let seql = self.len();
|
||||
if i + k > seql {
|
||||
return Err(KmerError::OutOfBounds {
|
||||
position: i,
|
||||
@@ -351,7 +290,7 @@ impl SuperKmer {
|
||||
|
||||
/// Returns `true` if this super-kmer is in canonical form (lexicographic minimum of forward and revcomp).
|
||||
pub fn is_canonical(&self) -> bool {
|
||||
let seql = self.seql();
|
||||
let seql = self.len();
|
||||
for i in 0..seql {
|
||||
let fwd = self.nucleotide(i);
|
||||
let rev = complement(self.nucleotide(seql - 1 - i));
|
||||
@@ -398,14 +337,18 @@ struct SKKmerIter<'a> {
|
||||
|
||||
impl<'a> SKKmerIter<'a> {
|
||||
fn new(skmer: &'a SuperKmer, k: usize) -> Self {
|
||||
let seql = skmer.seql();
|
||||
let seql = skmer.len();
|
||||
let lshift = 64 - k * 2;
|
||||
let mask = ((!0u128) << (lshift + 2)) as u64;
|
||||
Self {
|
||||
skmer,
|
||||
mask,
|
||||
lshift,
|
||||
current: if seql >= k { skmer.kmer(0, k).unwrap().raw() } else { 0 },
|
||||
current: if seql >= k {
|
||||
skmer.kmer(0, k).unwrap().raw()
|
||||
} else {
|
||||
0
|
||||
},
|
||||
pos: k,
|
||||
max_pos: seql,
|
||||
}
|
||||
@@ -449,482 +392,6 @@ fn stored_to_len(s: u8) -> usize {
|
||||
if s == 0 { 256 } else { s as usize }
|
||||
}
|
||||
|
||||
// ── tests ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
/// Repeating ACGT pattern of the given length.
|
||||
fn make_seq(len: usize) -> Vec<u8> {
|
||||
(0..len).map(|i| b"ACGT"[i % 4]).collect()
|
||||
}
|
||||
|
||||
/// Reference revcomp on ASCII bytes.
|
||||
fn ascii_revcomp(seq: &[u8]) -> Vec<u8> {
|
||||
seq.iter()
|
||||
.rev()
|
||||
.map(|&b| match b {
|
||||
b'A' => b'T',
|
||||
b'T' => b'A',
|
||||
b'C' => b'G',
|
||||
b'G' => b'C',
|
||||
_ => b'A',
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn all_lengths() -> impl Iterator<Item = usize> {
|
||||
(1..=9).chain([255, 256])
|
||||
}
|
||||
|
||||
// ── kmer extraction ───────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn kmer_first_matches_from_ascii() {
|
||||
let ascii = b"ACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let k = 4;
|
||||
let kmer = sk.kmer(0, k).unwrap();
|
||||
let expected = crate::kmer::Kmer::from_ascii(&ascii[..k], k).unwrap();
|
||||
assert_eq!(kmer, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn kmer_last_position() {
|
||||
let ascii = b"ACGTACGT";
|
||||
let seql = ascii.len();
|
||||
let k = 4;
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let kmer = sk.kmer(seql - k, k).unwrap();
|
||||
let expected = crate::kmer::Kmer::from_ascii(&ascii[seql - k..], k).unwrap();
|
||||
assert_eq!(kmer, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn kmer_all_positions() {
|
||||
let ascii = b"ACGTACGTACGT";
|
||||
let k = 4;
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
for i in 0..=ascii.len() - k {
|
||||
let kmer = sk.kmer(i, k).unwrap();
|
||||
let expected = crate::kmer::Kmer::from_ascii(&ascii[i..i + k], k).unwrap();
|
||||
assert_eq!(kmer, expected, "mismatch at position {i}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn kmer_out_of_bounds() {
|
||||
let sk = SuperKmer::from_ascii(b"ACGT");
|
||||
assert!(sk.kmer(2, 4).is_err()); // 2 + 4 > 4
|
||||
assert!(sk.kmer(4, 1).is_err()); // 4 + 1 > 4
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn kmer_invalid_k() {
|
||||
let sk = SuperKmer::from_ascii(b"ACGT");
|
||||
assert!(sk.kmer(0, 0).is_err());
|
||||
assert!(sk.kmer(0, 33).is_err());
|
||||
}
|
||||
|
||||
// ── canonical_kmer ────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn canonical_kmer_is_min_of_kmer_and_revcomp() {
|
||||
let sk = SuperKmer::from_ascii(b"ACGTACGT");
|
||||
let k = 4;
|
||||
for i in 0..=(sk.seql() - k) {
|
||||
let ck = sk.canonical_kmer(i, k).unwrap();
|
||||
let fwd = sk.kmer(i, k).unwrap();
|
||||
assert_eq!(ck, fwd.canonical(k));
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_kmer_palindrome_unchanged() {
|
||||
// ACGT is its own reverse complement
|
||||
let sk = SuperKmer::from_ascii(b"ACGT");
|
||||
let ck = sk.canonical_kmer(0, 4).unwrap();
|
||||
let fwd = sk.kmer(0, 4).unwrap();
|
||||
assert_eq!(ck, fwd);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_kmer_tttt_becomes_aaaa() {
|
||||
let sk = SuperKmer::from_ascii(b"TTTT");
|
||||
let ck = sk.canonical_kmer(0, 4).unwrap();
|
||||
let expected = Kmer::from_ascii(b"AAAA", 4).unwrap();
|
||||
assert_eq!(ck, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_kmer_errors_propagate() {
|
||||
let sk = SuperKmer::from_ascii(b"ACGT");
|
||||
assert!(sk.canonical_kmer(2, 4).is_err()); // out of bounds
|
||||
assert!(sk.canonical_kmer(0, 0).is_err()); // invalid k
|
||||
}
|
||||
|
||||
// ── count ─────────────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn count_starts_at_zero() {
|
||||
let sk = SuperKmer::from_ascii(b"ACGT");
|
||||
assert_eq!(sk.count(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn increment_adds_one() {
|
||||
let mut sk = SuperKmer::from_ascii(b"ACGT");
|
||||
sk.increment();
|
||||
assert_eq!(sk.count(), 1);
|
||||
sk.increment();
|
||||
assert_eq!(sk.count(), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn add_increases_count() {
|
||||
let mut sk = SuperKmer::from_ascii(b"ACGT");
|
||||
sk.add(42);
|
||||
assert_eq!(sk.count(), 42);
|
||||
sk.add(8);
|
||||
assert_eq!(sk.count(), 50);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn set_count_overwrites() {
|
||||
let mut sk = SuperKmer::from_ascii(b"ACGT");
|
||||
sk.add(100);
|
||||
sk.set_count(7);
|
||||
assert_eq!(sk.count(), 7);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn increment_preserves_seql() {
|
||||
for len in all_lengths() {
|
||||
let mut sk = SuperKmer::from_ascii(&make_seq(len));
|
||||
sk.increment();
|
||||
assert_eq!(sk.seql(), len, "increment altered seql for len={len}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn add_preserves_seql() {
|
||||
for len in all_lengths() {
|
||||
let mut sk = SuperKmer::from_ascii(&make_seq(len));
|
||||
sk.add(1000);
|
||||
assert_eq!(sk.seql(), len, "add altered seql for len={len}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn set_count_preserves_seql() {
|
||||
for len in all_lengths() {
|
||||
let mut sk = SuperKmer::from_ascii(&make_seq(len));
|
||||
sk.set_count(999);
|
||||
assert_eq!(sk.seql(), len, "set_count altered seql for len={len}");
|
||||
assert_eq!(sk.count(), 999);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn count_does_not_affect_sequence() {
|
||||
let ascii = b"ACGTACGT".to_vec();
|
||||
let mut sk = SuperKmer::from_ascii(&ascii);
|
||||
sk.set_count(16_000_000);
|
||||
assert_eq!(sk.to_ascii(), ascii);
|
||||
}
|
||||
|
||||
// ── seql encoding ─────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn seql_roundtrip() {
|
||||
for len in all_lengths() {
|
||||
let sk = SuperKmer::from_ascii(&make_seq(len));
|
||||
assert_eq!(sk.seql(), len, "seql() wrong for len={len}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn seql_256_stored_as_zero() {
|
||||
let sk = SuperKmer::from_ascii(&make_seq(256));
|
||||
assert_eq!(sk.header.seql(), 0u8);
|
||||
assert_eq!(sk.seql(), 256);
|
||||
}
|
||||
|
||||
// ── from_ascii / to_ascii roundtrip ───────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn ascii_roundtrip_all_lengths() {
|
||||
for len in all_lengths() {
|
||||
let ascii = make_seq(len);
|
||||
let sk = SuperKmer::from_ascii(&ascii);
|
||||
assert_eq!(sk.to_ascii(), ascii, "roundtrip failed for len={len}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ascii_roundtrip_all_bases() {
|
||||
for (base, expected) in [(b'A', b'A'), (b'C', b'C'), (b'G', b'G'), (b'T', b'T')] {
|
||||
let ascii = vec![base; 4];
|
||||
let sk = SuperKmer::from_ascii(&ascii);
|
||||
assert_eq!(sk.to_ascii(), vec![expected; 4]);
|
||||
}
|
||||
}
|
||||
|
||||
// ── revcomp correctness ───────────────────────────────────────────────────
|
||||
|
||||
/// Known (seq, expected_revcomp) pairs — one per shift value × two byte counts.
|
||||
#[test]
|
||||
fn revcomp_known_values() {
|
||||
let cases = [
|
||||
// shift=6
|
||||
("A", "T"),
|
||||
("ACGTA", "TACGT"),
|
||||
// shift=4
|
||||
("AC", "GT"),
|
||||
("ACGTAC", "GTACGT"),
|
||||
// shift=2
|
||||
("ACG", "CGT"),
|
||||
("ACGTACG", "CGTACGT"),
|
||||
// shift=0
|
||||
("ACGT", "ACGT"),
|
||||
("ACGTACGT", "ACGTACGT"),
|
||||
];
|
||||
for (seq, expected) in cases {
|
||||
let mut sk = SuperKmer::from_ascii(seq.as_bytes());
|
||||
sk.revcomp();
|
||||
assert_eq!(
|
||||
sk.to_ascii(),
|
||||
expected.as_bytes(),
|
||||
"revcomp wrong for \"{seq}\""
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn revcomp_vs_reference_all_lengths() {
|
||||
for len in all_lengths() {
|
||||
let ascii = make_seq(len);
|
||||
let expected = ascii_revcomp(&ascii);
|
||||
let mut sk = SuperKmer::from_ascii(&ascii);
|
||||
sk.revcomp();
|
||||
assert_eq!(sk.to_ascii(), expected, "revcomp wrong for len={len}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn revcomp_involution_all_lengths() {
|
||||
for len in all_lengths() {
|
||||
let ascii = make_seq(len);
|
||||
let mut sk = SuperKmer::from_ascii(&ascii);
|
||||
sk.revcomp();
|
||||
sk.revcomp();
|
||||
assert_eq!(sk.to_ascii(), ascii, "revcomp∘revcomp≠id for len={len}");
|
||||
}
|
||||
}
|
||||
|
||||
// ── canonical ─────────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn canonical_palindrome_unchanged() {
|
||||
// ACGT is its own revcomp
|
||||
let mut sk = SuperKmer::from_ascii(b"ACGT");
|
||||
sk.canonical();
|
||||
assert_eq!(sk.to_ascii(), b"ACGT");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_chooses_forward() {
|
||||
// "AAAA" < "TTTT" → stays as-is
|
||||
let mut sk = SuperKmer::from_ascii(b"AAAA");
|
||||
sk.canonical();
|
||||
assert_eq!(sk.to_ascii(), b"AAAA");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_chooses_revcomp() {
|
||||
// "TTTT" > "AAAA" → flipped
|
||||
let mut sk = SuperKmer::from_ascii(b"TTTT");
|
||||
sk.canonical();
|
||||
assert_eq!(sk.to_ascii(), b"AAAA");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_is_minimal_all_lengths() {
|
||||
for len in all_lengths() {
|
||||
let ascii = make_seq(len);
|
||||
let mut sk = SuperKmer::from_ascii(&ascii);
|
||||
sk.canonical();
|
||||
let fwd = sk.to_ascii();
|
||||
let rev = ascii_revcomp(&fwd);
|
||||
assert!(fwd <= rev, "canonical not minimal for len={len}");
|
||||
}
|
||||
}
|
||||
|
||||
// ── scatter / routing lifecycle ───────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn minimizer_pos_roundtrip() {
|
||||
let mut sk = SuperKmer::from_ascii(b"ACGTACGT");
|
||||
sk.set_minimizer_pos(42);
|
||||
assert_eq!(sk.minimizer_pos(), 42);
|
||||
assert_eq!(sk.seql(), 8, "set_minimizer_pos altered seql");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn minimizer_pos_boundary_values() {
|
||||
let mut sk = SuperKmer::from_ascii(b"ACGTACGT");
|
||||
sk.set_minimizer_pos(0);
|
||||
assert_eq!(sk.minimizer_pos(), 0);
|
||||
sk.set_minimizer_pos(255);
|
||||
assert_eq!(sk.minimizer_pos(), 255);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn init_count_resets_to_one_and_enables_counting() {
|
||||
let mut sk = SuperKmer::from_ascii(b"ACGTACGT");
|
||||
sk.set_minimizer_pos(7);
|
||||
sk.init_count();
|
||||
assert_eq!(sk.count(), 1);
|
||||
sk.increment();
|
||||
assert_eq!(sk.count(), 2);
|
||||
sk.add(10);
|
||||
assert_eq!(sk.count(), 12);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn init_count_preserves_seql() {
|
||||
for len in all_lengths() {
|
||||
let mut sk = SuperKmer::from_ascii(&make_seq(len));
|
||||
sk.set_minimizer_pos(0);
|
||||
sk.init_count();
|
||||
assert_eq!(sk.seql(), len, "init_count altered seql for len={len}");
|
||||
assert_eq!(sk.count(), 1);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn minimizer_pos_does_not_affect_sequence() {
|
||||
let ascii = b"ACGTACGT".to_vec();
|
||||
let mut sk = SuperKmer::from_ascii(&ascii);
|
||||
sk.set_minimizer_pos(3);
|
||||
assert_eq!(sk.to_ascii(), ascii);
|
||||
}
|
||||
|
||||
// ── iter_kmers ────────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_count() {
|
||||
let ascii = b"ACGTACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
for k in [1usize, 3, 4, 5, 8, 12] {
|
||||
let n = sk.iter_kmers(k).count();
|
||||
assert_eq!(n, ascii.len() - k + 1, "count mismatch for k={k}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_first_is_kmer_0() {
|
||||
let ascii = b"ACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
for k in 1..=ascii.len() {
|
||||
let first = sk.iter_kmers(k).next().unwrap();
|
||||
assert_eq!(first, sk.kmer(0, k).unwrap(), "k={k}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_matches_kmer_at_each_position() {
|
||||
let ascii = b"ACGTACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let k = 4;
|
||||
let kmers: Vec<Kmer> = sk.iter_kmers(k).collect();
|
||||
assert_eq!(kmers.len(), ascii.len() - k + 1);
|
||||
for (i, &km) in kmers.iter().enumerate() {
|
||||
assert_eq!(km, sk.kmer(i, k).unwrap(), "mismatch at pos {i}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_single_when_seql_eq_k() {
|
||||
let ascii = b"ACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let k = ascii.len();
|
||||
let kmers: Vec<Kmer> = sk.iter_kmers(k).collect();
|
||||
assert_eq!(kmers.len(), 1);
|
||||
assert_eq!(kmers[0], sk.kmer(0, k).unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_two_when_seql_eq_k_plus_one() {
|
||||
let ascii = b"ACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let k = ascii.len() - 1;
|
||||
let kmers: Vec<Kmer> = sk.iter_kmers(k).collect();
|
||||
assert_eq!(kmers.len(), 2);
|
||||
assert_eq!(kmers[0], sk.kmer(0, k).unwrap());
|
||||
assert_eq!(kmers[1], sk.kmer(1, k).unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_all_k_values() {
|
||||
// For every valid k, each yielded kmer must match kmer(i, k).
|
||||
let ascii = b"ACGTACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let seql = ascii.len();
|
||||
for k in 1..=seql {
|
||||
let kmers: Vec<Kmer> = sk.iter_kmers(k).collect();
|
||||
assert_eq!(kmers.len(), seql - k + 1, "k={k}");
|
||||
for (i, &km) in kmers.iter().enumerate() {
|
||||
assert_eq!(km, sk.kmer(i, k).unwrap(), "k={k}, pos={i}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_crosses_byte_boundary() {
|
||||
// Positions 3→4 and 7→8 cross a 4-nucleotide byte boundary.
|
||||
let ascii = b"ACGTACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let k = 3;
|
||||
let kmers: Vec<Kmer> = sk.iter_kmers(k).collect();
|
||||
for boundary in [3usize, 4, 7, 8] {
|
||||
if boundary + 1 < kmers.len() {
|
||||
assert_eq!(
|
||||
kmers[boundary],
|
||||
sk.kmer(boundary, k).unwrap(),
|
||||
"pos={boundary}"
|
||||
);
|
||||
assert_eq!(
|
||||
kmers[boundary + 1],
|
||||
sk.kmer(boundary + 1, k).unwrap(),
|
||||
"pos={}",
|
||||
boundary + 1
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_k1_yields_all_nucleotides() {
|
||||
let ascii = b"ACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let kmers: Vec<Kmer> = sk.iter_kmers(1).collect();
|
||||
assert_eq!(kmers.len(), 4);
|
||||
for (i, &km) in kmers.iter().enumerate() {
|
||||
assert_eq!(km, sk.kmer(i, 1).unwrap(), "pos={i}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_long_sequence() {
|
||||
let ascii = make_seq(20);
|
||||
let sk = SuperKmer::from_ascii(&ascii);
|
||||
let k = 7;
|
||||
let kmers: Vec<Kmer> = sk.iter_kmers(k).collect();
|
||||
assert_eq!(kmers.len(), ascii.len() - k + 1);
|
||||
for (i, &km) in kmers.iter().enumerate() {
|
||||
assert_eq!(km, sk.kmer(i, k).unwrap(), "pos={i}");
|
||||
}
|
||||
}
|
||||
}
|
||||
#[path = "tests/superkmer.rs"]
|
||||
mod tests;
|
||||
|
||||
@@ -0,0 +1,425 @@
|
||||
use super::*;
|
||||
|
||||
/// Repeating ACGT pattern of the given length.
|
||||
fn make_seq(len: usize) -> Vec<u8> {
|
||||
(0..len).map(|i| b"ACGT"[i % 4]).collect()
|
||||
}
|
||||
|
||||
/// Reference revcomp on ASCII bytes.
|
||||
fn ascii_revcomp(seq: &[u8]) -> Vec<u8> {
|
||||
seq.iter()
|
||||
.rev()
|
||||
.map(|&b| match b {
|
||||
b'A' => b'T',
|
||||
b'T' => b'A',
|
||||
b'C' => b'G',
|
||||
b'G' => b'C',
|
||||
_ => b'A',
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn all_lengths() -> impl Iterator<Item = usize> {
|
||||
(1..=9).chain([255, 256])
|
||||
}
|
||||
|
||||
// ── kmer extraction ───────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn kmer_first_matches_from_ascii() {
|
||||
let ascii = b"ACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let k = 4;
|
||||
let kmer = sk.kmer(0, k).unwrap();
|
||||
let expected = crate::kmer::Kmer::from_ascii(&ascii[..k], k).unwrap();
|
||||
assert_eq!(kmer, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn kmer_last_position() {
|
||||
let ascii = b"ACGTACGT";
|
||||
let seql = ascii.len();
|
||||
let k = 4;
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let kmer = sk.kmer(seql - k, k).unwrap();
|
||||
let expected = crate::kmer::Kmer::from_ascii(&ascii[seql - k..], k).unwrap();
|
||||
assert_eq!(kmer, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn kmer_all_positions() {
|
||||
let ascii = b"ACGTACGTACGT";
|
||||
let k = 4;
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
for i in 0..=ascii.len() - k {
|
||||
let kmer = sk.kmer(i, k).unwrap();
|
||||
let expected = crate::kmer::Kmer::from_ascii(&ascii[i..i + k], k).unwrap();
|
||||
assert_eq!(kmer, expected, "mismatch at position {i}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn kmer_out_of_bounds() {
|
||||
let sk = SuperKmer::from_ascii(b"ACGT");
|
||||
assert!(sk.kmer(2, 4).is_err()); // 2 + 4 > 4
|
||||
assert!(sk.kmer(4, 1).is_err()); // 4 + 1 > 4
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn kmer_invalid_k() {
|
||||
let sk = SuperKmer::from_ascii(b"ACGT");
|
||||
assert!(sk.kmer(0, 0).is_err());
|
||||
assert!(sk.kmer(0, 33).is_err());
|
||||
}
|
||||
|
||||
// ── canonical_kmer ────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn canonical_kmer_is_min_of_kmer_and_revcomp() {
|
||||
let sk = SuperKmer::from_ascii(b"ACGTACGT");
|
||||
let k = 4;
|
||||
for i in 0..=(sk.len() - k) {
|
||||
let ck = sk.canonical_kmer(i, k).unwrap();
|
||||
let fwd = sk.kmer(i, k).unwrap();
|
||||
assert_eq!(ck, fwd.canonical(k));
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_kmer_palindrome_unchanged() {
|
||||
// ACGT is its own reverse complement
|
||||
let sk = SuperKmer::from_ascii(b"ACGT");
|
||||
let ck = sk.canonical_kmer(0, 4).unwrap();
|
||||
let fwd = sk.kmer(0, 4).unwrap();
|
||||
assert_eq!(ck, fwd);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_kmer_tttt_becomes_aaaa() {
|
||||
let sk = SuperKmer::from_ascii(b"TTTT");
|
||||
let ck = sk.canonical_kmer(0, 4).unwrap();
|
||||
let expected = Kmer::from_ascii(b"AAAA", 4).unwrap();
|
||||
assert_eq!(ck, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_kmer_errors_propagate() {
|
||||
let sk = SuperKmer::from_ascii(b"ACGT");
|
||||
assert!(sk.canonical_kmer(2, 4).is_err()); // out of bounds
|
||||
assert!(sk.canonical_kmer(0, 0).is_err()); // invalid k
|
||||
}
|
||||
|
||||
// ── count ─────────────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn count_starts_at_one() {
|
||||
let sk = SuperKmer::from_ascii(b"ACGT");
|
||||
assert_eq!(sk.count(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn increment_adds_one() {
|
||||
let mut sk = SuperKmer::from_ascii(b"ACGT");
|
||||
sk.increment();
|
||||
assert_eq!(sk.count(), 2);
|
||||
sk.increment();
|
||||
assert_eq!(sk.count(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn add_increases_count() {
|
||||
let mut sk = SuperKmer::from_ascii(b"ACGT");
|
||||
sk.add(42);
|
||||
assert_eq!(sk.count(), 43);
|
||||
sk.add(8);
|
||||
assert_eq!(sk.count(), 51);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn set_count_overwrites() {
|
||||
let mut sk = SuperKmer::from_ascii(b"ACGT");
|
||||
sk.add(100);
|
||||
sk.set_count(7);
|
||||
assert_eq!(sk.count(), 7);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn increment_preserves_seql() {
|
||||
for len in all_lengths() {
|
||||
let mut sk = SuperKmer::from_ascii(&make_seq(len));
|
||||
sk.increment();
|
||||
assert_eq!(sk.len(), len, "increment altered seql for len={len}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn add_preserves_seql() {
|
||||
for len in all_lengths() {
|
||||
let mut sk = SuperKmer::from_ascii(&make_seq(len));
|
||||
sk.add(1000);
|
||||
assert_eq!(sk.len(), len, "add altered seql for len={len}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn set_count_preserves_seql() {
|
||||
for len in all_lengths() {
|
||||
let mut sk = SuperKmer::from_ascii(&make_seq(len));
|
||||
sk.set_count(999);
|
||||
assert_eq!(sk.len(), len, "set_count altered seql for len={len}");
|
||||
assert_eq!(sk.count(), 999);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn count_does_not_affect_sequence() {
|
||||
let ascii = b"ACGTACGT".to_vec();
|
||||
let mut sk = SuperKmer::from_ascii(&ascii);
|
||||
sk.set_count(16_000_000);
|
||||
assert_eq!(sk.to_ascii(), ascii);
|
||||
}
|
||||
|
||||
// ── seql encoding ─────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn seql_roundtrip() {
|
||||
for len in all_lengths() {
|
||||
let sk = SuperKmer::from_ascii(&make_seq(len));
|
||||
assert_eq!(sk.len(), len, "seql() wrong for len={len}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn seql_256_stored_as_zero() {
|
||||
let sk = SuperKmer::from_ascii(&make_seq(256));
|
||||
assert_eq!(sk.header.seql(), 0u8);
|
||||
assert_eq!(sk.len(), 256);
|
||||
}
|
||||
|
||||
// ── from_ascii / to_ascii roundtrip ───────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn ascii_roundtrip_all_lengths() {
|
||||
for len in all_lengths() {
|
||||
let ascii = make_seq(len);
|
||||
let sk = SuperKmer::from_ascii(&ascii);
|
||||
assert_eq!(sk.to_ascii(), ascii, "roundtrip failed for len={len}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ascii_roundtrip_all_bases() {
|
||||
// Canonical form: min(seq, revcomp). G×4 flips to C×4, T×4 flips to A×4.
|
||||
for (base, expected) in [(b'A', b'A'), (b'C', b'C'), (b'G', b'C'), (b'T', b'A')] {
|
||||
let ascii = vec![base; 4];
|
||||
let sk = SuperKmer::from_ascii(&ascii);
|
||||
assert_eq!(sk.to_ascii(), vec![expected; 4]);
|
||||
}
|
||||
}
|
||||
|
||||
// ── revcomp correctness ───────────────────────────────────────────────────
|
||||
|
||||
/// Known (seq, expected_revcomp) pairs — one per shift value × two byte counts.
|
||||
#[test]
|
||||
fn revcomp_known_values() {
|
||||
let cases = [
|
||||
// shift=6
|
||||
("A", "T"),
|
||||
("ACGTA", "TACGT"),
|
||||
// shift=4
|
||||
("AC", "GT"),
|
||||
("ACGTAC", "GTACGT"),
|
||||
// shift=2
|
||||
("ACG", "CGT"),
|
||||
("ACGTACG", "CGTACGT"),
|
||||
// shift=0
|
||||
("ACGT", "ACGT"),
|
||||
("ACGTACGT", "ACGTACGT"),
|
||||
];
|
||||
for (seq, expected) in cases {
|
||||
let mut sk = SuperKmer::from_ascii(seq.as_bytes());
|
||||
sk.revcomp();
|
||||
assert_eq!(
|
||||
sk.to_ascii(),
|
||||
expected.as_bytes(),
|
||||
"revcomp wrong for \"{seq}\""
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn revcomp_vs_reference_all_lengths() {
|
||||
for len in all_lengths() {
|
||||
let ascii = make_seq(len);
|
||||
let expected = ascii_revcomp(&ascii);
|
||||
let mut sk = SuperKmer::from_ascii(&ascii);
|
||||
sk.revcomp();
|
||||
assert_eq!(sk.to_ascii(), expected, "revcomp wrong for len={len}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn revcomp_involution_all_lengths() {
|
||||
for len in all_lengths() {
|
||||
let ascii = make_seq(len);
|
||||
let mut sk = SuperKmer::from_ascii(&ascii);
|
||||
sk.revcomp();
|
||||
sk.revcomp();
|
||||
assert_eq!(sk.to_ascii(), ascii, "revcomp∘revcomp≠id for len={len}");
|
||||
}
|
||||
}
|
||||
|
||||
// ── canonical ─────────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn canonical_palindrome_unchanged() {
|
||||
// ACGT is its own revcomp
|
||||
let mut sk = SuperKmer::from_ascii(b"ACGT");
|
||||
sk.canonical();
|
||||
assert_eq!(sk.to_ascii(), b"ACGT");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_chooses_forward() {
|
||||
// "AAAA" < "TTTT" → stays as-is
|
||||
let mut sk = SuperKmer::from_ascii(b"AAAA");
|
||||
sk.canonical();
|
||||
assert_eq!(sk.to_ascii(), b"AAAA");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_chooses_revcomp() {
|
||||
// "TTTT" > "AAAA" → flipped
|
||||
let mut sk = SuperKmer::from_ascii(b"TTTT");
|
||||
sk.canonical();
|
||||
assert_eq!(sk.to_ascii(), b"AAAA");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_is_minimal_all_lengths() {
|
||||
for len in all_lengths() {
|
||||
let ascii = make_seq(len);
|
||||
let mut sk = SuperKmer::from_ascii(&ascii);
|
||||
sk.canonical();
|
||||
let fwd = sk.to_ascii();
|
||||
let rev = ascii_revcomp(&fwd);
|
||||
assert!(fwd <= rev, "canonical not minimal for len={len}");
|
||||
}
|
||||
}
|
||||
|
||||
// ── iter_kmers ────────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_count() {
|
||||
let ascii = b"ACGTACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
for k in [1usize, 3, 4, 5, 8, 12] {
|
||||
let n = sk.iter_kmers(k).count();
|
||||
assert_eq!(n, ascii.len() - k + 1, "count mismatch for k={k}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_first_is_kmer_0() {
|
||||
let ascii = b"ACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
for k in 1..=ascii.len() {
|
||||
let first = sk.iter_kmers(k).next().unwrap();
|
||||
assert_eq!(first, sk.kmer(0, k).unwrap(), "k={k}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_matches_kmer_at_each_position() {
|
||||
let ascii = b"ACGTACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let k = 4;
|
||||
let kmers: Vec<Kmer> = sk.iter_kmers(k).collect();
|
||||
assert_eq!(kmers.len(), ascii.len() - k + 1);
|
||||
for (i, &km) in kmers.iter().enumerate() {
|
||||
assert_eq!(km, sk.kmer(i, k).unwrap(), "mismatch at pos {i}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_single_when_seql_eq_k() {
|
||||
let ascii = b"ACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let k = ascii.len();
|
||||
let kmers: Vec<Kmer> = sk.iter_kmers(k).collect();
|
||||
assert_eq!(kmers.len(), 1);
|
||||
assert_eq!(kmers[0], sk.kmer(0, k).unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_two_when_seql_eq_k_plus_one() {
|
||||
let ascii = b"ACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let k = ascii.len() - 1;
|
||||
let kmers: Vec<Kmer> = sk.iter_kmers(k).collect();
|
||||
assert_eq!(kmers.len(), 2);
|
||||
assert_eq!(kmers[0], sk.kmer(0, k).unwrap());
|
||||
assert_eq!(kmers[1], sk.kmer(1, k).unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_all_k_values() {
|
||||
// For every valid k, each yielded kmer must match kmer(i, k).
|
||||
let ascii = b"ACGTACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let seql = ascii.len();
|
||||
for k in 1..=seql {
|
||||
let kmers: Vec<Kmer> = sk.iter_kmers(k).collect();
|
||||
assert_eq!(kmers.len(), seql - k + 1, "k={k}");
|
||||
for (i, &km) in kmers.iter().enumerate() {
|
||||
assert_eq!(km, sk.kmer(i, k).unwrap(), "k={k}, pos={i}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_crosses_byte_boundary() {
|
||||
// Positions 3→4 and 7→8 cross a 4-nucleotide byte boundary.
|
||||
let ascii = b"ACGTACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let k = 3;
|
||||
let kmers: Vec<Kmer> = sk.iter_kmers(k).collect();
|
||||
for boundary in [3usize, 4, 7, 8] {
|
||||
if boundary + 1 < kmers.len() {
|
||||
assert_eq!(
|
||||
kmers[boundary],
|
||||
sk.kmer(boundary, k).unwrap(),
|
||||
"pos={boundary}"
|
||||
);
|
||||
assert_eq!(
|
||||
kmers[boundary + 1],
|
||||
sk.kmer(boundary + 1, k).unwrap(),
|
||||
"pos={}",
|
||||
boundary + 1
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_k1_yields_all_nucleotides() {
|
||||
let ascii = b"ACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let kmers: Vec<Kmer> = sk.iter_kmers(1).collect();
|
||||
assert_eq!(kmers.len(), 4);
|
||||
for (i, &km) in kmers.iter().enumerate() {
|
||||
assert_eq!(km, sk.kmer(i, 1).unwrap(), "pos={i}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_long_sequence() {
|
||||
let ascii = make_seq(20);
|
||||
let sk = SuperKmer::from_ascii(&ascii);
|
||||
let k = 7;
|
||||
let kmers: Vec<Kmer> = sk.iter_kmers(k).collect();
|
||||
assert_eq!(kmers.len(), ascii.len() - k + 1);
|
||||
for (i, &km) in kmers.iter().enumerate() {
|
||||
assert_eq!(km, sk.kmer(i, k).unwrap(), "pos={i}");
|
||||
}
|
||||
}
|
||||
@@ -4,6 +4,8 @@
|
||||
//! at the MSB of `seq[0]`, 4 bases per byte — but without the 256-nucleotide
|
||||
//! length cap and without the scatter/count header payload.
|
||||
|
||||
use std::io::{self, Write};
|
||||
|
||||
use crate::encoding::{DEC4, encode_base};
|
||||
use crate::kmer::{Kmer, KmerError};
|
||||
use crate::revcomp_lookup::REVCOMP4;
|
||||
@@ -101,23 +103,24 @@ impl Unitig {
|
||||
(self.seq[i / 4] >> (6 - 2 * (i % 4))) & 0b11
|
||||
}
|
||||
|
||||
/// Decode into ASCII nucleotides, appending into `buf`.
|
||||
pub fn write_ascii(&self, buf: &mut Vec<u8>) {
|
||||
/// Decode into ASCII nucleotides, writing into `writer`.
|
||||
pub fn write_ascii<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
let full = self.seql / 4;
|
||||
for i in 0..full {
|
||||
buf.extend_from_slice(&DEC4[self.seq[i] as usize].to_be_bytes());
|
||||
writer.write_all(&DEC4[self.seq[i] as usize].to_be_bytes())?;
|
||||
}
|
||||
let rem = self.seql % 4;
|
||||
if rem > 0 {
|
||||
let bytes = DEC4[self.seq[full] as usize].to_be_bytes();
|
||||
buf.extend_from_slice(&bytes[..rem]);
|
||||
writer.write_all(&bytes[..rem])?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Decode into a fresh ASCII `Vec<u8>`.
|
||||
pub fn to_ascii(&self) -> Vec<u8> {
|
||||
let mut buf = Vec::with_capacity(self.seql);
|
||||
self.write_ascii(&mut buf);
|
||||
self.write_ascii(&mut buf).unwrap();
|
||||
buf
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user