refactor: centralize k-mer config and introduce packed sequences

Centralize k-mer and minimizer configuration using a thread-safe global module, and replace manual bit-packing with a memory-efficient `PackedSeq` type. Refactor core sequence and k-mer types to use compile-time length enforcement and centralized hashing. Introduce a new De Bruijn graph implementation with compact node encoding and traversal iterators. Update I/O, partitioning, and builder modules to align with the new architecture, and add the `xxhash-rust` dependency.
2026-05-05 18:08:19 +02:00
parent 602f414957
commit 8c17bf958b
37 changed files with 2641 additions and 2456 deletions
@@ -3,6 +3,11 @@ name = "obikseq"
 version = "0.1.0"
 edition = "2024"

+[features]
+# Replaces the OnceLock-based params with thread-local storage so that
+# tests in dependent crates can call set_k / set_m freely without conflicts.
+test-utils = []
+
 [dependencies]
 bitvec = "1"
 serde = { version = "1.0", features = ["derive"] }
@@ -2,6 +2,14 @@ use serde::Serialize;
 use serde_json;
 use std::io::{self, Write};

+/// Minimal annotation carrying only the sequence length.
+#[derive(Serialize)]
+pub struct BasicAnnotation {
+    pub seq_length: usize,
+}
+
+impl Annotation for BasicAnnotation {}
+
 /// Serialize `self` as a single-line JSON object into a writer.
 pub trait Annotation: Serialize {
    /// Write the annotation as compact JSON into `writer`.
@@ -1,12 +1,70 @@
 //! Compact 2-bit kmer stored as a left-aligned u64.
 //!
 //! Nucleotide 0 occupies bits 63–62, nucleotide i occupies bits 63−2i and 62−2i.
-//! The low 64−2k bits are always zero. k is not stored — it is a parameter of
-//! every operation that needs it, and will be owned by the collection-level indexer.
+//! The low 64−2·len bits are always zero.
+//!
+//! The length is not stored in the struct — it is supplied by the [`KmerLength`]
+//! type parameter.  Two public marker types cover the common cases:
+//!
+//! | Alias           | Marker   | Length source  |
+//! |-----------------|----------|----------------|
+//! | [`Kmer`]        | [`KLen`] | `params::k()`  |
+//! | [`CanonicalKmer`]| [`KLen`]| `params::k()`  |
+//! | [`Minimizer`]   | [`MLen`] | `params::m()`  |
+//!
+//! Tests that need a fixed length independent of the global singletons can use
+//! [`ConstLen<N>`].

+use serde::Serialize;
 use std::io::{self, Write};
+use std::marker::PhantomData;

+use crate::Annotation;
+use crate::Sequence;
 use crate::encoding::{DEC4, encode_base};
+use crate::params::{k, m};
+use crate::sequence::mix64;
+
+// ── KmerLength ────────────────────────────────────────────────────────────────
+
+/// Marker trait that supplies a kmer length at runtime.
+pub trait KmerLength: Copy + std::fmt::Debug + 'static {
+    /// Returns the length this marker represents.
+    fn len() -> usize;
+}
+
+/// Marker for the k-mer length (`params::k()`).
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub struct KLen;
+
+/// Marker for the minimizer length (`params::m()`).
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub struct MLen;
+
+/// Marker for a compile-time-constant length — useful for tests.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub struct ConstLen<const N: usize>;
+
+impl KmerLength for KLen {
+    #[inline]
+    fn len() -> usize {
+        k()
+    }
+}
+
+impl KmerLength for MLen {
+    #[inline]
+    fn len() -> usize {
+        m()
+    }
+}
+
+impl<const N: usize> KmerLength for ConstLen<N> {
+    #[inline]
+    fn len() -> usize {
+        N
+    }
+}

 // ── KmerError ─────────────────────────────────────────────────────────────────

@@ -43,35 +101,31 @@ impl std::fmt::Display for KmerError {

 impl std::error::Error for KmerError {}

-// ── Kmer ──────────────────────────────────────────────────────────────────────
+#[derive(Serialize)]
+struct KmerAnnotation {
+    seq_length: usize,
+}
+impl Annotation for KmerAnnotation {}

-/// A DNA kmer of length k encoded as a left-aligned u64 (2 bits/nucleotide, MSB-first).
-/// k is not stored in the struct — it must be supplied by the caller.
+// ── KmerOf ────────────────────────────────────────────────────────────────────
+
+/// A DNA kmer of length `L::len()` encoded as a left-aligned u64 (2 bits/nucleotide, MSB-first).
+/// The low `64 − 2·L::len()` bits are always zero.
 #[repr(transparent)]
 #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
-pub struct Kmer(u64);
+pub struct KmerOf<L: KmerLength>(u64, PhantomData<L>);

-#[inline]
-fn mix64(x: u64) -> u64 {
-    let x = x ^ (x >> 30);
-    let x = x.wrapping_mul(0xbf58476d1ce4e5b9);
-    let x = x ^ (x >> 27);
-    let x = x.wrapping_mul(0x94d049bb133111eb);
-    x ^ (x >> 31)
-}
-
-impl Kmer {
-    /// Wrap a raw left-aligned u64 value as a Kmer.
+impl<L: KmerLength> KmerOf<L> {
+    /// Wrap a raw left-aligned u64 value as a kmer.
    #[inline]
    pub fn from_raw(raw: u64) -> Self {
-        Kmer(raw)
+        KmerOf(raw, PhantomData)
    }

-    /// Wrap a raw right-aligned u64 value as a Kmer.
-    /// The raw value is shifted left by `2 * k` bits to align it with the leftmost position.
+    /// Wrap a raw right-aligned u64 value, shifting it into left-aligned position.
    #[inline]
-    pub fn from_raw_right(raw: u64, k: usize) -> Self {
-        Kmer(raw << (64 - 2 * k))
+    pub fn from_raw_right(raw: u64) -> Self {
+        KmerOf(raw << (64 - 2 * L::len()), PhantomData)
    }

    /// Return the raw left-aligned u64 value.
@@ -82,14 +136,13 @@ impl Kmer {

    /// Return the raw right-aligned u64 value.
    #[inline]
-    pub fn raw_right(&self, k: usize) -> u64 {
-        self.0 >> (64 - 2 * k)
+    pub fn raw_right(&self) -> u64 {
+        self.0 >> (64 - 2 * L::len())
    }

-    /// Encode the first k nucleotides of an ASCII slice into a Kmer.
-    /// Zero allocation — result lives on the stack.
-    #[inline]
-    pub fn from_ascii(ascii: &[u8], k: usize) -> Result<Self, KmerError> {
+    /// Encode the first `L::len()` nucleotides of an ASCII slice into a kmer.
+    pub fn from_ascii(ascii: &[u8]) -> Result<Self, KmerError> {
+        let k = L::len();
        if k == 0 || k > 32 {
            return Err(KmerError::InvalidK { k });
        }
@@ -104,26 +157,21 @@ impl Kmer {
        for i in 0..k {
            val = (val << 2) | encode_base(ascii[i]) as u64;
        }
-        Ok(Kmer(val << (64 - 2 * k)))
+        Ok(KmerOf(val << (64 - 2 * k), PhantomData))
    }

-    /// Extract nucleotide i (0-based from 5′ end) as a 2-bit value.
+    /// Decode into a freshly allocated ASCII `Vec<u8>`.
    #[inline]
-    pub fn nucleotide(&self, i: usize) -> u8 {
-        ((self.0 >> (62 - 2 * i)) & 0b11) as u8
-    }
-
-    /// Decode this kmer into a freshly allocated ASCII `Vec<u8>`.
-    #[inline]
-    pub fn to_ascii(&self, k: usize) -> Vec<u8> {
-        let mut buf = Vec::with_capacity(k);
-        self.write_ascii(k, &mut buf).unwrap();
+    pub fn to_ascii(&self) -> Vec<u8> {
+        let mut buf = Vec::with_capacity(L::len());
+        self.write_ascii(&mut buf).unwrap();
        buf
    }

-    /// Decode this kmer into ASCII nucleotides, writing into `writer`.
+    /// Decode into ASCII nucleotides, writing into `writer`.
    #[inline]
-    pub fn write_ascii<W: Write>(&self, k: usize, writer: &mut W) -> io::Result<()> {
+    pub fn write_ascii<W: Write>(&self, writer: &mut W) -> io::Result<()> {
+        let k = L::len();
        let bytes = self.0.to_be_bytes();
        let full = k / 4;
        let rem = k % 4;
@@ -137,296 +185,212 @@ impl Kmer {
        Ok(())
    }

-    /// Compute the reverse complement of this kmer.
-    /// Zero allocation — result lives on the stack.
+    /// Compute the reverse complement.
    #[inline]
-    pub fn revcomp(&self, k: usize) -> Self {
-        let x = !self.0; // complement
-        let x = x.swap_bytes(); // reverse bytes
-        let x = ((x >> 4) & 0x0F0F0F0F0F0F0F0F) | ((x & 0x0F0F0F0F0F0F0F0F) << 4); // swap nibbles
-        let x = ((x >> 2) & 0x3333333333333333) | ((x & 0x3333333333333333) << 2); // swap 2-bit groups
-        Kmer(x << (64 - 2 * k))
+    pub fn revcomp(&self) -> Self {
+        let k = L::len();
+        let x = !self.0;
+        let x = x.swap_bytes();
+        let x = ((x >> 4) & 0x0F0F0F0F0F0F0F0F) | ((x & 0x0F0F0F0F0F0F0F0F) << 4);
+        let x = ((x >> 2) & 0x3333333333333333) | ((x & 0x3333333333333333) << 2);
+        KmerOf(x << (64 - 2 * k), PhantomData)
    }

-    /// Return the canonical form: lexicographic minimum of forward and reverse complement.
-    /// Zero allocation — result lives on the stack.
-    #[inline]
-    pub fn canonical(&self, k: usize) -> CanonicalKmer {
-        let rc = self.revcomp(k);
-        CanonicalKmer(if self.0 <= rc.0 { *self } else { rc })
-    }
-
-    /// Slide the window one base to the right: drop the first nucleotide, append `nuc` at position k-1.
-    pub fn push_right(self, nuc: u8, k: usize) -> Self {
+    /// Slide the window one base to the right: drop nucleotide 0, append `nuc` at position `L::len()-1`.
+    pub fn push_right(self, nuc: u8) -> Self {
+        let k = L::len();
        let shifted = self.0 << 2 & (!0u64 << (64 - 2 * (k - 1)));
-        let shift = 64 - 2 * k;
-        Kmer(shifted | ((nuc as u64 & 3) << shift))
+        KmerOf(shifted | ((nuc as u64 & 3) << (64 - 2 * k)), PhantomData)
    }

-    /// Slide the window one base to the left: drop the last nucleotide, prepend `nuc` at position 0.
-    pub fn push_left(self, nuc: u8, k: usize) -> Self {
+    /// Slide the window one base to the left: drop nucleotide `L::len()-1`, prepend `nuc` at position 0.
+    pub fn push_left(self, nuc: u8) -> Self {
+        let k = L::len();
        let shifted = (self.0 >> 2) & (!0u64 << (64 - 2 * k));
-        Kmer(shifted | ((nuc as u64 & 3) << 62))
+        KmerOf(shifted | ((nuc as u64 & 3) << 62), PhantomData)
    }

-    /// Returns `true` if `self` and `other` overlap by `k` - 1 bases.
-    ///
-    /// The last K-1 nucleotides of `self` and the first K-1 nucleotides
-    /// of `other` must be equal.
-    pub fn is_overlapping(self, other: Self, k: usize) -> bool {
-        let left = self.0 << 2 & (!0u64 << (64 - 2 * (k - 1)));
-        let right = other.0 & (!0u64 << (64 - 2 * (k - 1)));
-        left == right
+    /// Returns `true` if the last `L::len()-1` nucleotides of `self` equal the first `L::len()-1` of `other`.
+    pub fn is_overlapping(self, other: Self) -> bool {
+        let k = L::len();
+        let mask = !0u64 << (64 - 2 * (k - 1));
+        (self.0 << 2 & mask) == (other.0 & mask)
    }
 }

-// ── CanonicalKmer ─────────────────────────────────────────────────────────────
+impl<L: KmerLength> Sequence for KmerOf<L> {
+    type Canonical = CanonicalKmerOf<L>;

-/// A [`Kmer`] guaranteed to be in canonical form (lexicographic minimum of
+    fn seql(&self) -> usize {
+        L::len()
+    }
+
+    fn seq_hash(&self) -> u64 {
+        self.canonical().seq_hash()
+    }
+
+    #[inline]
+    fn nucleotide(&self, i: usize) -> u8 {
+        ((self.0 >> (62 - 2 * i)) & 0b11) as u8
+    }
+
+    #[inline]
+    fn canonical(&self) -> Self::Canonical {
+        let rc = self.revcomp();
+        CanonicalKmerOf(if self.0 <= rc.0 { self.0 } else { rc.0 }, PhantomData)
+    }
+
+    fn annotation(&self) -> impl Annotation {
+        KmerAnnotation {
+            seq_length: L::len(),
+        }
+    }
+}
+// ── CanonicalKmerOf ───────────────────────────────────────────────────────────
+
+/// A [`KmerOf<L>`] guaranteed to be in canonical form (lexicographic minimum of
 /// forward and reverse complement).
 ///
-/// The only public constructors are [`Kmer::canonical`] (checked) and
-/// [`CanonicalKmer::from_raw_unchecked`] (for trusted paths such as
-/// deserialisation or rolling-window minimizer extraction).
+/// The only public constructors are [`KmerOf::canonical`] (verified) and
+/// [`CanonicalKmerOf::from_raw_unchecked`] (trusted paths such as deserialisation).
 #[repr(transparent)]
 #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
-pub struct CanonicalKmer(Kmer);
+pub struct CanonicalKmerOf<L: KmerLength>(u64, PhantomData<L>);

-impl CanonicalKmer {
+impl<L: KmerLength> CanonicalKmerOf<L> {
    /// Wrap a raw left-aligned u64 without verifying the canonical invariant.
    ///
    /// # Safety (logical)
-    /// The caller must guarantee that `raw == min(raw, revcomp(raw, k))`.
-    /// Violations cause silently wrong results in MPHF lookup and graph traversal.
+    /// The caller must guarantee `raw == min(raw, revcomp(raw))`.
    #[inline]
    pub fn from_raw_unchecked(raw: u64) -> Self {
-        CanonicalKmer(Kmer(raw))
+        CanonicalKmerOf(raw, PhantomData)
    }

    /// Return the raw left-aligned u64 value.
    #[inline]
    pub fn raw(&self) -> u64 {
-        self.0.0
+        self.0
    }

    /// Decode into a freshly allocated ASCII `Vec<u8>`.
    #[inline]
-    pub fn to_ascii(&self, k: usize) -> Vec<u8> {
-        self.0.to_ascii(k)
+    pub fn to_ascii(&self) -> Vec<u8> {
+        self.into_kmer().to_ascii()
    }

    /// Decode into ASCII nucleotides, writing into `writer`.
    #[inline]
-    pub fn write_ascii<W: Write>(&self, k: usize, writer: &mut W) -> io::Result<()> {
-        self.0.write_ascii(k, writer)
+    pub fn write_ascii<W: Write>(&self, writer: &mut W) -> io::Result<()> {
+        self.into_kmer().write_ascii(writer)
    }

-    /// Compute the reverse complement.  The result is a raw [`Kmer`] — the
-    /// revcomp of a canonical kmer is not necessarily canonical itself.
+    /// Compute the reverse complement. The result is a raw [`KmerOf<L>`] — not
+    /// necessarily canonical itself.
    #[inline]
-    pub fn revcomp(&self, k: usize) -> Kmer {
-        self.0.revcomp(k)
+    pub fn revcomp(&self) -> KmerOf<L> {
+        self.into_kmer().revcomp()
    }

-    /// Hash via `mix64`.  No re-canonicalisation needed.
+    /// Hash via `mix64`.
    #[inline]
-    pub fn seq_hash(&self, _k: usize) -> u64 {
-        mix64(self.0.0)
+    pub fn seq_hash(&self) -> u64 {
+        hash_kmer(self.0)
    }

    /// Extract nucleotide i (0-based from 5′ end) as a 2-bit value.
    #[inline]
    pub fn nucleotide(&self, i: usize) -> u8 {
-        self.0.nucleotide(i)
+        self.into_kmer().nucleotide(i)
    }

    /// Return the four left canonical neighbours (each already canonical).
-    /// Zero allocation — result lives on the stack.
-    pub fn left_canonical_neighbors(&self, k: usize) -> [CanonicalKmer; 4] {
-        let shifted = (self.raw() >> 2) & (!0u64 << (64 - 2 * k));
+    pub fn left_canonical_neighbors(&self) -> [CanonicalKmerOf<L>; 4] {
+        let k = L::len();
+        let shifted = (self.0 >> 2) & (!0u64 << (64 - 2 * k));
        [
-            Kmer(shifted).canonical(k),
-            Kmer(shifted | (1u64 << 62)).canonical(k),
-            Kmer(shifted | (2u64 << 62)).canonical(k),
-            Kmer(shifted | (3u64 << 62)).canonical(k),
+            KmerOf::<L>(shifted, PhantomData).canonical(),
+            KmerOf::<L>(shifted | (1u64 << 62), PhantomData).canonical(),
+            KmerOf::<L>(shifted | (2u64 << 62), PhantomData).canonical(),
+            KmerOf::<L>(shifted | (3u64 << 62), PhantomData).canonical(),
        ]
    }

    /// Return the four right canonical neighbours (each already canonical).
-    /// Zero allocation — result lives on the stack.
-    pub fn right_canonical_neighbors(&self, k: usize) -> [CanonicalKmer; 4] {
-        let shifted = self.raw() << 2 & (!0u64 << (64 - 2 * (k - 1)));
+    pub fn right_canonical_neighbors(&self) -> [CanonicalKmerOf<L>; 4] {
+        let k = L::len();
+        let shifted = self.0 << 2 & (!0u64 << (64 - 2 * (k - 1)));
        let shift = 64 - 2 * k;
        [
-            Kmer(shifted).canonical(k),
-            Kmer(shifted | (1u64 << shift)).canonical(k),
-            Kmer(shifted | (2u64 << shift)).canonical(k),
-            Kmer(shifted | (3u64 << shift)).canonical(k),
+            KmerOf::<L>(shifted, PhantomData).canonical(),
+            KmerOf::<L>(shifted | (1u64 << shift), PhantomData).canonical(),
+            KmerOf::<L>(shifted | (2u64 << shift), PhantomData).canonical(),
+            KmerOf::<L>(shifted | (3u64 << shift), PhantomData).canonical(),
        ]
    }

-    /// Consume this wrapper and return the inner raw [`Kmer`].
+    /// Return the inner value as a raw [`KmerOf<L>`].
    #[inline]
-    pub fn into_kmer(self) -> Kmer {
-        self.0
+    pub fn into_kmer(self) -> KmerOf<L> {
+        KmerOf(self.0, PhantomData)
    }
 }

-impl From<CanonicalKmer> for Kmer {
-    #[inline]
-    fn from(ck: CanonicalKmer) -> Self {
-        ck.0
+impl<L: KmerLength> Sequence for CanonicalKmerOf<L> {
+    type Canonical = CanonicalKmerOf<L>;
+
+    fn seql(&self) -> usize {
+        L::len()
    }
+
+    fn seq_hash(&self) -> u64 {
+        hash_kmer(self.0)
+    }
+
+    #[inline]
+    fn nucleotide(&self, i: usize) -> u8 {
+        ((self.0 >> (62 - 2 * i)) & 0b11) as u8
+    }
+
+    fn canonical(&self) -> Self::Canonical {
+        *self
+    }
+
+    fn annotation(&self) -> impl Annotation {
+        KmerAnnotation {
+            seq_length: L::len(),
+        }
+    }
+}
+impl<L: KmerLength> From<CanonicalKmerOf<L>> for KmerOf<L> {
+    #[inline]
+    fn from(ck: CanonicalKmerOf<L>) -> Self {
+        ck.into_kmer()
+    }
+}
+
+// ── Public type aliases ───────────────────────────────────────────────────────
+
+/// A DNA k-mer using the global `params::k()` length.
+pub type Kmer = KmerOf<KLen>;
+
+/// A canonical k-mer using the global `params::k()` length.
+pub type CanonicalKmer = CanonicalKmerOf<KLen>;
+
+/// A minimizer: a canonical k-mer using the global `params::m()` length.
+pub type Minimizer = CanonicalKmerOf<MLen>;
+
+/// Compute a hash for a raw (left-aligned) kmer value.
+///
+/// This is a convenience wrapper around [`mix64`] that accepts the raw
+/// 64-bit representation directly, which is useful when the canonical
+/// invariant is not required or has already been handled.
+#[inline]
+pub fn hash_kmer(raw: u64) -> u64 {
+    mix64(raw ^ 0x9e3779b97f4a7c15)
 }

 // ── tests ─────────────────────────────────────────────────────────────────────
-
 #[cfg(test)]
-mod tests {
-    use super::*;
-
-    fn ascii_revcomp(seq: &[u8]) -> Vec<u8> {
-        seq.iter()
-            .rev()
-            .map(|&b| match b {
-                b'A' => b'T',
-                b'T' => b'A',
-                b'C' => b'G',
-                b'G' => b'C',
-                _ => b'A',
-            })
-            .collect()
-    }
-
-    const K_VALUES: &[usize] = &[1, 2, 3, 4, 8, 11, 16, 31, 32];
-
-    fn make_seq(k: usize) -> Vec<u8> {
-        (0..k).map(|i| b"ACGT"[i % 4]).collect()
-    }
-
-    // ── from_ascii / to_ascii ─────────────────────────────────────────────────
-
-    #[test]
-    fn ascii_roundtrip() {
-        for &k in K_VALUES {
-            let ascii = make_seq(k);
-            let kmer = Kmer::from_ascii(&ascii, k).unwrap();
-            assert_eq!(kmer.to_ascii(k), ascii, "roundtrip failed for k={k}");
-        }
-    }
-
-    #[test]
-    fn from_ascii_all_bases() {
-        for (base, expected) in [(b'A', b'A'), (b'C', b'C'), (b'G', b'G'), (b'T', b'T')] {
-            let kmer = Kmer::from_ascii(&[base], 1).unwrap();
-            assert_eq!(kmer.to_ascii(1), vec![expected]);
-        }
-    }
-
-    #[test]
-    fn from_ascii_invalid_k() {
-        assert!(Kmer::from_ascii(b"A", 0).is_err());
-        assert!(Kmer::from_ascii(b"ACGT", 33).is_err());
-    }
-
-    #[test]
-    fn from_ascii_too_short() {
-        assert!(Kmer::from_ascii(b"ACG", 4).is_err());
-    }
-
-    // ── nucleotide ────────────────────────────────────────────────────────────
-
-    #[test]
-    fn nucleotide_extraction() {
-        let kmer = Kmer::from_ascii(b"ACGT", 4).unwrap();
-        assert_eq!(kmer.nucleotide(0), 0b00); // A
-        assert_eq!(kmer.nucleotide(1), 0b01); // C
-        assert_eq!(kmer.nucleotide(2), 0b10); // G
-        assert_eq!(kmer.nucleotide(3), 0b11); // T
-    }
-
-    // ── revcomp ───────────────────────────────────────────────────────────────
-
-    #[test]
-    fn revcomp_known_values() {
-        let cases: &[(&[u8], &[u8])] = &[
-            (b"A", b"T"),
-            (b"AC", b"GT"),
-            (b"ACG", b"CGT"),
-            (b"ACGT", b"ACGT"), // palindrome
-            (b"AAAA", b"TTTT"),
-            (b"TTTT", b"AAAA"),
-        ];
-        for (seq, expected) in cases {
-            let k = seq.len();
-            let kmer = Kmer::from_ascii(seq, k).unwrap();
-            let rc = kmer.revcomp(k);
-            assert_eq!(
-                rc.to_ascii(k),
-                *expected,
-                "revcomp wrong for \"{}\"",
-                std::str::from_utf8(seq).unwrap()
-            );
-        }
-    }
-
-    #[test]
-    fn revcomp_vs_reference() {
-        for &k in K_VALUES {
-            let ascii = make_seq(k);
-            let expected = ascii_revcomp(&ascii);
-            let rc = Kmer::from_ascii(&ascii, k).unwrap().revcomp(k);
-            assert_eq!(rc.to_ascii(k), expected, "revcomp wrong for k={k}");
-        }
-    }
-
-    #[test]
-    fn revcomp_involution() {
-        for &k in K_VALUES {
-            let ascii = make_seq(k);
-            let kmer = Kmer::from_ascii(&ascii, k).unwrap();
-            assert_eq!(
-                kmer.revcomp(k).revcomp(k),
-                kmer,
-                "revcomp∘revcomp≠id for k={k}"
-            );
-        }
-    }
-
-    // ── canonical ─────────────────────────────────────────────────────────────
-
-    #[test]
-    fn canonical_palindrome() {
-        let kmer = Kmer::from_ascii(b"ACGT", 4).unwrap();
-        assert_eq!(kmer.canonical(4).into_kmer(), kmer);
-    }
-
-    #[test]
-    fn canonical_chooses_lesser() {
-        let kmer = Kmer::from_ascii(b"TTTT", 4).unwrap();
-        let expected = Kmer::from_ascii(b"AAAA", 4).unwrap();
-        assert_eq!(kmer.canonical(4).into_kmer(), expected);
-    }
-
-    #[test]
-    fn canonical_is_minimal() {
-        for &k in K_VALUES {
-            let ascii = make_seq(k);
-            let ck = Kmer::from_ascii(&ascii, k).unwrap().canonical(k);
-            let rc = ck.revcomp(k);
-            assert!(ck.raw() <= rc.raw(), "canonical not minimal for k={k}");
-        }
-    }
-
-    #[test]
-    fn canonical_idempotent() {
-        for &k in K_VALUES {
-            let ck = Kmer::from_ascii(&make_seq(k), k).unwrap().canonical(k);
-            assert_eq!(
-                ck.into_kmer().canonical(k),
-                ck,
-                "canonical not idempotent for k={k}"
-            );
-        }
-    }
-}
+#[path = "tests/kmer.rs"]
+mod tests;
@@ -9,6 +9,8 @@ mod annotations;

 mod encoding;
 pub mod kmer;
+pub mod packed_seq;
+pub mod params;
 mod revcomp_lookup;
 /// Routable super-kmer: canonical sequence paired with its minimizer for scatter routing.
 pub mod routable;
@@ -18,7 +20,9 @@ pub mod superkmer;
 pub mod unitig;

 pub use annotations::Annotation;
-pub use kmer::CanonicalKmer;
+pub use kmer::{CanonicalKmer, Kmer, Minimizer, hash_kmer};
+pub use params::{k, m, set_k, set_m};
 pub use routable::RoutableSuperKmer;
 pub use sequence::Sequence;
 pub use superkmer::SuperKmer;
+pub use unitig::Unitig;
@@ -0,0 +1,361 @@
+//! Compact 2-bit DNA sequence — shared substrate for [`SuperKmer`] and [`Unitig`].
+//!
+//! Encoding: A=00, C=01, G=10, T=11. Nucleotide 0 occupies bits 7–6 of `seq[0]`,
+//! nucleotide i occupies bits `7−2*(i%4)` and `6−2*(i%4)` of `seq[i/4]`.
+//! Padding bits in the last byte are always 0.
+//!
+//! The exact nucleotide count is recovered without storing it explicitly:
+//!
+//! ```text
+//! seql = (seq.len() - 1) * 4 + tail_count(tail)
+//! ```
+//!
+//! where `tail` encodes the number of valid nucleotides in the last byte (0 → 4).
+
+use std::io::{self, Read, Write};
+
+use bitvec::prelude::*;
+
+use crate::Sequence;
+use crate::encoding::{DEC4, encode_base};
+use crate::kmer::{CanonicalKmer, Kmer, KmerError, KLen, KmerLength, KmerOf, MLen, Minimizer};
+use crate::params::k;
+use crate::revcomp_lookup::REVCOMP4;
+
+// ── PackedSeq ─────────────────────────────────────────────────────────────────
+
+/// 2-bit packed DNA sequence of arbitrary length ≥ 1.
+///
+/// `tail` encodes the number of valid nucleotides in the last byte: 0 stands for 4,
+/// so the range 0–3 covers all four cases. Padding bits are always 0.
+#[derive(Debug, Clone)]
+pub struct PackedSeq {
+    pub(crate) tail: u8,
+    pub(crate) seq: Box<[u8]>,
+}
+
+impl PartialEq for PackedSeq {
+    fn eq(&self, other: &Self) -> bool {
+        self.tail == other.tail && self.seq == other.seq
+    }
+}
+
+impl Eq for PackedSeq {}
+
+impl std::hash::Hash for PackedSeq {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.tail.hash(state);
+        self.seq.hash(state);
+    }
+}
+
+impl PackedSeq {
+    /// Construct from pre-packed bytes and a `tail` value (0–3, where 0 means 4).
+    /// Caller must guarantee padding bits in the last byte are zeroed.
+    #[inline]
+    pub fn new(tail: u8, seq: Box<[u8]>) -> Self {
+        debug_assert!(tail <= 3, "tail must be 0–3");
+        debug_assert!(!seq.is_empty(), "seq must be non-empty");
+        Self { tail, seq }
+    }
+
+    /// Sequence length in nucleotides.
+    #[inline]
+    pub fn seql(&self) -> usize {
+        (self.seq.len() - 1) * 4 + tail_count(self.tail)
+    }
+
+    /// Read-only view of the packed 2-bit bytes.
+    #[inline]
+    pub fn seq_bytes(&self) -> &[u8] {
+        &self.seq
+    }
+
+    /// Extract nucleotide i (0-based from 5′ end) as a 2-bit value. Zero copy.
+    #[inline]
+    pub fn nucleotide(&self, i: usize) -> u8 {
+        (self.seq[i / 4] >> (6 - 2 * (i % 4))) & 0b11
+    }
+
+    /// Encode an ASCII nucleotide slice (ACGT, length ≥ 1). Allocates once.
+    pub fn from_ascii(ascii: &[u8]) -> Self {
+        let seql = ascii.len();
+        debug_assert!(seql >= 1);
+        let n = byte_len(seql);
+        let mut seq = vec![0u8; n];
+        let full = seql / 4;
+        for i in 0..full {
+            seq[i] = encode_base(ascii[i * 4]) << 6
+                | encode_base(ascii[i * 4 + 1]) << 4
+                | encode_base(ascii[i * 4 + 2]) << 2
+                | encode_base(ascii[i * 4 + 3]);
+        }
+        let rem = seql % 4;
+        if rem > 0 {
+            let mut last = 0u8;
+            for j in 0..rem {
+                last |= encode_base(ascii[full * 4 + j]) << (6 - 2 * j);
+            }
+            seq[full] = last;
+        }
+        Self::new(count_to_tail(seql), seq.into_boxed_slice())
+    }
+
+    /// Encode a slice of 2-bit nucleotide values (0=A…3=T, length ≥ 1). Allocates once.
+    pub fn from_nucleotides(nucs: &[u8]) -> Self {
+        let seql = nucs.len();
+        debug_assert!(seql >= 1);
+        let n = byte_len(seql);
+        let mut seq = vec![0u8; n];
+        for (i, &nuc) in nucs.iter().enumerate() {
+            seq[i / 4] |= (nuc & 0b11) << (6 - 2 * (i % 4));
+        }
+        Self::new(count_to_tail(seql), seq.into_boxed_slice())
+    }
+
+    /// Write ASCII nucleotides into `writer`. Zero allocation.
+    pub fn write_ascii<W: Write>(&self, writer: &mut W) -> io::Result<()> {
+        let seql = self.seql();
+        let full = seql / 4;
+        for i in 0..full {
+            writer.write_all(&DEC4[self.seq[i] as usize].to_be_bytes())?;
+        }
+        let rem = seql % 4;
+        if rem > 0 {
+            writer.write_all(&DEC4[self.seq[full] as usize].to_be_bytes()[..rem])?;
+        }
+        Ok(())
+    }
+
+    /// Decode into a fresh ASCII `Vec<u8>`. Allocates.
+    #[inline]
+    pub fn to_ascii(&self) -> Vec<u8> {
+        let mut buf = Vec::with_capacity(self.seql());
+        self.write_ascii(&mut buf).unwrap();
+        buf
+    }
+
+    /// Reverse-complement in place. Zero allocation.
+    pub fn revcomp_inplace(&mut self) {
+        let seql = self.seql();
+        let n = self.seq.len();
+        {
+            let bytes = &mut self.seq[..n];
+            let (mut lo, mut hi) = (0, n - 1);
+            while lo < hi {
+                (bytes[lo], bytes[hi]) =
+                    (REVCOMP4[bytes[hi] as usize], REVCOMP4[bytes[lo] as usize]);
+                lo += 1;
+                hi -= 1;
+            }
+            if lo == hi {
+                bytes[lo] = REVCOMP4[bytes[lo] as usize];
+            }
+        }
+        let shift = n * 8 - seql * 2;
+        if shift > 0 {
+            let bits = self.seq[..n].view_bits_mut::<Msb0>();
+            bits.rotate_left(shift);
+            let len = bits.len();
+            bits[len - shift..].fill(false);
+        }
+        // tail is invariant: seql is unchanged by revcomp
+    }
+
+    /// Returns `true` if in canonical form (lexicographic minimum of forward and revcomp).
+    pub fn is_canonical(&self) -> bool {
+        let seql = self.seql();
+        for i in 0..seql {
+            let fwd = self.nucleotide(i);
+            let rev = complement(self.nucleotide(seql - 1 - i));
+            match fwd.cmp(&rev) {
+                std::cmp::Ordering::Less => return true,
+                std::cmp::Ordering::Greater => return false,
+                std::cmp::Ordering::Equal => {}
+            }
+        }
+        true
+    }
+
+    /// Put in canonical form in place. Returns `true` if already canonical. Zero allocation.
+    #[inline]
+    pub fn canonicalize(&mut self) -> bool {
+        if self.is_canonical() {
+            return true;
+        }
+        self.revcomp_inplace();
+        false
+    }
+
+    /// Extract a kmer of length `L::len()` at nucleotide position `i`. Zero allocation.
+    fn extract<L: KmerLength>(&self, i: usize) -> Result<KmerOf<L>, KmerError> {
+        let len = L::len();
+        let seql = self.seql();
+        if i + len > seql {
+            return Err(KmerError::OutOfBounds { position: i, k: len, seql });
+        }
+        let bits = self.seq.view_bits::<Msb0>();
+        let raw: u64 = bits[i * 2..(i + len) * 2].load_be();
+        Ok(KmerOf::from_raw(raw << (64 - 2 * len)))
+    }
+
+    /// Extract the kmer of length `params::k()` at nucleotide position `i`. Zero allocation.
+    #[inline]
+    pub fn kmer(&self, i: usize) -> Result<Kmer, KmerError> {
+        self.extract::<KLen>(i)
+    }
+
+    /// Extract the canonical m-mer (minimizer) of length `params::m()` at position `i`. Zero allocation.
+    #[inline]
+    pub fn mmer(&self, i: usize) -> Result<Minimizer, KmerError> {
+        Ok(self.extract::<MLen>(i)?.canonical())
+    }
+
+    /// Extract the canonical kmer of length `params::k()` at position `i`. Zero allocation.
+    #[inline]
+    pub fn canonical_kmer(&self, i: usize) -> Result<CanonicalKmer, KmerError> {
+        Ok(self.kmer(i)?.canonical())
+    }
+
+    /// Iterate over all kmers of length `params::k()` in order. Zero allocation.
+    #[inline]
+    pub fn iter_kmers(&self) -> PackedSeqKmerIter<'_> {
+        PackedSeqKmerIter::new(self)
+    }
+
+    /// Iterate over all canonical kmers of length `params::k()` in order. Zero allocation.
+    #[inline]
+    pub fn iter_canonical_kmers(&self) -> impl Iterator<Item = CanonicalKmer> + '_ {
+        self.iter_kmers().map(|km| km.canonical())
+    }
+
+    /// Serialise to a compact binary representation.
+    ///
+    /// Format: varint(seql) followed by raw packed bytes.
+    /// `tail` and `byte_len` are both derivable from `seql` and need not be stored.
+    pub fn write_to_binary<W: Write>(&self, w: &mut W) -> io::Result<()> {
+        write_varint(w, self.seql() as u64)?;
+        w.write_all(&self.seq)
+    }
+
+    /// Deserialise from the compact binary format produced by [`write_to_binary`].
+    /// Allocates exactly one `Box<[u8]>` for the packed bytes.
+    pub fn read_from_binary<R: Read>(r: &mut R) -> io::Result<Self> {
+        let seql = read_varint(r)? as usize;
+        if seql == 0 {
+            return Err(io::Error::new(io::ErrorKind::InvalidData, "empty sequence"));
+        }
+        let byte_len = (seql + 3) / 4;
+        let tail = (seql % 4) as u8;
+        let mut seq = vec![0u8; byte_len];
+        r.read_exact(&mut seq)?;
+        Ok(Self::new(tail, seq.into_boxed_slice()))
+    }
+}
+
+// ── PackedSeqKmerIter ─────────────────────────────────────────────────────────
+
+/// Sliding-window kmer iterator over a [`PackedSeq`]. Zero allocation.
+pub struct PackedSeqKmerIter<'a> {
+    seq: &'a PackedSeq,
+    mask: u64,
+    lshift: usize,
+    current: u64,
+    pos: usize,
+    max_pos: usize,
+}
+
+impl<'a> PackedSeqKmerIter<'a> {
+    fn new(seq: &'a PackedSeq) -> Self {
+        let seql = seq.seql();
+        let klen = k();
+        let lshift = 64 - klen * 2;
+        let mask = ((!0u128) << (lshift + 2)) as u64;
+        Self {
+            seq,
+            mask,
+            lshift,
+            current: if seql >= klen { seq.extract::<KLen>(0).map(|km| km.raw()).unwrap_or(0) } else { 0 },
+            pos: klen,
+            max_pos: seql,
+        }
+    }
+}
+
+impl Iterator for PackedSeqKmerIter<'_> {
+    type Item = Kmer;
+
+    fn next(&mut self) -> Option<Kmer> {
+        if self.pos > self.max_pos {
+            return None;
+        }
+        let result = Kmer::from_raw(self.current);
+        if self.pos < self.max_pos {
+            let inner_shift = 6 - 2 * (self.pos & 3);
+            let nuc = ((self.seq.seq[self.pos / 4] >> inner_shift) & 3) as u64;
+            self.current = ((self.current << 2) & self.mask) | (nuc << self.lshift);
+        }
+        self.pos += 1;
+        Some(result)
+    }
+}
+
+// ── varint (LEB128) ───────────────────────────────────────────────────────────
+
+pub(crate) fn write_varint<W: Write>(w: &mut W, mut val: u64) -> io::Result<()> {
+    loop {
+        let mut byte = (val & 0x7F) as u8;
+        val >>= 7;
+        if val != 0 {
+            byte |= 0x80;
+        }
+        w.write_all(&[byte])?;
+        if val == 0 {
+            break;
+        }
+    }
+    Ok(())
+}
+
+pub(crate) fn read_varint<R: Read>(r: &mut R) -> io::Result<u64> {
+    let mut val = 0u64;
+    let mut shift = 0u32;
+    let mut buf = [0u8; 1];
+    loop {
+        r.read_exact(&mut buf)?;
+        let byte = buf[0];
+        val |= ((byte & 0x7F) as u64) << shift;
+        if byte & 0x80 == 0 {
+            break;
+        }
+        shift += 7;
+        if shift >= 64 {
+            return Err(io::Error::new(io::ErrorKind::InvalidData, "varint overflow"));
+        }
+    }
+    Ok(val)
+}
+
+// ── helpers ───────────────────────────────────────────────────────────────────
+
+#[inline]
+fn complement(base: u8) -> u8 {
+    !base & 0b11
+}
+
+#[inline]
+fn byte_len(seql: usize) -> usize {
+    (seql + 3) / 4
+}
+
+/// Nucleotide count → `tail` value: 0 encodes 4, 1–3 are identity.
+#[inline]
+pub(crate) fn count_to_tail(seql: usize) -> u8 {
+    (seql % 4) as u8
+}
+
+/// `tail` value → nucleotide count in last byte: 0 means 4.
+#[inline]
+pub(crate) fn tail_count(tail: u8) -> usize {
+    if tail == 0 { 4 } else { tail as usize }
+}
@@ -0,0 +1,102 @@
+//! Global k-mer and minimizer length parameters, set once at program startup.
+//!
+//! # Production vs. test behaviour
+//!
+//! In production (`#[cfg(not(test))]`) both `K` and `M` are stored in a
+//! [`OnceLock`]: they can be initialised exactly once; any attempt to set a
+//! different value panics.  This prevents silent divergence between the global
+//! parameter and the values used to build data structures.
+//!
+//! In test builds (`#[cfg(test)]`) the same public API is backed by
+//! `thread_local!` [`Cell`]s instead.  Each test thread gets its own
+//! independent copies of `K` and `M`, so tests can use arbitrary values
+//! without coordinating with one another and without any reset mechanism.
+//! The `OnceLock` constraint is deliberately absent: test isolation is
+//! provided by thread locality, not by write-once semantics.
+
+// ── Production implementation ─────────────────────────────────────────────────
+
+#[cfg(not(any(test, feature = "test-utils")))]
+mod state {
+    use std::sync::OnceLock;
+
+    static K: OnceLock<usize> = OnceLock::new();
+    static M: OnceLock<usize> = OnceLock::new();
+
+    pub fn set_k(k: usize) {
+        K.get_or_init(|| k);
+        assert_eq!(*K.get().unwrap(), k, "K already initialized to a different value");
+    }
+
+    pub fn k() -> usize {
+        *K.get().expect("K not initialized — call params::set_k or params::init first")
+    }
+
+    pub fn set_m(m: usize) {
+        M.get_or_init(|| m);
+        assert_eq!(*M.get().unwrap(), m, "M already initialized to a different value");
+    }
+
+    pub fn m() -> usize {
+        *M.get().expect("M not initialized — call params::set_m or params::init first")
+    }
+}
+
+// ── Test implementation ───────────────────────────────────────────────────────
+//
+// Each test thread owns its private K and M via thread_local!, so tests may
+// call set_k / set_m with any value without affecting other tests.
+
+#[cfg(any(test, feature = "test-utils"))]
+mod state {
+    use std::cell::Cell;
+
+    thread_local! {
+        static K: Cell<usize> = Cell::new(0);
+        static M: Cell<usize> = Cell::new(0);
+    }
+
+    pub fn set_k(k: usize) { K.with(|c| c.set(k)); }
+    pub fn k()    -> usize { K.with(|c| c.get()) }
+    pub fn set_m(m: usize) { M.with(|c| c.set(m)); }
+    pub fn m()    -> usize { M.with(|c| c.get()) }
+}
+
+// ── Public API (identical signature in both configurations) ───────────────────
+
+/// Initialise both K and M in one call.
+///
+/// In production, panics if either value has already been set to a different
+/// value.  In tests, simply overwrites the thread-local.
+pub fn init(k: usize, m: usize) {
+    state::set_k(k);
+    state::set_m(m);
+}
+
+/// Set the k-mer length.
+///
+/// In production: idempotent for the same value, panics on conflict.
+/// In tests: unconditionally updates the calling thread's value.
+pub fn set_k(k: usize) {
+    state::set_k(k);
+}
+
+/// Returns the k-mer length. Panics if not yet initialized.
+#[inline]
+pub fn k() -> usize {
+    state::k()
+}
+
+/// Set the minimizer length.
+///
+/// In production: idempotent for the same value, panics on conflict.
+/// In tests: unconditionally updates the calling thread's value.
+pub fn set_m(m: usize) {
+    state::set_m(m);
+}
+
+/// Returns the minimizer length. Panics if not yet initialized.
+#[inline]
+pub fn m() -> usize {
+    state::m()
+}
@@ -1,40 +1,53 @@
 //! Super-kmer with routing metadata: canonical sequence + pre-computed minimizer.

-use super::kmer::CanonicalKmer;
-use super::SuperKmer;
+use serde::Serialize;

-/// Owned wrapper that pairs a canonical [`SuperKmer`] with its minimizer [`Kmer`].
+use crate::Annotation;
+use crate::Sequence;
+use crate::SuperKmer;
+use crate::kmer::Minimizer;
+use crate::packed_seq::{PackedSeq, count_to_tail};
+use crate::params::m;
+
+/// Owned wrapper that pairs a canonical [`SuperKmer`] with its pre-computed minimizer.
 ///
 /// Created at the single point where raw sequence bytes are emitted from the
-/// scratch buffer.  The minimizer position (given in original orientation) is
-/// adjusted for any flip applied during canonicalisation.  After routing, call
+/// scratch buffer. The minimizer position (given in original orientation) is
+/// adjusted for any flip applied during canonicalisation. After routing, call
 /// [`into_superkmer`] to discard the metadata and continue with the bare sequence.
 ///
 /// [`into_superkmer`]: RoutableSuperKmer::into_superkmer
+#[derive(Clone)]
 pub struct RoutableSuperKmer {
    superkmer: SuperKmer,
-    minimizer: CanonicalKmer,
+    minimizer: Minimizer,
 }

+#[derive(Serialize)]
+struct SKRAnnotation {
+    seq_length: usize,
+    count: u32,
+}
+
+impl Annotation for SKRAnnotation {}
+
 impl RoutableSuperKmer {
    /// Construct from raw packed bytes.
    ///
    /// `min_pos` is the 0-based minimizer position in the **original** (pre-flip)
-    /// orientation.  `m` is the minimizer length.  `seql` and `seq` are the
-    /// raw length byte and 2-bit-packed nucleotides as produced by the scratch
-    /// buffer.
-    pub fn build(min_pos: usize, m: usize, seql: u8, seq: Box<[u8]>) -> Self {
-        let (sk, already_canonical) = SuperKmer::build(seql, seq);
+    /// orientation. `seql` is the nucleotide count. The sequence is canonicalised
+    /// in place; `min_pos` is adjusted accordingly.
+    pub fn build(min_pos: usize, seql: usize, seq: Box<[u8]>) -> Self {
+        let mut inner = PackedSeq::new(count_to_tail(seql), seq);
+        let already_canonical = inner.canonicalize();
        let adjusted_pos = if already_canonical {
            min_pos
        } else {
-            sk.len() - m - min_pos
+            seql - m() - min_pos
        };
-        let minimizer = sk.kmer(adjusted_pos, m).unwrap().canonical(m);
-        Self {
-            superkmer: sk,
-            minimizer,
-        }
+        let minimizer = inner.mmer(adjusted_pos).unwrap();
+        let superkmer = SuperKmer { count: 1, inner };
+        Self { superkmer, minimizer }
    }

    /// Borrow the canonical super-kmer sequence.
@@ -42,8 +55,8 @@ impl RoutableSuperKmer {
        &self.superkmer
    }

-    /// Borrow the canonical minimizer kmer.
-    pub fn minimizer(&self) -> &CanonicalKmer {
+    /// Borrow the canonical minimizer.
+    pub fn minimizer(&self) -> &Minimizer {
        &self.minimizer
    }

@@ -53,7 +66,34 @@ impl RoutableSuperKmer {
    }

    /// Sequence length in nucleotides.
-    pub fn len(&self) -> usize {
-        self.superkmer.len()
+    pub fn seql(&self) -> usize {
+        self.superkmer.seql()
+    }
+}
+
+impl Sequence for RoutableSuperKmer {
+    type Canonical = RoutableSuperKmer;
+
+    fn seql(&self) -> usize {
+        self.superkmer.seql()
+    }
+
+    fn nucleotide(&self, i: usize) -> u8 {
+        self.superkmer.nucleotide(i)
+    }
+
+    fn seq_hash(&self) -> u64 {
+        self.minimizer.seq_hash()
+    }
+
+    fn canonical(&self) -> Self::Canonical {
+        self.clone()
+    }
+
+    fn annotation(&self) -> impl Annotation {
+        SKRAnnotation {
+            seq_length: self.superkmer.seql(),
+            count: self.superkmer.count(),
+        }
    }
 }
@@ -1,8 +1,71 @@
-use crate::Annotation;
+use std::io::{self, Write};

+use crate::Annotation;
+use crate::annotations::BasicAnnotation;
+
+/// Common interface for all 2-bit packed DNA sequences in the pipeline.
+///
+/// Required methods: `Canonical`, `seql`, `nucleotide`, `seq_hash`, `canonical`.
+/// All other methods have default implementations derived from those five.
 pub trait Sequence {
-    fn sequence(&self) -> Box<[u8]>;
-    fn canonical(&self) -> &Self;
+    /// The canonical form of this sequence type.
+    ///
+    /// For types always stored canonical (`SuperKmer`, `CanonicalKmerOf`), set `Canonical = Self`.
+    /// For `KmerOf`, set `Canonical = CanonicalKmerOf`.
+    type Canonical: Sequence;
+
+    /// Sequence length in nucleotides.
+    fn seql(&self) -> usize;
+
+    /// Extract nucleotide `i` (0-based from 5′ end) as a 2-bit value (A=0, C=1, G=2, T=3).
+    fn nucleotide(&self, i: usize) -> u8;
+
+    /// Hash of the sequence, used for partitioning and routing.
    fn seq_hash(&self) -> u64;
-    fn annotation(&self) -> Annotation;
+
+    /// Return the canonical form.
+    ///
+    /// For `Copy` types this is free; for heap-backed types it clones (output/debug paths only).
+    fn canonical(&self) -> Self::Canonical;
+
+    /// Return an annotation describing this sequence's metadata.
+    ///
+    /// Default: `BasicAnnotation { seq_length }`. Override for richer metadata.
+    fn annotation(&self) -> impl Annotation {
+        BasicAnnotation { seq_length: self.seql() }
+    }
+
+    /// Decode into ASCII nucleotides, writing into `writer`.
+    ///
+    /// Default: one byte per nucleotide via `nucleotide()`.
+    /// Types with packed byte access should override with the faster DEC4 path.
+    fn write_ascii<W: Write>(&self, w: &mut W) -> io::Result<()> {
+        for i in 0..self.seql() {
+            w.write_all(&[b"ACGT"[self.nucleotide(i) as usize]])?;
+        }
+        Ok(())
+    }
+
+    /// Decode into a fresh ASCII `Vec<u8>`.
+    fn to_ascii(&self) -> Vec<u8> {
+        let mut buf = Vec::with_capacity(self.seql());
+        self.write_ascii(&mut buf).unwrap();
+        buf
+    }
+
+    /// Partition index derived from `seq_hash`.
+    ///
+    /// * `part_bits` — number of low bits to use (partition count = `1 << part_bits`).
+    fn partition(&self, part_bits: usize) -> usize {
+        (mix64(self.seq_hash()) & ((1 << part_bits) - 1)) as usize
+    }
+}
+
+#[inline]
+pub(crate) fn mix64(x: u64) -> u64 {
+    let x = x ^ (x >> 30);
+    let x = x.wrapping_mul(0xbf58476d1ce4e5b9);
+    let x = x ^ (x >> 27);
+    let x = x.wrapping_mul(0x94d049bb133111eb);
+    x ^ (x >> 31)
 }
@@ -1,84 +1,44 @@
-//! Compact 2-bit DNA super-kmer with in-place reverse complement and canonical form.
-use std::io::{self, Write};
+//! Canonical 2-bit DNA super-kmer with occurrence count.
+//!
+//! Delegates all sequence operations to [`PackedSeq`].
+//!
+//! On-disk header word (32 bits): `(count << 2) | tail` — 30-bit count, 2-bit tail.
+
+use std::io::{self, Read, Write};

-use bitvec::prelude::*;
 use serde::Serialize;
 use xxhash_rust::xxh3::xxh3_64;

+use crate::Annotation;
 use crate::Sequence;
-use crate::encoding::{DEC4, encode_base};
 use crate::kmer::{CanonicalKmer, Kmer, KmerError};
-use crate::revcomp_lookup::REVCOMP4;
+use crate::packed_seq::{PackedSeq, read_varint, write_varint};

-// ── SuperKmerHeader ───────────────────────────────────────────────────────────
-
-/// 32-bit super-kmer header.
-///
-/// Bit layout (MSB → LSB):
-///
-/// ```text
-/// [31 .......... 8] [7 ...... 0]
-///    count (24 b)    SEQL (8 b)
-/// ```
-///
-/// SEQL encodes the sequence length: 1–255 map directly; 0 encodes 256.
-/// The count field starts at 1 and accumulates occurrence counts during
-/// deduplication.
-#[derive(Debug, Clone, Copy)]
-pub(crate) struct SuperKmerHeader(u32);
-
-impl SuperKmerHeader {
-    pub(crate) fn new(seql: u8) -> Self {
-        Self((1 << 8) | seql as u32)
-    }
-
-    fn seql(&self) -> u8 {
-        self.0 as u8
-    }
-
-    fn count(&self) -> u32 {
-        self.0 >> 8
-    }
-
-    fn increment(&mut self) {
-        self.0 += 1 << 8;
-    }
-
-    fn add(&mut self, n: u32) {
-        self.0 += n << 8;
-    }
-
-    fn set_count(&mut self, n: u32) {
-        self.0 = (self.0 & 0xFF) | (n << 8);
-    }
-}
+// ── SKAnnotation ──────────────────────────────────────────────────────────────

 #[derive(Serialize)]
 struct SKAnnotation {
    seq_length: usize,
-    kmer_size: usize,
-    minimizer_size: usize,
-    partition: u32,
    count: u32,
 }

+impl Annotation for SKAnnotation {}
+
 // ── SuperKmer ─────────────────────────────────────────────────────────────────

-/// Canonical super-kmer: 32-bit header followed by a byte-aligned 2-bit nucleotide sequence.
-/// Nucleotide 0 is at the MSB of `seq[0]`. Always stored in canonical form.
+/// Canonical super-kmer: occurrence count + 2-bit packed DNA sequence.
 ///
-/// `PartialEq`, `Eq`, and `Hash` compare only sequence content (seql + seq bytes),
-/// ignoring the count / minimizer-pos payload — two records with identical sequences
-/// but different counts are considered equal.
+/// Always stored in canonical form (lex min of forward and revcomp).
+/// `PartialEq`/`Hash` compare only sequence content, ignoring count.
 #[derive(Debug, Clone)]
 pub struct SuperKmer {
-    header: SuperKmerHeader,
-    seq: Box<[u8]>,
+    pub(crate) count: u32,
+    pub(crate) inner: PackedSeq,
 }

 impl PartialEq for SuperKmer {
    fn eq(&self, other: &Self) -> bool {
-        self.header.seql() == other.header.seql() && self.seq == other.seq
+        self.inner == other.inner
    }
 }

@@ -86,320 +46,145 @@ impl Eq for SuperKmer {}

 impl std::hash::Hash for SuperKmer {
    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
-        self.header.seql().hash(state);
-        self.seq.hash(state);
+        self.inner.hash(state);
    }
 }

 impl Sequence for SuperKmer {
-    fn sequence(&self) -> Box<[u8]> {
-        self.seq.clone()
+    type Canonical = SuperKmer;
+
+    fn seql(&self) -> usize {
+        self.inner.seql()
    }

-    fn canonical(&self) -> &Self {
-        &self
+    fn nucleotide(&self, i: usize) -> u8 {
+        self.inner.nucleotide(i)
    }

-    /// Returns the XXH3-64 hash of the packed sequence bytes.
    fn seq_hash(&self) -> u64 {
-        xxh3_64(&self.seq)
+        xxh3_64(self.inner.seq_bytes())
    }

-    fn annotation(&self) -> Annotation {}
+    fn canonical(&self) -> Self::Canonical {
+        self.clone()
+    }
+
+    fn annotation(&self) -> impl Annotation {
+        SKAnnotation {
+            seq_length: self.inner.seql(),
+            count: self.count,
+        }
+    }
 }
+
 impl SuperKmer {
-    /// `seql` is the raw stored byte: 1–255 for lengths 1–255, 0 for length 256.
-    pub fn new(seql: u8, seq: Box<[u8]>) -> Self {
-        Self::build(seql, seq).0
-    }
-
-    /// Construct and canonicalise in place, returning `(sk, already_canonical)`.
-    /// `already_canonical` is `true` when the sequence was not flipped.
-    pub fn build(seql: u8, seq: Box<[u8]>) -> (Self, bool) {
-        let mut sk = Self {
-            header: SuperKmerHeader::new(seql),
-            seq,
-        };
-        let already_canonical = sk.canonical(); // true = pas retourné
-        (sk, already_canonical)
-    }
-
-    /// Deserialise from a raw 32-bit header word and packed sequence bytes.
-    /// Preserves the full header payload (count or minimizer_pos in bits [31:8]).
-    pub fn from_header_bits(bits: u32, seq: Box<[u8]>) -> Self {
-        let seql = (bits & 0xFF) as u8;
-        let len = stored_to_len(seql);
-        debug_assert_eq!(seq.len(), byte_len(len));
-        let sk = Self {
-            header: SuperKmerHeader(bits),
-            seq,
-        };
-        debug_assert!(
-            sk.is_canonical(),
-            "SuperKmer deserialised from disk is not canonical"
-        );
-        sk
-    }
-
-    /// Returns the sequence length in nucleotides (1–256).
-    pub fn len(&self) -> usize {
-        stored_to_len(self.header.seql())
-    }
-
-    /// Returns the occurrence count of this super-kmer.
-    pub fn count(&self) -> u32 {
-        self.header.count()
-    }
-
-    /// Increments the occurrence count by 1.
-    pub fn increment(&mut self) {
-        self.header.increment();
-    }
-
-    /// Adds `n` to the occurrence count.
-    pub fn add(&mut self, n: u32) {
-        self.header.add(n);
-    }
-
-    /// Sets the occurrence count to an absolute value.
-    pub fn set_count(&mut self, n: u32) {
-        self.header.set_count(n);
-    }
-
-    /// Extract nucleotide i (0-based from 5' end) as a 2-bit value.
-    pub fn nucleotide(&self, i: usize) -> u8 {
-        (self.seq[i / 4] >> (6 - 2 * (i % 4))) & 0b11
-    }
-
-    /// Reverse-complement this super-kmer in place.
-    ///
-    /// This method is only used internally by the build method.
-    fn revcomp(&mut self) {
-        let seql = self.len();
-        let n = byte_len(seql);
-
-        // Step 1: swap bytes outside-in, applying revcomp4 to each.
-        {
-            let bytes = &mut self.seq[..n];
-            let (mut lo, mut hi) = (0, n - 1);
-            while lo < hi {
-                (bytes[lo], bytes[hi]) =
-                    (REVCOMP4[bytes[hi] as usize], REVCOMP4[bytes[lo] as usize]);
-                lo += 1;
-                hi -= 1;
-            }
-            if lo == hi {
-                bytes[lo] = REVCOMP4[bytes[lo] as usize];
-            }
-        }
-
-        // Step 2: left-shift to flush padding T's introduced by complementing padding A's.
-        let shift = n * 8 - seql * 2;
-        if shift > 0 {
-            let bits = self.seq[..n].view_bits_mut::<Msb0>();
-            bits.rotate_left(shift);
-            let len = bits.len();
-            bits[len - shift..].fill(false);
-        }
-    }
-
-    /// Encode an ASCII nucleotide sequence (ACGT, length 1–256) into a canonical SuperKmer.
+    /// Encode ASCII nucleotides (length ≥ 1) into a canonical SuperKmer.
    pub fn from_ascii(ascii: &[u8]) -> Self {
-        let seql = ascii.len();
-        debug_assert!(
-            seql >= 1 && seql <= 256,
-            "super-kmer length must be 1..=256"
-        );
-        let n = byte_len(seql);
-        let mut seq = vec![0u8; n];
-
-        let full = seql / 4;
-        for i in 0..full {
-            seq[i] = encode_base(ascii[i * 4]) << 6
-                | encode_base(ascii[i * 4 + 1]) << 4
-                | encode_base(ascii[i * 4 + 2]) << 2
-                | encode_base(ascii[i * 4 + 3]);
-        }
-        let rem = seql % 4;
-        if rem > 0 {
-            let mut last = 0u8;
-            for j in 0..rem {
-                last |= encode_base(ascii[full * 4 + j]) << (6 - 2 * j);
-            }
-            seq[full] = last;
-        }
-
-        Self::new(seql as u8, seq.into_boxed_slice()) // 256usize as u8 == 0, intentional
+        let mut inner = PackedSeq::from_ascii(ascii);
+        inner.canonicalize();
+        Self { count: 1, inner }
    }

-    /// Decode this super-kmer sequence into ASCII nucleotides, writing into `writer`.
-    pub fn write_ascii<W: Write>(&self, writer: &mut W) -> io::Result<()> {
-        let seql = self.len();
-        let full = seql / 4;
-
-        for i in 0..full {
-            writer.write_all(&DEC4[self.seq[i] as usize].to_be_bytes())?;
-        }
-        let rem = seql % 4;
-        if rem > 0 {
-            let bytes = DEC4[self.seq[full] as usize].to_be_bytes();
-            writer.write_all(&bytes[..rem])?;
-        }
-        Ok(())
+    /// Wrap a pre-built [`PackedSeq`], canonicalising in place.
+    pub fn build(mut inner: PackedSeq) -> Self {
+        inner.canonicalize();
+        Self { count: 1, inner }
    }

-    /// Decode this super-kmer sequence into a fresh ASCII `Vec<u8>`.
-    pub fn to_ascii(&self) -> Vec<u8> {
-        let mut buf = Vec::with_capacity(self.len());
-        self.write_ascii(&mut buf).unwrap();
-        buf
+    /// Serialise to compact binary. Format: varint(count) + varint((byte_len << 2) | tail) + bytes.
+    pub fn write_to_binary<W: Write>(&self, w: &mut W) -> io::Result<()> {
+        write_varint(w, self.count as u64)?;
+        self.inner.write_to_binary(w)
    }

-    /// Returns the raw 32-bit header word for binary serialisation.
-    /// Bits [7:0] = seql encoding (0→256, 1-255 direct). Bits [31:8] = payload.
+    /// Deserialise from the binary format produced by [`write_to_binary`].
+    /// Allocates exactly one `Box<[u8]>` for the packed bytes.
+    pub fn read_from_binary<R: Read>(r: &mut R) -> io::Result<Self> {
+        let count = read_varint(r)? as u32;
+        let inner = PackedSeq::read_from_binary(r)?;
+        debug_assert!(inner.is_canonical(), "SuperKmer from disk is not canonical");
+        Ok(Self { count, inner })
+    }
+
+    /// Sequence length in nucleotides.
    #[inline]
-    pub fn header_bits(&self) -> u32 {
-        self.header.0
+    pub fn seql(&self) -> usize {
+        self.inner.seql()
    }

-    /// Returns a read-only view of the packed 2-bit sequence bytes.
-    /// Length is always `(seql() + 3) / 4` bytes.
+    /// Occurrence count.
+    #[inline]
+    pub fn count(&self) -> u32 {
+        self.count
+    }
+
+    /// Increment occurrence count by 1.
+    #[inline]
+    pub fn increment(&mut self) {
+        self.count += 1;
+    }
+
+    /// Add `n` to the occurrence count.
+    #[inline]
+    pub fn add(&mut self, n: u32) {
+        self.count += n;
+    }
+
+    /// Set the occurrence count to `n`.
+    #[inline]
+    pub fn set_count(&mut self, n: u32) {
+        self.count = n;
+    }
+
+    /// Read-only view of packed 2-bit bytes.
    #[inline]
    pub fn seq_bytes(&self) -> &[u8] {
-        &self.seq
+        self.inner.seq_bytes()
    }

-    /// Extract the kmer of length k starting at nucleotide position i (0-based).
-    ///
-    /// Returns an error if k is invalid (0 or > 32) or if position i + k exceeds the sequence length.
-    pub fn kmer(&self, i: usize, k: usize) -> Result<Kmer, KmerError> {
-        if k == 0 || k > 32 {
-            return Err(KmerError::InvalidK { k });
-        }
-        let seql = self.len();
-        if i + k > seql {
-            return Err(KmerError::OutOfBounds {
-                position: i,
-                k,
-                seql,
-            });
-        }
-        let bits = self.seq.view_bits::<Msb0>();
-        let raw: u64 = bits[i * 2..(i + k) * 2].load_be();
-        Ok(Kmer::from_raw(raw << (64 - 2 * k)))
+    /// Extract nucleotide i (0-based from 5′ end) as a 2-bit value.
+    #[inline]
+    pub fn nucleotide(&self, i: usize) -> u8 {
+        self.inner.nucleotide(i)
    }

-    /// Extract the canonical kmer of length k starting at nucleotide position i (0-based).
-    ///
-    /// Returns an error if k is invalid (0 or > 32) or if position i + k exceeds the sequence length.
-    pub fn canonical_kmer(&self, i: usize, k: usize) -> Result<CanonicalKmer, KmerError> {
-        Ok(self.kmer(i, k)?.canonical(k))
+    /// Extract the k-mer at position `i` using `params::k()`.
+    #[inline]
+    pub fn kmer(&self, i: usize) -> Result<Kmer, KmerError> {
+        self.inner.kmer(i)
    }

-    /// Put this super-kmer in canonical form (lexicographic minimum of forward and revcomp).
-    ///
-    /// Returns `true` if already canonical (no change), `false` if revcomp was applied.
-    fn canonical(&mut self) -> bool {
-        if self.is_canonical() {
-            return true;
-        }
-        self.revcomp();
-        false
+    /// Extract the canonical k-mer at position `i`.
+    #[inline]
+    pub fn canonical_kmer(&self, i: usize) -> Result<CanonicalKmer, KmerError> {
+        self.inner.canonical_kmer(i)
    }

-    /// Returns `true` if this super-kmer is in canonical form (lexicographic minimum of forward and revcomp).
-    fn is_canonical(&self) -> bool {
-        let seql = self.len();
-        for i in 0..seql {
-            let fwd = self.nucleotide(i);
-            let rev = complement(self.nucleotide(seql - 1 - i));
-            if fwd < rev {
-                return true;
-            }
-            if fwd > rev {
-                return false;
-            }
-        }
-        true
+    /// Decode into ASCII, writing into `writer`.
+    #[inline]
+    pub fn write_ascii<W: std::io::Write>(&self, writer: &mut W) -> std::io::Result<()> {
+        self.inner.write_ascii(writer)
    }

-    /// Iterate over all kmers of length `k` in order, yielding each as a left-aligned [`Kmer`].
-    pub fn iter_kmers(&self, k: usize) -> impl Iterator<Item = Kmer> + '_ {
-        SKKmerIter::new(self, k)
+    /// Decode into a fresh ASCII `Vec<u8>`.
+    #[inline]
+    pub fn to_ascii(&self) -> Vec<u8> {
+        self.inner.to_ascii()
    }

-    /// Iterate over all canonical kmers of length `k` in order.
-    pub fn iter_canonical_kmers(&self, k: usize) -> impl Iterator<Item = CanonicalKmer> + '_ {
-        self.iter_kmers(k).map(move |km| km.canonical(k))
+    /// Iterate over all k-mers of length `params::k()` in order.
+    #[inline]
+    pub fn iter_kmers(&self) -> impl Iterator<Item = Kmer> + '_ {
+        self.inner.iter_kmers()
    }
-}

-struct SKKmerIter<'a> {
-    skmer: &'a SuperKmer,
-    mask: u64,
-    lshift: usize,
-    current: u64,
-    pos: usize,
-    max_pos: usize,
-}
-
-impl<'a> SKKmerIter<'a> {
-    fn new(skmer: &'a SuperKmer, k: usize) -> Self {
-        let seql = skmer.len();
-        let lshift = 64 - k * 2;
-        let mask = ((!0u128) << (lshift + 2)) as u64;
-        Self {
-            skmer,
-            mask,
-            lshift,
-            current: if seql >= k {
-                skmer.kmer(0, k).unwrap().raw()
-            } else {
-                0
-            },
-            pos: k,
-            max_pos: seql,
-        }
+    /// Iterate over all canonical k-mers in order.
+    #[inline]
+    pub fn iter_canonical_kmers(&self) -> impl Iterator<Item = CanonicalKmer> + '_ {
+        self.inner.iter_canonical_kmers()
    }
 }

-impl<'a> Iterator for SKKmerIter<'a> {
-    type Item = Kmer;
-
-    fn next(&mut self) -> Option<Self::Item> {
-        if self.pos > self.max_pos {
-            return None;
-        }
-        // Emit current kmer first, then slide the window forward.
-        let result = Kmer::from_raw(self.current);
-        if self.pos < self.max_pos {
-            let byte_pos = self.pos / 4;
-            // Nucleotide at position r within its byte occupies bits 7-2r (MSB) and 6-2r (LSB).
-            // Extract right-aligned, then place at lshift.
-            let inner_shift = 6 - 2 * (self.pos & 3);
-            let nuc = (((self.skmer.seq[byte_pos] >> inner_shift) & 3) as u64) << self.lshift;
-            self.current = ((self.current << 2) & self.mask) | nuc;
-        }
-        self.pos += 1;
-        Some(result)
-    }
-}
-
-// ── helpers ───────────────────────────────────────────────────────────────────
-
-fn complement(base: u8) -> u8 {
-    !base & 0b11
-}
-
-fn byte_len(seql: usize) -> usize {
-    (seql + 3) / 4
-}
-
-/// Stored u8 → actual length: 0 encodes 256, 1–255 are identity.
-fn stored_to_len(s: u8) -> usize {
-    if s == 0 { 256 } else { s as usize }
-}
-
 #[cfg(test)]
 #[path = "tests/superkmer.rs"]
 mod tests;
@@ -0,0 +1,213 @@
+use super::*;
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    // Tests use ConstLen<N> — no dependency on global params singletons.
+    type K1 = KmerOf<ConstLen<1>>;
+    type K4 = KmerOf<ConstLen<4>>;
+
+    fn ascii_revcomp(seq: &[u8]) -> Vec<u8> {
+        seq.iter()
+            .rev()
+            .map(|&b| match b {
+                b'A' => b'T',
+                b'T' => b'A',
+                b'C' => b'G',
+                b'G' => b'C',
+                _ => b'A',
+            })
+            .collect()
+    }
+
+    fn make_seq<const N: usize>() -> Vec<u8> {
+        (0..N).map(|i| b"ACGT"[i % 4]).collect()
+    }
+
+    // ── from_ascii / to_ascii ─────────────────────────────────────────────────
+
+    #[test]
+    fn ascii_roundtrip() {
+        macro_rules! check {
+            ($n:expr) => {{
+                let ascii = make_seq::<$n>();
+                let kmer = KmerOf::<ConstLen<$n>>::from_ascii(&ascii).unwrap();
+                assert_eq!(kmer.to_ascii(), ascii, "roundtrip failed for k={}", $n);
+            }};
+        }
+        check!(1);
+        check!(2);
+        check!(3);
+        check!(4);
+        check!(8);
+        check!(11);
+        check!(16);
+        check!(31);
+        check!(32);
+    }
+
+    #[test]
+    fn from_ascii_all_bases() {
+        for (base, expected) in [(b'A', b'A'), (b'C', b'C'), (b'G', b'G'), (b'T', b'T')] {
+            let kmer = K1::from_ascii(&[base]).unwrap();
+            assert_eq!(kmer.to_ascii(), vec![expected]);
+        }
+    }
+
+    #[test]
+    fn from_ascii_invalid_k() {
+        assert!(KmerOf::<ConstLen<0>>::from_ascii(b"A").is_err());
+        assert!(KmerOf::<ConstLen<33>>::from_ascii(b"ACGT").is_err());
+    }
+
+    #[test]
+    fn from_ascii_too_short() {
+        assert!(KmerOf::<ConstLen<4>>::from_ascii(b"ACG").is_err());
+    }
+
+    // ── nucleotide ────────────────────────────────────────────────────────────
+
+    #[test]
+    fn nucleotide_extraction() {
+        let kmer = K4::from_ascii(b"ACGT").unwrap();
+        assert_eq!(kmer.nucleotide(0), 0b00); // A
+        assert_eq!(kmer.nucleotide(1), 0b01); // C
+        assert_eq!(kmer.nucleotide(2), 0b10); // G
+        assert_eq!(kmer.nucleotide(3), 0b11); // T
+    }
+
+    // ── revcomp ───────────────────────────────────────────────────────────────
+
+    #[test]
+    fn revcomp_known_values() {
+        let cases: &[(&[u8], &[u8])] = &[
+            (b"A", b"T"),
+            (b"AC", b"GT"),
+            (b"ACG", b"CGT"),
+            (b"ACGT", b"ACGT"),
+            (b"AAAA", b"TTTT"),
+            (b"TTTT", b"AAAA"),
+        ];
+        for (seq, expected) in cases {
+            macro_rules! check_len {
+                ($n:expr) => {
+                    if seq.len() == $n {
+                        let kmer = KmerOf::<ConstLen<$n>>::from_ascii(seq).unwrap();
+                        assert_eq!(
+                            kmer.revcomp().to_ascii(),
+                            *expected,
+                            "revcomp wrong for \"{}\"",
+                            std::str::from_utf8(seq).unwrap()
+                        );
+                    }
+                };
+            }
+            check_len!(1);
+            check_len!(2);
+            check_len!(3);
+            check_len!(4);
+        }
+    }
+
+    #[test]
+    fn revcomp_vs_reference() {
+        macro_rules! check {
+            ($n:expr) => {{
+                let ascii = make_seq::<$n>();
+                let expected = ascii_revcomp(&ascii);
+                let rc = KmerOf::<ConstLen<$n>>::from_ascii(&ascii)
+                    .unwrap()
+                    .revcomp();
+                assert_eq!(rc.to_ascii(), expected, "revcomp wrong for k={}", $n);
+            }};
+        }
+        check!(1);
+        check!(4);
+        check!(8);
+        check!(11);
+        check!(16);
+        check!(31);
+        check!(32);
+    }
+
+    #[test]
+    fn revcomp_involution() {
+        macro_rules! check {
+            ($n:expr) => {{
+                let ascii = make_seq::<$n>();
+                let kmer = KmerOf::<ConstLen<$n>>::from_ascii(&ascii).unwrap();
+                assert_eq!(
+                    kmer.revcomp().revcomp(),
+                    kmer,
+                    "revcomp∘revcomp≠id for k={}",
+                    $n
+                );
+            }};
+        }
+        check!(1);
+        check!(4);
+        check!(8);
+        check!(16);
+        check!(31);
+        check!(32);
+    }
+
+    // ── canonical ─────────────────────────────────────────────────────────────
+
+    #[test]
+    fn canonical_palindrome() {
+        let kmer = K4::from_ascii(b"ACGT").unwrap();
+        assert_eq!(kmer.canonical().into_kmer(), kmer);
+    }
+
+    #[test]
+    fn canonical_chooses_lesser() {
+        let kmer = K4::from_ascii(b"TTTT").unwrap();
+        let expected = K4::from_ascii(b"AAAA").unwrap();
+        assert_eq!(kmer.canonical().into_kmer(), expected);
+    }
+
+    #[test]
+    fn canonical_is_minimal() {
+        macro_rules! check {
+            ($n:expr) => {{
+                let ascii = make_seq::<$n>();
+                let ck = KmerOf::<ConstLen<$n>>::from_ascii(&ascii)
+                    .unwrap()
+                    .canonical();
+                let rc = ck.revcomp();
+                assert!(ck.raw() <= rc.raw(), "canonical not minimal for k={}", $n);
+            }};
+        }
+        check!(1);
+        check!(4);
+        check!(8);
+        check!(16);
+        check!(31);
+        check!(32);
+    }
+
+    #[test]
+    fn canonical_idempotent() {
+        macro_rules! check {
+            ($n:expr) => {{
+                let ck = KmerOf::<ConstLen<$n>>::from_ascii(&make_seq::<$n>())
+                    .unwrap()
+                    .canonical();
+                assert_eq!(
+                    ck.into_kmer().canonical(),
+                    ck,
+                    "canonical not idempotent for k={}",
+                    $n
+                );
+            }};
+        }
+        check!(1);
+        check!(4);
+        check!(8);
+        check!(16);
+        check!(31);
+        check!(32);
+    }
+}
@@ -1,11 +1,10 @@
 use super::*;
+use crate::set_k;

-/// Repeating ACGT pattern of the given length.
 fn make_seq(len: usize) -> Vec<u8> {
    (0..len).map(|i| b"ACGT"[i % 4]).collect()
 }

-/// Reference revcomp on ASCII bytes.
 fn ascii_revcomp(seq: &[u8]) -> Vec<u8> {
    seq.iter()
        .rev()
@@ -20,96 +19,93 @@ fn ascii_revcomp(seq: &[u8]) -> Vec<u8> {
 }

 fn all_lengths() -> impl Iterator<Item = usize> {
-    (1..=9).chain([255, 256])
+    (1..=9).chain([255, 256, 257, 1000])
 }

-// ── kmer extraction ───────────────────────────────────────────────────────
+// ── from_ascii / canonical form ───────────────────────────────────────────────

 #[test]
-fn kmer_first_matches_from_ascii() {
-    let ascii = b"ACGTACGT";
-    let sk = SuperKmer::from_ascii(ascii);
-    let k = 4;
-    let kmer = sk.kmer(0, k).unwrap();
-    let expected = crate::kmer::Kmer::from_ascii(&ascii[..k], k).unwrap();
-    assert_eq!(kmer, expected);
-}
-
-#[test]
-fn kmer_last_position() {
-    let ascii = b"ACGTACGT";
-    let seql = ascii.len();
-    let k = 4;
-    let sk = SuperKmer::from_ascii(ascii);
-    let kmer = sk.kmer(seql - k, k).unwrap();
-    let expected = crate::kmer::Kmer::from_ascii(&ascii[seql - k..], k).unwrap();
-    assert_eq!(kmer, expected);
-}
-
-#[test]
-fn kmer_all_positions() {
-    let ascii = b"ACGTACGTACGT";
-    let k = 4;
-    let sk = SuperKmer::from_ascii(ascii);
-    for i in 0..=ascii.len() - k {
-        let kmer = sk.kmer(i, k).unwrap();
-        let expected = crate::kmer::Kmer::from_ascii(&ascii[i..i + k], k).unwrap();
-        assert_eq!(kmer, expected, "mismatch at position {i}");
+fn ascii_roundtrip_all_lengths() {
+    for len in all_lengths() {
+        let ascii = make_seq(len);
+        let sk = SuperKmer::from_ascii(&ascii);
+        // SuperKmer stores in canonical form; ACGT pattern is already canonical.
+        assert_eq!(sk.to_ascii(), ascii, "roundtrip failed for len={len}");
    }
 }

 #[test]
-fn kmer_out_of_bounds() {
-    let sk = SuperKmer::from_ascii(b"ACGT");
-    assert!(sk.kmer(2, 4).is_err()); // 2 + 4 > 4
-    assert!(sk.kmer(4, 1).is_err()); // 4 + 1 > 4
-}
-
-#[test]
-fn kmer_invalid_k() {
-    let sk = SuperKmer::from_ascii(b"ACGT");
-    assert!(sk.kmer(0, 0).is_err());
-    assert!(sk.kmer(0, 33).is_err());
-}
-
-// ── canonical_kmer ────────────────────────────────────────────────────────
-
-#[test]
-fn canonical_kmer_is_min_of_kmer_and_revcomp() {
-    let sk = SuperKmer::from_ascii(b"ACGTACGT");
-    let k = 4;
-    for i in 0..=(sk.len() - k) {
-        let ck = sk.canonical_kmer(i, k).unwrap();
-        let fwd = sk.kmer(i, k).unwrap();
-        assert_eq!(ck, fwd.canonical(k));
+fn from_ascii_canonical_all_bases() {
+    // G×4 revcomp is C×4; T×4 revcomp is A×4.
+    for (base, expected) in [(b'A', b'A'), (b'C', b'C'), (b'G', b'C'), (b'T', b'A')] {
+        let ascii = vec![base; 4];
+        let sk = SuperKmer::from_ascii(&ascii);
+        assert_eq!(sk.to_ascii(), vec![expected; 4]);
    }
 }

 #[test]
-fn canonical_kmer_palindrome_unchanged() {
-    // ACGT is its own reverse complement
-    let sk = SuperKmer::from_ascii(b"ACGT");
-    let ck = sk.canonical_kmer(0, 4).unwrap();
-    let fwd = sk.kmer(0, 4).unwrap();
-    assert_eq!(ck.into_kmer(), fwd);
+fn from_ascii_is_canonical_all_lengths() {
+    for len in all_lengths() {
+        let ascii = make_seq(len);
+        let sk = SuperKmer::from_ascii(&ascii);
+        let fwd = sk.to_ascii();
+        let rev = ascii_revcomp(&fwd);
+        assert!(fwd <= rev, "not canonical for len={len}");
+    }
+}
+
+// ── seql ──────────────────────────────────────────────────────────────────────
+
+#[test]
+fn seql_roundtrip() {
+    for len in all_lengths() {
+        let sk = SuperKmer::from_ascii(&make_seq(len));
+        assert_eq!(sk.seql(), len, "seql() wrong for len={len}");
+    }
+}
+
+// ── binary serialisation ──────────────────────────────────────────────────────
+
+#[test]
+fn binary_roundtrip() {
+    for len in all_lengths() {
+        let mut sk = SuperKmer::from_ascii(&make_seq(len));
+        sk.set_count(42);
+        let mut buf = Vec::new();
+        sk.write_to_binary(&mut buf).unwrap();
+        let sk2 = SuperKmer::read_from_binary(&mut buf.as_slice()).unwrap();
+        assert_eq!(
+            sk.to_ascii(),
+            sk2.to_ascii(),
+            "sequence mismatch for len={len}"
+        );
+        assert_eq!(sk2.count(), 42, "count mismatch for len={len}");
+    }
 }

 #[test]
-fn canonical_kmer_tttt_becomes_aaaa() {
-    let sk = SuperKmer::from_ascii(b"TTTT");
-    let ck = sk.canonical_kmer(0, 4).unwrap();
-    let expected = Kmer::from_ascii(b"AAAA", 4).unwrap();
-    assert_eq!(ck.into_kmer(), expected);
+fn binary_packed_seq_roundtrip() {
+    use crate::packed_seq::PackedSeq;
+    for len in all_lengths() {
+        let ps = PackedSeq::from_ascii(&make_seq(len));
+        let mut buf = Vec::new();
+        ps.write_to_binary(&mut buf).unwrap();
+        let ps2 = PackedSeq::read_from_binary(&mut buf.as_slice()).unwrap();
+        assert_eq!(ps, ps2, "PackedSeq mismatch for len={len}");
+    }
 }

 #[test]
-fn canonical_kmer_errors_propagate() {
+fn binary_size_is_compact() {
+    // seql=4 (1 byte packed): varint(count=1, 1 byte) + varint((1<<2)|0=4, 1 byte) + 1 byte = 3 bytes
    let sk = SuperKmer::from_ascii(b"ACGT");
-    assert!(sk.canonical_kmer(2, 4).is_err()); // out of bounds
-    assert!(sk.canonical_kmer(0, 0).is_err()); // invalid k
+    let mut buf = Vec::new();
+    sk.write_to_binary(&mut buf).unwrap();
+    assert_eq!(buf.len(), 3, "expected 3 bytes for 4-nt superkmer");
 }

-// ── count ─────────────────────────────────────────────────────────────────
+// ── count ─────────────────────────────────────────────────────────────────────

 #[test]
 fn count_starts_at_one() {
@@ -144,30 +140,15 @@ fn set_count_overwrites() {
 }

 #[test]
-fn increment_preserves_seql() {
+fn count_operations_preserve_seql() {
    for len in all_lengths() {
        let mut sk = SuperKmer::from_ascii(&make_seq(len));
        sk.increment();
-        assert_eq!(sk.len(), len, "increment altered seql for len={len}");
-    }
-}
-
-#[test]
-fn add_preserves_seql() {
-    for len in all_lengths() {
-        let mut sk = SuperKmer::from_ascii(&make_seq(len));
+        assert_eq!(sk.seql(), len, "increment altered seql for len={len}");
        sk.add(1000);
-        assert_eq!(sk.len(), len, "add altered seql for len={len}");
-    }
-}
-
-#[test]
-fn set_count_preserves_seql() {
-    for len in all_lengths() {
-        let mut sk = SuperKmer::from_ascii(&make_seq(len));
+        assert_eq!(sk.seql(), len, "add altered seql for len={len}");
        sk.set_count(999);
-        assert_eq!(sk.len(), len, "set_count altered seql for len={len}");
-        assert_eq!(sk.count(), 999);
+        assert_eq!(sk.seql(), len, "set_count altered seql for len={len}");
    }
 }

@@ -179,247 +160,136 @@ fn count_does_not_affect_sequence() {
    assert_eq!(sk.to_ascii(), ascii);
 }

-// ── seql encoding ─────────────────────────────────────────────────────────
+// ── kmer extraction ───────────────────────────────────────────────────────────

 #[test]
-fn seql_roundtrip() {
-    for len in all_lengths() {
-        let sk = SuperKmer::from_ascii(&make_seq(len));
-        assert_eq!(sk.len(), len, "seql() wrong for len={len}");
+fn kmer_first_matches_from_ascii() {
+    set_k(4);
+    let k = crate::params::k();
+    let ascii = b"ACGTACGT";
+    let sk = SuperKmer::from_ascii(ascii);
+    let kmer = sk.kmer(0).unwrap();
+    let expected = crate::kmer::Kmer::from_ascii(&ascii[..k]).unwrap();
+    assert_eq!(kmer, expected);
+}
+
+#[test]
+fn kmer_all_positions() {
+    set_k(4);
+    let k = crate::params::k();
+    let ascii = b"ACGTACGTACGT";
+    let sk = SuperKmer::from_ascii(ascii);
+    for i in 0..=ascii.len() - k {
+        let kmer = sk.kmer(i).unwrap();
+        let expected = crate::kmer::Kmer::from_ascii(&ascii[i..i + k]).unwrap();
+        assert_eq!(kmer, expected, "mismatch at position {i}");
    }
 }

 #[test]
-fn seql_256_stored_as_zero() {
-    let sk = SuperKmer::from_ascii(&make_seq(256));
-    assert_eq!(sk.header.seql(), 0u8);
-    assert_eq!(sk.len(), 256);
+fn kmer_out_of_bounds() {
+    set_k(4);
+    let sk = SuperKmer::from_ascii(b"ACGT"); // seql=4, k=4
+    assert!(sk.kmer(1).is_err()); // 1 + 4 > 4
 }

-// ── from_ascii / to_ascii roundtrip ───────────────────────────────────────
+// ── canonical_kmer ────────────────────────────────────────────────────────────

 #[test]
-fn ascii_roundtrip_all_lengths() {
-    for len in all_lengths() {
-        let ascii = make_seq(len);
-        let sk = SuperKmer::from_ascii(&ascii);
-        assert_eq!(sk.to_ascii(), ascii, "roundtrip failed for len={len}");
+fn canonical_kmer_is_min_of_kmer_and_revcomp() {
+    set_k(4);
+    let k = crate::params::k();
+    let sk = SuperKmer::from_ascii(b"ACGTACGT");
+    for i in 0..=(sk.seql() - k) {
+        let ck = sk.canonical_kmer(i).unwrap();
+        let fwd = sk.kmer(i).unwrap();
+        assert_eq!(ck, fwd.canonical());
    }
 }

 #[test]
-fn ascii_roundtrip_all_bases() {
-    // Canonical form: min(seq, revcomp). G×4 flips to C×4, T×4 flips to A×4.
-    for (base, expected) in [(b'A', b'A'), (b'C', b'C'), (b'G', b'C'), (b'T', b'A')] {
-        let ascii = vec![base; 4];
-        let sk = SuperKmer::from_ascii(&ascii);
-        assert_eq!(sk.to_ascii(), vec![expected; 4]);
-    }
-}
-
-// ── revcomp correctness ───────────────────────────────────────────────────
-
-/// Known (seq, expected_revcomp) pairs — one per shift value × two byte counts.
-#[test]
-fn revcomp_known_values() {
-    let cases = [
-        // shift=6
-        ("A", "T"),
-        ("ACGTA", "TACGT"),
-        // shift=4
-        ("AC", "GT"),
-        ("ACGTAC", "GTACGT"),
-        // shift=2
-        ("ACG", "CGT"),
-        ("ACGTACG", "CGTACGT"),
-        // shift=0
-        ("ACGT", "ACGT"),
-        ("ACGTACGT", "ACGTACGT"),
-    ];
-    for (seq, expected) in cases {
-        let mut sk = SuperKmer::from_ascii(seq.as_bytes());
-        sk.revcomp();
-        assert_eq!(
-            sk.to_ascii(),
-            expected.as_bytes(),
-            "revcomp wrong for \"{seq}\""
-        );
-    }
+fn canonical_kmer_palindrome_unchanged() {
+    set_k(4);
+    let sk = SuperKmer::from_ascii(b"ACGT"); // ACGT is its own revcomp
+    let ck = sk.canonical_kmer(0).unwrap();
+    let fwd = sk.kmer(0).unwrap();
+    assert_eq!(ck.into_kmer(), fwd);
 }

 #[test]
-fn revcomp_vs_reference_all_lengths() {
-    for len in all_lengths() {
-        let ascii = make_seq(len);
-        let expected = ascii_revcomp(&ascii);
-        let mut sk = SuperKmer::from_ascii(&ascii);
-        sk.revcomp();
-        assert_eq!(sk.to_ascii(), expected, "revcomp wrong for len={len}");
-    }
+fn canonical_kmer_errors_propagate() {
+    set_k(4);
+    let sk = SuperKmer::from_ascii(b"ACGT");
+    assert!(sk.canonical_kmer(1).is_err()); // out of bounds: 1 + 4 > 4
 }

-#[test]
-fn revcomp_involution_all_lengths() {
-    for len in all_lengths() {
-        let ascii = make_seq(len);
-        let mut sk = SuperKmer::from_ascii(&ascii);
-        sk.revcomp();
-        sk.revcomp();
-        assert_eq!(sk.to_ascii(), ascii, "revcomp∘revcomp≠id for len={len}");
-    }
-}
-
-// ── canonical ─────────────────────────────────────────────────────────────
-
-#[test]
-fn canonical_palindrome_unchanged() {
-    // ACGT is its own revcomp
-    let mut sk = SuperKmer::from_ascii(b"ACGT");
-    sk.canonical();
-    assert_eq!(sk.to_ascii(), b"ACGT");
-}
-
-#[test]
-fn canonical_chooses_forward() {
-    // "AAAA" < "TTTT" → stays as-is
-    let mut sk = SuperKmer::from_ascii(b"AAAA");
-    sk.canonical();
-    assert_eq!(sk.to_ascii(), b"AAAA");
-}
-
-#[test]
-fn canonical_chooses_revcomp() {
-    // "TTTT" > "AAAA" → flipped
-    let mut sk = SuperKmer::from_ascii(b"TTTT");
-    sk.canonical();
-    assert_eq!(sk.to_ascii(), b"AAAA");
-}
-
-#[test]
-fn canonical_is_minimal_all_lengths() {
-    for len in all_lengths() {
-        let ascii = make_seq(len);
-        let mut sk = SuperKmer::from_ascii(&ascii);
-        sk.canonical();
-        let fwd = sk.to_ascii();
-        let rev = ascii_revcomp(&fwd);
-        assert!(fwd <= rev, "canonical not minimal for len={len}");
-    }
-}
-
-// ── iter_kmers ────────────────────────────────────────────────────────────
+// ── iter_kmers ────────────────────────────────────────────────────────────────

 #[test]
 fn iter_kmers_count() {
+    set_k(4);
+    let k = crate::params::k();
    let ascii = b"ACGTACGTACGT";
    let sk = SuperKmer::from_ascii(ascii);
-    for k in [1usize, 3, 4, 5, 8, 12] {
-        let n = sk.iter_kmers(k).count();
-        assert_eq!(n, ascii.len() - k + 1, "count mismatch for k={k}");
-    }
+    assert_eq!(sk.iter_kmers().count(), ascii.len() - k + 1);
 }

 #[test]
 fn iter_kmers_first_is_kmer_0() {
+    set_k(4);
    let ascii = b"ACGTACGT";
    let sk = SuperKmer::from_ascii(ascii);
-    for k in 1..=ascii.len() {
-        let first = sk.iter_kmers(k).next().unwrap();
-        assert_eq!(first, sk.kmer(0, k).unwrap(), "k={k}");
-    }
+    let first = sk.iter_kmers().next().unwrap();
+    assert_eq!(first, sk.kmer(0).unwrap());
 }

 #[test]
 fn iter_kmers_matches_kmer_at_each_position() {
+    set_k(4);
    let ascii = b"ACGTACGTACGT";
    let sk = SuperKmer::from_ascii(ascii);
-    let k = 4;
-    let kmers: Vec<Kmer> = sk.iter_kmers(k).collect();
-    assert_eq!(kmers.len(), ascii.len() - k + 1);
+    let kmers: Vec<crate::kmer::Kmer> = sk.iter_kmers().collect();
    for (i, &km) in kmers.iter().enumerate() {
-        assert_eq!(km, sk.kmer(i, k).unwrap(), "mismatch at pos {i}");
+        assert_eq!(km, sk.kmer(i).unwrap(), "mismatch at pos {i}");
    }
 }

 #[test]
 fn iter_kmers_single_when_seql_eq_k() {
-    let ascii = b"ACGTACGT";
-    let sk = SuperKmer::from_ascii(ascii);
-    let k = ascii.len();
-    let kmers: Vec<Kmer> = sk.iter_kmers(k).collect();
-    assert_eq!(kmers.len(), 1);
-    assert_eq!(kmers[0], sk.kmer(0, k).unwrap());
-}
-
-#[test]
-fn iter_kmers_two_when_seql_eq_k_plus_one() {
-    let ascii = b"ACGTACGT";
-    let sk = SuperKmer::from_ascii(ascii);
-    let k = ascii.len() - 1;
-    let kmers: Vec<Kmer> = sk.iter_kmers(k).collect();
-    assert_eq!(kmers.len(), 2);
-    assert_eq!(kmers[0], sk.kmer(0, k).unwrap());
-    assert_eq!(kmers[1], sk.kmer(1, k).unwrap());
-}
-
-#[test]
-fn iter_kmers_all_k_values() {
-    // For every valid k, each yielded kmer must match kmer(i, k).
-    let ascii = b"ACGTACGTACGT";
-    let sk = SuperKmer::from_ascii(ascii);
-    let seql = ascii.len();
-    for k in 1..=seql {
-        let kmers: Vec<Kmer> = sk.iter_kmers(k).collect();
-        assert_eq!(kmers.len(), seql - k + 1, "k={k}");
-        for (i, &km) in kmers.iter().enumerate() {
-            assert_eq!(km, sk.kmer(i, k).unwrap(), "k={k}, pos={i}");
-        }
-    }
+    set_k(4);
+    let k = crate::params::k();
+    let ascii = make_seq(k);
+    let sk = SuperKmer::from_ascii(&ascii);
+    assert_eq!(sk.iter_kmers().count(), 1);
+    assert_eq!(sk.iter_kmers().next().unwrap(), sk.kmer(0).unwrap());
 }

 #[test]
 fn iter_kmers_crosses_byte_boundary() {
-    // Positions 3→4 and 7→8 cross a 4-nucleotide byte boundary.
+    set_k(4);
    let ascii = b"ACGTACGTACGT";
    let sk = SuperKmer::from_ascii(ascii);
-    let k = 3;
-    let kmers: Vec<Kmer> = sk.iter_kmers(k).collect();
+    let kmers: Vec<crate::kmer::Kmer> = sk.iter_kmers().collect();
    for boundary in [3usize, 4, 7, 8] {
        if boundary + 1 < kmers.len() {
            assert_eq!(
                kmers[boundary],
-                sk.kmer(boundary, k).unwrap(),
+                sk.kmer(boundary).unwrap(),
                "pos={boundary}"
            );
-            assert_eq!(
-                kmers[boundary + 1],
-                sk.kmer(boundary + 1, k).unwrap(),
-                "pos={}",
-                boundary + 1
-            );
        }
    }
 }

-#[test]
-fn iter_kmers_k1_yields_all_nucleotides() {
-    let ascii = b"ACGT";
-    let sk = SuperKmer::from_ascii(ascii);
-    let kmers: Vec<Kmer> = sk.iter_kmers(1).collect();
-    assert_eq!(kmers.len(), 4);
-    for (i, &km) in kmers.iter().enumerate() {
-        assert_eq!(km, sk.kmer(i, 1).unwrap(), "pos={i}");
-    }
-}
-
 #[test]
 fn iter_kmers_long_sequence() {
-    let ascii = make_seq(20);
+    set_k(4);
+    let k = crate::params::k();
+    let ascii = make_seq(200);
    let sk = SuperKmer::from_ascii(&ascii);
-    let k = 7;
-    let kmers: Vec<Kmer> = sk.iter_kmers(k).collect();
-    assert_eq!(kmers.len(), ascii.len() - k + 1);
+    let kmers: Vec<crate::kmer::Kmer> = sk.iter_kmers().collect();
+    assert_eq!(kmers.len(), 200 - k + 1);
    for (i, &km) in kmers.iter().enumerate() {
-        assert_eq!(km, sk.kmer(i, k).unwrap(), "pos={i}");
+        assert_eq!(km, sk.kmer(i).unwrap(), "pos={i}");
    }
 }
@@ -0,0 +1,171 @@
+// ── tests ─────────────────────────────────────────────────────────────────────
+
+#[cfg(test)]
+mod tests {
+    use crate::packed_seq::PackedSeq as Unitig;
+    use crate::set_k;
+
+    fn make_seq(len: usize) -> Vec<u8> {
+        (0..len).map(|i| b"ACGT"[i % 4]).collect()
+    }
+
+    fn ascii_revcomp(seq: &[u8]) -> Vec<u8> {
+        seq.iter()
+            .rev()
+            .map(|&b| match b {
+                b'A' => b'T',
+                b'T' => b'A',
+                b'C' => b'G',
+                b'G' => b'C',
+                _ => b'A',
+            })
+            .collect()
+    }
+
+    fn test_lengths() -> impl Iterator<Item = usize> {
+        (1..=9).chain([255, 256, 257, 1000, 10_000])
+    }
+
+    // ── from_ascii / to_ascii ─────────────────────────────────────────────────
+
+    #[test]
+    fn ascii_roundtrip_all_lengths() {
+        for len in test_lengths() {
+            let ascii = make_seq(len);
+            let u = Unitig::from_ascii(&ascii);
+            assert_eq!(u.to_ascii(), ascii, "roundtrip failed for len={len}");
+        }
+    }
+
+    // ── seql ──────────────────────────────────────────────────────────────────
+
+    #[test]
+    fn seql_roundtrip() {
+        for len in test_lengths() {
+            let u = Unitig::from_ascii(&make_seq(len));
+            assert_eq!(u.seql(), len);
+        }
+    }
+
+    // ── revcomp ───────────────────────────────────────────────────────────────
+
+    #[test]
+    fn revcomp_known_values() {
+        let cases = [
+            ("A", "T"),
+            ("AC", "GT"),
+            ("ACG", "CGT"),
+            ("ACGT", "ACGT"),
+            ("ACGTA", "TACGT"),
+        ];
+        for (seq, expected) in cases {
+            let mut u = Unitig::from_ascii(seq.as_bytes());
+            u.revcomp_inplace();
+            assert_eq!(
+                u.to_ascii(),
+                expected.as_bytes(),
+                "revcomp wrong for \"{seq}\""
+            );
+        }
+    }
+
+    #[test]
+    fn revcomp_vs_reference_all_lengths() {
+        for len in test_lengths() {
+            let ascii = make_seq(len);
+            let expected = ascii_revcomp(&ascii);
+            let mut u = Unitig::from_ascii(&ascii);
+            u.revcomp_inplace();
+            assert_eq!(u.to_ascii(), expected, "revcomp wrong for len={len}");
+        }
+    }
+
+    #[test]
+    fn revcomp_involution_all_lengths() {
+        for len in test_lengths() {
+            let ascii = make_seq(len);
+            let mut u = Unitig::from_ascii(&ascii);
+            u.revcomp_inplace();
+            u.revcomp_inplace();
+            assert_eq!(u.to_ascii(), ascii, "revcomp∘revcomp≠id for len={len}");
+        }
+    }
+
+    // ── canonicalize ──────────────────────────────────────────────────────────
+
+    #[test]
+    fn canonical_palindrome_unchanged() {
+        let mut u = Unitig::from_ascii(b"ACGT");
+        u.canonicalize();
+        assert_eq!(u.to_ascii(), b"ACGT");
+    }
+
+    #[test]
+    fn canonical_chooses_revcomp() {
+        let mut u = Unitig::from_ascii(b"TTTT");
+        u.canonicalize();
+        assert_eq!(u.to_ascii(), b"AAAA");
+    }
+
+    #[test]
+    fn canonical_is_minimal_all_lengths() {
+        for len in test_lengths() {
+            let ascii = make_seq(len);
+            let mut u = Unitig::from_ascii(&ascii);
+            u.canonicalize();
+            let fwd = u.to_ascii();
+            let rev = ascii_revcomp(&fwd);
+            assert!(fwd <= rev, "canonical not minimal for len={len}");
+        }
+    }
+
+    // ── kmer extraction ───────────────────────────────────────────────────────
+
+    #[test]
+    fn kmer_all_positions() {
+        set_k(4);
+        let k = crate::params::k();
+        let ascii = b"ACGTACGTACGT";
+        let u = Unitig::from_ascii(ascii);
+        for i in 0..=ascii.len() - k {
+            let kmer = u.kmer(i).unwrap();
+            let expected = crate::kmer::Kmer::from_ascii(&ascii[i..i + k]).unwrap();
+            assert_eq!(kmer, expected, "mismatch at position {i}");
+        }
+    }
+
+    // ── iter_kmers ────────────────────────────────────────────────────────────
+
+    #[test]
+    fn iter_kmers_matches_kmer_at_each_position() {
+        set_k(4);
+        let ascii = make_seq(20);
+        let u = Unitig::from_ascii(&ascii);
+        let kmers: Vec<crate::kmer::Kmer> = u.iter_kmers().collect();
+        for (i, &km) in kmers.iter().enumerate() {
+            assert_eq!(km, u.kmer(i).unwrap(), "pos={i}");
+        }
+    }
+
+    #[test]
+    fn iter_kmers_long_unitig() {
+        set_k(4);
+        let k = crate::params::k();
+        let ascii = make_seq(10_000);
+        let u = Unitig::from_ascii(&ascii);
+        assert_eq!(u.iter_kmers().count(), 10_000 - k + 1);
+    }
+
+    // ── binary serialisation ──────────────────────────────────────────────────
+
+    #[test]
+    fn binary_roundtrip_all_lengths() {
+        for len in test_lengths() {
+            let u = Unitig::from_ascii(&make_seq(len));
+            let mut buf = Vec::new();
+            u.write_to_binary(&mut buf).unwrap();
+            let u2 = Unitig::read_from_binary(&mut buf.as_slice()).unwrap();
+            assert_eq!(u, u2, "binary roundtrip failed for len={len}");
+        }
+    }
+}
@@ -1,424 +1,10 @@
-//! Compact 2-bit DNA unitig with in-place reverse complement and canonical form.
+//! Unitig: a 2-bit packed DNA sequence without metadata.
 //!
-//! Same encoding as [`SuperKmer`](crate::superkmer::SuperKmer) — nucleotide 0
-//! at the MSB of `seq[0]`, 4 bases per byte — but without the 256-nucleotide
-//! length cap and without the scatter/count header payload.
+//! [`Unitig`] is a type alias for [`PackedSeq`] — all sequence operations,
+//! binary serialisation, and k-mer iteration are available directly.

-use std::io::{self, Write};
-
-use crate::encoding::{DEC4, encode_base};
-use crate::kmer::{CanonicalKmer, Kmer, KmerError};
-use crate::revcomp_lookup::REVCOMP4;
-use bitvec::prelude::*;
-
-// ── Unitig ────────────────────────────────────────────────────────────────────
-
-/// Compact unitig: sequence length (usize) + byte-aligned 2-bit nucleotide sequence.
-///
-/// Encoding: A=00, C=01, G=10, T=11. Nucleotide 0 occupies bits 7–6 of `seq[0]`,
-/// nucleotide i occupies bits `7 − 2*(i%4)` and `6 − 2*(i%4)` of `seq[i/4]`.
-/// Padding bits in the last byte are always 0.
-#[derive(Debug, Clone)]
-pub struct Unitig {
-    seql: usize,
-    seq: Box<[u8]>,
-}
-
-impl PartialEq for Unitig {
-    fn eq(&self, other: &Self) -> bool {
-        self.seql == other.seql && self.seq == other.seq
-    }
-}
-
-impl Eq for Unitig {}
-
-impl std::hash::Hash for Unitig {
-    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
-        self.seql.hash(state);
-        self.seq.hash(state);
-    }
-}
-
-impl Unitig {
-    /// Create from a pre-packed 2-bit byte slice and explicit length.
-    /// `seq.len()` must equal `(seql + 3) / 4`.
-    pub fn new(seql: usize, seq: Box<[u8]>) -> Self {
-        debug_assert_eq!(seq.len(), byte_len(seql));
-        Self { seql, seq }
-    }
-
-    /// Encode a slice of 2-bit nucleotide values (0=A, 1=C, 2=G, 3=T, any length ≥ 1).
-    /// More efficient than `from_ascii` when nucleotides are already 2-bit encoded.
-    pub fn from_nucleotides(nucs: &[u8]) -> Self {
-        let seql = nucs.len();
-        debug_assert!(seql >= 1, "unitig length must be ≥ 1");
-        let n = byte_len(seql);
-        let mut seq = vec![0u8; n];
-        for (i, &nuc) in nucs.iter().enumerate() {
-            seq[i / 4] |= (nuc & 0b11) << (6 - 2 * (i % 4));
-        }
-        Self::new(seql, seq.into_boxed_slice())
-    }
-
-    /// Encode an ASCII nucleotide slice (ACGT, any length ≥ 1) into a new Unitig.
-    /// The result is not yet in canonical form; call `.canonical()` if needed.
-    pub fn from_ascii(ascii: &[u8]) -> Self {
-        let seql = ascii.len();
-        debug_assert!(seql >= 1, "unitig length must be ≥ 1");
-        let n = byte_len(seql);
-        let mut seq = vec![0u8; n];
-
-        let full = seql / 4;
-        for i in 0..full {
-            seq[i] = encode_base(ascii[i * 4]) << 6
-                | encode_base(ascii[i * 4 + 1]) << 4
-                | encode_base(ascii[i * 4 + 2]) << 2
-                | encode_base(ascii[i * 4 + 3]);
-        }
-        let rem = seql % 4;
-        if rem > 0 {
-            let mut last = 0u8;
-            for j in 0..rem {
-                last |= encode_base(ascii[full * 4 + j]) << (6 - 2 * j);
-            }
-            seq[full] = last;
-        }
-
-        Self::new(seql, seq.into_boxed_slice())
-    }
-
-    /// Returns the sequence length in nucleotides.
-    pub fn seql(&self) -> usize {
-        self.seql
-    }
-
-    /// Returns a read-only view of the packed 2-bit sequence bytes.
-    /// Length is always `(seql() + 3) / 4`.
-    pub fn seq_bytes(&self) -> &[u8] {
-        &self.seq
-    }
-
-    /// Extract nucleotide i (0-based from 5′ end) as a 2-bit value.
-    pub fn nucleotide(&self, i: usize) -> u8 {
-        (self.seq[i / 4] >> (6 - 2 * (i % 4))) & 0b11
-    }
-
-    /// Decode into ASCII nucleotides, writing into `writer`.
-    pub fn write_ascii<W: Write>(&self, writer: &mut W) -> io::Result<()> {
-        let full = self.seql / 4;
-        for i in 0..full {
-            writer.write_all(&DEC4[self.seq[i] as usize].to_be_bytes())?;
-        }
-        let rem = self.seql % 4;
-        if rem > 0 {
-            let bytes = DEC4[self.seq[full] as usize].to_be_bytes();
-            writer.write_all(&bytes[..rem])?;
-        }
-        Ok(())
-    }
-
-    /// Decode into a fresh ASCII `Vec<u8>`.
-    pub fn to_ascii(&self) -> Vec<u8> {
-        let mut buf = Vec::with_capacity(self.seql);
-        self.write_ascii(&mut buf).unwrap();
-        buf
-    }
-
-    /// Reverse-complement this unitig in place.
-    pub fn revcomp(&mut self) {
-        let n = byte_len(self.seql);
-
-        // Step 1: swap bytes outside-in, complementing each 4-base chunk via lookup.
-        {
-            let bytes = &mut self.seq[..n];
-            let (mut lo, mut hi) = (0, n - 1);
-            while lo < hi {
-                (bytes[lo], bytes[hi]) =
-                    (REVCOMP4[bytes[hi] as usize], REVCOMP4[bytes[lo] as usize]);
-                lo += 1;
-                hi -= 1;
-            }
-            if lo == hi {
-                bytes[lo] = REVCOMP4[bytes[lo] as usize];
-            }
-        }
-
-        // Step 2: left-shift to flush the padding T's produced by complementing padding A's.
-        let shift = n * 8 - self.seql * 2;
-        if shift > 0 {
-            let bits = self.seq[..n].view_bits_mut::<Msb0>();
-            bits.rotate_left(shift);
-            let len = bits.len();
-            bits[len - shift..].fill(false);
-        }
-    }
-
-    /// Returns `true` if this unitig is in canonical form (lexicographic minimum
-    /// of forward and reverse complement).
-    pub fn is_canonical(&self) -> bool {
-        for i in 0..self.seql {
-            let fwd = self.nucleotide(i);
-            let rev = complement(self.nucleotide(self.seql - 1 - i));
-            if fwd < rev {
-                return true;
-            }
-            if fwd > rev {
-                return false;
-            }
-        }
-        true
-    }
-
-    /// Put this unitig in canonical form in place.
-    ///
-    /// Returns `true` if already canonical (no change), `false` if revcomp was applied.
-    pub fn canonical(&mut self) -> bool {
-        if self.is_canonical() {
-            return true;
-        }
-        self.revcomp();
-        false
-    }
-
-    /// Extract the kmer of length `k` starting at nucleotide position `i` (0-based).
-    pub fn kmer(&self, i: usize, k: usize) -> Result<Kmer, KmerError> {
-        if k == 0 || k > 32 {
-            return Err(KmerError::InvalidK { k });
-        }
-        if i + k > self.seql {
-            return Err(KmerError::OutOfBounds {
-                position: i,
-                k,
-                seql: self.seql,
-            });
-        }
-        let bits = self.seq.view_bits::<Msb0>();
-        let raw: u64 = bits[i * 2..(i + k) * 2].load_be();
-        Ok(Kmer::from_raw(raw << (64 - 2 * k)))
-    }
-
-    /// Extract the canonical kmer of length `k` starting at position `i`.
-    pub fn canonical_kmer(&self, i: usize, k: usize) -> Result<CanonicalKmer, KmerError> {
-        Ok(self.kmer(i, k)?.canonical(k))
-    }
-
-    /// Iterate over all kmers of length `k` in order, yielding each as a [`Kmer`].
-    pub fn iter_kmers(&self, k: usize) -> impl Iterator<Item = Kmer> + '_ {
-        UnitigKmerIter::new(self, k)
-    }
-
-    /// Iterate over all canonical kmers of length `k` in order.
-    pub fn iter_canonical_kmers(&self, k: usize) -> impl Iterator<Item = CanonicalKmer> + '_ {
-        self.iter_kmers(k).map(move |km| km.canonical(k))
-    }
-}
-
-// ── UnitigKmerIter ────────────────────────────────────────────────────────────
-
-struct UnitigKmerIter<'a> {
-    unitig: &'a Unitig,
-    mask: u64,
-    lshift: usize,
-    current: u64,
-    pos: usize,
-    max_pos: usize,
-}
-
-impl<'a> UnitigKmerIter<'a> {
-    fn new(unitig: &'a Unitig, k: usize) -> Self {
-        let seql = unitig.seql();
-        let lshift = 64 - k * 2;
-        let mask = ((!0u128) << (lshift + 2)) as u64;
-        Self {
-            unitig,
-            mask,
-            lshift,
-            current: if seql >= k { unitig.kmer(0, k).unwrap().raw() } else { 0 },
-            pos: k,
-            max_pos: seql,
-        }
-    }
-}
-
-impl<'a> Iterator for UnitigKmerIter<'a> {
-    type Item = Kmer;
-
-    fn next(&mut self) -> Option<Self::Item> {
-        if self.pos > self.max_pos {
-            return None;
-        }
-        let result = Kmer::from_raw(self.current);
-        if self.pos < self.max_pos {
-            let byte_pos = self.pos / 4;
-            // nucleotide at position p within its byte occupies bits 7−2*(p%4) and 6−2*(p%4)
-            let inner_shift = 6 - 2 * (self.pos & 3);
-            let nuc = (((self.unitig.seq[byte_pos] >> inner_shift) & 3) as u64) << self.lshift;
-            self.current = ((self.current << 2) & self.mask) | nuc;
-        }
-        self.pos += 1;
-        Some(result)
-    }
-}
-
-// ── helpers ───────────────────────────────────────────────────────────────────
-
-fn complement(base: u8) -> u8 {
-    !base & 0b11
-}
-
-fn byte_len(seql: usize) -> usize {
-    (seql + 3) / 4
-}
-
-// ── tests ─────────────────────────────────────────────────────────────────────
+pub use crate::packed_seq::PackedSeq as Unitig;

 #[cfg(test)]
-mod tests {
-    use super::*;
-
-    fn make_seq(len: usize) -> Vec<u8> {
-        (0..len).map(|i| b"ACGT"[i % 4]).collect()
-    }
-
-    fn ascii_revcomp(seq: &[u8]) -> Vec<u8> {
-        seq.iter()
-            .rev()
-            .map(|&b| match b {
-                b'A' => b'T',
-                b'T' => b'A',
-                b'C' => b'G',
-                b'G' => b'C',
-                _ => b'A',
-            })
-            .collect()
-    }
-
-    fn test_lengths() -> impl Iterator<Item = usize> {
-        (1..=9).chain([255, 256, 257, 1000, 10_000])
-    }
-
-    // ── from_ascii / to_ascii ─────────────────────────────────────────────────
-
-    #[test]
-    fn ascii_roundtrip_all_lengths() {
-        for len in test_lengths() {
-            let ascii = make_seq(len);
-            let u = Unitig::from_ascii(&ascii);
-            assert_eq!(u.to_ascii(), ascii, "roundtrip failed for len={len}");
-        }
-    }
-
-    // ── seql ──────────────────────────────────────────────────────────────────
-
-    #[test]
-    fn seql_roundtrip() {
-        for len in test_lengths() {
-            let u = Unitig::from_ascii(&make_seq(len));
-            assert_eq!(u.seql(), len);
-        }
-    }
-
-    // ── revcomp ───────────────────────────────────────────────────────────────
-
-    #[test]
-    fn revcomp_known_values() {
-        let cases = [
-            ("A", "T"),
-            ("AC", "GT"),
-            ("ACG", "CGT"),
-            ("ACGT", "ACGT"),
-            ("ACGTA", "TACGT"),
-        ];
-        for (seq, expected) in cases {
-            let mut u = Unitig::from_ascii(seq.as_bytes());
-            u.revcomp();
-            assert_eq!(u.to_ascii(), expected.as_bytes(), "revcomp wrong for \"{seq}\"");
-        }
-    }
-
-    #[test]
-    fn revcomp_vs_reference_all_lengths() {
-        for len in test_lengths() {
-            let ascii = make_seq(len);
-            let expected = ascii_revcomp(&ascii);
-            let mut u = Unitig::from_ascii(&ascii);
-            u.revcomp();
-            assert_eq!(u.to_ascii(), expected, "revcomp wrong for len={len}");
-        }
-    }
-
-    #[test]
-    fn revcomp_involution_all_lengths() {
-        for len in test_lengths() {
-            let ascii = make_seq(len);
-            let mut u = Unitig::from_ascii(&ascii);
-            u.revcomp();
-            u.revcomp();
-            assert_eq!(u.to_ascii(), ascii, "revcomp∘revcomp≠id for len={len}");
-        }
-    }
-
-    // ── canonical ─────────────────────────────────────────────────────────────
-
-    #[test]
-    fn canonical_palindrome_unchanged() {
-        let mut u = Unitig::from_ascii(b"ACGT");
-        u.canonical();
-        assert_eq!(u.to_ascii(), b"ACGT");
-    }
-
-    #[test]
-    fn canonical_chooses_revcomp() {
-        let mut u = Unitig::from_ascii(b"TTTT");
-        u.canonical();
-        assert_eq!(u.to_ascii(), b"AAAA");
-    }
-
-    #[test]
-    fn canonical_is_minimal_all_lengths() {
-        for len in test_lengths() {
-            let ascii = make_seq(len);
-            let mut u = Unitig::from_ascii(&ascii);
-            u.canonical();
-            let fwd = u.to_ascii();
-            let rev = ascii_revcomp(&fwd);
-            assert!(fwd <= rev, "canonical not minimal for len={len}");
-        }
-    }
-
-    // ── kmer extraction ───────────────────────────────────────────────────────
-
-    #[test]
-    fn kmer_all_positions() {
-        let ascii = b"ACGTACGTACGT";
-        let k = 4;
-        let u = Unitig::from_ascii(ascii);
-        for i in 0..=ascii.len() - k {
-            let kmer = u.kmer(i, k).unwrap();
-            let expected = Kmer::from_ascii(&ascii[i..i + k], k).unwrap();
-            assert_eq!(kmer, expected, "mismatch at position {i}");
-        }
-    }
-
-    // ── iter_kmers ────────────────────────────────────────────────────────────
-
-    #[test]
-    fn iter_kmers_matches_kmer_at_each_position() {
-        let ascii = make_seq(20);
-        let k = 7;
-        let u = Unitig::from_ascii(&ascii);
-        let kmers: Vec<Kmer> = u.iter_kmers(k).collect();
-        assert_eq!(kmers.len(), ascii.len() - k + 1);
-        for (i, &km) in kmers.iter().enumerate() {
-            assert_eq!(km, u.kmer(i, k).unwrap(), "pos={i}");
-        }
-    }
-
-    #[test]
-    fn iter_kmers_long_unitig() {
-        let ascii = make_seq(10_000);
-        let k = 11;
-        let u = Unitig::from_ascii(&ascii);
-        assert_eq!(u.iter_kmers(k).count(), 10_000 - k + 1);
-    }
-}
+#[path = "tests/unitig.rs"]
+mod tests;