feat: add kmer iterators and optimize layered map performance

Replace `ph` with `ptr_hash` and introduce `epserde` and `rayon` dependencies. Refactor MPHF construction to leverage parallel iteration, eliminating intermediate `Vec<u64>` allocations and reducing memory footprint. Add a `n_kmers` field to track and serialize total kmer counts, alongside three zero-allocation iterators for efficient chunk traversal. Include comprehensive unit tests for the new iterators and update CLAUDE.md to enforce explicit dependency validation policies.
2026-05-12 22:28:01 +08:00
parent 9c41891cc8
commit ff75c9198d
7 changed files with 359 additions and 61 deletions
@@ -137,6 +137,108 @@ fn verify_second_unitig_second_position() {
    assert!(r.verify_canonical_kmer(1, 1, query));
 }

+// ── iter_kmers ────────────────────────────────────────────────────────────────
+
+#[test]
+fn iter_kmers_empty_file() {
+    set_k(4);
+    let dir = tempdir().unwrap();
+    let path = dir.path().join("unitigs.bin");
+    UnitigFileWriter::create(&path).unwrap().close().unwrap();
+    let r = UnitigFileReader::open(&path).unwrap();
+    assert_eq!(r.iter_kmers().count(), 0);
+}
+
+#[test]
+fn iter_kmers_single_chunk_count_and_order() {
+    set_k(4);
+    // "AAAACG": 6 nucl → 3 kmers (k=4)
+    let (_dir, r) = write_read(&[b"AAAACG"]);
+    let kmers: Vec<Kmer> = r.iter_kmers().collect();
+    assert_eq!(kmers.len(), 3);
+    for (rank, kmer) in kmers.iter().enumerate() {
+        assert_eq!(kmer.raw(), r.raw_kmer(0, rank), "kmer mismatch at rank {rank}");
+    }
+}
+
+#[test]
+fn iter_kmers_two_chunks_order() {
+    set_k(4);
+    // "AAAACG" → 3 kmers, "CCCCAG" → 3 kmers
+    let (_dir, r) = write_read(&[b"AAAACG", b"CCCCAG"]);
+    let kmers: Vec<Kmer> = r.iter_kmers().collect();
+    assert_eq!(kmers.len(), 6);
+    // Chunk 0 first
+    for rank in 0..3 {
+        assert_eq!(kmers[rank].raw(), r.raw_kmer(0, rank));
+    }
+    // Chunk 1 after
+    for rank in 0..3 {
+        assert_eq!(kmers[3 + rank].raw(), r.raw_kmer(1, rank));
+    }
+}
+
+// ── iter_canonical_kmers ──────────────────────────────────────────────────────
+
+#[test]
+fn iter_canonical_kmers_all_canonical() {
+    set_k(4);
+    let (_dir, r) = write_read(&[b"AAAACG", b"CCCCAG"]);
+    for kmer in r.iter_canonical_kmers() {
+        // canonical of a canonical kmer is itself
+        assert_eq!(kmer.raw(), kmer.canonical().raw());
+    }
+}
+
+#[test]
+fn iter_canonical_kmers_matches_iter_kmers() {
+    set_k(4);
+    let (_dir, r) = write_read(&[b"AAAACG", b"CCCCAG"]);
+    let canonical: Vec<CanonicalKmer> = r.iter_canonical_kmers().collect();
+    let raw: Vec<Kmer> = r.iter_kmers().collect();
+    assert_eq!(canonical.len(), raw.len());
+    for (ck, rk) in canonical.iter().zip(raw.iter()) {
+        assert_eq!(ck.raw(), rk.canonical().raw());
+    }
+}
+
+// ── iter_indexed_canonical_kmers ──────────────────────────────────────────────
+
+#[test]
+fn iter_indexed_chunk_id_and_rank_single_chunk() {
+    set_k(4);
+    let (_dir, r) = write_read(&[b"AAAACG"]);
+    let items: Vec<(CanonicalKmer, usize, usize)> = r.iter_indexed_canonical_kmers().collect();
+    assert_eq!(items.len(), 3);
+    for (rank, (kmer, chunk_id, item_rank)) in items.iter().enumerate() {
+        assert_eq!(*chunk_id, 0, "chunk_id must be 0");
+        assert_eq!(*item_rank, rank, "rank mismatch");
+        assert!(r.verify_canonical_kmer(0, rank, *kmer));
+    }
+}
+
+#[test]
+fn iter_indexed_chunk_id_and_rank_two_chunks() {
+    set_k(4);
+    let (_dir, r) = write_read(&[b"AAAACG", b"CCCCAG"]);
+    let items: Vec<(CanonicalKmer, usize, usize)> = r.iter_indexed_canonical_kmers().collect();
+    assert_eq!(items.len(), 6);
+    // First 3 items: chunk_id=0, rank 0..2
+    for rank in 0..3 {
+        let (kmer, chunk_id, item_rank) = items[rank];
+        assert_eq!(chunk_id, 0);
+        assert_eq!(item_rank, rank);
+        assert!(r.verify_canonical_kmer(0, rank, kmer));
+    }
+    // Next 3 items: chunk_id=1, rank resets to 0
+    for rank in 0..3 {
+        let (kmer, chunk_id, item_rank) = items[3 + rank];
+        assert_eq!(chunk_id, 1);
+        assert_eq!(item_rank, rank);
+        assert!(r.verify_canonical_kmer(1, rank, kmer));
+    }
+}
+
 // ── Splitting ─────────────────────────────────────────────────────────────────

 #[test]
@@ -3,7 +3,7 @@ use std::io::{BufWriter, Write as _};
 use std::path::{Path, PathBuf};

 use memmap2::Mmap;
-use obikseq::{CanonicalKmer, Unitig};
+use obikseq::{CanonicalKmer, Kmer, Unitig};

 pub use obikseq::MAX_KMERS_PER_CHUNK;

@@ -13,6 +13,7 @@ use crate::error::{SKError, SKResult};
 //
 //   magic:           [u8; 4]  = b"UIDX"
 //   n_unitigs:       u32 LE
+//   n_kmers:         u64 LE   total kmer count across all chunks
 //   seqls:           [u8; n_unitigs]       max kmer index per chunk (= n_kmers − 1)
 //   packed_offsets:  [u32; n_unitigs + 1]  byte offsets to packed bytes in the
 //                                          sequence file; last entry is sentinel
@@ -44,6 +45,7 @@ pub struct UnitigFileWriter {
    seqls: Vec<u8>,
    packed_offsets: Vec<u32>,
    next_offset: u32,
+    n_kmers: usize,
    k: usize,
 }

@@ -56,6 +58,7 @@ impl UnitigFileWriter {
            seqls: Vec::new(),
            packed_offsets: Vec::new(),
            next_offset: 0,
+            n_kmers: 0,
            k: obikseq::params::k(),
        })
    }
@@ -98,6 +101,7 @@ impl UnitigFileWriter {
        debug_assert!(seql - self.k <= u8::MAX as usize, "chunk exceeds MAX_KMERS_PER_CHUNK");
        self.packed_offsets.push(self.next_offset + 1);
        self.seqls.push((seql - self.k) as u8);
+        self.n_kmers += seql - self.k + 1;

        unitig
            .write_to_binary(&mut self.file)
@@ -122,7 +126,7 @@ impl UnitigFileWriter {
        };
        self.packed_offsets.push(sentinel);

-        write_idx(&idx_path(&self.path), &self.seqls, &self.packed_offsets)
+        write_idx(&idx_path(&self.path), &self.seqls, &self.packed_offsets, self.n_kmers)
    }

    pub fn len(&self) -> usize {
@@ -134,10 +138,11 @@ impl UnitigFileWriter {
    }
 }

-fn write_idx(path: &Path, seqls: &[u8], packed_offsets: &[u32]) -> SKResult<()> {
+fn write_idx(path: &Path, seqls: &[u8], packed_offsets: &[u32], n_kmers: usize) -> SKResult<()> {
    let mut w = BufWriter::new(File::create(path).map_err(SKError::Io)?);
    w.write_all(&MAGIC).map_err(SKError::Io)?;
    w.write_all(&(seqls.len() as u32).to_le_bytes()).map_err(SKError::Io)?;
+    w.write_all(&(n_kmers as u64).to_le_bytes()).map_err(SKError::Io)?;
    w.write_all(seqls).map_err(SKError::Io)?;
    for &off in packed_offsets {
        w.write_all(&off.to_le_bytes()).map_err(SKError::Io)?;
@@ -155,6 +160,7 @@ pub struct UnitigFileReader {
    mmap: Mmap,
    seqls: Vec<u8>,
    packed_offsets: Vec<u32>,
+    n_kmers: usize,
    k: usize,
 }

@@ -162,9 +168,9 @@ impl UnitigFileReader {
    pub fn open(path: &Path) -> SKResult<Self> {
        let file = File::open(path).map_err(SKError::Io)?;
        let mmap = unsafe { Mmap::map(&file).map_err(SKError::Io)? };
-        let (seqls, packed_offsets) = read_idx(&idx_path(path))?;
+        let (seqls, packed_offsets, n_kmers) = read_idx(&idx_path(path))?;
        let k = obikseq::params::k();
-        Ok(Self { mmap, seqls, packed_offsets, k })
+        Ok(Self { mmap, seqls, packed_offsets, n_kmers, k })
    }

    pub fn len(&self) -> usize {
@@ -175,6 +181,11 @@ impl UnitigFileReader {
        self.seqls.is_empty()
    }

+    /// Total number of kmers across all chunks.
+    pub fn n_kmers(&self) -> usize {
+        self.n_kmers
+    }
+
    /// Return the nucleotide length of chunk `i`.
    #[inline]
    pub fn seql(&self, i: usize) -> usize {
@@ -206,9 +217,42 @@ impl UnitigFileReader {
    pub fn verify_canonical_kmer(&self, i: usize, j: usize, query: CanonicalKmer) -> bool {
        canonical_raw(self.raw_kmer(i, j), self.k) == query.raw()
    }
+
+    /// Iterate over all kmers in file order (all positions of chunk 0, then chunk 1, …).
+    ///
+    /// Each chunk is copied from the mmap once; iteration within the chunk is
+    /// zero-allocation (sliding-window via [`OwnedPackedSeqKmerIter`]).
+    pub fn iter_kmers(&self) -> impl Iterator<Item = Kmer> + '_ {
+        (0..self.len()).flat_map(move |i| self.unitig(i).into_kmers())
+    }
+
+    /// Iterate over all canonical kmers in file order.
+    ///
+    /// Equivalent to `iter_kmers().map(|km| km.canonical())` but uses the
+    /// built-in canonical iterator on each chunk, which avoids a separate
+    /// canonicalization pass.
+    pub fn iter_canonical_kmers(&self) -> impl Iterator<Item = CanonicalKmer> + '_ {
+        (0..self.len()).flat_map(move |i| self.unitig(i).into_canonical_kmers())
+    }
+
+    /// Iterate over `(kmer, chunk_id, rank)` for every canonical kmer in the file.
+    ///
+    /// `chunk_id` is the index of the chunk within this file; `rank` is the
+    /// 0-based position of the kmer within that chunk.  Used to build the
+    /// evidence table in `obilayeredmap`.
+    pub fn iter_indexed_canonical_kmers(
+        &self,
+    ) -> impl Iterator<Item = (CanonicalKmer, usize, usize)> + '_ {
+        (0..self.len()).flat_map(move |chunk_id| {
+            self.unitig(chunk_id)
+                .into_canonical_kmers()
+                .enumerate()
+                .map(move |(rank, kmer)| (kmer, chunk_id, rank))
+        })
+    }
 }

-fn read_idx(path: &Path) -> SKResult<(Vec<u8>, Vec<u32>)> {
+fn read_idx(path: &Path) -> SKResult<(Vec<u8>, Vec<u32>, usize)> {
    let data = std::fs::read(path).map_err(SKError::Io)?;
    let mut pos = 0;

@@ -227,6 +271,11 @@ fn read_idx(path: &Path) -> SKResult<(Vec<u8>, Vec<u32>)> {
    let n = u32::from_le_bytes(n_bytes.try_into().unwrap()) as usize;
    pos += 4;

+    let nk_bytes = data.get(pos..pos + 8)
+        .ok_or(SKError::Truncated { context: "unitig index: n_kmers" })?;
+    let n_kmers = u64::from_le_bytes(nk_bytes.try_into().unwrap()) as usize;
+    pos += 8;
+
    let seqls = data.get(pos..pos + n)
        .ok_or(SKError::Truncated { context: "unitig index: seqls" })?
        .to_vec();
@@ -240,7 +289,7 @@ fn read_idx(path: &Path) -> SKResult<(Vec<u8>, Vec<u32>)> {
        pos += 4;
    }

-    Ok((seqls, packed_offsets))
+    Ok((seqls, packed_offsets, n_kmers))
 }

 // ── Kmer utilities ────────────────────────────────────────────────────────────