feat: implement persistent layered index and chunked binary format

Introduce the `obilayeredmap` specification and persistent MPHF-based index architecture for incremental multi-dataset indexing. Implement chunked binary serialization with a fixed `u8` k-mer count limit (256) and overlapping super-kmer segments. Add memory-mapped I/O and a companion `.idx` index file for allocation-free, O(1) unitig access. Update MkDocs navigation, enhance the k-mer comparison script, and add comprehensive tests for serialization, partitioning, and file I/O pipelines.
2026-05-09 17:20:08 +08:00
parent 8c17bf958b
commit 5169f65dc9
24 changed files with 1342 additions and 382 deletions
@@ -0,0 +1,286 @@
+use std::fs::File;
+use std::io::{BufWriter, Write as _};
+use std::path::{Path, PathBuf};
+
+use memmap2::Mmap;
+use obikseq::{CanonicalKmer, Unitig};
+
+pub use obikseq::MAX_KMERS_PER_CHUNK;
+
+use crate::error::{SKError, SKResult};
+
+// ── Index file format ─────────────────────────────────────────────────────────
+//
+//   magic:           [u8; 4]  = b"UIDX"
+//   n_unitigs:       u32 LE
+//   seqls:           [u8; n_unitigs]       max kmer index per chunk (= n_kmers − 1)
+//   packed_offsets:  [u32; n_unitigs + 1]  byte offsets to packed bytes in the
+//                                          sequence file; last entry is sentinel
+//
+// Each sequence record in the binary file: [u8: n_kmers−1][packed bytes].
+// Offsets point to the first packed byte of each record, past the leading u8.
+// Unitigs with more than MAX_KMERS_PER_CHUNK kmers are transparently split by the
+// writer into overlapping chunks (k-1 nucleotide overlap) so no kmer is lost.
+
+const MAGIC: [u8; 4] = *b"UIDX";
+
+fn idx_path(path: &Path) -> PathBuf {
+    let mut s = path.as_os_str().to_owned();
+    s.push(".idx");
+    PathBuf::from(s)
+}
+
+// Extract a sub-sequence [start, end) nucleotides from a unitig.
+fn sub_unitig(unitig: &Unitig, start: usize, end: usize) -> Unitig {
+    unitig.sub(start, end)
+}
+
+// ── Writer ────────────────────────────────────────────────────────────────────
+
+/// Writes a sequence of [`Unitig`] to an uncompressed binary file and builds
+/// an offset index at close time.
+///
+/// Unitigs with more than [`MAX_KMERS_PER_CHUNK`] kmers are transparently split
+/// into overlapping chunks (k-1 nucleotide overlap) so no kmer is lost.
+///
+/// The companion index file (`path.idx`) is written on [`close`].
+/// The binary format per record is `[u8: n_kmers−1][packed 2-bit bytes]`.
+pub struct UnitigFileWriter {
+    path: PathBuf,
+    file: BufWriter<File>,
+    seqls: Vec<u8>,
+    packed_offsets: Vec<u32>,
+    next_offset: u32,
+    k: usize,
+}
+
+impl UnitigFileWriter {
+    pub fn create(path: &Path) -> SKResult<Self> {
+        let file = File::create(path).map_err(SKError::Io)?;
+        Ok(Self {
+            path: path.to_owned(),
+            file: BufWriter::new(file),
+            seqls: Vec::new(),
+            packed_offsets: Vec::new(),
+            next_offset: 0,
+            k: obikseq::params::k(),
+        })
+    }
+
+    /// Write a unitig, splitting it into chunks if it exceeds [`MAX_KMERS_PER_CHUNK`].
+    pub fn write(&mut self, unitig: &Unitig) -> SKResult<()> {
+        let seql = unitig.seql();
+        let k = self.k;
+
+        if seql < k {
+            return Ok(());
+        }
+
+        let n_kmers = seql - k + 1;
+        if n_kmers <= MAX_KMERS_PER_CHUNK {
+            return self.write_chunk(unitig);
+        }
+
+        // Split into overlapping chunks of MAX_KMERS_PER_CHUNK kmers.
+        // Overlap of k-1 nucleotides ensures no kmer is lost at boundaries.
+        let chunk_nucl = MAX_KMERS_PER_CHUNK + k - 1;
+        let stride = MAX_KMERS_PER_CHUNK;
+        let mut start = 0;
+        while start < seql {
+            let end = (start + chunk_nucl).min(seql);
+            self.write_chunk(&sub_unitig(unitig, start, end))?;
+            if end == seql {
+                break;
+            }
+            start += stride;
+        }
+        Ok(())
+    }
+
+    fn write_chunk(&mut self, unitig: &Unitig) -> SKResult<()> {
+        let seql = unitig.seql();
+        let byte_len = (seql + 3) / 4;
+
+        // Header is 1 byte (u8: n_kmers − 1 = seql − k); packed bytes follow.
+        self.packed_offsets.push(self.next_offset + 1);
+        self.seqls.push((seql - self.k) as u8);
+
+        unitig
+            .write_to_binary(&mut self.file)
+            .map_err(SKError::Io)?;
+
+        self.next_offset += 1 + byte_len as u32;
+        Ok(())
+    }
+
+    /// Flush the sequence file and write the companion `.idx`.
+    pub fn close(mut self) -> SKResult<()> {
+        self.file.flush().map_err(SKError::Io)?;
+        drop(self.file);
+
+        // Sentinel: byte offset past the last record's packed bytes.
+        let sentinel = match (self.packed_offsets.last(), self.seqls.last()) {
+            (Some(&last_off), Some(&last_seql)) => {
+                let seql = last_seql as u32 + self.k as u32;
+                last_off + (seql + 3) / 4
+            }
+            _ => 0,
+        };
+        self.packed_offsets.push(sentinel);
+
+        write_idx(&idx_path(&self.path), &self.seqls, &self.packed_offsets)
+    }
+
+    pub fn len(&self) -> usize {
+        self.seqls.len()
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.seqls.is_empty()
+    }
+}
+
+fn write_idx(path: &Path, seqls: &[u8], packed_offsets: &[u32]) -> SKResult<()> {
+    let mut w = BufWriter::new(File::create(path).map_err(SKError::Io)?);
+    w.write_all(&MAGIC).map_err(SKError::Io)?;
+    w.write_all(&(seqls.len() as u32).to_le_bytes()).map_err(SKError::Io)?;
+    w.write_all(seqls).map_err(SKError::Io)?;
+    for &off in packed_offsets {
+        w.write_all(&off.to_le_bytes()).map_err(SKError::Io)?;
+    }
+    w.flush().map_err(SKError::Io)
+}
+
+// ── Reader ────────────────────────────────────────────────────────────────────
+
+/// Read-only random-access view of a unitig file.
+///
+/// The sequence file is memory-mapped; the index is loaded into RAM on open.
+/// All per-kmer operations are O(1) and allocation-free.
+pub struct UnitigFileReader {
+    mmap: Mmap,
+    seqls: Vec<u8>,
+    packed_offsets: Vec<u32>,
+    k: usize,
+}
+
+impl UnitigFileReader {
+    pub fn open(path: &Path) -> SKResult<Self> {
+        let file = File::open(path).map_err(SKError::Io)?;
+        let mmap = unsafe { Mmap::map(&file).map_err(SKError::Io)? };
+        let (seqls, packed_offsets) = read_idx(&idx_path(path))?;
+        let k = obikseq::params::k();
+        Ok(Self { mmap, seqls, packed_offsets, k })
+    }
+
+    pub fn len(&self) -> usize {
+        self.seqls.len()
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.seqls.is_empty()
+    }
+
+    /// Return the nucleotide length of chunk `i`.
+    #[inline]
+    pub fn seql(&self, i: usize) -> usize {
+        self.seqls[i] as usize + self.k
+    }
+
+    /// Reconstruct chunk `i` as a [`Unitig`]. Allocates a copy of the packed bytes.
+    pub fn unitig(&self, i: usize) -> Unitig {
+        let seql = self.seqls[i] as usize + self.k;
+        let start = self.packed_offsets[i] as usize;
+        let byte_len = (seql + 3) / 4;
+        let tail = (seql % 4) as u8;
+        let bytes = self.mmap[start..start + byte_len].to_vec().into_boxed_slice();
+        Unitig::new(tail, bytes)
+    }
+
+    /// Extract the raw left-aligned u64 of the kmer at position `j` within chunk `i`.
+    #[inline]
+    pub fn raw_kmer(&self, i: usize, j: usize) -> u64 {
+        let start = self.packed_offsets[i] as usize;
+        extract_kmer_raw(&self.mmap[start..], j, self.k)
+    }
+
+    /// Return `true` iff the kmer at position `j` of chunk `i` equals `query`.
+    ///
+    /// O(1), zero allocation. The chunk may store either orientation of the kmer;
+    /// canonicalization is applied before comparison.
+    #[inline]
+    pub fn verify_canonical_kmer(&self, i: usize, j: usize, query: CanonicalKmer) -> bool {
+        canonical_raw(self.raw_kmer(i, j), self.k) == query.raw()
+    }
+}
+
+fn read_idx(path: &Path) -> SKResult<(Vec<u8>, Vec<u32>)> {
+    let data = std::fs::read(path).map_err(SKError::Io)?;
+    let mut pos = 0;
+
+    if &data[pos..pos + 4] != &MAGIC {
+        return Err(SKError::Io(std::io::Error::new(
+            std::io::ErrorKind::InvalidData,
+            "unitig index: bad magic",
+        )));
+    }
+    pos += 4;
+
+    let n = u32::from_le_bytes(data[pos..pos + 4].try_into().unwrap()) as usize;
+    pos += 4;
+
+    let seqls = data[pos..pos + n].to_vec();
+    pos += n;
+
+    let mut packed_offsets = Vec::with_capacity(n + 1);
+    for _ in 0..=n {
+        packed_offsets.push(u32::from_le_bytes(data[pos..pos + 4].try_into().unwrap()));
+        pos += 4;
+    }
+
+    Ok((seqls, packed_offsets))
+}
+
+// ── Kmer utilities ────────────────────────────────────────────────────────────
+
+/// Reverse complement of a left-aligned 2-bit kmer (same algorithm as [`KmerOf::revcomp`]).
+#[inline]
+fn revcomp_raw(raw: u64, k: usize) -> u64 {
+    let x = !raw;
+    let x = x.swap_bytes();
+    let x = ((x >> 4) & 0x0F0F0F0F0F0F0F0F) | ((x & 0x0F0F0F0F0F0F0F0F) << 4);
+    let x = ((x >> 2) & 0x3333333333333333) | ((x & 0x3333333333333333) << 2);
+    x << (64 - 2 * k)
+}
+
+/// Canonical form of a left-aligned 2-bit kmer: `min(kmer, revcomp(kmer))`.
+#[inline]
+fn canonical_raw(raw: u64, k: usize) -> u64 {
+    raw.min(revcomp_raw(raw, k))
+}
+
+// ── Bit extraction ────────────────────────────────────────────────────────────
+
+/// Extract the kmer at nucleotide position `j` from MSB-first 2-bit packed `bytes`.
+/// Returns a left-aligned u64 matching [`KmerOf`]'s internal representation.
+#[inline]
+fn extract_kmer_raw(bytes: &[u8], j: usize, k: usize) -> u64 {
+    let bit_start = j * 2;
+    let byte_start = bit_start / 8;
+    let bit_offset = bit_start % 8; // always 0, 2, 4, or 6
+    let bytes_needed = (bit_offset + 2 * k + 7) / 8; // ≤ 9 for k ≤ 32
+
+    let mut acc = 0u128;
+    for idx in 0..bytes_needed {
+        acc = (acc << 8) | bytes.get(byte_start + idx).copied().unwrap_or(0) as u128;
+    }
+
+    let shift = bytes_needed * 8 - bit_offset - 2 * k;
+    let mask = !0u64 >> (64 - 2 * k);
+    let raw = (acc >> shift) as u64 & mask;
+    raw << (64 - 2 * k)
+}
+
+#[cfg(test)]
+#[path = "tests/unitig_index.rs"]
+mod tests;