Push kztouvrzoqym #8

Merged
coissac merged 10 commits from push-kztouvrzoqym into main 2026-05-23 12:04:51 +00:00
2 changed files with 159 additions and 42 deletions
Showing only changes of commit 8478072b78 - Show all commits
+91
View File
@@ -269,3 +269,94 @@ fn long_unitig_split_no_kmer_lost() {
// First kmer of chunk 1 = original nucl 256..260 — a different, adjacent kmer.
assert!(r.verify_canonical_kmer(1, 0, canonical_of(&seq[256..260])));
}
// ── block_bits parametrisation ────────────────────────────────────────────────
fn write_read_bb(seqs: &[&[u8]], block_bits: u8) -> (tempfile::TempDir, UnitigFileReader) {
let dir = tempdir().unwrap();
let path = dir.path().join("unitigs.bin");
let mut w = UnitigFileWriter::create_with_block_bits(&path, block_bits).unwrap();
for s in seqs {
w.write(&make_unitig(s)).unwrap();
}
w.close().unwrap();
let r = UnitigFileReader::open(&path).unwrap();
(dir, r)
}
#[test]
fn block_bits_stored_and_read_back() {
set_k(4);
for bb in [0u8, 1, 2, 3, 6] {
let dir = tempdir().unwrap();
let path = dir.path().join("unitigs.bin");
let w = UnitigFileWriter::create_with_block_bits(&path, bb).unwrap();
assert_eq!(w.block_bits(), bb);
w.close().unwrap();
let r = UnitigFileReader::open(&path).unwrap();
assert_eq!(r.block_bits(), bb, "block_bits={bb} not preserved");
}
}
#[test]
fn block_bits_zero_exact_offsets() {
// block_bits=0 → one offset per chunk, no sequential scan in chunk_start
set_k(4);
let seqs: &[&[u8]] = &[b"ACGTACGT", b"TTTTCCCC", b"GGGAAA", b"AAAACG"];
let (_dir, r) = write_read_bb(seqs, 0);
assert_eq!(r.len(), seqs.len());
for (i, s) in seqs.iter().enumerate() {
assert_eq!(r.unitig(i), make_unitig(s), "block_bits=0: unitig {i} mismatch");
}
}
#[test]
fn block_bits_one_block_size_two() {
// block_bits=1 → BLOCK_SIZE=2: every other chunk gets an offset
set_k(4);
let seqs: &[&[u8]] = &[b"ACGTACGT", b"TTTTCCCC", b"GGGAAA", b"AAAACG", b"CCCCGT"];
let (_dir, r) = write_read_bb(seqs, 1);
assert_eq!(r.len(), seqs.len());
for (i, s) in seqs.iter().enumerate() {
assert_eq!(r.unitig(i), make_unitig(s), "block_bits=1: unitig {i} mismatch");
}
}
#[test]
fn block_bits_larger_than_chunk_count() {
// block_bits=6 (BLOCK_SIZE=64) with only 3 chunks: all in block 0
set_k(4);
let seqs: &[&[u8]] = &[b"ACGTACGT", b"TTTTCCCC", b"GGGAAA"];
let (_dir, r) = write_read_bb(seqs, 6);
assert_eq!(r.len(), seqs.len());
for (i, s) in seqs.iter().enumerate() {
assert_eq!(r.unitig(i), make_unitig(s), "block_bits=6: unitig {i} mismatch");
}
}
#[test]
fn block_bits_random_access_matches_sequential() {
// Write many chunks with block_bits=2 (BLOCK_SIZE=4), verify random access
// against sequential iteration for every chunk.
set_k(4);
let seqs: Vec<Vec<u8>> = (0..20_usize)
.map(|i| (0..8_usize).map(|j| b"ACGT"[(i + j) % 4]).collect())
.collect();
let seq_refs: Vec<&[u8]> = seqs.iter().map(|s| s.as_slice()).collect();
let (_dir, r) = write_read_bb(&seq_refs, 2);
assert_eq!(r.len(), seqs.len());
let sequential: Vec<Unitig> = r.iter_chunks_sequential().map(|(_, u)| u).collect();
for i in 0..seqs.len() {
assert_eq!(r.unitig(i), sequential[i], "random vs sequential mismatch at chunk {i}");
}
}
#[test]
fn block_bits_kmer_count_preserved() {
set_k(4);
// "AAAACG" → 3 kmers, "CCCCAG" → 3 kmers; total = 6
for bb in [0u8, 1, 2, 6] {
let (_dir, r) = write_read_bb(&[b"AAAACG", b"CCCCAG"], bb);
assert_eq!(r.n_kmers(), 6, "block_bits={bb}: n_kmers mismatch");
}
}
+68 -42
View File
@@ -11,18 +11,17 @@ use crate::error::{SKError, SKResult};
// ── Block index parameters ────────────────────────────────────────────────────
//
// One offset entry per BLOCK_SIZE chunks. BLOCK_SIZE must be a power of two
// so that block = i >> LOG2_BLOCK_SIZE and rem = i & (BLOCK_SIZE 1) are
// branchless shifts/masks rather than divisions.
// BLOCK_SIZE = 1 << block_bits chunks share one offset entry in the index.
// block_bits=0 → one entry per chunk (exact offsets, no scan).
// block_bits=6 → one entry per 64 chunks (default; O(64) scan per lookup).
//
// With BLOCK_SIZE = 64 and an average chunk size of ~10 bytes, a random lookup
// scans at most 63 × 10 = 630 bytes sequentially — negligible next to the MPHF
// lookup that precedes it. The index file shrinks from ~5 bytes/chunk to
// ~1/64 bytes/chunk (≈ 300× for typical workloads).
// block_bits is stored in the index file so the reader derives all parameters
// at runtime — no compile-time constant constrains the format.
const MAGIC: [u8; 4] = *b"UIX2";
const BLOCK_SIZE: usize = 64;
const LOG2_BLOCK_SIZE: u32 = 6; // 2^6 = BLOCK_SIZE
const MAGIC: [u8; 4] = *b"UIX3";
/// Default block granularity used by [`UnitigFileWriter::create`].
pub const DEFAULT_BLOCK_BITS: u8 = 6;
fn idx_path(path: &Path) -> PathBuf {
crate::append_path_suffix(path, ".idx")
@@ -33,23 +32,36 @@ fn idx_path(path: &Path) -> PathBuf {
/// Writes a sequence of [`Unitig`] to an uncompressed binary file and builds
/// a block-sampled offset index at close time.
///
/// One offset is stored every [`BLOCK_SIZE`] chunks; random access to chunk `i`
/// costs at most `BLOCK_SIZE 1` sequential chunk scans after the block lookup.
/// One offset is stored every `1 << block_bits` chunks; random access to chunk
/// `i` costs at most `(1 << block_bits) 1` sequential chunk scans after the
/// block lookup.
///
/// Unitigs with more than [`MAX_KMERS_PER_CHUNK`] k-mers are transparently split
/// into overlapping chunks (k1 nucleotide overlap) so no k-mer is lost.
pub struct UnitigFileWriter {
path: PathBuf,
file: BufWriter<File>,
block_offsets: Vec<u32>, // byte offset of first record in each block
block_offsets: Vec<u32>,
chunk_count: usize,
next_offset: u32, // byte offset of the START of the next record
next_offset: u32,
n_kmers: usize,
k: usize,
block_bits: u8,
mask: usize, // (1 << block_bits) - 1
}
impl UnitigFileWriter {
/// Create a writer with the default block size (`DEFAULT_BLOCK_BITS = 6`).
pub fn create(path: &Path) -> SKResult<Self> {
Self::create_with_block_bits(path, DEFAULT_BLOCK_BITS)
}
/// Create a writer with a custom block size.
///
/// `block_bits` must be in 0..=31. `block_bits=0` stores one offset per
/// chunk (exact, no scan); larger values trade index size for scan length.
pub fn create_with_block_bits(path: &Path, block_bits: u8) -> SKResult<Self> {
assert!(block_bits <= 31, "block_bits must be ≤ 31");
let file = File::create(path).map_err(SKError::Io)?;
Ok(Self {
path: path.to_owned(),
@@ -59,6 +71,8 @@ impl UnitigFileWriter {
next_offset: 0,
n_kmers: 0,
k: obikseq::params::k(),
block_bits,
mask: (1usize << block_bits) - 1,
})
}
@@ -95,8 +109,7 @@ impl UnitigFileWriter {
debug_assert!(seql - self.k <= u8::MAX as usize, "chunk exceeds MAX_KMERS_PER_CHUNK");
// Record a block offset at the start of every BLOCK_SIZE-th chunk.
if self.chunk_count & (BLOCK_SIZE - 1) == 0 {
if self.chunk_count & self.mask == 0 {
self.block_offsets.push(self.next_offset);
}
@@ -113,25 +126,32 @@ impl UnitigFileWriter {
self.file.flush().map_err(SKError::Io)?;
drop(self.file);
// Sentinel: byte offset past the last record (needed for end-of-file detection).
self.block_offsets.push(self.next_offset);
write_idx(
&idx_path(&self.path),
self.chunk_count as u32,
self.n_kmers as u64,
self.block_bits,
&self.block_offsets,
)
}
pub fn len(&self) -> usize { self.chunk_count }
pub fn is_empty(&self) -> bool { self.chunk_count == 0 }
pub fn block_bits(&self) -> u8 { self.block_bits }
}
fn write_idx(path: &Path, n_unitigs: u32, n_kmers: u64, block_offsets: &[u32]) -> SKResult<()> {
fn write_idx(
path: &Path,
n_unitigs: u32,
n_kmers: u64,
block_bits: u8,
block_offsets: &[u32],
) -> SKResult<()> {
let mut w = BufWriter::new(File::create(path).map_err(SKError::Io)?);
w.write_all(&MAGIC).map_err(SKError::Io)?;
w.write_all(&(BLOCK_SIZE as u32).to_le_bytes()).map_err(SKError::Io)?;
w.write_all(&(block_bits as u32).to_le_bytes()).map_err(SKError::Io)?;
w.write_all(&n_unitigs.to_le_bytes()).map_err(SKError::Io)?;
w.write_all(&n_kmers.to_le_bytes()).map_err(SKError::Io)?;
for &off in block_offsets {
@@ -145,39 +165,45 @@ fn write_idx(path: &Path, n_unitigs: u32, n_kmers: u64, block_offsets: &[u32]) -
/// Read-only random-access view of a unitig file.
///
/// The sequence file is memory-mapped; the block offset table is loaded into RAM
/// on open (≈ n_chunks / BLOCK_SIZE entries, negligible memory).
///
/// Random access to chunk `i`: O(BLOCK_SIZE) sequential mmap reads — branchless
/// shift/mask arithmetic, cache-friendly, negligible versus the MPHF lookup.
///
/// Sequential iteration: O(n) via a running-offset cursor (no per-chunk overhead).
/// on open. Random access to chunk `i`: O(1 << block_bits) sequential mmap
/// reads. Sequential iteration: O(n) via a running-offset cursor.
pub struct UnitigFileReader {
mmap: Mmap,
block_offsets: Vec<u32>,
n_unitigs: usize,
n_kmers: usize,
k: usize,
block_bits: u8,
mask: usize, // (1 << block_bits) - 1
}
impl UnitigFileReader {
pub fn open(path: &Path) -> SKResult<Self> {
let file = File::open(path).map_err(SKError::Io)?;
let mmap = unsafe { Mmap::map(&file).map_err(SKError::Io)? };
let (n_unitigs, n_kmers, block_offsets) = read_idx(&idx_path(path))?;
let (n_unitigs, n_kmers, block_bits, block_offsets) = read_idx(&idx_path(path))?;
let k = obikseq::params::k();
Ok(Self { mmap, block_offsets, n_unitigs, n_kmers, k })
Ok(Self {
mmap,
block_offsets,
n_unitigs,
n_kmers,
k,
block_bits,
mask: (1usize << block_bits) - 1,
})
}
pub fn len(&self) -> usize { self.n_unitigs }
pub fn is_empty(&self) -> bool { self.n_unitigs == 0 }
pub fn n_kmers(&self) -> usize { self.n_kmers }
pub fn block_bits(&self) -> u8 { self.block_bits }
/// Byte offset of the START of record `i` (the seql byte) in the mmap.
/// O(BLOCK_SIZE) sequential scan within the block.
#[inline]
fn chunk_start(&self, i: usize) -> usize {
let block = i >> LOG2_BLOCK_SIZE;
let rem = i & (BLOCK_SIZE - 1);
let block = i >> self.block_bits;
let rem = i & self.mask;
let mut offset = self.block_offsets[block] as usize;
for _ in 0..rem {
let seql_minus_k = self.mmap[offset] as usize;
@@ -216,7 +242,6 @@ impl UnitigFileReader {
// ── Sequential iterators (O(n) running-offset cursor) ─────────────────────
/// Iterate all chunks in file order with a running byte offset — O(n) total.
fn iter_chunks_sequential(&self) -> impl Iterator<Item = (usize, Unitig)> + '_ {
let k = self.k;
let mmap = &*self.mmap;
@@ -253,7 +278,7 @@ impl UnitigFileReader {
}
}
fn read_idx(path: &Path) -> SKResult<(usize, usize, Vec<u32>)> {
fn read_idx(path: &Path) -> SKResult<(usize, usize, u8, Vec<u32>)> {
let data = std::fs::read(path).map_err(SKError::Io)?;
let mut pos = 0;
@@ -261,22 +286,22 @@ fn read_idx(path: &Path) -> SKResult<(usize, usize, Vec<u32>)> {
.ok_or(SKError::Truncated { context: "unitig index: magic" })?;
if magic_bytes != &MAGIC {
return Err(SKError::BadMagic {
expected: "UIX2",
expected: "UIX3",
got: magic_bytes.try_into().unwrap(),
});
}
pos += 4;
// block_size stored for forward-compatibility verification
let bs_bytes = data.get(pos..pos + 4)
.ok_or(SKError::Truncated { context: "unitig index: block_size" })?;
let stored_bs = u32::from_le_bytes(bs_bytes.try_into().unwrap()) as usize;
if stored_bs != BLOCK_SIZE {
let bb_bytes = data.get(pos..pos + 4)
.ok_or(SKError::Truncated { context: "unitig index: block_bits" })?;
let block_bits_u32 = u32::from_le_bytes(bb_bytes.try_into().unwrap());
if block_bits_u32 > 31 {
return Err(SKError::InvalidData {
context: "unitig index",
detail: format!("block_size mismatch: file={stored_bs} code={BLOCK_SIZE}"),
detail: format!("block_bits out of range: {block_bits_u32}"),
});
}
let block_bits = block_bits_u32 as u8;
pos += 4;
let n_bytes = data.get(pos..pos + 4)
@@ -289,8 +314,9 @@ fn read_idx(path: &Path) -> SKResult<(usize, usize, Vec<u32>)> {
let n_kmers = u64::from_le_bytes(nk_bytes.try_into().unwrap()) as usize;
pos += 8;
let n_blocks = (n_unitigs + BLOCK_SIZE - 1) >> LOG2_BLOCK_SIZE;
let n_offsets = n_blocks + 1; // +1 for sentinel
let block_size = 1usize << block_bits;
let n_blocks = (n_unitigs + block_size - 1) >> block_bits;
let n_offsets = n_blocks + 1;
let mut block_offsets = Vec::with_capacity(n_offsets);
for _ in 0..n_offsets {
let off_bytes = data.get(pos..pos + 4)
@@ -299,7 +325,7 @@ fn read_idx(path: &Path) -> SKResult<(usize, usize, Vec<u32>)> {
pos += 4;
}
Ok((n_unitigs, n_kmers, block_offsets))
Ok((n_unitigs, n_kmers, block_bits, block_offsets))
}
// ── Kmer utilities ────────────────────────────────────────────────────────────