Push kztouvrzoqym #8
@@ -269,3 +269,94 @@ fn long_unitig_split_no_kmer_lost() {
|
||||
// First kmer of chunk 1 = original nucl 256..260 — a different, adjacent kmer.
|
||||
assert!(r.verify_canonical_kmer(1, 0, canonical_of(&seq[256..260])));
|
||||
}
|
||||
|
||||
// ── block_bits parametrisation ────────────────────────────────────────────────
|
||||
|
||||
fn write_read_bb(seqs: &[&[u8]], block_bits: u8) -> (tempfile::TempDir, UnitigFileReader) {
|
||||
let dir = tempdir().unwrap();
|
||||
let path = dir.path().join("unitigs.bin");
|
||||
let mut w = UnitigFileWriter::create_with_block_bits(&path, block_bits).unwrap();
|
||||
for s in seqs {
|
||||
w.write(&make_unitig(s)).unwrap();
|
||||
}
|
||||
w.close().unwrap();
|
||||
let r = UnitigFileReader::open(&path).unwrap();
|
||||
(dir, r)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn block_bits_stored_and_read_back() {
|
||||
set_k(4);
|
||||
for bb in [0u8, 1, 2, 3, 6] {
|
||||
let dir = tempdir().unwrap();
|
||||
let path = dir.path().join("unitigs.bin");
|
||||
let w = UnitigFileWriter::create_with_block_bits(&path, bb).unwrap();
|
||||
assert_eq!(w.block_bits(), bb);
|
||||
w.close().unwrap();
|
||||
let r = UnitigFileReader::open(&path).unwrap();
|
||||
assert_eq!(r.block_bits(), bb, "block_bits={bb} not preserved");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn block_bits_zero_exact_offsets() {
|
||||
// block_bits=0 → one offset per chunk, no sequential scan in chunk_start
|
||||
set_k(4);
|
||||
let seqs: &[&[u8]] = &[b"ACGTACGT", b"TTTTCCCC", b"GGGAAA", b"AAAACG"];
|
||||
let (_dir, r) = write_read_bb(seqs, 0);
|
||||
assert_eq!(r.len(), seqs.len());
|
||||
for (i, s) in seqs.iter().enumerate() {
|
||||
assert_eq!(r.unitig(i), make_unitig(s), "block_bits=0: unitig {i} mismatch");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn block_bits_one_block_size_two() {
|
||||
// block_bits=1 → BLOCK_SIZE=2: every other chunk gets an offset
|
||||
set_k(4);
|
||||
let seqs: &[&[u8]] = &[b"ACGTACGT", b"TTTTCCCC", b"GGGAAA", b"AAAACG", b"CCCCGT"];
|
||||
let (_dir, r) = write_read_bb(seqs, 1);
|
||||
assert_eq!(r.len(), seqs.len());
|
||||
for (i, s) in seqs.iter().enumerate() {
|
||||
assert_eq!(r.unitig(i), make_unitig(s), "block_bits=1: unitig {i} mismatch");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn block_bits_larger_than_chunk_count() {
|
||||
// block_bits=6 (BLOCK_SIZE=64) with only 3 chunks: all in block 0
|
||||
set_k(4);
|
||||
let seqs: &[&[u8]] = &[b"ACGTACGT", b"TTTTCCCC", b"GGGAAA"];
|
||||
let (_dir, r) = write_read_bb(seqs, 6);
|
||||
assert_eq!(r.len(), seqs.len());
|
||||
for (i, s) in seqs.iter().enumerate() {
|
||||
assert_eq!(r.unitig(i), make_unitig(s), "block_bits=6: unitig {i} mismatch");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn block_bits_random_access_matches_sequential() {
|
||||
// Write many chunks with block_bits=2 (BLOCK_SIZE=4), verify random access
|
||||
// against sequential iteration for every chunk.
|
||||
set_k(4);
|
||||
let seqs: Vec<Vec<u8>> = (0..20_usize)
|
||||
.map(|i| (0..8_usize).map(|j| b"ACGT"[(i + j) % 4]).collect())
|
||||
.collect();
|
||||
let seq_refs: Vec<&[u8]> = seqs.iter().map(|s| s.as_slice()).collect();
|
||||
let (_dir, r) = write_read_bb(&seq_refs, 2);
|
||||
assert_eq!(r.len(), seqs.len());
|
||||
let sequential: Vec<Unitig> = r.iter_chunks_sequential().map(|(_, u)| u).collect();
|
||||
for i in 0..seqs.len() {
|
||||
assert_eq!(r.unitig(i), sequential[i], "random vs sequential mismatch at chunk {i}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn block_bits_kmer_count_preserved() {
|
||||
set_k(4);
|
||||
// "AAAACG" → 3 kmers, "CCCCAG" → 3 kmers; total = 6
|
||||
for bb in [0u8, 1, 2, 6] {
|
||||
let (_dir, r) = write_read_bb(&[b"AAAACG", b"CCCCAG"], bb);
|
||||
assert_eq!(r.n_kmers(), 6, "block_bits={bb}: n_kmers mismatch");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -11,18 +11,17 @@ use crate::error::{SKError, SKResult};
|
||||
|
||||
// ── Block index parameters ────────────────────────────────────────────────────
|
||||
//
|
||||
// One offset entry per BLOCK_SIZE chunks. BLOCK_SIZE must be a power of two
|
||||
// so that block = i >> LOG2_BLOCK_SIZE and rem = i & (BLOCK_SIZE − 1) are
|
||||
// branchless shifts/masks rather than divisions.
|
||||
// BLOCK_SIZE = 1 << block_bits chunks share one offset entry in the index.
|
||||
// block_bits=0 → one entry per chunk (exact offsets, no scan).
|
||||
// block_bits=6 → one entry per 64 chunks (default; O(64) scan per lookup).
|
||||
//
|
||||
// With BLOCK_SIZE = 64 and an average chunk size of ~10 bytes, a random lookup
|
||||
// scans at most 63 × 10 = 630 bytes sequentially — negligible next to the MPHF
|
||||
// lookup that precedes it. The index file shrinks from ~5 bytes/chunk to
|
||||
// ~1/64 bytes/chunk (≈ 300× for typical workloads).
|
||||
// block_bits is stored in the index file so the reader derives all parameters
|
||||
// at runtime — no compile-time constant constrains the format.
|
||||
|
||||
const MAGIC: [u8; 4] = *b"UIX2";
|
||||
const BLOCK_SIZE: usize = 64;
|
||||
const LOG2_BLOCK_SIZE: u32 = 6; // 2^6 = BLOCK_SIZE
|
||||
const MAGIC: [u8; 4] = *b"UIX3";
|
||||
|
||||
/// Default block granularity used by [`UnitigFileWriter::create`].
|
||||
pub const DEFAULT_BLOCK_BITS: u8 = 6;
|
||||
|
||||
fn idx_path(path: &Path) -> PathBuf {
|
||||
crate::append_path_suffix(path, ".idx")
|
||||
@@ -33,23 +32,36 @@ fn idx_path(path: &Path) -> PathBuf {
|
||||
/// Writes a sequence of [`Unitig`] to an uncompressed binary file and builds
|
||||
/// a block-sampled offset index at close time.
|
||||
///
|
||||
/// One offset is stored every [`BLOCK_SIZE`] chunks; random access to chunk `i`
|
||||
/// costs at most `BLOCK_SIZE − 1` sequential chunk scans after the block lookup.
|
||||
/// One offset is stored every `1 << block_bits` chunks; random access to chunk
|
||||
/// `i` costs at most `(1 << block_bits) − 1` sequential chunk scans after the
|
||||
/// block lookup.
|
||||
///
|
||||
/// Unitigs with more than [`MAX_KMERS_PER_CHUNK`] k-mers are transparently split
|
||||
/// into overlapping chunks (k−1 nucleotide overlap) so no k-mer is lost.
|
||||
pub struct UnitigFileWriter {
|
||||
path: PathBuf,
|
||||
file: BufWriter<File>,
|
||||
block_offsets: Vec<u32>, // byte offset of first record in each block
|
||||
block_offsets: Vec<u32>,
|
||||
chunk_count: usize,
|
||||
next_offset: u32, // byte offset of the START of the next record
|
||||
next_offset: u32,
|
||||
n_kmers: usize,
|
||||
k: usize,
|
||||
block_bits: u8,
|
||||
mask: usize, // (1 << block_bits) - 1
|
||||
}
|
||||
|
||||
impl UnitigFileWriter {
|
||||
/// Create a writer with the default block size (`DEFAULT_BLOCK_BITS = 6`).
|
||||
pub fn create(path: &Path) -> SKResult<Self> {
|
||||
Self::create_with_block_bits(path, DEFAULT_BLOCK_BITS)
|
||||
}
|
||||
|
||||
/// Create a writer with a custom block size.
|
||||
///
|
||||
/// `block_bits` must be in 0..=31. `block_bits=0` stores one offset per
|
||||
/// chunk (exact, no scan); larger values trade index size for scan length.
|
||||
pub fn create_with_block_bits(path: &Path, block_bits: u8) -> SKResult<Self> {
|
||||
assert!(block_bits <= 31, "block_bits must be ≤ 31");
|
||||
let file = File::create(path).map_err(SKError::Io)?;
|
||||
Ok(Self {
|
||||
path: path.to_owned(),
|
||||
@@ -59,6 +71,8 @@ impl UnitigFileWriter {
|
||||
next_offset: 0,
|
||||
n_kmers: 0,
|
||||
k: obikseq::params::k(),
|
||||
block_bits,
|
||||
mask: (1usize << block_bits) - 1,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -95,8 +109,7 @@ impl UnitigFileWriter {
|
||||
|
||||
debug_assert!(seql - self.k <= u8::MAX as usize, "chunk exceeds MAX_KMERS_PER_CHUNK");
|
||||
|
||||
// Record a block offset at the start of every BLOCK_SIZE-th chunk.
|
||||
if self.chunk_count & (BLOCK_SIZE - 1) == 0 {
|
||||
if self.chunk_count & self.mask == 0 {
|
||||
self.block_offsets.push(self.next_offset);
|
||||
}
|
||||
|
||||
@@ -113,25 +126,32 @@ impl UnitigFileWriter {
|
||||
self.file.flush().map_err(SKError::Io)?;
|
||||
drop(self.file);
|
||||
|
||||
// Sentinel: byte offset past the last record (needed for end-of-file detection).
|
||||
self.block_offsets.push(self.next_offset);
|
||||
|
||||
write_idx(
|
||||
&idx_path(&self.path),
|
||||
self.chunk_count as u32,
|
||||
self.n_kmers as u64,
|
||||
self.block_bits,
|
||||
&self.block_offsets,
|
||||
)
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize { self.chunk_count }
|
||||
pub fn is_empty(&self) -> bool { self.chunk_count == 0 }
|
||||
pub fn block_bits(&self) -> u8 { self.block_bits }
|
||||
}
|
||||
|
||||
fn write_idx(path: &Path, n_unitigs: u32, n_kmers: u64, block_offsets: &[u32]) -> SKResult<()> {
|
||||
fn write_idx(
|
||||
path: &Path,
|
||||
n_unitigs: u32,
|
||||
n_kmers: u64,
|
||||
block_bits: u8,
|
||||
block_offsets: &[u32],
|
||||
) -> SKResult<()> {
|
||||
let mut w = BufWriter::new(File::create(path).map_err(SKError::Io)?);
|
||||
w.write_all(&MAGIC).map_err(SKError::Io)?;
|
||||
w.write_all(&(BLOCK_SIZE as u32).to_le_bytes()).map_err(SKError::Io)?;
|
||||
w.write_all(&(block_bits as u32).to_le_bytes()).map_err(SKError::Io)?;
|
||||
w.write_all(&n_unitigs.to_le_bytes()).map_err(SKError::Io)?;
|
||||
w.write_all(&n_kmers.to_le_bytes()).map_err(SKError::Io)?;
|
||||
for &off in block_offsets {
|
||||
@@ -145,39 +165,45 @@ fn write_idx(path: &Path, n_unitigs: u32, n_kmers: u64, block_offsets: &[u32]) -
|
||||
/// Read-only random-access view of a unitig file.
|
||||
///
|
||||
/// The sequence file is memory-mapped; the block offset table is loaded into RAM
|
||||
/// on open (≈ n_chunks / BLOCK_SIZE entries, negligible memory).
|
||||
///
|
||||
/// Random access to chunk `i`: O(BLOCK_SIZE) sequential mmap reads — branchless
|
||||
/// shift/mask arithmetic, cache-friendly, negligible versus the MPHF lookup.
|
||||
///
|
||||
/// Sequential iteration: O(n) via a running-offset cursor (no per-chunk overhead).
|
||||
/// on open. Random access to chunk `i`: O(1 << block_bits) sequential mmap
|
||||
/// reads. Sequential iteration: O(n) via a running-offset cursor.
|
||||
pub struct UnitigFileReader {
|
||||
mmap: Mmap,
|
||||
block_offsets: Vec<u32>,
|
||||
n_unitigs: usize,
|
||||
n_kmers: usize,
|
||||
k: usize,
|
||||
block_bits: u8,
|
||||
mask: usize, // (1 << block_bits) - 1
|
||||
}
|
||||
|
||||
impl UnitigFileReader {
|
||||
pub fn open(path: &Path) -> SKResult<Self> {
|
||||
let file = File::open(path).map_err(SKError::Io)?;
|
||||
let mmap = unsafe { Mmap::map(&file).map_err(SKError::Io)? };
|
||||
let (n_unitigs, n_kmers, block_offsets) = read_idx(&idx_path(path))?;
|
||||
let (n_unitigs, n_kmers, block_bits, block_offsets) = read_idx(&idx_path(path))?;
|
||||
let k = obikseq::params::k();
|
||||
Ok(Self { mmap, block_offsets, n_unitigs, n_kmers, k })
|
||||
Ok(Self {
|
||||
mmap,
|
||||
block_offsets,
|
||||
n_unitigs,
|
||||
n_kmers,
|
||||
k,
|
||||
block_bits,
|
||||
mask: (1usize << block_bits) - 1,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize { self.n_unitigs }
|
||||
pub fn is_empty(&self) -> bool { self.n_unitigs == 0 }
|
||||
pub fn n_kmers(&self) -> usize { self.n_kmers }
|
||||
pub fn block_bits(&self) -> u8 { self.block_bits }
|
||||
|
||||
/// Byte offset of the START of record `i` (the seql byte) in the mmap.
|
||||
/// O(BLOCK_SIZE) sequential scan within the block.
|
||||
#[inline]
|
||||
fn chunk_start(&self, i: usize) -> usize {
|
||||
let block = i >> LOG2_BLOCK_SIZE;
|
||||
let rem = i & (BLOCK_SIZE - 1);
|
||||
let block = i >> self.block_bits;
|
||||
let rem = i & self.mask;
|
||||
let mut offset = self.block_offsets[block] as usize;
|
||||
for _ in 0..rem {
|
||||
let seql_minus_k = self.mmap[offset] as usize;
|
||||
@@ -216,7 +242,6 @@ impl UnitigFileReader {
|
||||
|
||||
// ── Sequential iterators (O(n) running-offset cursor) ─────────────────────
|
||||
|
||||
/// Iterate all chunks in file order with a running byte offset — O(n) total.
|
||||
fn iter_chunks_sequential(&self) -> impl Iterator<Item = (usize, Unitig)> + '_ {
|
||||
let k = self.k;
|
||||
let mmap = &*self.mmap;
|
||||
@@ -253,7 +278,7 @@ impl UnitigFileReader {
|
||||
}
|
||||
}
|
||||
|
||||
fn read_idx(path: &Path) -> SKResult<(usize, usize, Vec<u32>)> {
|
||||
fn read_idx(path: &Path) -> SKResult<(usize, usize, u8, Vec<u32>)> {
|
||||
let data = std::fs::read(path).map_err(SKError::Io)?;
|
||||
let mut pos = 0;
|
||||
|
||||
@@ -261,22 +286,22 @@ fn read_idx(path: &Path) -> SKResult<(usize, usize, Vec<u32>)> {
|
||||
.ok_or(SKError::Truncated { context: "unitig index: magic" })?;
|
||||
if magic_bytes != &MAGIC {
|
||||
return Err(SKError::BadMagic {
|
||||
expected: "UIX2",
|
||||
expected: "UIX3",
|
||||
got: magic_bytes.try_into().unwrap(),
|
||||
});
|
||||
}
|
||||
pos += 4;
|
||||
|
||||
// block_size stored for forward-compatibility verification
|
||||
let bs_bytes = data.get(pos..pos + 4)
|
||||
.ok_or(SKError::Truncated { context: "unitig index: block_size" })?;
|
||||
let stored_bs = u32::from_le_bytes(bs_bytes.try_into().unwrap()) as usize;
|
||||
if stored_bs != BLOCK_SIZE {
|
||||
let bb_bytes = data.get(pos..pos + 4)
|
||||
.ok_or(SKError::Truncated { context: "unitig index: block_bits" })?;
|
||||
let block_bits_u32 = u32::from_le_bytes(bb_bytes.try_into().unwrap());
|
||||
if block_bits_u32 > 31 {
|
||||
return Err(SKError::InvalidData {
|
||||
context: "unitig index",
|
||||
detail: format!("block_size mismatch: file={stored_bs} code={BLOCK_SIZE}"),
|
||||
detail: format!("block_bits out of range: {block_bits_u32}"),
|
||||
});
|
||||
}
|
||||
let block_bits = block_bits_u32 as u8;
|
||||
pos += 4;
|
||||
|
||||
let n_bytes = data.get(pos..pos + 4)
|
||||
@@ -289,8 +314,9 @@ fn read_idx(path: &Path) -> SKResult<(usize, usize, Vec<u32>)> {
|
||||
let n_kmers = u64::from_le_bytes(nk_bytes.try_into().unwrap()) as usize;
|
||||
pos += 8;
|
||||
|
||||
let n_blocks = (n_unitigs + BLOCK_SIZE - 1) >> LOG2_BLOCK_SIZE;
|
||||
let n_offsets = n_blocks + 1; // +1 for sentinel
|
||||
let block_size = 1usize << block_bits;
|
||||
let n_blocks = (n_unitigs + block_size - 1) >> block_bits;
|
||||
let n_offsets = n_blocks + 1;
|
||||
let mut block_offsets = Vec::with_capacity(n_offsets);
|
||||
for _ in 0..n_offsets {
|
||||
let off_bytes = data.get(pos..pos + 4)
|
||||
@@ -299,7 +325,7 @@ fn read_idx(path: &Path) -> SKResult<(usize, usize, Vec<u32>)> {
|
||||
pos += 4;
|
||||
}
|
||||
|
||||
Ok((n_unitigs, n_kmers, block_offsets))
|
||||
Ok((n_unitigs, n_kmers, block_bits, block_offsets))
|
||||
}
|
||||
|
||||
// ── Kmer utilities ────────────────────────────────────────────────────────────
|
||||
|
||||
Reference in New Issue
Block a user