feat: implement persistent layered index and chunked binary format

Introduce the `obilayeredmap` specification and persistent MPHF-based index architecture for incremental multi-dataset indexing. Implement chunked binary serialization with a fixed `u8` k-mer count limit (256) and overlapping super-kmer segments. Add memory-mapped I/O and a companion `.idx` index file for allocation-free, O(1) unitig access. Update MkDocs navigation, enhance the k-mer comparison script, and add comprehensive tests for serialization, partitioning, and file I/O pipelines.
This commit is contained in:
Eric Coissac
2026-05-09 17:20:08 +08:00
parent 8c17bf958b
commit 5169f65dc9
24 changed files with 1342 additions and 382 deletions
+1
View File
@@ -10,6 +10,7 @@ lru = "0.12"
serde = { version = "1", features = ["derive"] }
serde_json = "1"
memmap2 = "0.9"
obikseq = { path = "../obikseq" }
[dev-dependencies]
+2 -60
View File
@@ -17,63 +17,5 @@ pub(crate) fn read_superkmer<R: Read>(r: &mut R) -> io::Result<Option<SuperKmer>
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Cursor;
fn make_sk(ascii: &[u8]) -> SuperKmer {
SuperKmer::from_ascii(ascii)
}
#[test]
fn roundtrip_single() {
let sk = make_sk(b"ACGTACGT");
let mut buf = Vec::new();
write_superkmer(&mut buf, &sk).unwrap();
let mut cur = Cursor::new(&buf);
let got = read_superkmer(&mut cur).unwrap().unwrap();
assert_eq!(sk.to_ascii(), got.to_ascii());
assert_eq!(sk.seql(), got.seql());
}
#[test]
fn roundtrip_all_lengths() {
let bases: Vec<u8> = (0..300).map(|i| b"ACGT"[i % 4]).collect();
let k = 11;
for len in (k..=k + 8).chain([255, 256, 257]) {
let sk = make_sk(&bases[..len]);
let mut buf = Vec::new();
write_superkmer(&mut buf, &sk).unwrap();
let mut cur = Cursor::new(&buf);
let got = read_superkmer(&mut cur).unwrap().unwrap();
assert_eq!(sk.to_ascii(), got.to_ascii(), "len={len}");
assert_eq!(sk.seql(), got.seql(), "len={len}");
}
}
#[test]
fn eof_returns_none() {
let buf: Vec<u8> = vec![];
let mut cur = Cursor::new(&buf);
assert!(read_superkmer(&mut cur).unwrap().is_none());
}
#[test]
fn multiple_records() {
let seqs: &[&[u8]] = &[b"AAAA", b"CCCC", b"GGGG", b"TTTT"];
let mut buf = Vec::new();
for s in seqs {
write_superkmer(&mut buf, &make_sk(s)).unwrap();
}
let mut cur = Cursor::new(&buf);
for s in seqs {
let got = read_superkmer(&mut cur).unwrap().unwrap();
let expected = make_sk(s);
assert_eq!(expected.to_ascii(), got.to_ascii());
}
assert!(read_superkmer(&mut cur).unwrap().is_none());
}
}
#[path = "tests/codec.rs"]
mod tests;
+2
View File
@@ -4,8 +4,10 @@ pub mod limits;
pub mod meta;
pub mod pool;
pub mod reader;
pub mod unitig_index;
pub use error::{SKError, SKResult};
pub use meta::SKFileMeta;
pub use pool::{create_token, create_token_with, SKFilePool, SharedPool, SKFileWriter};
pub use reader::{SKFileIter, SKFileReader};
pub use unitig_index::{UnitigFileReader, UnitigFileWriter};
+2 -226
View File
@@ -428,229 +428,5 @@ impl Drop for SKFileWriter {
// ── tests ──────────────────────────────────────────────────────────────────────
#[cfg(test)]
mod tests {
use super::*;
use crate::reader::SKFileReader;
use obikseq::{SuperKmer, set_k};
use tempfile::{NamedTempFile, TempDir};
const TEST_K: usize = 4;
fn make_sk(seed: usize) -> SuperKmer {
let bases: Vec<u8> = (0..8).map(|j| b"ACGT"[(seed + j) % 4]).collect();
SuperKmer::from_ascii(&bases)
}
fn pool(max_open: usize) -> SharedPool {
Arc::new(Mutex::new(SKFilePool::new(max_open)))
}
fn open_token(t: &mut SKFileWriter, sk: &SuperKmer) {
t.set_flush_threshold(1);
t.write(sk).unwrap(); // pending ≥ 1 → drain → fd opened
}
#[test]
fn creation_holds_no_fd() {
set_k(TEST_K);
let dir = TempDir::new().unwrap();
let p = pool(3);
for i in 0..10 {
create_token(&p, dir.path().join(format!("p{i}.zst"))).unwrap();
}
assert_eq!(p.lock().unwrap().open_count(), 0);
}
#[test]
fn pool_limits_open_fds() {
set_k(TEST_K);
let dir = TempDir::new().unwrap();
let p = pool(3);
let sk = make_sk(0);
let mut tokens: Vec<SKFileWriter> = (0..6)
.map(|i| create_token(&p, dir.path().join(format!("p{i}.zst"))).unwrap())
.collect();
for t in tokens.iter_mut() {
open_token(t, &sk);
}
assert!(
p.lock().unwrap().open_count() <= 3,
"open={}",
p.lock().unwrap().open_count()
);
}
#[test]
fn evicted_token_stays_logically_open() {
set_k(TEST_K);
let dir = TempDir::new().unwrap();
let p = pool(1);
let sk = make_sk(0);
let mut t0 = create_token(&p, dir.path().join("a.zst")).unwrap();
let mut t1 = create_token(&p, dir.path().join("b.zst")).unwrap();
open_token(&mut t0, &sk); // t0 fd open, pool full
open_token(&mut t1, &sk); // evicts t0, t1 fd open
assert!(t0.is_open(), "t0 must remain logically open after eviction");
assert_eq!(p.lock().unwrap().open_count(), 1);
}
#[test]
fn evicted_data_readable_after_close_all() {
set_k(TEST_K);
let dir = TempDir::new().unwrap();
let p = pool(1);
let sk = make_sk(0);
let mut t0 = create_token(&p, dir.path().join("a.zst")).unwrap();
let mut t1 = create_token(&p, dir.path().join("b.zst")).unwrap();
t0.set_flush_threshold(1);
t0.write(&sk).unwrap(); // t0 fd open, pool full
t1.set_flush_threshold(1);
t1.write(&sk).unwrap(); // evicts t0, t1 fd open
// t0 still has the record in pending (eviction just closed fd, pending stays in token)
// Actually: t0's pending was drained before drain() returned (drain clears pending).
// So t0 wrote its record, then was evicted (fd closed).
drop(t0);
drop(t1);
p.lock().unwrap().close_all().unwrap();
for name in &["a.zst", "b.zst"] {
let mut r = SKFileReader::open(dir.path().join(name)).unwrap();
let got = r.read_batch(10).unwrap();
assert_eq!(got.len(), 1, "{name}: expected 1 record");
}
}
#[test]
fn touch_moves_to_mru_so_lru_is_evicted() {
set_k(TEST_K);
let dir = TempDir::new().unwrap();
let p = pool(2);
let sk = make_sk(0);
let mut t0 = create_token(&p, dir.path().join("a.zst")).unwrap();
let mut t1 = create_token(&p, dir.path().join("b.zst")).unwrap();
let mut t2 = create_token(&p, dir.path().join("c.zst")).unwrap();
open_token(&mut t0, &sk); // t0 open
open_token(&mut t1, &sk); // t1 open, t0 LRU
// Write to t0 again → t0 becomes MRU, t1 becomes LRU
t0.set_flush_threshold(1);
t0.write(&sk).unwrap();
// Writing to t2 fills pool (cap=2) → evicts LRU = t1
open_token(&mut t2, &sk);
let open_count = p.lock().unwrap().open_count();
assert!(open_count <= 2, "open_count={open_count}");
}
#[test]
fn close_all_produces_readable_files() {
set_k(TEST_K);
let dir = TempDir::new().unwrap();
let p = pool(8);
let paths: Vec<_> = (0..4)
.map(|i| dir.path().join(format!("{i}.zst")))
.collect();
let mut tokens: Vec<SKFileWriter> = paths
.iter()
.map(|path| create_token(&p, path.clone()).unwrap())
.collect();
for (i, t) in tokens.iter_mut().enumerate() {
t.write(&make_sk(i)).unwrap();
}
// close tokens first so pending bytes are flushed and Zstd frames finalized
for t in tokens.iter_mut() {
t.close().unwrap();
}
p.lock().unwrap().close_all().unwrap();
for path in &paths {
let mut r = SKFileReader::open(path).unwrap();
let got = r.read_batch(10).unwrap();
assert_eq!(got.len(), 1);
}
}
#[test]
fn write_batch_roundtrip() {
set_k(TEST_K);
let dir = TempDir::new().unwrap();
let p = pool(4);
let sks: Vec<_> = (0..50).map(make_sk).collect();
let path = dir.path().join("batch.zst");
let mut t = create_token(&p, path.clone()).unwrap();
t.write_batch(&sks).unwrap();
t.close().unwrap();
let mut r = SKFileReader::open(&path).unwrap();
let got = r.read_batch(100).unwrap();
assert_eq!(got.len(), 50);
for (a, b) in sks.iter().zip(got.iter()) {
assert_eq!(a.to_ascii(), b.to_ascii());
}
}
#[test]
fn from_system_limits_bounded() {
set_k(TEST_K);
let pool = SKFilePool::from_system_limits();
assert!(pool.max_open() >= 16);
assert!(pool.max_open() <= MAX_POOL_SIZE);
}
#[test]
fn standalone_roundtrip_zstd() {
set_k(TEST_K);
let tmp = NamedTempFile::new().unwrap();
let sks: Vec<_> = (0..100).map(make_sk).collect();
{
let mut w = SKFileWriter::create(tmp.path()).unwrap();
w.write_batch(&sks).unwrap();
w.close().unwrap();
}
let mut r = SKFileReader::open(tmp.path()).unwrap();
let got = r.read_batch(200).unwrap();
assert_eq!(got.len(), 100);
for (a, b) in sks.iter().zip(got.iter()) {
assert_eq!(a.to_ascii(), b.to_ascii());
}
}
#[test]
fn standalone_close_prevents_write() {
set_k(TEST_K);
let tmp = NamedTempFile::new().unwrap();
let mut w = SKFileWriter::create(tmp.path()).unwrap();
w.close().unwrap();
assert!(!w.is_open());
assert!(w.write(&make_sk(0)).is_err());
}
#[test]
fn standalone_is_physically_open() {
set_k(TEST_K);
let tmp = NamedTempFile::new().unwrap();
let mut w = SKFileWriter::create(tmp.path()).unwrap();
assert!(!w.is_physically_open()); // fd deferred until first drain
w.set_flush_threshold(1);
w.write(&make_sk(0)).unwrap(); // triggers drain → fd opened
assert!(w.is_physically_open());
w.close().unwrap();
assert!(!w.is_physically_open());
}
}
#[path = "tests/pool.rs"]
mod tests;
+2 -67
View File
@@ -143,70 +143,5 @@ impl Iterator for SKFileIter<'_> {
}
#[cfg(test)]
mod tests {
use super::*;
use crate::pool::SKFileWriter;
use tempfile::NamedTempFile;
const TEST_K: usize = 4; // test sequences are 8 bases; k=4 gives n_kmers=5
fn setup() {
obikseq::params::set_k(TEST_K);
}
fn make_sks(n: usize) -> Vec<SuperKmer> {
(0..n)
.map(|i| {
let bases: Vec<u8> = (0..8).map(|j| b"ACGT"[(i + j) % 4]).collect();
SuperKmer::from_ascii(&bases)
})
.collect()
}
#[test]
fn iter_all() {
setup();
let tmp = NamedTempFile::new().unwrap();
let sks = make_sks(50);
{
let mut w = SKFileWriter::create(tmp.path()).unwrap();
w.write_batch(&sks).unwrap();
}
let mut r = SKFileReader::open(tmp.path()).unwrap();
let got: Vec<_> = r.iter().collect();
assert_eq!(got.len(), 50);
for (a, b) in sks.iter().zip(got.iter()) {
assert_eq!(a.to_ascii(), b.to_ascii());
}
}
#[test]
fn reopen_and_seek() {
setup();
let tmp = NamedTempFile::new().unwrap();
let sks = make_sks(20);
{
let mut w = SKFileWriter::create(tmp.path()).unwrap();
w.write_batch(&sks).unwrap();
}
let mut r = SKFileReader::open(tmp.path()).unwrap();
// Read 10, then simulate pool eviction + re-access
let first = r.read_batch(10).unwrap();
r.close();
r.reopen_and_seek().unwrap();
// Continue from position 10
let rest = r.read_batch(20).unwrap();
assert_eq!(first.len(), 10);
assert_eq!(rest.len(), 10);
for (a, b) in sks[..10].iter().zip(first.iter()) {
assert_eq!(a.to_ascii(), b.to_ascii());
}
for (a, b) in sks[10..].iter().zip(rest.iter()) {
assert_eq!(a.to_ascii(), b.to_ascii());
}
}
}
#[path = "tests/reader.rs"]
mod tests;
+64
View File
@@ -0,0 +1,64 @@
use super::*;
use obikseq::set_k;
use std::io::Cursor;
fn make_sk(ascii: &[u8]) -> SuperKmer {
SuperKmer::from_ascii(ascii)
}
#[test]
fn roundtrip_single() {
set_k(4);
let sk = make_sk(b"ACGTACGT");
let mut buf = Vec::new();
write_superkmer(&mut buf, &sk).unwrap();
let mut cur = Cursor::new(&buf);
let got = read_superkmer(&mut cur).unwrap().unwrap();
assert_eq!(sk.to_ascii(), got.to_ascii());
assert_eq!(sk.seql(), got.seql());
}
#[test]
fn roundtrip_all_lengths() {
set_k(11);
let k: usize = 11;
let bases: Vec<u8> = (0..300).map(|i| b"ACGT"[i % 4]).collect();
// With k=11, seql=257 → n_kmers=247 ≤ 256: single chunk, no split.
for len in (k..=k + 8).chain([255, 256, 257]) {
let sk = make_sk(&bases[..len]);
let mut buf = Vec::new();
write_superkmer(&mut buf, &sk).unwrap();
let mut cur = Cursor::new(&buf);
let got = read_superkmer(&mut cur).unwrap().unwrap();
assert_eq!(sk.to_ascii(), got.to_ascii(), "len={len}");
assert_eq!(sk.seql(), got.seql(), "len={len}");
}
}
#[test]
fn eof_returns_none() {
set_k(4);
let buf: Vec<u8> = vec![];
let mut cur = Cursor::new(&buf);
assert!(read_superkmer(&mut cur).unwrap().is_none());
}
#[test]
fn multiple_records() {
set_k(4);
let seqs: &[&[u8]] = &[b"AAAA", b"CCCC", b"GGGG", b"TTTT"];
let mut buf = Vec::new();
for s in seqs {
write_superkmer(&mut buf, &make_sk(s)).unwrap();
}
let mut cur = Cursor::new(&buf);
for s in seqs {
let got = read_superkmer(&mut cur).unwrap().unwrap();
let expected = make_sk(s);
assert_eq!(expected.to_ascii(), got.to_ascii());
}
assert!(read_superkmer(&mut cur).unwrap().is_none());
}
+217
View File
@@ -0,0 +1,217 @@
use super::*;
use crate::reader::SKFileReader;
use obikseq::{SuperKmer, set_k};
use tempfile::{NamedTempFile, TempDir};
const TEST_K: usize = 4;
fn make_sk(seed: usize) -> SuperKmer {
let bases: Vec<u8> = (0..8).map(|j| b"ACGT"[(seed + j) % 4]).collect();
SuperKmer::from_ascii(&bases)
}
fn pool(max_open: usize) -> SharedPool {
Arc::new(Mutex::new(SKFilePool::new(max_open)))
}
fn open_token(t: &mut SKFileWriter, sk: &SuperKmer) {
t.set_flush_threshold(1);
t.write(sk).unwrap();
}
#[test]
fn creation_holds_no_fd() {
set_k(TEST_K);
let dir = TempDir::new().unwrap();
let p = pool(3);
for i in 0..10 {
create_token(&p, dir.path().join(format!("p{i}.zst"))).unwrap();
}
assert_eq!(p.lock().unwrap().open_count(), 0);
}
#[test]
fn pool_limits_open_fds() {
set_k(TEST_K);
let dir = TempDir::new().unwrap();
let p = pool(3);
let sk = make_sk(0);
let mut tokens: Vec<SKFileWriter> = (0..6)
.map(|i| create_token(&p, dir.path().join(format!("p{i}.zst"))).unwrap())
.collect();
for t in tokens.iter_mut() {
open_token(t, &sk);
}
assert!(
p.lock().unwrap().open_count() <= 3,
"open={}",
p.lock().unwrap().open_count()
);
}
#[test]
fn evicted_token_stays_logically_open() {
set_k(TEST_K);
let dir = TempDir::new().unwrap();
let p = pool(1);
let sk = make_sk(0);
let mut t0 = create_token(&p, dir.path().join("a.zst")).unwrap();
let mut t1 = create_token(&p, dir.path().join("b.zst")).unwrap();
open_token(&mut t0, &sk);
open_token(&mut t1, &sk);
assert!(t0.is_open(), "t0 must remain logically open after eviction");
assert_eq!(p.lock().unwrap().open_count(), 1);
}
#[test]
fn evicted_data_readable_after_close_all() {
set_k(TEST_K);
let dir = TempDir::new().unwrap();
let p = pool(1);
let sk = make_sk(0);
let mut t0 = create_token(&p, dir.path().join("a.zst")).unwrap();
let mut t1 = create_token(&p, dir.path().join("b.zst")).unwrap();
t0.set_flush_threshold(1);
t0.write(&sk).unwrap();
t1.set_flush_threshold(1);
t1.write(&sk).unwrap();
drop(t0);
drop(t1);
p.lock().unwrap().close_all().unwrap();
for name in &["a.zst", "b.zst"] {
let mut r = SKFileReader::open(dir.path().join(name)).unwrap();
let got = r.read_batch(10).unwrap();
assert_eq!(got.len(), 1, "{name}: expected 1 record");
}
}
#[test]
fn touch_moves_to_mru_so_lru_is_evicted() {
set_k(TEST_K);
let dir = TempDir::new().unwrap();
let p = pool(2);
let sk = make_sk(0);
let mut t0 = create_token(&p, dir.path().join("a.zst")).unwrap();
let mut t1 = create_token(&p, dir.path().join("b.zst")).unwrap();
let mut t2 = create_token(&p, dir.path().join("c.zst")).unwrap();
open_token(&mut t0, &sk);
open_token(&mut t1, &sk);
t0.set_flush_threshold(1);
t0.write(&sk).unwrap();
open_token(&mut t2, &sk);
let open_count = p.lock().unwrap().open_count();
assert!(open_count <= 2, "open_count={open_count}");
}
#[test]
fn close_all_produces_readable_files() {
set_k(TEST_K);
let dir = TempDir::new().unwrap();
let p = pool(8);
let paths: Vec<_> = (0..4)
.map(|i| dir.path().join(format!("{i}.zst")))
.collect();
let mut tokens: Vec<SKFileWriter> = paths
.iter()
.map(|path| create_token(&p, path.clone()).unwrap())
.collect();
for (i, t) in tokens.iter_mut().enumerate() {
t.write(&make_sk(i)).unwrap();
}
for t in tokens.iter_mut() {
t.close().unwrap();
}
p.lock().unwrap().close_all().unwrap();
for path in &paths {
let mut r = SKFileReader::open(path).unwrap();
let got = r.read_batch(10).unwrap();
assert_eq!(got.len(), 1);
}
}
#[test]
fn write_batch_roundtrip() {
set_k(TEST_K);
let dir = TempDir::new().unwrap();
let p = pool(4);
let sks: Vec<_> = (0..50).map(make_sk).collect();
let path = dir.path().join("batch.zst");
let mut t = create_token(&p, path.clone()).unwrap();
t.write_batch(&sks).unwrap();
t.close().unwrap();
let mut r = SKFileReader::open(&path).unwrap();
let got = r.read_batch(100).unwrap();
assert_eq!(got.len(), 50);
for (a, b) in sks.iter().zip(got.iter()) {
assert_eq!(a.to_ascii(), b.to_ascii());
}
}
#[test]
fn from_system_limits_bounded() {
set_k(TEST_K);
let pool = SKFilePool::from_system_limits();
assert!(pool.max_open() >= 16);
assert!(pool.max_open() <= MAX_POOL_SIZE);
}
#[test]
fn standalone_roundtrip_zstd() {
set_k(TEST_K);
let tmp = NamedTempFile::new().unwrap();
let sks: Vec<_> = (0..100).map(make_sk).collect();
{
let mut w = SKFileWriter::create(tmp.path()).unwrap();
w.write_batch(&sks).unwrap();
w.close().unwrap();
}
let mut r = SKFileReader::open(tmp.path()).unwrap();
let got = r.read_batch(200).unwrap();
assert_eq!(got.len(), 100);
for (a, b) in sks.iter().zip(got.iter()) {
assert_eq!(a.to_ascii(), b.to_ascii());
}
}
#[test]
fn standalone_close_prevents_write() {
set_k(TEST_K);
let tmp = NamedTempFile::new().unwrap();
let mut w = SKFileWriter::create(tmp.path()).unwrap();
w.close().unwrap();
assert!(!w.is_open());
assert!(w.write(&make_sk(0)).is_err());
}
#[test]
fn standalone_is_physically_open() {
set_k(TEST_K);
let tmp = NamedTempFile::new().unwrap();
let mut w = SKFileWriter::create(tmp.path()).unwrap();
assert!(!w.is_physically_open());
w.set_flush_threshold(1);
w.write(&make_sk(0)).unwrap();
assert!(w.is_physically_open());
w.close().unwrap();
assert!(!w.is_physically_open());
}
+63
View File
@@ -0,0 +1,63 @@
use super::*;
use crate::pool::SKFileWriter;
use tempfile::NamedTempFile;
const TEST_K: usize = 4;
fn setup() {
obikseq::params::set_k(TEST_K);
}
fn make_sks(n: usize) -> Vec<SuperKmer> {
(0..n)
.map(|i| {
let bases: Vec<u8> = (0..8).map(|j| b"ACGT"[(i + j) % 4]).collect();
SuperKmer::from_ascii(&bases)
})
.collect()
}
#[test]
fn iter_all() {
setup();
let tmp = NamedTempFile::new().unwrap();
let sks = make_sks(50);
{
let mut w = SKFileWriter::create(tmp.path()).unwrap();
w.write_batch(&sks).unwrap();
}
let mut r = SKFileReader::open(tmp.path()).unwrap();
let got: Vec<_> = r.iter().collect();
assert_eq!(got.len(), 50);
for (a, b) in sks.iter().zip(got.iter()) {
assert_eq!(a.to_ascii(), b.to_ascii());
}
}
#[test]
fn reopen_and_seek() {
setup();
let tmp = NamedTempFile::new().unwrap();
let sks = make_sks(20);
{
let mut w = SKFileWriter::create(tmp.path()).unwrap();
w.write_batch(&sks).unwrap();
}
let mut r = SKFileReader::open(tmp.path()).unwrap();
let first = r.read_batch(10).unwrap();
r.close();
r.reopen_and_seek().unwrap();
let rest = r.read_batch(20).unwrap();
assert_eq!(first.len(), 10);
assert_eq!(rest.len(), 10);
for (a, b) in sks[..10].iter().zip(first.iter()) {
assert_eq!(a.to_ascii(), b.to_ascii());
}
for (a, b) in sks[10..].iter().zip(rest.iter()) {
assert_eq!(a.to_ascii(), b.to_ascii());
}
}
+169
View File
@@ -0,0 +1,169 @@
use super::*;
use obikseq::{Kmer, Sequence as _, Unitig, set_k};
use tempfile::tempdir;
fn make_unitig(ascii: &[u8]) -> Unitig {
Unitig::from_ascii(ascii)
}
fn canonical_of(ascii: &[u8]) -> CanonicalKmer {
Kmer::from_ascii(ascii).unwrap().canonical()
}
fn write_read(seqs: &[&[u8]]) -> (tempfile::TempDir, UnitigFileReader) {
let dir = tempdir().unwrap();
let path = dir.path().join("unitigs.bin");
let mut w = UnitigFileWriter::create(&path).unwrap();
for s in seqs {
w.write(&make_unitig(s)).unwrap();
}
w.close().unwrap();
let r = UnitigFileReader::open(&path).unwrap();
(dir, r)
}
// ── I/O round-trip ────────────────────────────────────────────────────────────
#[test]
fn roundtrip_empty_index() {
set_k(4);
let dir = tempdir().unwrap();
let path = dir.path().join("unitigs.bin");
let w = UnitigFileWriter::create(&path).unwrap();
w.close().unwrap();
let r = UnitigFileReader::open(&path).unwrap();
assert_eq!(r.len(), 0);
}
#[test]
fn roundtrip_unitigs() {
set_k(4);
let seqs: &[&[u8]] = &[b"ACGTACGT", b"TTTTCCCC", b"GGGAAA"];
let (_dir, r) = write_read(seqs);
assert_eq!(r.len(), seqs.len());
for (i, s) in seqs.iter().enumerate() {
assert_eq!(r.unitig(i), make_unitig(s), "unitig {i} mismatch");
}
}
// ── Bit extraction ────────────────────────────────────────────────────────────
#[test]
fn extract_kmer_raw_basic() {
// ACGT = 00 01 10 11 = 0x1B; k=4, j=0 → 0x1B << 56
let bytes = [0x1Bu8];
assert_eq!(extract_kmer_raw(&bytes, 0, 4), 0x1Bu64 << 56);
}
#[test]
fn extract_kmer_raw_intra_byte_offset() {
// ACGT, j=1, k=3 → CGT = 01 10 11 = 0x1B (6 bits) → 0x1B << 58
let bytes = [0x1Bu8];
assert_eq!(extract_kmer_raw(&bytes, 1, 3), 0x1Bu64 << 58);
}
#[test]
fn extract_kmer_raw_cross_byte() {
// Two bytes: ACGT | ACGT = [0x1B, 0x1B]
// j=3, k=4: nucleotides 3..7 = T A C G = 11 00 01 10 = 0b11000110 = 0xC6
let bytes = [0x1Bu8, 0x1Bu8];
assert_eq!(extract_kmer_raw(&bytes, 3, 4), 0xC6u64 << 56);
}
// ── revcomp / canonical ───────────────────────────────────────────────────────
#[test]
fn revcomp_palindrome() {
// ACGT is its own reverse complement
let raw = 0x1Bu64 << 56; // ACGT, k=4
assert_eq!(revcomp_raw(raw, 4), raw);
}
#[test]
fn revcomp_asymmetric() {
// revcomp(TTTG) = CAAA
// TTTG = 11 11 11 10 = 0xFE → 0xFE << 56
// CAAA = 01 00 00 00 = 0x40 → 0x40 << 56
let tttg = 0xFEu64 << 56;
let caaa = 0x40u64 << 56;
assert_eq!(revcomp_raw(tttg, 4), caaa);
assert_eq!(revcomp_raw(caaa, 4), tttg);
}
#[test]
fn canonical_raw_selects_minimum() {
let tttg = 0xFEu64 << 56;
let caaa = 0x40u64 << 56;
assert_eq!(canonical_raw(tttg, 4), caaa); // TTTG → canonical is CAAA
assert_eq!(canonical_raw(caaa, 4), caaa); // CAAA already canonical
}
// ── verify_canonical_kmer ─────────────────────────────────────────────────────
#[test]
fn verify_forward_canonical() {
// CAAA is canonical (< TTTG); stored forward in the unitig → direct match
set_k(4);
let (_dir, r) = write_read(&[b"CAAAACGT"]);
let query = canonical_of(b"CAAA");
assert!(r.verify_canonical_kmer(0, 0, query));
}
#[test]
fn verify_reverse_complement_stored() {
// TTTG stored in the unitig; canonical form is CAAA
// verify must recognise the match despite the stored orientation being non-canonical
set_k(4);
let (_dir, r) = write_read(&[b"TTTGACGT"]);
let query = canonical_of(b"CAAA"); // == canonical_of(b"TTTG")
assert!(r.verify_canonical_kmer(0, 0, query));
}
#[test]
fn verify_wrong_kmer_returns_false() {
set_k(4);
let (_dir, r) = write_read(&[b"TTTGACGT"]);
let wrong = canonical_of(b"AAAC");
assert!(!r.verify_canonical_kmer(0, 0, wrong));
}
#[test]
fn verify_second_unitig_second_position() {
// Two unitigs; check kmer at j=1 of unitig 1 ("TTTGACGT")
// j=1 → nucleotides 1..5 = TTGA
set_k(4);
let (_dir, r) = write_read(&[b"ACGTACGT", b"TTTGACGT"]);
let query = canonical_of(b"TTGA");
assert!(r.verify_canonical_kmer(1, 1, query));
}
// ── Splitting ─────────────────────────────────────────────────────────────────
#[test]
fn short_unitig_not_split() {
// seql=259 → n_kmers=256 = MAX_KMERS_PER_CHUNK → no split
set_k(4);
let seq: Vec<u8> = (0..259_usize).map(|i| b"ACGT"[i % 4]).collect();
let (_dir, r) = write_read(&[&seq]);
assert_eq!(r.len(), 1);
assert_eq!(r.seql(0), 259);
}
#[test]
fn long_unitig_split_no_kmer_lost() {
// seql=260 → n_kmers=257 > MAX_KMERS_PER_CHUNK(256) → 2 chunks
// chunk_nucl=259, stride=256
// Chunk 0: nucl 0..259 (259 nucl, 256 kmers)
// Chunk 1: nucl 256..260 (4 nucl, 1 kmer)
set_k(4);
let seq: Vec<u8> = (0..260_usize).map(|i| b"ACGT"[i % 4]).collect();
let (_dir, r) = write_read(&[&seq]);
assert_eq!(r.len(), 2);
assert_eq!(r.seql(0), 259);
assert_eq!(r.seql(1), 4);
// k-1=3 nucleotide overlap → 0 kmers duplicated, 0 kmers lost.
// Last kmer of chunk 0 = original nucl 255..259.
assert!(r.verify_canonical_kmer(0, 255, canonical_of(&seq[255..259])));
// First kmer of chunk 1 = original nucl 256..260 — a different, adjacent kmer.
assert!(r.verify_canonical_kmer(1, 0, canonical_of(&seq[256..260])));
}
+286
View File
@@ -0,0 +1,286 @@
use std::fs::File;
use std::io::{BufWriter, Write as _};
use std::path::{Path, PathBuf};
use memmap2::Mmap;
use obikseq::{CanonicalKmer, Unitig};
pub use obikseq::MAX_KMERS_PER_CHUNK;
use crate::error::{SKError, SKResult};
// ── Index file format ─────────────────────────────────────────────────────────
//
// magic: [u8; 4] = b"UIDX"
// n_unitigs: u32 LE
// seqls: [u8; n_unitigs] max kmer index per chunk (= n_kmers 1)
// packed_offsets: [u32; n_unitigs + 1] byte offsets to packed bytes in the
// sequence file; last entry is sentinel
//
// Each sequence record in the binary file: [u8: n_kmers1][packed bytes].
// Offsets point to the first packed byte of each record, past the leading u8.
// Unitigs with more than MAX_KMERS_PER_CHUNK kmers are transparently split by the
// writer into overlapping chunks (k-1 nucleotide overlap) so no kmer is lost.
const MAGIC: [u8; 4] = *b"UIDX";
fn idx_path(path: &Path) -> PathBuf {
let mut s = path.as_os_str().to_owned();
s.push(".idx");
PathBuf::from(s)
}
// Extract a sub-sequence [start, end) nucleotides from a unitig.
fn sub_unitig(unitig: &Unitig, start: usize, end: usize) -> Unitig {
unitig.sub(start, end)
}
// ── Writer ────────────────────────────────────────────────────────────────────
/// Writes a sequence of [`Unitig`] to an uncompressed binary file and builds
/// an offset index at close time.
///
/// Unitigs with more than [`MAX_KMERS_PER_CHUNK`] kmers are transparently split
/// into overlapping chunks (k-1 nucleotide overlap) so no kmer is lost.
///
/// The companion index file (`path.idx`) is written on [`close`].
/// The binary format per record is `[u8: n_kmers1][packed 2-bit bytes]`.
pub struct UnitigFileWriter {
path: PathBuf,
file: BufWriter<File>,
seqls: Vec<u8>,
packed_offsets: Vec<u32>,
next_offset: u32,
k: usize,
}
impl UnitigFileWriter {
pub fn create(path: &Path) -> SKResult<Self> {
let file = File::create(path).map_err(SKError::Io)?;
Ok(Self {
path: path.to_owned(),
file: BufWriter::new(file),
seqls: Vec::new(),
packed_offsets: Vec::new(),
next_offset: 0,
k: obikseq::params::k(),
})
}
/// Write a unitig, splitting it into chunks if it exceeds [`MAX_KMERS_PER_CHUNK`].
pub fn write(&mut self, unitig: &Unitig) -> SKResult<()> {
let seql = unitig.seql();
let k = self.k;
if seql < k {
return Ok(());
}
let n_kmers = seql - k + 1;
if n_kmers <= MAX_KMERS_PER_CHUNK {
return self.write_chunk(unitig);
}
// Split into overlapping chunks of MAX_KMERS_PER_CHUNK kmers.
// Overlap of k-1 nucleotides ensures no kmer is lost at boundaries.
let chunk_nucl = MAX_KMERS_PER_CHUNK + k - 1;
let stride = MAX_KMERS_PER_CHUNK;
let mut start = 0;
while start < seql {
let end = (start + chunk_nucl).min(seql);
self.write_chunk(&sub_unitig(unitig, start, end))?;
if end == seql {
break;
}
start += stride;
}
Ok(())
}
fn write_chunk(&mut self, unitig: &Unitig) -> SKResult<()> {
let seql = unitig.seql();
let byte_len = (seql + 3) / 4;
// Header is 1 byte (u8: n_kmers 1 = seql k); packed bytes follow.
self.packed_offsets.push(self.next_offset + 1);
self.seqls.push((seql - self.k) as u8);
unitig
.write_to_binary(&mut self.file)
.map_err(SKError::Io)?;
self.next_offset += 1 + byte_len as u32;
Ok(())
}
/// Flush the sequence file and write the companion `.idx`.
pub fn close(mut self) -> SKResult<()> {
self.file.flush().map_err(SKError::Io)?;
drop(self.file);
// Sentinel: byte offset past the last record's packed bytes.
let sentinel = match (self.packed_offsets.last(), self.seqls.last()) {
(Some(&last_off), Some(&last_seql)) => {
let seql = last_seql as u32 + self.k as u32;
last_off + (seql + 3) / 4
}
_ => 0,
};
self.packed_offsets.push(sentinel);
write_idx(&idx_path(&self.path), &self.seqls, &self.packed_offsets)
}
pub fn len(&self) -> usize {
self.seqls.len()
}
pub fn is_empty(&self) -> bool {
self.seqls.is_empty()
}
}
fn write_idx(path: &Path, seqls: &[u8], packed_offsets: &[u32]) -> SKResult<()> {
let mut w = BufWriter::new(File::create(path).map_err(SKError::Io)?);
w.write_all(&MAGIC).map_err(SKError::Io)?;
w.write_all(&(seqls.len() as u32).to_le_bytes()).map_err(SKError::Io)?;
w.write_all(seqls).map_err(SKError::Io)?;
for &off in packed_offsets {
w.write_all(&off.to_le_bytes()).map_err(SKError::Io)?;
}
w.flush().map_err(SKError::Io)
}
// ── Reader ────────────────────────────────────────────────────────────────────
/// Read-only random-access view of a unitig file.
///
/// The sequence file is memory-mapped; the index is loaded into RAM on open.
/// All per-kmer operations are O(1) and allocation-free.
pub struct UnitigFileReader {
mmap: Mmap,
seqls: Vec<u8>,
packed_offsets: Vec<u32>,
k: usize,
}
impl UnitigFileReader {
pub fn open(path: &Path) -> SKResult<Self> {
let file = File::open(path).map_err(SKError::Io)?;
let mmap = unsafe { Mmap::map(&file).map_err(SKError::Io)? };
let (seqls, packed_offsets) = read_idx(&idx_path(path))?;
let k = obikseq::params::k();
Ok(Self { mmap, seqls, packed_offsets, k })
}
pub fn len(&self) -> usize {
self.seqls.len()
}
pub fn is_empty(&self) -> bool {
self.seqls.is_empty()
}
/// Return the nucleotide length of chunk `i`.
#[inline]
pub fn seql(&self, i: usize) -> usize {
self.seqls[i] as usize + self.k
}
/// Reconstruct chunk `i` as a [`Unitig`]. Allocates a copy of the packed bytes.
pub fn unitig(&self, i: usize) -> Unitig {
let seql = self.seqls[i] as usize + self.k;
let start = self.packed_offsets[i] as usize;
let byte_len = (seql + 3) / 4;
let tail = (seql % 4) as u8;
let bytes = self.mmap[start..start + byte_len].to_vec().into_boxed_slice();
Unitig::new(tail, bytes)
}
/// Extract the raw left-aligned u64 of the kmer at position `j` within chunk `i`.
#[inline]
pub fn raw_kmer(&self, i: usize, j: usize) -> u64 {
let start = self.packed_offsets[i] as usize;
extract_kmer_raw(&self.mmap[start..], j, self.k)
}
/// Return `true` iff the kmer at position `j` of chunk `i` equals `query`.
///
/// O(1), zero allocation. The chunk may store either orientation of the kmer;
/// canonicalization is applied before comparison.
#[inline]
pub fn verify_canonical_kmer(&self, i: usize, j: usize, query: CanonicalKmer) -> bool {
canonical_raw(self.raw_kmer(i, j), self.k) == query.raw()
}
}
fn read_idx(path: &Path) -> SKResult<(Vec<u8>, Vec<u32>)> {
let data = std::fs::read(path).map_err(SKError::Io)?;
let mut pos = 0;
if &data[pos..pos + 4] != &MAGIC {
return Err(SKError::Io(std::io::Error::new(
std::io::ErrorKind::InvalidData,
"unitig index: bad magic",
)));
}
pos += 4;
let n = u32::from_le_bytes(data[pos..pos + 4].try_into().unwrap()) as usize;
pos += 4;
let seqls = data[pos..pos + n].to_vec();
pos += n;
let mut packed_offsets = Vec::with_capacity(n + 1);
for _ in 0..=n {
packed_offsets.push(u32::from_le_bytes(data[pos..pos + 4].try_into().unwrap()));
pos += 4;
}
Ok((seqls, packed_offsets))
}
// ── Kmer utilities ────────────────────────────────────────────────────────────
/// Reverse complement of a left-aligned 2-bit kmer (same algorithm as [`KmerOf::revcomp`]).
#[inline]
fn revcomp_raw(raw: u64, k: usize) -> u64 {
let x = !raw;
let x = x.swap_bytes();
let x = ((x >> 4) & 0x0F0F0F0F0F0F0F0F) | ((x & 0x0F0F0F0F0F0F0F0F) << 4);
let x = ((x >> 2) & 0x3333333333333333) | ((x & 0x3333333333333333) << 2);
x << (64 - 2 * k)
}
/// Canonical form of a left-aligned 2-bit kmer: `min(kmer, revcomp(kmer))`.
#[inline]
fn canonical_raw(raw: u64, k: usize) -> u64 {
raw.min(revcomp_raw(raw, k))
}
// ── Bit extraction ────────────────────────────────────────────────────────────
/// Extract the kmer at nucleotide position `j` from MSB-first 2-bit packed `bytes`.
/// Returns a left-aligned u64 matching [`KmerOf`]'s internal representation.
#[inline]
fn extract_kmer_raw(bytes: &[u8], j: usize, k: usize) -> u64 {
let bit_start = j * 2;
let byte_start = bit_start / 8;
let bit_offset = bit_start % 8; // always 0, 2, 4, or 6
let bytes_needed = (bit_offset + 2 * k + 7) / 8; // ≤ 9 for k ≤ 32
let mut acc = 0u128;
for idx in 0..bytes_needed {
acc = (acc << 8) | bytes.get(byte_start + idx).copied().unwrap_or(0) as u128;
}
let shift = bytes_needed * 8 - bit_offset - 2 * k;
let mask = !0u64 >> (64 - 2 * k);
let raw = (acc >> shift) as u64 & mask;
raw << (64 - 2 * k)
}
#[cfg(test)]
#[path = "tests/unitig_index.rs"]
mod tests;