Files
obikmer/src/obiskio/src/tests/unitig_index.rs
T
Eric Coissac 24afd74e2f refactor: decouple unitig index generation and add exact evidence
Decouple index generation by introducing `build_unitig_idx()` for retroactive `.idx` creation and optional immediate writing on close. Add `open_sequential()` for index-less iteration while enforcing index requirements for random access. Refactor the MPHF layer to pre-generate the unitig index for parallel random access, integrate `rayon` for k-mer processing, and enforce mapping integrity via duplicate slot validation. Additionally, implement `build_exact_evidence()` to reconstruct evidence from existing artifacts, and update tests to leverage the new index generation and simplified k-mer iteration helpers.
2026-05-23 13:02:25 +02:00

368 lines
13 KiB
Rust

use super::*;
use obikseq::{Kmer, Sequence as _, Unitig, set_k};
use tempfile::tempdir;
fn make_unitig(ascii: &[u8]) -> Unitig {
Unitig::from_ascii(ascii)
}
fn canonical_of(ascii: &[u8]) -> CanonicalKmer {
Kmer::from_ascii(ascii).unwrap().canonical()
}
fn write_read(seqs: &[&[u8]]) -> (tempfile::TempDir, UnitigFileReader) {
let dir = tempdir().unwrap();
let path = dir.path().join("unitigs.bin");
let mut w = UnitigFileWriter::create(&path).unwrap();
for s in seqs {
w.write(&make_unitig(s)).unwrap();
}
w.close().unwrap();
super::build_unitig_idx(&path, super::DEFAULT_BLOCK_BITS).unwrap();
let r = UnitigFileReader::open(&path).unwrap();
(dir, r)
}
// ── I/O round-trip ────────────────────────────────────────────────────────────
#[test]
fn roundtrip_empty_index() {
set_k(4);
let dir = tempdir().unwrap();
let path = dir.path().join("unitigs.bin");
let w = UnitigFileWriter::create(&path).unwrap();
w.close().unwrap();
super::build_unitig_idx(&path, super::DEFAULT_BLOCK_BITS).unwrap();
let r = UnitigFileReader::open(&path).unwrap();
assert_eq!(r.len(), 0);
}
#[test]
fn roundtrip_unitigs() {
set_k(4);
let seqs: &[&[u8]] = &[b"ACGTACGT", b"TTTTCCCC", b"GGGAAA"];
let (_dir, r) = write_read(seqs);
assert_eq!(r.len(), seqs.len());
for (i, s) in seqs.iter().enumerate() {
assert_eq!(r.unitig(i), make_unitig(s), "unitig {i} mismatch");
}
}
// ── Bit extraction ────────────────────────────────────────────────────────────
#[test]
fn extract_kmer_raw_basic() {
// ACGT = 00 01 10 11 = 0x1B; k=4, j=0 → 0x1B << 56
let bytes = [0x1Bu8];
assert_eq!(extract_kmer_raw(&bytes, 0, 4), 0x1Bu64 << 56);
}
#[test]
fn extract_kmer_raw_intra_byte_offset() {
// ACGT, j=1, k=3 → CGT = 01 10 11 = 0x1B (6 bits) → 0x1B << 58
let bytes = [0x1Bu8];
assert_eq!(extract_kmer_raw(&bytes, 1, 3), 0x1Bu64 << 58);
}
#[test]
fn extract_kmer_raw_cross_byte() {
// Two bytes: ACGT | ACGT = [0x1B, 0x1B]
// j=3, k=4: nucleotides 3..7 = T A C G = 11 00 01 10 = 0b11000110 = 0xC6
let bytes = [0x1Bu8, 0x1Bu8];
assert_eq!(extract_kmer_raw(&bytes, 3, 4), 0xC6u64 << 56);
}
// ── revcomp / canonical ───────────────────────────────────────────────────────
#[test]
fn revcomp_palindrome() {
// ACGT is its own reverse complement
let raw = 0x1Bu64 << 56; // ACGT, k=4
assert_eq!(revcomp_raw(raw, 4), raw);
}
#[test]
fn revcomp_asymmetric() {
// revcomp(TTTG) = CAAA
// TTTG = 11 11 11 10 = 0xFE → 0xFE << 56
// CAAA = 01 00 00 00 = 0x40 → 0x40 << 56
let tttg = 0xFEu64 << 56;
let caaa = 0x40u64 << 56;
assert_eq!(revcomp_raw(tttg, 4), caaa);
assert_eq!(revcomp_raw(caaa, 4), tttg);
}
#[test]
fn canonical_raw_selects_minimum() {
let tttg = 0xFEu64 << 56;
let caaa = 0x40u64 << 56;
assert_eq!(canonical_raw(tttg, 4), caaa); // TTTG → canonical is CAAA
assert_eq!(canonical_raw(caaa, 4), caaa); // CAAA already canonical
}
// ── verify_canonical_kmer ─────────────────────────────────────────────────────
#[test]
fn verify_forward_canonical() {
// CAAA is canonical (< TTTG); stored forward in the unitig → direct match
set_k(4);
let (_dir, r) = write_read(&[b"CAAAACGT"]);
let query = canonical_of(b"CAAA");
assert!(r.verify_canonical_kmer(0, 0, query));
}
#[test]
fn verify_reverse_complement_stored() {
// TTTG stored in the unitig; canonical form is CAAA
// verify must recognise the match despite the stored orientation being non-canonical
set_k(4);
let (_dir, r) = write_read(&[b"TTTGACGT"]);
let query = canonical_of(b"CAAA"); // == canonical_of(b"TTTG")
assert!(r.verify_canonical_kmer(0, 0, query));
}
#[test]
fn verify_wrong_kmer_returns_false() {
set_k(4);
let (_dir, r) = write_read(&[b"TTTGACGT"]);
let wrong = canonical_of(b"AAAC");
assert!(!r.verify_canonical_kmer(0, 0, wrong));
}
#[test]
fn verify_second_unitig_second_position() {
// Two unitigs; check kmer at j=1 of unitig 1 ("TTTGACGT")
// j=1 → nucleotides 1..5 = TTGA
set_k(4);
let (_dir, r) = write_read(&[b"ACGTACGT", b"TTTGACGT"]);
let query = canonical_of(b"TTGA");
assert!(r.verify_canonical_kmer(1, 1, query));
}
// ── iter_kmers ────────────────────────────────────────────────────────────────
#[test]
fn iter_kmers_empty_file() {
set_k(4);
let dir = tempdir().unwrap();
let path = dir.path().join("unitigs.bin");
UnitigFileWriter::create(&path).unwrap().close().unwrap();
super::build_unitig_idx(&path, super::DEFAULT_BLOCK_BITS).unwrap();
let r = UnitigFileReader::open(&path).unwrap();
assert_eq!(r.iter_kmers().count(), 0);
}
#[test]
fn iter_kmers_single_chunk_count_and_order() {
set_k(4);
// "AAAACG": 6 nucl → 3 kmers (k=4)
let (_dir, r) = write_read(&[b"AAAACG"]);
let kmers: Vec<Kmer> = r.iter_kmers().collect();
assert_eq!(kmers.len(), 3);
for (rank, kmer) in kmers.iter().enumerate() {
assert_eq!(kmer.raw(), r.raw_kmer(0, rank), "kmer mismatch at rank {rank}");
}
}
#[test]
fn iter_kmers_two_chunks_order() {
set_k(4);
// "AAAACG" → 3 kmers, "CCCCAG" → 3 kmers
let (_dir, r) = write_read(&[b"AAAACG", b"CCCCAG"]);
let kmers: Vec<Kmer> = r.iter_kmers().collect();
assert_eq!(kmers.len(), 6);
// Chunk 0 first
for rank in 0..3 {
assert_eq!(kmers[rank].raw(), r.raw_kmer(0, rank));
}
// Chunk 1 after
for rank in 0..3 {
assert_eq!(kmers[3 + rank].raw(), r.raw_kmer(1, rank));
}
}
// ── iter_canonical_kmers ──────────────────────────────────────────────────────
#[test]
fn iter_canonical_kmers_all_canonical() {
set_k(4);
let (_dir, r) = write_read(&[b"AAAACG", b"CCCCAG"]);
for kmer in r.iter_canonical_kmers() {
// canonical of a canonical kmer is itself
assert_eq!(kmer.raw(), kmer.canonical().raw());
}
}
#[test]
fn iter_canonical_kmers_matches_iter_kmers() {
set_k(4);
let (_dir, r) = write_read(&[b"AAAACG", b"CCCCAG"]);
let canonical: Vec<CanonicalKmer> = r.iter_canonical_kmers().collect();
let raw: Vec<Kmer> = r.iter_kmers().collect();
assert_eq!(canonical.len(), raw.len());
for (ck, rk) in canonical.iter().zip(raw.iter()) {
assert_eq!(ck.raw(), rk.canonical().raw());
}
}
// ── iter_indexed_canonical_kmers ──────────────────────────────────────────────
#[test]
fn iter_indexed_chunk_id_and_rank_single_chunk() {
set_k(4);
let (_dir, r) = write_read(&[b"AAAACG"]);
let items: Vec<(CanonicalKmer, usize, usize)> = r.iter_indexed_canonical_kmers().collect();
assert_eq!(items.len(), 3);
for (rank, (kmer, chunk_id, item_rank)) in items.iter().enumerate() {
assert_eq!(*chunk_id, 0, "chunk_id must be 0");
assert_eq!(*item_rank, rank, "rank mismatch");
assert!(r.verify_canonical_kmer(0, rank, *kmer));
}
}
#[test]
fn iter_indexed_chunk_id_and_rank_two_chunks() {
set_k(4);
let (_dir, r) = write_read(&[b"AAAACG", b"CCCCAG"]);
let items: Vec<(CanonicalKmer, usize, usize)> = r.iter_indexed_canonical_kmers().collect();
assert_eq!(items.len(), 6);
// First 3 items: chunk_id=0, rank 0..2
for rank in 0..3 {
let (kmer, chunk_id, item_rank) = items[rank];
assert_eq!(chunk_id, 0);
assert_eq!(item_rank, rank);
assert!(r.verify_canonical_kmer(0, rank, kmer));
}
// Next 3 items: chunk_id=1, rank resets to 0
for rank in 0..3 {
let (kmer, chunk_id, item_rank) = items[3 + rank];
assert_eq!(chunk_id, 1);
assert_eq!(item_rank, rank);
assert!(r.verify_canonical_kmer(1, rank, kmer));
}
}
// ── Splitting ─────────────────────────────────────────────────────────────────
#[test]
fn short_unitig_not_split() {
// seql=259 → n_kmers=256 = MAX_KMERS_PER_CHUNK → no split
set_k(4);
let seq: Vec<u8> = (0..259_usize).map(|i| b"ACGT"[i % 4]).collect();
let (_dir, r) = write_read(&[&seq]);
assert_eq!(r.len(), 1);
assert_eq!(r.seql(0), 259);
}
#[test]
fn long_unitig_split_no_kmer_lost() {
// seql=260 → n_kmers=257 > MAX_KMERS_PER_CHUNK(256) → 2 chunks
// chunk_nucl=259, stride=256
// Chunk 0: nucl 0..259 (259 nucl, 256 kmers)
// Chunk 1: nucl 256..260 (4 nucl, 1 kmer)
set_k(4);
let seq: Vec<u8> = (0..260_usize).map(|i| b"ACGT"[i % 4]).collect();
let (_dir, r) = write_read(&[&seq]);
assert_eq!(r.len(), 2);
assert_eq!(r.seql(0), 259);
assert_eq!(r.seql(1), 4);
// k-1=3 nucleotide overlap → 0 kmers duplicated, 0 kmers lost.
// Last kmer of chunk 0 = original nucl 255..259.
assert!(r.verify_canonical_kmer(0, 255, canonical_of(&seq[255..259])));
// First kmer of chunk 1 = original nucl 256..260 — a different, adjacent kmer.
assert!(r.verify_canonical_kmer(1, 0, canonical_of(&seq[256..260])));
}
// ── block_bits parametrisation ────────────────────────────────────────────────
fn write_read_bb(seqs: &[&[u8]], block_bits: u8) -> (tempfile::TempDir, UnitigFileReader) {
let dir = tempdir().unwrap();
let path = dir.path().join("unitigs.bin");
let mut w = UnitigFileWriter::create_with_block_bits(&path, block_bits).unwrap();
for s in seqs {
w.write(&make_unitig(s)).unwrap();
}
w.close().unwrap();
super::build_unitig_idx(&path, block_bits).unwrap();
let r = UnitigFileReader::open(&path).unwrap();
(dir, r)
}
#[test]
fn block_bits_stored_and_read_back() {
set_k(4);
for bb in [0u8, 1, 2, 3, 6] {
let dir = tempdir().unwrap();
let path = dir.path().join("unitigs.bin");
let w = UnitigFileWriter::create_with_block_bits(&path, bb).unwrap();
assert_eq!(w.block_bits(), bb);
w.close().unwrap();
super::build_unitig_idx(&path, bb).unwrap();
let r = UnitigFileReader::open(&path).unwrap();
assert_eq!(r.block_bits(), bb, "block_bits={bb} not preserved");
}
}
#[test]
fn block_bits_zero_exact_offsets() {
// block_bits=0 → one offset per chunk, no sequential scan in chunk_start
set_k(4);
let seqs: &[&[u8]] = &[b"ACGTACGT", b"TTTTCCCC", b"GGGAAA", b"AAAACG"];
let (_dir, r) = write_read_bb(seqs, 0);
assert_eq!(r.len(), seqs.len());
for (i, s) in seqs.iter().enumerate() {
assert_eq!(r.unitig(i), make_unitig(s), "block_bits=0: unitig {i} mismatch");
}
}
#[test]
fn block_bits_one_block_size_two() {
// block_bits=1 → BLOCK_SIZE=2: every other chunk gets an offset
set_k(4);
let seqs: &[&[u8]] = &[b"ACGTACGT", b"TTTTCCCC", b"GGGAAA", b"AAAACG", b"CCCCGT"];
let (_dir, r) = write_read_bb(seqs, 1);
assert_eq!(r.len(), seqs.len());
for (i, s) in seqs.iter().enumerate() {
assert_eq!(r.unitig(i), make_unitig(s), "block_bits=1: unitig {i} mismatch");
}
}
#[test]
fn block_bits_larger_than_chunk_count() {
// block_bits=6 (BLOCK_SIZE=64) with only 3 chunks: all in block 0
set_k(4);
let seqs: &[&[u8]] = &[b"ACGTACGT", b"TTTTCCCC", b"GGGAAA"];
let (_dir, r) = write_read_bb(seqs, 6);
assert_eq!(r.len(), seqs.len());
for (i, s) in seqs.iter().enumerate() {
assert_eq!(r.unitig(i), make_unitig(s), "block_bits=6: unitig {i} mismatch");
}
}
#[test]
fn block_bits_random_access_matches_sequential() {
// Write many chunks with block_bits=2 (BLOCK_SIZE=4), verify random access
// against sequential iteration for every chunk.
set_k(4);
let seqs: Vec<Vec<u8>> = (0..20_usize)
.map(|i| (0..8_usize).map(|j| b"ACGT"[(i + j) % 4]).collect())
.collect();
let seq_refs: Vec<&[u8]> = seqs.iter().map(|s| s.as_slice()).collect();
let (_dir, r) = write_read_bb(&seq_refs, 2);
assert_eq!(r.len(), seqs.len());
let sequential: Vec<Unitig> = r.iter_chunks_sequential().map(|(_, u)| u).collect();
for i in 0..seqs.len() {
assert_eq!(r.unitig(i), sequential[i], "random vs sequential mismatch at chunk {i}");
}
}
#[test]
fn block_bits_kmer_count_preserved() {
set_k(4);
// "AAAACG" → 3 kmers, "CCCCAG" → 3 kmers; total = 6
for bb in [0u8, 1, 2, 6] {
let (_dir, r) = write_read_bb(&[b"AAAACG", b"CCCCAG"], bb);
assert_eq!(r.n_kmers(), 6, "block_bits={bb}: n_kmers mismatch");
}
}