refactor: abstract k-mer types and fix bit alignment

Abstracts k-mer storage using a `RawKmer` alias and `KMER_BITS` constant to simplify bit manipulation and enable future extension to larger types. Updates bit-shifting and masking logic across `kmer.rs` and `packed_seq.rs` to prevent overflow and improve type safety. Adapts the MPHF layer to iterate over indexed canonical k-mers with explicit slot bounds validation and bit-level encoding. Fixes test suite compilation errors by correcting method names, adding tuple destructuring, and passing the required `IndexMode::Exact` parameter.
This commit is contained in:
Eric Coissac
2026-05-31 20:40:13 +02:00
parent 8b57c91ab7
commit 657f964dda
6 changed files with 84 additions and 70 deletions
+3 -3
View File
@@ -118,7 +118,7 @@ impl MphfLayer {
}
LayerEvidence::Approx { unitigs_path, .. } => {
let reader = UnitigFileReader::open_sequential(unitigs_path).ok()?;
for stored in reader.iter_canonical_kmers() {
for (stored, _, _) in reader.iter_indexed_canonical_kmers() {
if self.mphf.index(&stored.raw()) == slot {
return if stored == kmer { Some(slot) } else { None };
}
@@ -196,7 +196,7 @@ impl MphfLayer {
let mut fw = FingerprintVecWriter::new(n, b);
for kmer in unitigs.iter_canonical_kmers() {
for (kmer, _, _) in unitigs.iter_indexed_canonical_kmers() {
let slot = mphf.index(&kmer.raw());
if slot >= n {
return Err(OLMError::Mphf("slot out of bounds".into()));
@@ -281,7 +281,7 @@ impl MphfLayer {
IndexMode::Approx { b, .. } => {
let mut fw = FingerprintVecWriter::new(n, *b);
for kmer in unitigs2.iter_canonical_kmers() {
for (kmer, _, _) in unitigs2.iter_indexed_canonical_kmers() {
let slot = mphf.index(&kmer.raw());
if slot >= n { return Err(OLMError::Mphf("slot out of bounds".into())); }
let byte = slot / 8; let bit = 1u8 << (slot % 8);
+11 -10
View File
@@ -2,7 +2,7 @@ use super::*;
use obicompactvec::PersistentCompactIntMatrix;
use obikseq::{set_k, Kmer, Sequence as _, Unitig};
use obiskio::DEFAULT_BLOCK_BITS;
use crate::meta::EvidenceKind;
use crate::meta::IndexMode;
use tempfile::tempdir;
fn write_unitigs(dir: &Path, seqs: &[&[u8]]) {
@@ -15,7 +15,8 @@ fn write_unitigs(dir: &Path, seqs: &[&[u8]]) {
fn all_canonical_kmers(dir: &Path) -> Vec<CanonicalKmer> {
UnitigFileReader::open_sequential(&dir.join(UNITIGS_FILE)).unwrap()
.iter_canonical_kmers()
.iter_indexed_canonical_kmers()
.map(|(kmer, _, _)| kmer)
.collect()
}
@@ -25,8 +26,8 @@ fn build_and_query_all_kmers_found() {
let dir = tempdir().unwrap();
write_unitigs(dir.path(), &[b"AAAACGT"]);
let kmers = all_canonical_kmers(dir.path());
Layer::<()>::build(dir.path(), DEFAULT_BLOCK_BITS, &EvidenceKind::Exact).unwrap();
let layer = Layer::<()>::open(dir.path()).unwrap();
Layer::<()>::build(dir.path(), DEFAULT_BLOCK_BITS, &IndexMode::Exact).unwrap();
let layer = Layer::<()>::open(dir.path(), &IndexMode::Exact).unwrap();
for kmer in kmers {
assert!(layer.query(kmer).is_some(), "kmer should be present");
}
@@ -43,10 +44,10 @@ fn counts_are_stored_and_retrieved() {
Layer::<PersistentCompactIntMatrix>::build(
dir.path(),
DEFAULT_BLOCK_BITS,
&EvidenceKind::Exact,
&IndexMode::Exact,
|kmer| count_map.get(&kmer).copied().unwrap_or(0),
).unwrap();
let layer = Layer::<PersistentCompactIntMatrix>::open(dir.path()).unwrap();
let layer = Layer::<PersistentCompactIntMatrix>::open(dir.path(), &IndexMode::Exact).unwrap();
for kmer in &kmers {
let hit = layer.query(*kmer).expect("kmer must be present");
assert_eq!(hit.data[0], count_map[kmer]);
@@ -58,8 +59,8 @@ fn query_absent_returns_none() {
set_k(4);
let dir = tempdir().unwrap();
write_unitigs(dir.path(), &[b"AAAACGT"]);
Layer::<()>::build(dir.path(), DEFAULT_BLOCK_BITS, &EvidenceKind::Exact).unwrap();
let layer = Layer::<()>::open(dir.path()).unwrap();
Layer::<()>::build(dir.path(), DEFAULT_BLOCK_BITS, &IndexMode::Exact).unwrap();
let layer = Layer::<()>::open(dir.path(), &IndexMode::Exact).unwrap();
let absent = Kmer::from_ascii(b"CCCC").unwrap().canonical();
assert!(layer.query(absent).is_none());
}
@@ -69,9 +70,9 @@ fn open_after_build_is_consistent() {
set_k(4);
let dir = tempdir().unwrap();
write_unitigs(dir.path(), &[b"AAAACGT"]);
let n = Layer::<PersistentCompactIntMatrix>::build(dir.path(), DEFAULT_BLOCK_BITS, &EvidenceKind::Exact, |_| 7).unwrap();
let n = Layer::<PersistentCompactIntMatrix>::build(dir.path(), DEFAULT_BLOCK_BITS, &IndexMode::Exact, |_| 7).unwrap();
assert_eq!(n, 4);
let layer = Layer::<PersistentCompactIntMatrix>::open(dir.path()).unwrap();
let layer = Layer::<PersistentCompactIntMatrix>::open(dir.path(), &IndexMode::Exact).unwrap();
let kmer = Kmer::from_ascii(b"AAAA").unwrap().canonical();
let hit = layer.query(kmer).expect("AAAA must be present");
assert_eq!(hit.data[0], 7);
+7 -6
View File
@@ -1,6 +1,7 @@
use super::*;
use obicompactvec::PersistentCompactIntMatrix;
use obikseq::{set_k, Sequence as _, Unitig};
use crate::meta::IndexMode;
use tempfile::tempdir;
fn push_unitigs_and_layer(
@@ -24,7 +25,7 @@ fn canonical(ascii: &[u8]) -> CanonicalKmer {
fn create_empty_map() {
set_k(4);
let dir = tempdir().unwrap();
let map = LayeredMap::<()>::create(dir.path()).unwrap();
let map = LayeredMap::<()>::create(dir.path(), IndexMode::Exact).unwrap();
assert_eq!(map.n_layers(), 0);
}
@@ -33,7 +34,7 @@ fn open_reloads_layer_count() {
set_k(4);
let dir = tempdir().unwrap();
{
let mut map = LayeredMap::<PersistentCompactIntMatrix>::create(dir.path()).unwrap();
let mut map = LayeredMap::<PersistentCompactIntMatrix>::create(dir.path(), IndexMode::Exact).unwrap();
push_unitigs_and_layer(&mut map, &[b"AAAACGT"], 1);
}
let map = LayeredMap::<PersistentCompactIntMatrix>::open(dir.path()).unwrap();
@@ -44,7 +45,7 @@ fn open_reloads_layer_count() {
fn query_finds_kmer_in_layer_zero() {
set_k(4);
let dir = tempdir().unwrap();
let mut map = LayeredMap::<PersistentCompactIntMatrix>::create(dir.path()).unwrap();
let mut map = LayeredMap::<PersistentCompactIntMatrix>::create(dir.path(), IndexMode::Exact).unwrap();
push_unitigs_and_layer(&mut map, &[b"AAAACGT"], 3);
let kmer = canonical(b"AAAC");
let (layer_idx, hit) = map.query(kmer).expect("kmer must be found");
@@ -56,7 +57,7 @@ fn query_finds_kmer_in_layer_zero() {
fn query_finds_kmer_in_correct_layer() {
set_k(4);
let dir = tempdir().unwrap();
let mut map = LayeredMap::<PersistentCompactIntMatrix>::create(dir.path()).unwrap();
let mut map = LayeredMap::<PersistentCompactIntMatrix>::create(dir.path(), IndexMode::Exact).unwrap();
push_unitigs_and_layer(&mut map, &[b"AAAACGT"], 1);
push_unitigs_and_layer(&mut map, &[b"GGGACGT"], 2);
assert_eq!(map.n_layers(), 2);
@@ -74,7 +75,7 @@ fn query_finds_kmer_in_correct_layer() {
fn query_absent_returns_none() {
set_k(4);
let dir = tempdir().unwrap();
let mut map = LayeredMap::<PersistentCompactIntMatrix>::create(dir.path()).unwrap();
let mut map = LayeredMap::<PersistentCompactIntMatrix>::create(dir.path(), IndexMode::Exact).unwrap();
push_unitigs_and_layer(&mut map, &[b"AAAACGT"], 1);
let absent = canonical(b"CCCC");
assert!(map.query(absent).is_none());
@@ -84,7 +85,7 @@ fn query_absent_returns_none() {
fn push_layer_from_map_convenience() {
set_k(4);
let dir = tempdir().unwrap();
let mut map = LayeredMap::<PersistentCompactIntMatrix>::create(dir.path()).unwrap();
let mut map = LayeredMap::<PersistentCompactIntMatrix>::create(dir.path(), IndexMode::Exact).unwrap();
let mut w = map.next_layer_writer().unwrap();
w.write(&Unitig::from_ascii(b"AAAACGT")).unwrap();
w.close().unwrap();