feat: introduce column-major matrix storage and migrate layered map
Introduces `PersistentBitMatrix` and `PersistentCompactIntMatrix` to replace single-file vector storage with a column-major, directory-based layout. Each column is persisted as an individual file alongside a lightweight `meta.json` for dimension tracking. Migrates `obilayeredmap` to use these multi-column structures, updating Rust APIs, query return types, and build signatures. Includes comprehensive documentation, unit and integration tests for persistence and accessors, and refactors distance calculation helpers.
This commit is contained in:
@@ -4,7 +4,10 @@ use std::path::Path;
|
||||
|
||||
use cacheline_ef::{CachelineEf, CachelineEfVec};
|
||||
use epserde::prelude::*;
|
||||
use obicompactvec::{PersistentCompactIntVec, PersistentCompactIntVecBuilder};
|
||||
use obicompactvec::{
|
||||
PersistentBitMatrix, PersistentBitMatrixBuilder,
|
||||
PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder,
|
||||
};
|
||||
use obikseq::CanonicalKmer;
|
||||
use obiskio::{UnitigFileReader, UnitigFileWriter};
|
||||
use ptr_hash::{PtrHash, PtrHashParams, bucket_fn::CubicEps, hash::Xx64};
|
||||
@@ -15,7 +18,8 @@ use crate::evidence::{Evidence, EvidenceWriter};
|
||||
pub(crate) const MPHF_FILE: &str = "mphf.bin";
|
||||
pub(crate) const UNITIGS_FILE: &str = "unitigs.bin";
|
||||
const EVIDENCE_FILE: &str = "evidence.bin";
|
||||
const COUNTS_FILE: &str = "counts.pciv";
|
||||
const COUNTS_DIR: &str = "counts";
|
||||
const PRESENCE_DIR: &str = "presence";
|
||||
|
||||
type Mphf = PtrHash<u64, CubicEps, CachelineEfVec<Vec<CachelineEf>>, Xx64, Vec<u8>>;
|
||||
|
||||
@@ -33,12 +37,20 @@ impl LayerData for () {
|
||||
fn read(&self, _slot: usize) {}
|
||||
}
|
||||
|
||||
impl LayerData for PersistentCompactIntVec {
|
||||
type Item = u32;
|
||||
impl LayerData for PersistentCompactIntMatrix {
|
||||
type Item = Box<[u32]>;
|
||||
fn open(layer_dir: &Path) -> OLMResult<Self> {
|
||||
PersistentCompactIntVec::open(&layer_dir.join(COUNTS_FILE)).map_err(OLMError::Io)
|
||||
PersistentCompactIntMatrix::open(&layer_dir.join(COUNTS_DIR)).map_err(OLMError::Io)
|
||||
}
|
||||
fn read(&self, slot: usize) -> u32 { self.get(slot) }
|
||||
fn read(&self, slot: usize) -> Box<[u32]> { self.row(slot) }
|
||||
}
|
||||
|
||||
impl LayerData for PersistentBitMatrix {
|
||||
type Item = Box<[bool]>;
|
||||
fn open(layer_dir: &Path) -> OLMResult<Self> {
|
||||
PersistentBitMatrix::open(&layer_dir.join(PRESENCE_DIR)).map_err(OLMError::Io)
|
||||
}
|
||||
fn read(&self, slot: usize) -> Box<[bool]> { self.row(slot) }
|
||||
}
|
||||
|
||||
// ── Structures ────────────────────────────────────────────────────────────────
|
||||
@@ -151,27 +163,31 @@ impl Layer<()> {
|
||||
}
|
||||
}
|
||||
|
||||
// ── Mode 2 — counts (PersistentCompactIntVec) ─────────────────────────────────
|
||||
// ── Mode 2 — count matrix (1 column per layer) ────────────────────────────────
|
||||
|
||||
impl Layer<PersistentCompactIntVec> {
|
||||
impl Layer<PersistentCompactIntMatrix> {
|
||||
pub fn build(out_dir: &Path, count_of: impl Fn(CanonicalKmer) -> u32) -> OLMResult<usize> {
|
||||
let unitigs = UnitigFileReader::open(&out_dir.join(UNITIGS_FILE))?;
|
||||
let n = unitigs.n_kmers();
|
||||
let counts_dir = out_dir.join(COUNTS_DIR);
|
||||
if n == 0 {
|
||||
empty_layer(out_dir)?;
|
||||
PersistentCompactIntVecBuilder::new(0, &out_dir.join(COUNTS_FILE))
|
||||
.and_then(|b| b.close())
|
||||
let mut mb = PersistentCompactIntMatrixBuilder::new(0, &counts_dir)
|
||||
.map_err(OLMError::Io)?;
|
||||
mb.add_col().map_err(OLMError::Io)?.close().map_err(OLMError::Io)?;
|
||||
mb.close().map_err(OLMError::Io)?;
|
||||
return Ok(0);
|
||||
}
|
||||
let mphf = build_mphf(out_dir, n)?;
|
||||
let mut cnt = PersistentCompactIntVecBuilder::new(n, &out_dir.join(COUNTS_FILE))
|
||||
let mut mb = PersistentCompactIntMatrixBuilder::new(n, &counts_dir)
|
||||
.map_err(OLMError::Io)?;
|
||||
let mut col = mb.add_col().map_err(OLMError::Io)?;
|
||||
build_second_pass(out_dir, n, &mphf, &mut |slot, kmer| {
|
||||
cnt.set(slot, count_of(kmer));
|
||||
col.set(slot, count_of(kmer));
|
||||
Ok(())
|
||||
})?;
|
||||
cnt.close().map_err(OLMError::Io)?;
|
||||
col.close().map_err(OLMError::Io)?;
|
||||
mb.close().map_err(OLMError::Io)?;
|
||||
Ok(n)
|
||||
}
|
||||
|
||||
@@ -183,6 +199,49 @@ impl Layer<PersistentCompactIntVec> {
|
||||
}
|
||||
}
|
||||
|
||||
// ── Mode 3 — presence/absence matrix (1 column per genome) ───────────────────
|
||||
|
||||
impl Layer<PersistentBitMatrix> {
|
||||
pub fn build_presence(
|
||||
out_dir: &Path,
|
||||
n_genomes: usize,
|
||||
present_in: impl Fn(CanonicalKmer, usize) -> bool,
|
||||
) -> OLMResult<usize> {
|
||||
let unitigs = UnitigFileReader::open(&out_dir.join(UNITIGS_FILE))?;
|
||||
let n = unitigs.n_kmers();
|
||||
let presence_dir = out_dir.join(PRESENCE_DIR);
|
||||
if n == 0 {
|
||||
empty_layer(out_dir)?;
|
||||
let mut mb = PersistentBitMatrixBuilder::new(0, &presence_dir)
|
||||
.map_err(OLMError::Io)?;
|
||||
for _ in 0..n_genomes {
|
||||
mb.add_col().map_err(OLMError::Io)?.close().map_err(OLMError::Io)?;
|
||||
}
|
||||
mb.close().map_err(OLMError::Io)?;
|
||||
return Ok(0);
|
||||
}
|
||||
let mphf = build_mphf(out_dir, n)?;
|
||||
|
||||
let mut mb = PersistentBitMatrixBuilder::new(n, &presence_dir).map_err(OLMError::Io)?;
|
||||
let mut cols: Vec<_> = (0..n_genomes)
|
||||
.map(|_| mb.add_col().map_err(OLMError::Io))
|
||||
.collect::<OLMResult<_>>()?;
|
||||
|
||||
build_second_pass(out_dir, n, &mphf, &mut |slot, kmer| {
|
||||
for (g, col) in cols.iter_mut().enumerate() {
|
||||
col.set(slot, present_in(kmer, g));
|
||||
}
|
||||
Ok(())
|
||||
})?;
|
||||
|
||||
for col in cols {
|
||||
col.close().map_err(OLMError::Io)?;
|
||||
}
|
||||
mb.close().map_err(OLMError::Io)?;
|
||||
Ok(n)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
#[path = "tests/layer.rs"]
|
||||
mod tests;
|
||||
|
||||
@@ -2,7 +2,7 @@ use std::collections::HashMap;
|
||||
use std::fs;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use obicompactvec::PersistentCompactIntVec;
|
||||
use obicompactvec::PersistentCompactIntMatrix;
|
||||
use obikseq::CanonicalKmer;
|
||||
use obiskio::UnitigFileWriter;
|
||||
|
||||
@@ -96,13 +96,13 @@ impl LayeredMap<()> {
|
||||
}
|
||||
}
|
||||
|
||||
// ── Mode 2 — counts ───────────────────────────────────────────────────────────
|
||||
// ── Mode 2 — count matrix ─────────────────────────────────────────────────────
|
||||
|
||||
impl LayeredMap<PersistentCompactIntVec> {
|
||||
impl LayeredMap<PersistentCompactIntMatrix> {
|
||||
pub fn push_layer(&mut self, count_of: impl Fn(CanonicalKmer) -> u32) -> OLMResult<usize> {
|
||||
let i = self.layers.len();
|
||||
let dir = layer_dir(&self.root, i);
|
||||
Layer::<PersistentCompactIntVec>::build(&dir, count_of)?;
|
||||
Layer::<PersistentCompactIntMatrix>::build(&dir, count_of)?;
|
||||
self.append_layer()?;
|
||||
Ok(i)
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use super::*;
|
||||
use obicompactvec::PersistentCompactIntVec;
|
||||
use obicompactvec::PersistentCompactIntMatrix;
|
||||
use obikseq::{set_k, Kmer, Sequence as _, Unitig};
|
||||
use tempfile::tempdir;
|
||||
|
||||
@@ -44,14 +44,14 @@ fn counts_are_stored_and_retrieved() {
|
||||
let kmers = all_canonical_kmers(dir.path(), 4);
|
||||
let count_map: HashMap<CanonicalKmer, u32> =
|
||||
kmers.iter().enumerate().map(|(i, &k)| (k, i as u32 + 1)).collect();
|
||||
Layer::<PersistentCompactIntVec>::build(
|
||||
Layer::<PersistentCompactIntMatrix>::build(
|
||||
dir.path(),
|
||||
|kmer| count_map.get(&kmer).copied().unwrap_or(0),
|
||||
).unwrap();
|
||||
let layer = Layer::<PersistentCompactIntVec>::open(dir.path()).unwrap();
|
||||
let layer = Layer::<PersistentCompactIntMatrix>::open(dir.path()).unwrap();
|
||||
for kmer in &kmers {
|
||||
let hit = layer.query(*kmer).expect("kmer must be present");
|
||||
assert_eq!(hit.data, count_map[kmer]);
|
||||
assert_eq!(hit.data[0], count_map[kmer]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -71,10 +71,10 @@ fn open_after_build_is_consistent() {
|
||||
set_k(4);
|
||||
let dir = tempdir().unwrap();
|
||||
write_unitigs(dir.path(), &[b"AAAACGT"]);
|
||||
let n = Layer::<PersistentCompactIntVec>::build(dir.path(), |_| 7).unwrap();
|
||||
let n = Layer::<PersistentCompactIntMatrix>::build(dir.path(), |_| 7).unwrap();
|
||||
assert_eq!(n, 4);
|
||||
let layer = Layer::<PersistentCompactIntVec>::open(dir.path()).unwrap();
|
||||
let layer = Layer::<PersistentCompactIntMatrix>::open(dir.path()).unwrap();
|
||||
let kmer = Kmer::from_ascii(b"AAAA").unwrap().canonical();
|
||||
let hit = layer.query(kmer).expect("AAAA must be present");
|
||||
assert_eq!(hit.data, 7);
|
||||
assert_eq!(hit.data[0], 7);
|
||||
}
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
use super::*;
|
||||
use obicompactvec::PersistentCompactIntVec;
|
||||
use obicompactvec::PersistentCompactIntMatrix;
|
||||
use obikseq::{set_k, Sequence as _, Unitig};
|
||||
use tempfile::tempdir;
|
||||
|
||||
fn push_unitigs_and_layer(
|
||||
map: &mut LayeredMap<PersistentCompactIntVec>,
|
||||
map: &mut LayeredMap<PersistentCompactIntMatrix>,
|
||||
seqs: &[&[u8]],
|
||||
count: u32,
|
||||
) {
|
||||
@@ -33,10 +33,10 @@ fn open_reloads_layer_count() {
|
||||
set_k(4);
|
||||
let dir = tempdir().unwrap();
|
||||
{
|
||||
let mut map = LayeredMap::<PersistentCompactIntVec>::create(dir.path()).unwrap();
|
||||
let mut map = LayeredMap::<PersistentCompactIntMatrix>::create(dir.path()).unwrap();
|
||||
push_unitigs_and_layer(&mut map, &[b"AAAACGT"], 1);
|
||||
}
|
||||
let map = LayeredMap::<PersistentCompactIntVec>::open(dir.path()).unwrap();
|
||||
let map = LayeredMap::<PersistentCompactIntMatrix>::open(dir.path()).unwrap();
|
||||
assert_eq!(map.n_layers(), 1);
|
||||
}
|
||||
|
||||
@@ -44,37 +44,37 @@ fn open_reloads_layer_count() {
|
||||
fn query_finds_kmer_in_layer_zero() {
|
||||
set_k(4);
|
||||
let dir = tempdir().unwrap();
|
||||
let mut map = LayeredMap::<PersistentCompactIntVec>::create(dir.path()).unwrap();
|
||||
let mut map = LayeredMap::<PersistentCompactIntMatrix>::create(dir.path()).unwrap();
|
||||
push_unitigs_and_layer(&mut map, &[b"AAAACGT"], 3);
|
||||
let kmer = canonical(b"AAAC");
|
||||
let (layer_idx, hit) = map.query(kmer).expect("kmer must be found");
|
||||
assert_eq!(layer_idx, 0);
|
||||
assert_eq!(hit.data, 3);
|
||||
assert_eq!(hit.data[0], 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn query_finds_kmer_in_correct_layer() {
|
||||
set_k(4);
|
||||
let dir = tempdir().unwrap();
|
||||
let mut map = LayeredMap::<PersistentCompactIntVec>::create(dir.path()).unwrap();
|
||||
let mut map = LayeredMap::<PersistentCompactIntMatrix>::create(dir.path()).unwrap();
|
||||
push_unitigs_and_layer(&mut map, &[b"AAAACGT"], 1);
|
||||
push_unitigs_and_layer(&mut map, &[b"GGGACGT"], 2);
|
||||
assert_eq!(map.n_layers(), 2);
|
||||
|
||||
let (li, hit) = map.query(canonical(b"AAAA")).expect("AAAA must be found");
|
||||
assert_eq!(li, 0);
|
||||
assert_eq!(hit.data, 1);
|
||||
assert_eq!(hit.data[0], 1);
|
||||
|
||||
let (li, hit) = map.query(canonical(b"GGGA")).expect("GGGA must be found");
|
||||
assert_eq!(li, 1);
|
||||
assert_eq!(hit.data, 2);
|
||||
assert_eq!(hit.data[0], 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn query_absent_returns_none() {
|
||||
set_k(4);
|
||||
let dir = tempdir().unwrap();
|
||||
let mut map = LayeredMap::<PersistentCompactIntVec>::create(dir.path()).unwrap();
|
||||
let mut map = LayeredMap::<PersistentCompactIntMatrix>::create(dir.path()).unwrap();
|
||||
push_unitigs_and_layer(&mut map, &[b"AAAACGT"], 1);
|
||||
let absent = canonical(b"CCCC");
|
||||
assert!(map.query(absent).is_none());
|
||||
@@ -84,7 +84,7 @@ fn query_absent_returns_none() {
|
||||
fn push_layer_from_map_convenience() {
|
||||
set_k(4);
|
||||
let dir = tempdir().unwrap();
|
||||
let mut map = LayeredMap::<PersistentCompactIntVec>::create(dir.path()).unwrap();
|
||||
let mut map = LayeredMap::<PersistentCompactIntMatrix>::create(dir.path()).unwrap();
|
||||
let mut w = map.next_layer_writer().unwrap();
|
||||
w.write(&Unitig::from_ascii(b"AAAACGT")).unwrap();
|
||||
w.close().unwrap();
|
||||
@@ -93,5 +93,5 @@ fn push_layer_from_map_convenience() {
|
||||
].into_iter().collect();
|
||||
map.push_layer_from_map(&counts).unwrap();
|
||||
let (_, hit) = map.query(canonical(b"AAAA")).unwrap();
|
||||
assert_eq!(hit.data, 10);
|
||||
assert_eq!(hit.data[0], 10);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user