feat: introduce column-major matrix storage and migrate layered map

Introduces `PersistentBitMatrix` and `PersistentCompactIntMatrix` to replace single-file vector storage with a column-major, directory-based layout. Each column is persisted as an individual file alongside a lightweight `meta.json` for dimension tracking. Migrates `obilayeredmap` to use these multi-column structures, updating Rust APIs, query return types, and build signatures. Includes comprehensive documentation, unit and integration tests for persistence and accessors, and refactors distance calculation helpers.
This commit is contained in:
Eric Coissac
2026-05-14 09:31:11 +08:00
parent f48f7500cd
commit b218bf012b
15 changed files with 843 additions and 201 deletions
+57
View File
@@ -0,0 +1,57 @@
use std::{fs, io, path::{Path, PathBuf}};
use crate::bitvec::{PersistentBitVec, PersistentBitVecBuilder};
use crate::meta::MatrixMeta;
fn col_path(dir: &Path, col: usize) -> PathBuf {
dir.join(format!("col_{col:06}.pbiv"))
}
pub struct PersistentBitMatrix {
cols: Vec<PersistentBitVec>,
n: usize,
}
impl PersistentBitMatrix {
pub fn open(dir: &Path) -> io::Result<Self> {
let meta = MatrixMeta::load(dir)?;
let cols = (0..meta.n_cols)
.map(|c| PersistentBitVec::open(&col_path(dir, c)))
.collect::<io::Result<Vec<_>>>()?;
Ok(Self { cols, n: meta.n })
}
pub fn n(&self) -> usize { self.n }
pub fn n_cols(&self) -> usize { self.cols.len() }
pub fn col(&self, c: usize) -> &PersistentBitVec { &self.cols[c] }
pub fn row(&self, slot: usize) -> Box<[bool]> {
self.cols.iter().map(|c| c.get(slot)).collect()
}
}
pub struct PersistentBitMatrixBuilder {
dir: PathBuf,
n: usize,
n_cols: usize,
}
impl PersistentBitMatrixBuilder {
pub fn new(n: usize, dir: &Path) -> io::Result<Self> {
fs::create_dir_all(dir)?;
Ok(Self { dir: dir.to_path_buf(), n, n_cols: 0 })
}
pub fn n(&self) -> usize { self.n }
pub fn n_cols(&self) -> usize { self.n_cols }
pub fn add_col(&mut self) -> io::Result<PersistentBitVecBuilder> {
let path = col_path(&self.dir, self.n_cols);
self.n_cols += 1;
PersistentBitVecBuilder::new(self.n, &path)
}
pub fn close(self) -> io::Result<()> {
MatrixMeta { n: self.n, n_cols: self.n_cols }.save(&self.dir)
}
}
+58
View File
@@ -0,0 +1,58 @@
use std::{fs, io, path::{Path, PathBuf}};
use crate::builder::PersistentCompactIntVecBuilder;
use crate::meta::MatrixMeta;
use crate::reader::PersistentCompactIntVec;
fn col_path(dir: &Path, col: usize) -> PathBuf {
dir.join(format!("col_{col:06}.pciv"))
}
pub struct PersistentCompactIntMatrix {
cols: Vec<PersistentCompactIntVec>,
n: usize,
}
impl PersistentCompactIntMatrix {
pub fn open(dir: &Path) -> io::Result<Self> {
let meta = MatrixMeta::load(dir)?;
let cols = (0..meta.n_cols)
.map(|c| PersistentCompactIntVec::open(&col_path(dir, c)))
.collect::<io::Result<Vec<_>>>()?;
Ok(Self { cols, n: meta.n })
}
pub fn n(&self) -> usize { self.n }
pub fn n_cols(&self) -> usize { self.cols.len() }
pub fn col(&self, c: usize) -> &PersistentCompactIntVec { &self.cols[c] }
pub fn row(&self, slot: usize) -> Box<[u32]> {
self.cols.iter().map(|c| c.get(slot)).collect()
}
}
pub struct PersistentCompactIntMatrixBuilder {
dir: PathBuf,
n: usize,
n_cols: usize,
}
impl PersistentCompactIntMatrixBuilder {
pub fn new(n: usize, dir: &Path) -> io::Result<Self> {
fs::create_dir_all(dir)?;
Ok(Self { dir: dir.to_path_buf(), n, n_cols: 0 })
}
pub fn n(&self) -> usize { self.n }
pub fn n_cols(&self) -> usize { self.n_cols }
pub fn add_col(&mut self) -> io::Result<PersistentCompactIntVecBuilder> {
let path = col_path(&self.dir, self.n_cols);
self.n_cols += 1;
PersistentCompactIntVecBuilder::new(self.n, &path)
}
pub fn close(self) -> io::Result<()> {
MatrixMeta { n: self.n, n_cols: self.n_cols }.save(&self.dir)
}
}
+5
View File
@@ -1,10 +1,15 @@
mod bitvec;
mod bitmatrix;
mod builder;
mod format;
mod intmatrix;
mod meta;
mod reader;
pub use bitvec::{BitIter, PersistentBitVec, PersistentBitVecBuilder};
pub use bitmatrix::{PersistentBitMatrix, PersistentBitMatrixBuilder};
pub use builder::PersistentCompactIntVecBuilder;
pub use intmatrix::{PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder};
pub use reader::PersistentCompactIntVec;
#[cfg(test)]
+32
View File
@@ -0,0 +1,32 @@
use std::{fs, io, path::Path};
pub struct MatrixMeta {
pub n: usize,
pub n_cols: usize,
}
impl MatrixMeta {
pub fn load(dir: &Path) -> io::Result<Self> {
let s = fs::read_to_string(dir.join("meta.json"))?;
parse(&s).ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "bad meta.json"))
}
pub fn save(&self, dir: &Path) -> io::Result<()> {
fs::write(
dir.join("meta.json"),
format!("{{\"n\":{},\"n_cols\":{}}}\n", self.n, self.n_cols),
)
}
}
fn parse(s: &str) -> Option<MatrixMeta> {
Some(MatrixMeta { n: field(s, "n")?, n_cols: field(s, "n_cols")? })
}
fn field(s: &str, name: &str) -> Option<usize> {
let key = format!("\"{}\":", name);
let pos = s.find(&key)? + key.len();
let rest = s[pos..].trim_start();
let end = rest.find(|c: char| !c.is_ascii_digit()).unwrap_or(rest.len());
rest[..end].parse().ok()
}
+201 -57
View File
@@ -7,41 +7,45 @@ use memmap2::Mmap;
use crate::format::{HEADER_SIZE, INDEX_ENTRY_SIZE, MAGIC, OVERFLOW_ENTRY_SIZE};
pub struct PersistentCompactIntVec {
mmap: Mmap,
n: usize,
n_overflow: usize,
pub step: usize,
index: Vec<(usize, usize)>, // (slot, pos) — L1-resident sparse index
primary_offset: usize, // = HEADER_SIZE
data_offset: usize, // = HEADER_SIZE + n
path: PathBuf,
mmap: Mmap,
n: usize,
n_overflow: usize,
pub step: usize,
index: Vec<(usize, usize)>, // (slot, pos) — L1-resident sparse index
primary_offset: usize, // = HEADER_SIZE
data_offset: usize, // = HEADER_SIZE + n
path: PathBuf,
}
impl PersistentCompactIntVec {
/// Opens a persistent compact int vector from the given path.
pub fn open(path: &Path) -> io::Result<Self> {
let mmap = unsafe { Mmap::map(&File::open(path)?)? };
if mmap.len() < HEADER_SIZE {
return Err(io::Error::new(io::ErrorKind::InvalidData, "PCIV file too short"));
return Err(io::Error::new(
io::ErrorKind::InvalidData,
"PCIV file too short",
));
}
if &mmap[0..4] != &MAGIC {
return Err(io::Error::new(io::ErrorKind::InvalidData, "bad PCIV magic"));
}
let n = u64::from_le_bytes(mmap[8..16].try_into().unwrap()) as usize;
let n = u64::from_le_bytes(mmap[8..16].try_into().unwrap()) as usize;
let n_overflow = u64::from_le_bytes(mmap[16..24].try_into().unwrap()) as usize;
let n_index = u64::from_le_bytes(mmap[24..32].try_into().unwrap()) as usize;
let step = u64::from_le_bytes(mmap[32..40].try_into().unwrap()) as usize;
let n_index = u64::from_le_bytes(mmap[24..32].try_into().unwrap()) as usize;
let step = u64::from_le_bytes(mmap[32..40].try_into().unwrap()) as usize;
let primary_offset = HEADER_SIZE;
let data_offset = primary_offset + n;
let index_offset = data_offset + n_overflow * OVERFLOW_ENTRY_SIZE;
let data_offset = primary_offset + n;
let index_offset = data_offset + n_overflow * OVERFLOW_ENTRY_SIZE;
let mut index = Vec::with_capacity(n_index);
for i in 0..n_index {
let off = index_offset + i * INDEX_ENTRY_SIZE;
let off = index_offset + i * INDEX_ENTRY_SIZE;
let slot = u64::from_le_bytes(mmap[off..off + 8].try_into().unwrap()) as usize;
let pos = u64::from_le_bytes(mmap[off + 8..off + 16].try_into().unwrap()) as usize;
let pos = u64::from_le_bytes(mmap[off + 8..off + 16].try_into().unwrap()) as usize;
index.push((slot, pos));
}
@@ -57,36 +61,44 @@ impl PersistentCompactIntVec {
})
}
/// Returns the path of the compact int vector file.
pub fn path(&self) -> &Path {
&self.path
}
/// Returns the length of the compact int vector.
pub fn len(&self) -> usize {
self.n
}
/// Returns whether the compact int vector is empty.
pub fn is_empty(&self) -> bool {
self.n == 0
}
/// Returns the value at the given slot.
pub fn get(&self, slot: usize) -> u32 {
match self.mmap[self.primary_offset + slot] {
255 => self.overflow_get(slot),
v => v as u32,
v => v as u32,
}
}
/// Returns the value at the given slot from the overflow region.
fn overflow_get(&self, slot: usize) -> u32 {
let pos_start;
let pos_end;
if self.step == 0 {
pos_start = 0;
pos_end = self.n_overflow;
pos_end = self.n_overflow;
} else {
let i = self.index.partition_point(|&(s, _)| s <= slot).saturating_sub(1);
let i = self
.index
.partition_point(|&(s, _)| s <= slot)
.saturating_sub(1);
pos_start = self.index[i].1;
pos_end = if i + 1 < self.index.len() {
pos_end = if i + 1 < self.index.len() {
self.index[i + 1].1
} else {
self.n_overflow
@@ -98,8 +110,8 @@ impl PersistentCompactIntVec {
while lo < hi {
let mid = lo + (hi - lo) / 2;
match self.data_slot(mid).cmp(&slot) {
std::cmp::Ordering::Equal => return self.data_value(mid),
std::cmp::Ordering::Less => lo = mid + 1,
std::cmp::Ordering::Equal => return self.data_value(mid),
std::cmp::Ordering::Less => lo = mid + 1,
std::cmp::Ordering::Greater => hi = mid,
}
}
@@ -107,85 +119,203 @@ impl PersistentCompactIntVec {
}
#[inline]
/// Returns the slot at the given index in the overflow region.
fn data_slot(&self, i: usize) -> usize {
let off = self.data_offset + i * OVERFLOW_ENTRY_SIZE;
u64::from_le_bytes(self.mmap[off..off + 8].try_into().unwrap()) as usize
}
#[inline]
/// Returns the value at the given index in the overflow region.
fn data_value(&self, i: usize) -> u32 {
let off = self.data_offset + i * OVERFLOW_ENTRY_SIZE + 8;
u32::from_le_bytes(self.mmap[off..off + 4].try_into().unwrap())
}
#[inline]
/// Returns the sum of all values in the compact int vector.
pub fn sum(&self) -> u64 {
self.iter().map(|v| v as u64).sum()
}
#[inline]
/// Returns the Bray-Curtis distance between two compact int vectors.
pub fn bray_dist(&self, other: &PersistentCompactIntVec) -> f64 {
assert_eq!(self.n, other.len(), "length mismatch");
let (sum_min, sum_a, sum_b) = self.iter().zip(other.iter()).fold(
(0u64, 0u64, 0u64),
|(sm, sa, sb), (a, b)| (sm + a.min(b) as u64, sa + a as u64, sb + b as u64),
);
let denom = sum_a + sum_b;
if denom == 0 { return 0.0; }
let (sum_min, denom) = self.partial_bray_dist(other);
if denom == 0 {
return 0.0;
}
1.0 - 2.0 * sum_min as f64 / denom as f64
}
/// Returns the partial Bray-Curtis distance between two compact int vectors.
///
/// Returns a tuple `(sum_min, denom)` where `sum_min` is the sum of the minimum values
/// at each index, and `denom` is the sum of the values in both vectors.
/// This is used internally by [`bray_dist`] and to easily compute the Bray-Curtis distance
/// over a set of vector pairs.
///
/// Returns the tuple `(sum_min, sum_a + sum_b)` where `sum_min` is the sum of the minimum
/// values at each index, `sum_a` is the sum of the first vector's counts, and `sum_b` is
/// the sum of the second vector's counts.
pub fn partial_bray_dist(&self, other: &PersistentCompactIntVec) -> (u64, u64) {
assert_eq!(self.n, other.len(), "length mismatch");
let (sum_min, sum_a, sum_b) = self
.iter()
.zip(other.iter())
.fold((0u64, 0u64, 0u64), |(sm, sa, sb), (a, b)| {
(sm + a.min(b) as u64, sa + a as u64, sb + b as u64)
});
(sum_min, sum_a + sum_b)
}
/// Returns the relative frequency Bray-Curtis distance between two compact int vectors.
///
/// This is a variant of [`bray_dist`] that uses relative frequencies instead of raw counts.
pub fn relfreq_bray_dist(&self, other: &PersistentCompactIntVec) -> f64 {
assert_eq!(self.n, other.len(), "length mismatch");
let sum_a = self.sum() as f64;
let sum_b = other.sum() as f64;
if sum_a == 0.0 && sum_b == 0.0 { return 0.0; }
let sum_min: f64 = self.iter().zip(other.iter())
if sum_a == 0.0 && sum_b == 0.0 {
return 0.0;
}
let sum_min = self.partial_relfreq_bray_dist(other, sum_a, sum_b);
1.0 - sum_min
}
/// Returns the partial relative frequency Bray-Curtis distance between two compact int vectors.
///
/// This is used internally by [`relfreq_bray_dist`] and to easily compute the relative frequency
/// Bray-Curtis distance over a set of vector pairs.
///
/// Arguments:
/// - `other`: the other compact int vector to compare with
/// - `sum_a`: the sum of the first vector's counts
/// - `sum_b`: the sum of the second vector's counts
///
/// Returns the sum of the minimum relative frequencies at each index.
pub fn partial_relfreq_bray_dist(
&self,
other: &PersistentCompactIntVec,
sum_a: f64,
sum_b: f64,
) -> f64 {
assert_eq!(self.n, other.len(), "length mismatch");
let sum_min: f64 = self
.iter()
.zip(other.iter())
.map(|(a, b)| {
let pa = if sum_a > 0.0 { a as f64 / sum_a } else { 0.0 };
let pb = if sum_b > 0.0 { b as f64 / sum_b } else { 0.0 };
pa.min(pb)
})
.sum();
1.0 - sum_min
sum_min
}
/// Returns the euclidean distance between two compact int vectors.
pub fn euclidean_dist(&self, other: &PersistentCompactIntVec) -> f64 {
assert_eq!(self.n, other.len(), "length mismatch");
let sq: f64 = self.iter().zip(other.iter())
.map(|(a, b)| { let d = a as f64 - b as f64; d * d })
.sum();
sq.sqrt()
self.partial_euclidean_dist(other).sqrt()
}
/// Returns the partial euclidean distance between two compact int vectors.
///
/// This is used internally by [`euclidean_dist`] and to easily compute the euclidean distance
/// over a set of vector pairs.
///
/// The result is the sum of the squared differences between corresponding elements of the two
/// vectors.
pub fn partial_euclidean_dist(&self, other: &PersistentCompactIntVec) -> f64 {
assert_eq!(self.n, other.len(), "length mismatch");
self.iter()
.zip(other.iter())
.map(|(a, b)| {
let d = a as f64 - b as f64;
d * d
})
.sum()
}
/// Returns the relative frequency euclidean distance between two compact int vectors.
///
/// This is a variant of [`euclidean_dist`] that uses relative frequencies instead of raw counts.
pub fn relfreq_euclidean_dist(&self, other: &PersistentCompactIntVec) -> f64 {
assert_eq!(self.n, other.len(), "length mismatch");
let sum_a = self.sum() as f64;
let sum_b = other.sum() as f64;
if sum_a == 0.0 && sum_b == 0.0 { return 0.0; }
let sq: f64 = self.iter().zip(other.iter())
if sum_a == 0.0 && sum_b == 0.0 {
return 0.0;
}
self.partial_relfreq_euclidean_dist(other, sum_a, sum_b)
.sqrt()
}
/// Returns the partial relative frequency euclidean distance between two compact int vectors.
///
/// This is used internally by [`relfreq_euclidean_dist`] and to easily compute the relative frequency
/// euclidean distance over a set of vector pairs.
pub fn partial_relfreq_euclidean_dist(
&self,
other: &PersistentCompactIntVec,
sum_a: f64,
sum_b: f64,
) -> f64 {
assert_eq!(self.n, other.len(), "length mismatch");
self.iter()
.zip(other.iter())
.map(|(a, b)| {
let pa = if sum_a > 0.0 { a as f64 / sum_a } else { 0.0 };
let pb = if sum_b > 0.0 { b as f64 / sum_b } else { 0.0 };
let d = pa - pb;
d * d
})
.sum();
sq.sqrt()
.sum()
}
/// Returns the Euclidean distance between two compact int vectors using the Hellinger transform.
///
/// The Hellinger transform is applied to the raw counts of each vector, and the result is
/// the Euclidean distance between the transformed vectors. The Hellinger transform is defined
/// as the square root of the relative frequencies.
pub fn hellinger_euclidean_dist(&self, other: &PersistentCompactIntVec) -> f64 {
assert_eq!(self.n, other.len(), "length mismatch");
let sum_a = self.sum() as f64;
let sum_b = other.sum() as f64;
if sum_a == 0.0 && sum_b == 0.0 { return 0.0; }
let sq: f64 = self.iter().zip(other.iter())
if sum_a == 0.0 && sum_b == 0.0 {
return 0.0;
}
self.partial_hellinger_euclidean_dist(other, sum_a, sum_b)
.sqrt()
}
/// Returns the partial Hellinger Euclidean distance between two compact int vectors.
///
/// This is used internally by [`hellinger_euclidean_dist`] and to easily compute the Hellinger
/// Euclidean distance over a set of vector pairs.
pub fn partial_hellinger_euclidean_dist(
&self,
other: &PersistentCompactIntVec,
sum_a: f64,
sum_b: f64,
) -> f64 {
assert_eq!(self.n, other.len(), "length mismatch");
self.iter()
.zip(other.iter())
.map(|(a, b)| {
let pa = if sum_a > 0.0 { (a as f64 / sum_a).sqrt() } else { 0.0 };
let pb = if sum_b > 0.0 { (b as f64 / sum_b).sqrt() } else { 0.0 };
let pa = if sum_a > 0.0 {
(a as f64 / sum_a).sqrt()
} else {
0.0
};
let pb = if sum_b > 0.0 {
(b as f64 / sum_b).sqrt()
} else {
0.0
};
let d = pa - pb;
d * d
})
.sum();
sq.sqrt()
.sum()
}
pub fn hellinger_dist(&self, other: &PersistentCompactIntVec) -> f64 {
@@ -194,16 +324,26 @@ impl PersistentCompactIntVec {
pub fn threshold_jaccard_dist(&self, other: &PersistentCompactIntVec, threshold: u32) -> f64 {
assert_eq!(self.n, other.len(), "length mismatch");
let (intersection, union) = self.iter().zip(other.iter()).fold(
(0u64, 0u64),
|(inter, uni), (a, b)| {
let (intersection, union) = self.partial_threshold_jaccard_dist(other, threshold);
if union == 0 {
return 0.0;
}
1.0 - intersection as f64 / union as f64
}
pub fn partial_threshold_jaccard_dist(
&self,
other: &PersistentCompactIntVec,
threshold: u32,
) -> (u64, u64) {
assert_eq!(self.n, other.len(), "length mismatch");
self.iter()
.zip(other.iter())
.fold((0u64, 0u64), |(inter, uni), (a, b)| {
let ap = a >= threshold;
let bp = b >= threshold;
(inter + (ap & bp) as u64, uni + (ap | bp) as u64)
},
);
if union == 0 { return 0.0; }
1.0 - intersection as f64 / union as f64
})
}
pub fn jaccard_dist(&self, other: &PersistentCompactIntVec) -> f64 {
@@ -211,7 +351,11 @@ impl PersistentCompactIntVec {
}
pub fn iter(&self) -> Iter<'_> {
Iter { pciv: self, slot: 0, overflow_pos: 0 }
Iter {
pciv: self,
slot: 0,
overflow_pos: 0,
}
}
}
@@ -225,8 +369,8 @@ impl<'a> IntoIterator for &'a PersistentCompactIntVec {
}
pub struct Iter<'a> {
pciv: &'a PersistentCompactIntVec,
slot: usize,
pciv: &'a PersistentCompactIntVec,
slot: usize,
overflow_pos: usize,
}
+69
View File
@@ -0,0 +1,69 @@
use tempfile::tempdir;
use crate::{PersistentBitMatrix, PersistentBitMatrixBuilder};
#[test]
fn single_col_roundtrip() {
let dir = tempdir().unwrap();
let mut b = PersistentBitMatrixBuilder::new(4, dir.path()).unwrap();
let mut col = b.add_col().unwrap();
col.set(0, true);
col.set(1, false);
col.set(2, true);
col.set(3, true);
col.close().unwrap();
b.close().unwrap();
let m = PersistentBitMatrix::open(dir.path()).unwrap();
assert_eq!(m.n_cols(), 1);
assert_eq!(m.n(), 4);
assert_eq!(&*m.row(0), &[true]);
assert_eq!(&*m.row(1), &[false]);
assert_eq!(&*m.row(2), &[true]);
assert_eq!(&*m.row(3), &[true]);
}
#[test]
fn two_cols_roundtrip() {
let dir = tempdir().unwrap();
let mut b = PersistentBitMatrixBuilder::new(3, dir.path()).unwrap();
let mut col0 = b.add_col().unwrap();
col0.set(0, true); col0.set(1, false); col0.set(2, true);
col0.close().unwrap();
let mut col1 = b.add_col().unwrap();
col1.set(0, false); col1.set(1, true); col1.set(2, false);
col1.close().unwrap();
b.close().unwrap();
let m = PersistentBitMatrix::open(dir.path()).unwrap();
assert_eq!(m.n_cols(), 2);
assert_eq!(&*m.row(0), &[true, false]);
assert_eq!(&*m.row(1), &[false, true]);
assert_eq!(&*m.row(2), &[true, false]);
}
#[test]
fn col_accessor() {
let dir = tempdir().unwrap();
let mut b = PersistentBitMatrixBuilder::new(3, dir.path()).unwrap();
let mut col = b.add_col().unwrap();
col.set(0, true); col.set(1, false); col.set(2, true);
col.close().unwrap();
b.close().unwrap();
let m = PersistentBitMatrix::open(dir.path()).unwrap();
assert!(m.col(0).get(0));
assert!(!m.col(0).get(1));
assert!(m.col(0).get(2));
}
#[test]
fn zero_cols_roundtrip() {
let dir = tempdir().unwrap();
let b = PersistentBitMatrixBuilder::new(8, dir.path()).unwrap();
b.close().unwrap();
let m = PersistentBitMatrix::open(dir.path()).unwrap();
assert_eq!(m.n_cols(), 0);
assert_eq!(m.n(), 8);
}
+68
View File
@@ -0,0 +1,68 @@
use tempfile::tempdir;
use crate::{PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder};
#[test]
fn single_col_roundtrip() {
let dir = tempdir().unwrap();
let mut b = PersistentCompactIntMatrixBuilder::new(4, dir.path()).unwrap();
let mut col = b.add_col().unwrap();
col.set(0, 10);
col.set(1, 200);
col.set(2, 300);
col.set(3, 1000);
col.close().unwrap();
b.close().unwrap();
let m = PersistentCompactIntMatrix::open(dir.path()).unwrap();
assert_eq!(m.n_cols(), 1);
assert_eq!(m.n(), 4);
assert_eq!(&*m.row(0), &[10u32]);
assert_eq!(&*m.row(1), &[200u32]);
assert_eq!(&*m.row(2), &[300u32]);
assert_eq!(&*m.row(3), &[1000u32]);
}
#[test]
fn two_cols_roundtrip() {
let dir = tempdir().unwrap();
let mut b = PersistentCompactIntMatrixBuilder::new(3, dir.path()).unwrap();
let mut col0 = b.add_col().unwrap();
col0.set(0, 1); col0.set(1, 2); col0.set(2, 3);
col0.close().unwrap();
let mut col1 = b.add_col().unwrap();
col1.set(0, 10); col1.set(1, 20); col1.set(2, 30);
col1.close().unwrap();
b.close().unwrap();
let m = PersistentCompactIntMatrix::open(dir.path()).unwrap();
assert_eq!(m.n_cols(), 2);
assert_eq!(&*m.row(0), &[1u32, 10]);
assert_eq!(&*m.row(1), &[2u32, 20]);
assert_eq!(&*m.row(2), &[3u32, 30]);
}
#[test]
fn col_accessor() {
let dir = tempdir().unwrap();
let mut b = PersistentCompactIntMatrixBuilder::new(2, dir.path()).unwrap();
let mut col0 = b.add_col().unwrap();
col0.set(0, 5); col0.set(1, 7);
col0.close().unwrap();
b.close().unwrap();
let m = PersistentCompactIntMatrix::open(dir.path()).unwrap();
assert_eq!(m.col(0).get(0), 5);
assert_eq!(m.col(0).get(1), 7);
}
#[test]
fn zero_cols_roundtrip() {
let dir = tempdir().unwrap();
let b = PersistentCompactIntMatrixBuilder::new(10, dir.path()).unwrap();
b.close().unwrap();
let m = PersistentCompactIntMatrix::open(dir.path()).unwrap();
assert_eq!(m.n_cols(), 0);
assert_eq!(m.n(), 10);
}
+2
View File
@@ -1,4 +1,6 @@
mod bitmatrix;
mod bitvec;
mod intmatrix;
use tempfile::tempdir;
+72 -13
View File
@@ -4,7 +4,10 @@ use std::path::Path;
use cacheline_ef::{CachelineEf, CachelineEfVec};
use epserde::prelude::*;
use obicompactvec::{PersistentCompactIntVec, PersistentCompactIntVecBuilder};
use obicompactvec::{
PersistentBitMatrix, PersistentBitMatrixBuilder,
PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder,
};
use obikseq::CanonicalKmer;
use obiskio::{UnitigFileReader, UnitigFileWriter};
use ptr_hash::{PtrHash, PtrHashParams, bucket_fn::CubicEps, hash::Xx64};
@@ -15,7 +18,8 @@ use crate::evidence::{Evidence, EvidenceWriter};
pub(crate) const MPHF_FILE: &str = "mphf.bin";
pub(crate) const UNITIGS_FILE: &str = "unitigs.bin";
const EVIDENCE_FILE: &str = "evidence.bin";
const COUNTS_FILE: &str = "counts.pciv";
const COUNTS_DIR: &str = "counts";
const PRESENCE_DIR: &str = "presence";
type Mphf = PtrHash<u64, CubicEps, CachelineEfVec<Vec<CachelineEf>>, Xx64, Vec<u8>>;
@@ -33,12 +37,20 @@ impl LayerData for () {
fn read(&self, _slot: usize) {}
}
impl LayerData for PersistentCompactIntVec {
type Item = u32;
impl LayerData for PersistentCompactIntMatrix {
type Item = Box<[u32]>;
fn open(layer_dir: &Path) -> OLMResult<Self> {
PersistentCompactIntVec::open(&layer_dir.join(COUNTS_FILE)).map_err(OLMError::Io)
PersistentCompactIntMatrix::open(&layer_dir.join(COUNTS_DIR)).map_err(OLMError::Io)
}
fn read(&self, slot: usize) -> u32 { self.get(slot) }
fn read(&self, slot: usize) -> Box<[u32]> { self.row(slot) }
}
impl LayerData for PersistentBitMatrix {
type Item = Box<[bool]>;
fn open(layer_dir: &Path) -> OLMResult<Self> {
PersistentBitMatrix::open(&layer_dir.join(PRESENCE_DIR)).map_err(OLMError::Io)
}
fn read(&self, slot: usize) -> Box<[bool]> { self.row(slot) }
}
// ── Structures ────────────────────────────────────────────────────────────────
@@ -151,27 +163,31 @@ impl Layer<()> {
}
}
// ── Mode 2 — counts (PersistentCompactIntVec) ────────────────────────────────
// ── Mode 2 — count matrix (1 column per layer) ────────────────────────────────
impl Layer<PersistentCompactIntVec> {
impl Layer<PersistentCompactIntMatrix> {
pub fn build(out_dir: &Path, count_of: impl Fn(CanonicalKmer) -> u32) -> OLMResult<usize> {
let unitigs = UnitigFileReader::open(&out_dir.join(UNITIGS_FILE))?;
let n = unitigs.n_kmers();
let counts_dir = out_dir.join(COUNTS_DIR);
if n == 0 {
empty_layer(out_dir)?;
PersistentCompactIntVecBuilder::new(0, &out_dir.join(COUNTS_FILE))
.and_then(|b| b.close())
let mut mb = PersistentCompactIntMatrixBuilder::new(0, &counts_dir)
.map_err(OLMError::Io)?;
mb.add_col().map_err(OLMError::Io)?.close().map_err(OLMError::Io)?;
mb.close().map_err(OLMError::Io)?;
return Ok(0);
}
let mphf = build_mphf(out_dir, n)?;
let mut cnt = PersistentCompactIntVecBuilder::new(n, &out_dir.join(COUNTS_FILE))
let mut mb = PersistentCompactIntMatrixBuilder::new(n, &counts_dir)
.map_err(OLMError::Io)?;
let mut col = mb.add_col().map_err(OLMError::Io)?;
build_second_pass(out_dir, n, &mphf, &mut |slot, kmer| {
cnt.set(slot, count_of(kmer));
col.set(slot, count_of(kmer));
Ok(())
})?;
cnt.close().map_err(OLMError::Io)?;
col.close().map_err(OLMError::Io)?;
mb.close().map_err(OLMError::Io)?;
Ok(n)
}
@@ -183,6 +199,49 @@ impl Layer<PersistentCompactIntVec> {
}
}
// ── Mode 3 — presence/absence matrix (1 column per genome) ───────────────────
impl Layer<PersistentBitMatrix> {
pub fn build_presence(
out_dir: &Path,
n_genomes: usize,
present_in: impl Fn(CanonicalKmer, usize) -> bool,
) -> OLMResult<usize> {
let unitigs = UnitigFileReader::open(&out_dir.join(UNITIGS_FILE))?;
let n = unitigs.n_kmers();
let presence_dir = out_dir.join(PRESENCE_DIR);
if n == 0 {
empty_layer(out_dir)?;
let mut mb = PersistentBitMatrixBuilder::new(0, &presence_dir)
.map_err(OLMError::Io)?;
for _ in 0..n_genomes {
mb.add_col().map_err(OLMError::Io)?.close().map_err(OLMError::Io)?;
}
mb.close().map_err(OLMError::Io)?;
return Ok(0);
}
let mphf = build_mphf(out_dir, n)?;
let mut mb = PersistentBitMatrixBuilder::new(n, &presence_dir).map_err(OLMError::Io)?;
let mut cols: Vec<_> = (0..n_genomes)
.map(|_| mb.add_col().map_err(OLMError::Io))
.collect::<OLMResult<_>>()?;
build_second_pass(out_dir, n, &mphf, &mut |slot, kmer| {
for (g, col) in cols.iter_mut().enumerate() {
col.set(slot, present_in(kmer, g));
}
Ok(())
})?;
for col in cols {
col.close().map_err(OLMError::Io)?;
}
mb.close().map_err(OLMError::Io)?;
Ok(n)
}
}
#[cfg(test)]
#[path = "tests/layer.rs"]
mod tests;
+4 -4
View File
@@ -2,7 +2,7 @@ use std::collections::HashMap;
use std::fs;
use std::path::{Path, PathBuf};
use obicompactvec::PersistentCompactIntVec;
use obicompactvec::PersistentCompactIntMatrix;
use obikseq::CanonicalKmer;
use obiskio::UnitigFileWriter;
@@ -96,13 +96,13 @@ impl LayeredMap<()> {
}
}
// ── Mode 2 — counts ───────────────────────────────────────────────────────────
// ── Mode 2 — count matrix ─────────────────────────────────────────────────────
impl LayeredMap<PersistentCompactIntVec> {
impl LayeredMap<PersistentCompactIntMatrix> {
pub fn push_layer(&mut self, count_of: impl Fn(CanonicalKmer) -> u32) -> OLMResult<usize> {
let i = self.layers.len();
let dir = layer_dir(&self.root, i);
Layer::<PersistentCompactIntVec>::build(&dir, count_of)?;
Layer::<PersistentCompactIntMatrix>::build(&dir, count_of)?;
self.append_layer()?;
Ok(i)
}
+7 -7
View File
@@ -1,5 +1,5 @@
use super::*;
use obicompactvec::PersistentCompactIntVec;
use obicompactvec::PersistentCompactIntMatrix;
use obikseq::{set_k, Kmer, Sequence as _, Unitig};
use tempfile::tempdir;
@@ -44,14 +44,14 @@ fn counts_are_stored_and_retrieved() {
let kmers = all_canonical_kmers(dir.path(), 4);
let count_map: HashMap<CanonicalKmer, u32> =
kmers.iter().enumerate().map(|(i, &k)| (k, i as u32 + 1)).collect();
Layer::<PersistentCompactIntVec>::build(
Layer::<PersistentCompactIntMatrix>::build(
dir.path(),
|kmer| count_map.get(&kmer).copied().unwrap_or(0),
).unwrap();
let layer = Layer::<PersistentCompactIntVec>::open(dir.path()).unwrap();
let layer = Layer::<PersistentCompactIntMatrix>::open(dir.path()).unwrap();
for kmer in &kmers {
let hit = layer.query(*kmer).expect("kmer must be present");
assert_eq!(hit.data, count_map[kmer]);
assert_eq!(hit.data[0], count_map[kmer]);
}
}
@@ -71,10 +71,10 @@ fn open_after_build_is_consistent() {
set_k(4);
let dir = tempdir().unwrap();
write_unitigs(dir.path(), &[b"AAAACGT"]);
let n = Layer::<PersistentCompactIntVec>::build(dir.path(), |_| 7).unwrap();
let n = Layer::<PersistentCompactIntMatrix>::build(dir.path(), |_| 7).unwrap();
assert_eq!(n, 4);
let layer = Layer::<PersistentCompactIntVec>::open(dir.path()).unwrap();
let layer = Layer::<PersistentCompactIntMatrix>::open(dir.path()).unwrap();
let kmer = Kmer::from_ascii(b"AAAA").unwrap().canonical();
let hit = layer.query(kmer).expect("AAAA must be present");
assert_eq!(hit.data, 7);
assert_eq!(hit.data[0], 7);
}
+12 -12
View File
@@ -1,10 +1,10 @@
use super::*;
use obicompactvec::PersistentCompactIntVec;
use obicompactvec::PersistentCompactIntMatrix;
use obikseq::{set_k, Sequence as _, Unitig};
use tempfile::tempdir;
fn push_unitigs_and_layer(
map: &mut LayeredMap<PersistentCompactIntVec>,
map: &mut LayeredMap<PersistentCompactIntMatrix>,
seqs: &[&[u8]],
count: u32,
) {
@@ -33,10 +33,10 @@ fn open_reloads_layer_count() {
set_k(4);
let dir = tempdir().unwrap();
{
let mut map = LayeredMap::<PersistentCompactIntVec>::create(dir.path()).unwrap();
let mut map = LayeredMap::<PersistentCompactIntMatrix>::create(dir.path()).unwrap();
push_unitigs_and_layer(&mut map, &[b"AAAACGT"], 1);
}
let map = LayeredMap::<PersistentCompactIntVec>::open(dir.path()).unwrap();
let map = LayeredMap::<PersistentCompactIntMatrix>::open(dir.path()).unwrap();
assert_eq!(map.n_layers(), 1);
}
@@ -44,37 +44,37 @@ fn open_reloads_layer_count() {
fn query_finds_kmer_in_layer_zero() {
set_k(4);
let dir = tempdir().unwrap();
let mut map = LayeredMap::<PersistentCompactIntVec>::create(dir.path()).unwrap();
let mut map = LayeredMap::<PersistentCompactIntMatrix>::create(dir.path()).unwrap();
push_unitigs_and_layer(&mut map, &[b"AAAACGT"], 3);
let kmer = canonical(b"AAAC");
let (layer_idx, hit) = map.query(kmer).expect("kmer must be found");
assert_eq!(layer_idx, 0);
assert_eq!(hit.data, 3);
assert_eq!(hit.data[0], 3);
}
#[test]
fn query_finds_kmer_in_correct_layer() {
set_k(4);
let dir = tempdir().unwrap();
let mut map = LayeredMap::<PersistentCompactIntVec>::create(dir.path()).unwrap();
let mut map = LayeredMap::<PersistentCompactIntMatrix>::create(dir.path()).unwrap();
push_unitigs_and_layer(&mut map, &[b"AAAACGT"], 1);
push_unitigs_and_layer(&mut map, &[b"GGGACGT"], 2);
assert_eq!(map.n_layers(), 2);
let (li, hit) = map.query(canonical(b"AAAA")).expect("AAAA must be found");
assert_eq!(li, 0);
assert_eq!(hit.data, 1);
assert_eq!(hit.data[0], 1);
let (li, hit) = map.query(canonical(b"GGGA")).expect("GGGA must be found");
assert_eq!(li, 1);
assert_eq!(hit.data, 2);
assert_eq!(hit.data[0], 2);
}
#[test]
fn query_absent_returns_none() {
set_k(4);
let dir = tempdir().unwrap();
let mut map = LayeredMap::<PersistentCompactIntVec>::create(dir.path()).unwrap();
let mut map = LayeredMap::<PersistentCompactIntMatrix>::create(dir.path()).unwrap();
push_unitigs_and_layer(&mut map, &[b"AAAACGT"], 1);
let absent = canonical(b"CCCC");
assert!(map.query(absent).is_none());
@@ -84,7 +84,7 @@ fn query_absent_returns_none() {
fn push_layer_from_map_convenience() {
set_k(4);
let dir = tempdir().unwrap();
let mut map = LayeredMap::<PersistentCompactIntVec>::create(dir.path()).unwrap();
let mut map = LayeredMap::<PersistentCompactIntMatrix>::create(dir.path()).unwrap();
let mut w = map.next_layer_writer().unwrap();
w.write(&Unitig::from_ascii(b"AAAACGT")).unwrap();
w.close().unwrap();
@@ -93,5 +93,5 @@ fn push_layer_from_map_convenience() {
].into_iter().collect();
map.push_layer_from_map(&counts).unwrap();
let (_, hit) = map.query(canonical(b"AAAA")).unwrap();
assert_eq!(hit.data, 10);
assert_eq!(hit.data[0], 10);
}