feat: introduce column-major matrix storage and migrate layered map
Introduces `PersistentBitMatrix` and `PersistentCompactIntMatrix` to replace single-file vector storage with a column-major, directory-based layout. Each column is persisted as an individual file alongside a lightweight `meta.json` for dimension tracking. Migrates `obilayeredmap` to use these multi-column structures, updating Rust APIs, query return types, and build signatures. Includes comprehensive documentation, unit and integration tests for persistence and accessors, and refactors distance calculation helpers.
This commit is contained in:
@@ -0,0 +1,57 @@
|
||||
use std::{fs, io, path::{Path, PathBuf}};
|
||||
|
||||
use crate::bitvec::{PersistentBitVec, PersistentBitVecBuilder};
|
||||
use crate::meta::MatrixMeta;
|
||||
|
||||
fn col_path(dir: &Path, col: usize) -> PathBuf {
|
||||
dir.join(format!("col_{col:06}.pbiv"))
|
||||
}
|
||||
|
||||
pub struct PersistentBitMatrix {
|
||||
cols: Vec<PersistentBitVec>,
|
||||
n: usize,
|
||||
}
|
||||
|
||||
impl PersistentBitMatrix {
|
||||
pub fn open(dir: &Path) -> io::Result<Self> {
|
||||
let meta = MatrixMeta::load(dir)?;
|
||||
let cols = (0..meta.n_cols)
|
||||
.map(|c| PersistentBitVec::open(&col_path(dir, c)))
|
||||
.collect::<io::Result<Vec<_>>>()?;
|
||||
Ok(Self { cols, n: meta.n })
|
||||
}
|
||||
|
||||
pub fn n(&self) -> usize { self.n }
|
||||
pub fn n_cols(&self) -> usize { self.cols.len() }
|
||||
pub fn col(&self, c: usize) -> &PersistentBitVec { &self.cols[c] }
|
||||
|
||||
pub fn row(&self, slot: usize) -> Box<[bool]> {
|
||||
self.cols.iter().map(|c| c.get(slot)).collect()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PersistentBitMatrixBuilder {
|
||||
dir: PathBuf,
|
||||
n: usize,
|
||||
n_cols: usize,
|
||||
}
|
||||
|
||||
impl PersistentBitMatrixBuilder {
|
||||
pub fn new(n: usize, dir: &Path) -> io::Result<Self> {
|
||||
fs::create_dir_all(dir)?;
|
||||
Ok(Self { dir: dir.to_path_buf(), n, n_cols: 0 })
|
||||
}
|
||||
|
||||
pub fn n(&self) -> usize { self.n }
|
||||
pub fn n_cols(&self) -> usize { self.n_cols }
|
||||
|
||||
pub fn add_col(&mut self) -> io::Result<PersistentBitVecBuilder> {
|
||||
let path = col_path(&self.dir, self.n_cols);
|
||||
self.n_cols += 1;
|
||||
PersistentBitVecBuilder::new(self.n, &path)
|
||||
}
|
||||
|
||||
pub fn close(self) -> io::Result<()> {
|
||||
MatrixMeta { n: self.n, n_cols: self.n_cols }.save(&self.dir)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,58 @@
|
||||
use std::{fs, io, path::{Path, PathBuf}};
|
||||
|
||||
use crate::builder::PersistentCompactIntVecBuilder;
|
||||
use crate::meta::MatrixMeta;
|
||||
use crate::reader::PersistentCompactIntVec;
|
||||
|
||||
fn col_path(dir: &Path, col: usize) -> PathBuf {
|
||||
dir.join(format!("col_{col:06}.pciv"))
|
||||
}
|
||||
|
||||
pub struct PersistentCompactIntMatrix {
|
||||
cols: Vec<PersistentCompactIntVec>,
|
||||
n: usize,
|
||||
}
|
||||
|
||||
impl PersistentCompactIntMatrix {
|
||||
pub fn open(dir: &Path) -> io::Result<Self> {
|
||||
let meta = MatrixMeta::load(dir)?;
|
||||
let cols = (0..meta.n_cols)
|
||||
.map(|c| PersistentCompactIntVec::open(&col_path(dir, c)))
|
||||
.collect::<io::Result<Vec<_>>>()?;
|
||||
Ok(Self { cols, n: meta.n })
|
||||
}
|
||||
|
||||
pub fn n(&self) -> usize { self.n }
|
||||
pub fn n_cols(&self) -> usize { self.cols.len() }
|
||||
pub fn col(&self, c: usize) -> &PersistentCompactIntVec { &self.cols[c] }
|
||||
|
||||
pub fn row(&self, slot: usize) -> Box<[u32]> {
|
||||
self.cols.iter().map(|c| c.get(slot)).collect()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PersistentCompactIntMatrixBuilder {
|
||||
dir: PathBuf,
|
||||
n: usize,
|
||||
n_cols: usize,
|
||||
}
|
||||
|
||||
impl PersistentCompactIntMatrixBuilder {
|
||||
pub fn new(n: usize, dir: &Path) -> io::Result<Self> {
|
||||
fs::create_dir_all(dir)?;
|
||||
Ok(Self { dir: dir.to_path_buf(), n, n_cols: 0 })
|
||||
}
|
||||
|
||||
pub fn n(&self) -> usize { self.n }
|
||||
pub fn n_cols(&self) -> usize { self.n_cols }
|
||||
|
||||
pub fn add_col(&mut self) -> io::Result<PersistentCompactIntVecBuilder> {
|
||||
let path = col_path(&self.dir, self.n_cols);
|
||||
self.n_cols += 1;
|
||||
PersistentCompactIntVecBuilder::new(self.n, &path)
|
||||
}
|
||||
|
||||
pub fn close(self) -> io::Result<()> {
|
||||
MatrixMeta { n: self.n, n_cols: self.n_cols }.save(&self.dir)
|
||||
}
|
||||
}
|
||||
@@ -1,10 +1,15 @@
|
||||
mod bitvec;
|
||||
mod bitmatrix;
|
||||
mod builder;
|
||||
mod format;
|
||||
mod intmatrix;
|
||||
mod meta;
|
||||
mod reader;
|
||||
|
||||
pub use bitvec::{BitIter, PersistentBitVec, PersistentBitVecBuilder};
|
||||
pub use bitmatrix::{PersistentBitMatrix, PersistentBitMatrixBuilder};
|
||||
pub use builder::PersistentCompactIntVecBuilder;
|
||||
pub use intmatrix::{PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder};
|
||||
pub use reader::PersistentCompactIntVec;
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -0,0 +1,32 @@
|
||||
use std::{fs, io, path::Path};
|
||||
|
||||
pub struct MatrixMeta {
|
||||
pub n: usize,
|
||||
pub n_cols: usize,
|
||||
}
|
||||
|
||||
impl MatrixMeta {
|
||||
pub fn load(dir: &Path) -> io::Result<Self> {
|
||||
let s = fs::read_to_string(dir.join("meta.json"))?;
|
||||
parse(&s).ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "bad meta.json"))
|
||||
}
|
||||
|
||||
pub fn save(&self, dir: &Path) -> io::Result<()> {
|
||||
fs::write(
|
||||
dir.join("meta.json"),
|
||||
format!("{{\"n\":{},\"n_cols\":{}}}\n", self.n, self.n_cols),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
fn parse(s: &str) -> Option<MatrixMeta> {
|
||||
Some(MatrixMeta { n: field(s, "n")?, n_cols: field(s, "n_cols")? })
|
||||
}
|
||||
|
||||
fn field(s: &str, name: &str) -> Option<usize> {
|
||||
let key = format!("\"{}\":", name);
|
||||
let pos = s.find(&key)? + key.len();
|
||||
let rest = s[pos..].trim_start();
|
||||
let end = rest.find(|c: char| !c.is_ascii_digit()).unwrap_or(rest.len());
|
||||
rest[..end].parse().ok()
|
||||
}
|
||||
+201
-57
@@ -7,41 +7,45 @@ use memmap2::Mmap;
|
||||
use crate::format::{HEADER_SIZE, INDEX_ENTRY_SIZE, MAGIC, OVERFLOW_ENTRY_SIZE};
|
||||
|
||||
pub struct PersistentCompactIntVec {
|
||||
mmap: Mmap,
|
||||
n: usize,
|
||||
n_overflow: usize,
|
||||
pub step: usize,
|
||||
index: Vec<(usize, usize)>, // (slot, pos) — L1-resident sparse index
|
||||
primary_offset: usize, // = HEADER_SIZE
|
||||
data_offset: usize, // = HEADER_SIZE + n
|
||||
path: PathBuf,
|
||||
mmap: Mmap,
|
||||
n: usize,
|
||||
n_overflow: usize,
|
||||
pub step: usize,
|
||||
index: Vec<(usize, usize)>, // (slot, pos) — L1-resident sparse index
|
||||
primary_offset: usize, // = HEADER_SIZE
|
||||
data_offset: usize, // = HEADER_SIZE + n
|
||||
path: PathBuf,
|
||||
}
|
||||
|
||||
impl PersistentCompactIntVec {
|
||||
/// Opens a persistent compact int vector from the given path.
|
||||
pub fn open(path: &Path) -> io::Result<Self> {
|
||||
let mmap = unsafe { Mmap::map(&File::open(path)?)? };
|
||||
|
||||
if mmap.len() < HEADER_SIZE {
|
||||
return Err(io::Error::new(io::ErrorKind::InvalidData, "PCIV file too short"));
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
"PCIV file too short",
|
||||
));
|
||||
}
|
||||
if &mmap[0..4] != &MAGIC {
|
||||
return Err(io::Error::new(io::ErrorKind::InvalidData, "bad PCIV magic"));
|
||||
}
|
||||
|
||||
let n = u64::from_le_bytes(mmap[8..16].try_into().unwrap()) as usize;
|
||||
let n = u64::from_le_bytes(mmap[8..16].try_into().unwrap()) as usize;
|
||||
let n_overflow = u64::from_le_bytes(mmap[16..24].try_into().unwrap()) as usize;
|
||||
let n_index = u64::from_le_bytes(mmap[24..32].try_into().unwrap()) as usize;
|
||||
let step = u64::from_le_bytes(mmap[32..40].try_into().unwrap()) as usize;
|
||||
let n_index = u64::from_le_bytes(mmap[24..32].try_into().unwrap()) as usize;
|
||||
let step = u64::from_le_bytes(mmap[32..40].try_into().unwrap()) as usize;
|
||||
|
||||
let primary_offset = HEADER_SIZE;
|
||||
let data_offset = primary_offset + n;
|
||||
let index_offset = data_offset + n_overflow * OVERFLOW_ENTRY_SIZE;
|
||||
let data_offset = primary_offset + n;
|
||||
let index_offset = data_offset + n_overflow * OVERFLOW_ENTRY_SIZE;
|
||||
|
||||
let mut index = Vec::with_capacity(n_index);
|
||||
for i in 0..n_index {
|
||||
let off = index_offset + i * INDEX_ENTRY_SIZE;
|
||||
let off = index_offset + i * INDEX_ENTRY_SIZE;
|
||||
let slot = u64::from_le_bytes(mmap[off..off + 8].try_into().unwrap()) as usize;
|
||||
let pos = u64::from_le_bytes(mmap[off + 8..off + 16].try_into().unwrap()) as usize;
|
||||
let pos = u64::from_le_bytes(mmap[off + 8..off + 16].try_into().unwrap()) as usize;
|
||||
index.push((slot, pos));
|
||||
}
|
||||
|
||||
@@ -57,36 +61,44 @@ impl PersistentCompactIntVec {
|
||||
})
|
||||
}
|
||||
|
||||
/// Returns the path of the compact int vector file.
|
||||
pub fn path(&self) -> &Path {
|
||||
&self.path
|
||||
}
|
||||
|
||||
/// Returns the length of the compact int vector.
|
||||
pub fn len(&self) -> usize {
|
||||
self.n
|
||||
}
|
||||
|
||||
/// Returns whether the compact int vector is empty.
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.n == 0
|
||||
}
|
||||
|
||||
/// Returns the value at the given slot.
|
||||
pub fn get(&self, slot: usize) -> u32 {
|
||||
match self.mmap[self.primary_offset + slot] {
|
||||
255 => self.overflow_get(slot),
|
||||
v => v as u32,
|
||||
v => v as u32,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the value at the given slot from the overflow region.
|
||||
fn overflow_get(&self, slot: usize) -> u32 {
|
||||
let pos_start;
|
||||
let pos_end;
|
||||
|
||||
if self.step == 0 {
|
||||
pos_start = 0;
|
||||
pos_end = self.n_overflow;
|
||||
pos_end = self.n_overflow;
|
||||
} else {
|
||||
let i = self.index.partition_point(|&(s, _)| s <= slot).saturating_sub(1);
|
||||
let i = self
|
||||
.index
|
||||
.partition_point(|&(s, _)| s <= slot)
|
||||
.saturating_sub(1);
|
||||
pos_start = self.index[i].1;
|
||||
pos_end = if i + 1 < self.index.len() {
|
||||
pos_end = if i + 1 < self.index.len() {
|
||||
self.index[i + 1].1
|
||||
} else {
|
||||
self.n_overflow
|
||||
@@ -98,8 +110,8 @@ impl PersistentCompactIntVec {
|
||||
while lo < hi {
|
||||
let mid = lo + (hi - lo) / 2;
|
||||
match self.data_slot(mid).cmp(&slot) {
|
||||
std::cmp::Ordering::Equal => return self.data_value(mid),
|
||||
std::cmp::Ordering::Less => lo = mid + 1,
|
||||
std::cmp::Ordering::Equal => return self.data_value(mid),
|
||||
std::cmp::Ordering::Less => lo = mid + 1,
|
||||
std::cmp::Ordering::Greater => hi = mid,
|
||||
}
|
||||
}
|
||||
@@ -107,85 +119,203 @@ impl PersistentCompactIntVec {
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// Returns the slot at the given index in the overflow region.
|
||||
fn data_slot(&self, i: usize) -> usize {
|
||||
let off = self.data_offset + i * OVERFLOW_ENTRY_SIZE;
|
||||
u64::from_le_bytes(self.mmap[off..off + 8].try_into().unwrap()) as usize
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// Returns the value at the given index in the overflow region.
|
||||
fn data_value(&self, i: usize) -> u32 {
|
||||
let off = self.data_offset + i * OVERFLOW_ENTRY_SIZE + 8;
|
||||
u32::from_le_bytes(self.mmap[off..off + 4].try_into().unwrap())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// Returns the sum of all values in the compact int vector.
|
||||
pub fn sum(&self) -> u64 {
|
||||
self.iter().map(|v| v as u64).sum()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// Returns the Bray-Curtis distance between two compact int vectors.
|
||||
pub fn bray_dist(&self, other: &PersistentCompactIntVec) -> f64 {
|
||||
assert_eq!(self.n, other.len(), "length mismatch");
|
||||
let (sum_min, sum_a, sum_b) = self.iter().zip(other.iter()).fold(
|
||||
(0u64, 0u64, 0u64),
|
||||
|(sm, sa, sb), (a, b)| (sm + a.min(b) as u64, sa + a as u64, sb + b as u64),
|
||||
);
|
||||
let denom = sum_a + sum_b;
|
||||
if denom == 0 { return 0.0; }
|
||||
let (sum_min, denom) = self.partial_bray_dist(other);
|
||||
if denom == 0 {
|
||||
return 0.0;
|
||||
}
|
||||
1.0 - 2.0 * sum_min as f64 / denom as f64
|
||||
}
|
||||
|
||||
/// Returns the partial Bray-Curtis distance between two compact int vectors.
|
||||
///
|
||||
/// Returns a tuple `(sum_min, denom)` where `sum_min` is the sum of the minimum values
|
||||
/// at each index, and `denom` is the sum of the values in both vectors.
|
||||
/// This is used internally by [`bray_dist`] and to easily compute the Bray-Curtis distance
|
||||
/// over a set of vector pairs.
|
||||
///
|
||||
/// Returns the tuple `(sum_min, sum_a + sum_b)` where `sum_min` is the sum of the minimum
|
||||
/// values at each index, `sum_a` is the sum of the first vector's counts, and `sum_b` is
|
||||
/// the sum of the second vector's counts.
|
||||
pub fn partial_bray_dist(&self, other: &PersistentCompactIntVec) -> (u64, u64) {
|
||||
assert_eq!(self.n, other.len(), "length mismatch");
|
||||
let (sum_min, sum_a, sum_b) = self
|
||||
.iter()
|
||||
.zip(other.iter())
|
||||
.fold((0u64, 0u64, 0u64), |(sm, sa, sb), (a, b)| {
|
||||
(sm + a.min(b) as u64, sa + a as u64, sb + b as u64)
|
||||
});
|
||||
(sum_min, sum_a + sum_b)
|
||||
}
|
||||
|
||||
/// Returns the relative frequency Bray-Curtis distance between two compact int vectors.
|
||||
///
|
||||
/// This is a variant of [`bray_dist`] that uses relative frequencies instead of raw counts.
|
||||
pub fn relfreq_bray_dist(&self, other: &PersistentCompactIntVec) -> f64 {
|
||||
assert_eq!(self.n, other.len(), "length mismatch");
|
||||
let sum_a = self.sum() as f64;
|
||||
let sum_b = other.sum() as f64;
|
||||
if sum_a == 0.0 && sum_b == 0.0 { return 0.0; }
|
||||
let sum_min: f64 = self.iter().zip(other.iter())
|
||||
if sum_a == 0.0 && sum_b == 0.0 {
|
||||
return 0.0;
|
||||
}
|
||||
let sum_min = self.partial_relfreq_bray_dist(other, sum_a, sum_b);
|
||||
1.0 - sum_min
|
||||
}
|
||||
|
||||
/// Returns the partial relative frequency Bray-Curtis distance between two compact int vectors.
|
||||
///
|
||||
/// This is used internally by [`relfreq_bray_dist`] and to easily compute the relative frequency
|
||||
/// Bray-Curtis distance over a set of vector pairs.
|
||||
///
|
||||
/// Arguments:
|
||||
/// - `other`: the other compact int vector to compare with
|
||||
/// - `sum_a`: the sum of the first vector's counts
|
||||
/// - `sum_b`: the sum of the second vector's counts
|
||||
///
|
||||
/// Returns the sum of the minimum relative frequencies at each index.
|
||||
pub fn partial_relfreq_bray_dist(
|
||||
&self,
|
||||
other: &PersistentCompactIntVec,
|
||||
sum_a: f64,
|
||||
sum_b: f64,
|
||||
) -> f64 {
|
||||
assert_eq!(self.n, other.len(), "length mismatch");
|
||||
let sum_min: f64 = self
|
||||
.iter()
|
||||
.zip(other.iter())
|
||||
.map(|(a, b)| {
|
||||
let pa = if sum_a > 0.0 { a as f64 / sum_a } else { 0.0 };
|
||||
let pb = if sum_b > 0.0 { b as f64 / sum_b } else { 0.0 };
|
||||
pa.min(pb)
|
||||
})
|
||||
.sum();
|
||||
1.0 - sum_min
|
||||
sum_min
|
||||
}
|
||||
|
||||
/// Returns the euclidean distance between two compact int vectors.
|
||||
pub fn euclidean_dist(&self, other: &PersistentCompactIntVec) -> f64 {
|
||||
assert_eq!(self.n, other.len(), "length mismatch");
|
||||
let sq: f64 = self.iter().zip(other.iter())
|
||||
.map(|(a, b)| { let d = a as f64 - b as f64; d * d })
|
||||
.sum();
|
||||
sq.sqrt()
|
||||
self.partial_euclidean_dist(other).sqrt()
|
||||
}
|
||||
|
||||
/// Returns the partial euclidean distance between two compact int vectors.
|
||||
///
|
||||
/// This is used internally by [`euclidean_dist`] and to easily compute the euclidean distance
|
||||
/// over a set of vector pairs.
|
||||
///
|
||||
/// The result is the sum of the squared differences between corresponding elements of the two
|
||||
/// vectors.
|
||||
pub fn partial_euclidean_dist(&self, other: &PersistentCompactIntVec) -> f64 {
|
||||
assert_eq!(self.n, other.len(), "length mismatch");
|
||||
self.iter()
|
||||
.zip(other.iter())
|
||||
.map(|(a, b)| {
|
||||
let d = a as f64 - b as f64;
|
||||
d * d
|
||||
})
|
||||
.sum()
|
||||
}
|
||||
|
||||
/// Returns the relative frequency euclidean distance between two compact int vectors.
|
||||
///
|
||||
/// This is a variant of [`euclidean_dist`] that uses relative frequencies instead of raw counts.
|
||||
pub fn relfreq_euclidean_dist(&self, other: &PersistentCompactIntVec) -> f64 {
|
||||
assert_eq!(self.n, other.len(), "length mismatch");
|
||||
let sum_a = self.sum() as f64;
|
||||
let sum_b = other.sum() as f64;
|
||||
if sum_a == 0.0 && sum_b == 0.0 { return 0.0; }
|
||||
let sq: f64 = self.iter().zip(other.iter())
|
||||
if sum_a == 0.0 && sum_b == 0.0 {
|
||||
return 0.0;
|
||||
}
|
||||
self.partial_relfreq_euclidean_dist(other, sum_a, sum_b)
|
||||
.sqrt()
|
||||
}
|
||||
|
||||
/// Returns the partial relative frequency euclidean distance between two compact int vectors.
|
||||
///
|
||||
/// This is used internally by [`relfreq_euclidean_dist`] and to easily compute the relative frequency
|
||||
/// euclidean distance over a set of vector pairs.
|
||||
pub fn partial_relfreq_euclidean_dist(
|
||||
&self,
|
||||
other: &PersistentCompactIntVec,
|
||||
sum_a: f64,
|
||||
sum_b: f64,
|
||||
) -> f64 {
|
||||
assert_eq!(self.n, other.len(), "length mismatch");
|
||||
self.iter()
|
||||
.zip(other.iter())
|
||||
.map(|(a, b)| {
|
||||
let pa = if sum_a > 0.0 { a as f64 / sum_a } else { 0.0 };
|
||||
let pb = if sum_b > 0.0 { b as f64 / sum_b } else { 0.0 };
|
||||
let d = pa - pb;
|
||||
d * d
|
||||
})
|
||||
.sum();
|
||||
sq.sqrt()
|
||||
.sum()
|
||||
}
|
||||
|
||||
/// Returns the Euclidean distance between two compact int vectors using the Hellinger transform.
|
||||
///
|
||||
/// The Hellinger transform is applied to the raw counts of each vector, and the result is
|
||||
/// the Euclidean distance between the transformed vectors. The Hellinger transform is defined
|
||||
/// as the square root of the relative frequencies.
|
||||
pub fn hellinger_euclidean_dist(&self, other: &PersistentCompactIntVec) -> f64 {
|
||||
assert_eq!(self.n, other.len(), "length mismatch");
|
||||
let sum_a = self.sum() as f64;
|
||||
let sum_b = other.sum() as f64;
|
||||
if sum_a == 0.0 && sum_b == 0.0 { return 0.0; }
|
||||
let sq: f64 = self.iter().zip(other.iter())
|
||||
if sum_a == 0.0 && sum_b == 0.0 {
|
||||
return 0.0;
|
||||
}
|
||||
self.partial_hellinger_euclidean_dist(other, sum_a, sum_b)
|
||||
.sqrt()
|
||||
}
|
||||
|
||||
/// Returns the partial Hellinger Euclidean distance between two compact int vectors.
|
||||
///
|
||||
/// This is used internally by [`hellinger_euclidean_dist`] and to easily compute the Hellinger
|
||||
/// Euclidean distance over a set of vector pairs.
|
||||
pub fn partial_hellinger_euclidean_dist(
|
||||
&self,
|
||||
other: &PersistentCompactIntVec,
|
||||
sum_a: f64,
|
||||
sum_b: f64,
|
||||
) -> f64 {
|
||||
assert_eq!(self.n, other.len(), "length mismatch");
|
||||
self.iter()
|
||||
.zip(other.iter())
|
||||
.map(|(a, b)| {
|
||||
let pa = if sum_a > 0.0 { (a as f64 / sum_a).sqrt() } else { 0.0 };
|
||||
let pb = if sum_b > 0.0 { (b as f64 / sum_b).sqrt() } else { 0.0 };
|
||||
let pa = if sum_a > 0.0 {
|
||||
(a as f64 / sum_a).sqrt()
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
let pb = if sum_b > 0.0 {
|
||||
(b as f64 / sum_b).sqrt()
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
let d = pa - pb;
|
||||
d * d
|
||||
})
|
||||
.sum();
|
||||
sq.sqrt()
|
||||
.sum()
|
||||
}
|
||||
|
||||
pub fn hellinger_dist(&self, other: &PersistentCompactIntVec) -> f64 {
|
||||
@@ -194,16 +324,26 @@ impl PersistentCompactIntVec {
|
||||
|
||||
pub fn threshold_jaccard_dist(&self, other: &PersistentCompactIntVec, threshold: u32) -> f64 {
|
||||
assert_eq!(self.n, other.len(), "length mismatch");
|
||||
let (intersection, union) = self.iter().zip(other.iter()).fold(
|
||||
(0u64, 0u64),
|
||||
|(inter, uni), (a, b)| {
|
||||
let (intersection, union) = self.partial_threshold_jaccard_dist(other, threshold);
|
||||
if union == 0 {
|
||||
return 0.0;
|
||||
}
|
||||
1.0 - intersection as f64 / union as f64
|
||||
}
|
||||
|
||||
pub fn partial_threshold_jaccard_dist(
|
||||
&self,
|
||||
other: &PersistentCompactIntVec,
|
||||
threshold: u32,
|
||||
) -> (u64, u64) {
|
||||
assert_eq!(self.n, other.len(), "length mismatch");
|
||||
self.iter()
|
||||
.zip(other.iter())
|
||||
.fold((0u64, 0u64), |(inter, uni), (a, b)| {
|
||||
let ap = a >= threshold;
|
||||
let bp = b >= threshold;
|
||||
(inter + (ap & bp) as u64, uni + (ap | bp) as u64)
|
||||
},
|
||||
);
|
||||
if union == 0 { return 0.0; }
|
||||
1.0 - intersection as f64 / union as f64
|
||||
})
|
||||
}
|
||||
|
||||
pub fn jaccard_dist(&self, other: &PersistentCompactIntVec) -> f64 {
|
||||
@@ -211,7 +351,11 @@ impl PersistentCompactIntVec {
|
||||
}
|
||||
|
||||
pub fn iter(&self) -> Iter<'_> {
|
||||
Iter { pciv: self, slot: 0, overflow_pos: 0 }
|
||||
Iter {
|
||||
pciv: self,
|
||||
slot: 0,
|
||||
overflow_pos: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -225,8 +369,8 @@ impl<'a> IntoIterator for &'a PersistentCompactIntVec {
|
||||
}
|
||||
|
||||
pub struct Iter<'a> {
|
||||
pciv: &'a PersistentCompactIntVec,
|
||||
slot: usize,
|
||||
pciv: &'a PersistentCompactIntVec,
|
||||
slot: usize,
|
||||
overflow_pos: usize,
|
||||
}
|
||||
|
||||
|
||||
@@ -0,0 +1,69 @@
|
||||
use tempfile::tempdir;
|
||||
|
||||
use crate::{PersistentBitMatrix, PersistentBitMatrixBuilder};
|
||||
|
||||
#[test]
|
||||
fn single_col_roundtrip() {
|
||||
let dir = tempdir().unwrap();
|
||||
let mut b = PersistentBitMatrixBuilder::new(4, dir.path()).unwrap();
|
||||
let mut col = b.add_col().unwrap();
|
||||
col.set(0, true);
|
||||
col.set(1, false);
|
||||
col.set(2, true);
|
||||
col.set(3, true);
|
||||
col.close().unwrap();
|
||||
b.close().unwrap();
|
||||
|
||||
let m = PersistentBitMatrix::open(dir.path()).unwrap();
|
||||
assert_eq!(m.n_cols(), 1);
|
||||
assert_eq!(m.n(), 4);
|
||||
assert_eq!(&*m.row(0), &[true]);
|
||||
assert_eq!(&*m.row(1), &[false]);
|
||||
assert_eq!(&*m.row(2), &[true]);
|
||||
assert_eq!(&*m.row(3), &[true]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn two_cols_roundtrip() {
|
||||
let dir = tempdir().unwrap();
|
||||
let mut b = PersistentBitMatrixBuilder::new(3, dir.path()).unwrap();
|
||||
let mut col0 = b.add_col().unwrap();
|
||||
col0.set(0, true); col0.set(1, false); col0.set(2, true);
|
||||
col0.close().unwrap();
|
||||
let mut col1 = b.add_col().unwrap();
|
||||
col1.set(0, false); col1.set(1, true); col1.set(2, false);
|
||||
col1.close().unwrap();
|
||||
b.close().unwrap();
|
||||
|
||||
let m = PersistentBitMatrix::open(dir.path()).unwrap();
|
||||
assert_eq!(m.n_cols(), 2);
|
||||
assert_eq!(&*m.row(0), &[true, false]);
|
||||
assert_eq!(&*m.row(1), &[false, true]);
|
||||
assert_eq!(&*m.row(2), &[true, false]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn col_accessor() {
|
||||
let dir = tempdir().unwrap();
|
||||
let mut b = PersistentBitMatrixBuilder::new(3, dir.path()).unwrap();
|
||||
let mut col = b.add_col().unwrap();
|
||||
col.set(0, true); col.set(1, false); col.set(2, true);
|
||||
col.close().unwrap();
|
||||
b.close().unwrap();
|
||||
|
||||
let m = PersistentBitMatrix::open(dir.path()).unwrap();
|
||||
assert!(m.col(0).get(0));
|
||||
assert!(!m.col(0).get(1));
|
||||
assert!(m.col(0).get(2));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn zero_cols_roundtrip() {
|
||||
let dir = tempdir().unwrap();
|
||||
let b = PersistentBitMatrixBuilder::new(8, dir.path()).unwrap();
|
||||
b.close().unwrap();
|
||||
|
||||
let m = PersistentBitMatrix::open(dir.path()).unwrap();
|
||||
assert_eq!(m.n_cols(), 0);
|
||||
assert_eq!(m.n(), 8);
|
||||
}
|
||||
@@ -0,0 +1,68 @@
|
||||
use tempfile::tempdir;
|
||||
|
||||
use crate::{PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder};
|
||||
|
||||
#[test]
|
||||
fn single_col_roundtrip() {
|
||||
let dir = tempdir().unwrap();
|
||||
let mut b = PersistentCompactIntMatrixBuilder::new(4, dir.path()).unwrap();
|
||||
let mut col = b.add_col().unwrap();
|
||||
col.set(0, 10);
|
||||
col.set(1, 200);
|
||||
col.set(2, 300);
|
||||
col.set(3, 1000);
|
||||
col.close().unwrap();
|
||||
b.close().unwrap();
|
||||
|
||||
let m = PersistentCompactIntMatrix::open(dir.path()).unwrap();
|
||||
assert_eq!(m.n_cols(), 1);
|
||||
assert_eq!(m.n(), 4);
|
||||
assert_eq!(&*m.row(0), &[10u32]);
|
||||
assert_eq!(&*m.row(1), &[200u32]);
|
||||
assert_eq!(&*m.row(2), &[300u32]);
|
||||
assert_eq!(&*m.row(3), &[1000u32]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn two_cols_roundtrip() {
|
||||
let dir = tempdir().unwrap();
|
||||
let mut b = PersistentCompactIntMatrixBuilder::new(3, dir.path()).unwrap();
|
||||
let mut col0 = b.add_col().unwrap();
|
||||
col0.set(0, 1); col0.set(1, 2); col0.set(2, 3);
|
||||
col0.close().unwrap();
|
||||
let mut col1 = b.add_col().unwrap();
|
||||
col1.set(0, 10); col1.set(1, 20); col1.set(2, 30);
|
||||
col1.close().unwrap();
|
||||
b.close().unwrap();
|
||||
|
||||
let m = PersistentCompactIntMatrix::open(dir.path()).unwrap();
|
||||
assert_eq!(m.n_cols(), 2);
|
||||
assert_eq!(&*m.row(0), &[1u32, 10]);
|
||||
assert_eq!(&*m.row(1), &[2u32, 20]);
|
||||
assert_eq!(&*m.row(2), &[3u32, 30]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn col_accessor() {
|
||||
let dir = tempdir().unwrap();
|
||||
let mut b = PersistentCompactIntMatrixBuilder::new(2, dir.path()).unwrap();
|
||||
let mut col0 = b.add_col().unwrap();
|
||||
col0.set(0, 5); col0.set(1, 7);
|
||||
col0.close().unwrap();
|
||||
b.close().unwrap();
|
||||
|
||||
let m = PersistentCompactIntMatrix::open(dir.path()).unwrap();
|
||||
assert_eq!(m.col(0).get(0), 5);
|
||||
assert_eq!(m.col(0).get(1), 7);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn zero_cols_roundtrip() {
|
||||
let dir = tempdir().unwrap();
|
||||
let b = PersistentCompactIntMatrixBuilder::new(10, dir.path()).unwrap();
|
||||
b.close().unwrap();
|
||||
|
||||
let m = PersistentCompactIntMatrix::open(dir.path()).unwrap();
|
||||
assert_eq!(m.n_cols(), 0);
|
||||
assert_eq!(m.n(), 10);
|
||||
}
|
||||
@@ -1,4 +1,6 @@
|
||||
mod bitmatrix;
|
||||
mod bitvec;
|
||||
mod intmatrix;
|
||||
|
||||
use tempfile::tempdir;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user