feat: introduce packed matrix storage and layer metadata
Unifies bit and integer matrix storage into `PersistentBitMatrix` and `PersistentCompactIntMatrix` enums, supporting both columnar and memory-mapped single-file layouts. Introduces `LayerMeta` to persist layer dimensions as `layer_meta.json`, enabling correct initialization of implicit presence matrices. Adds CLI commands (`pack` and `--upgrade-index`) to convert existing columnar indices to the compact format and backfill missing metadata. Updates partitionner and layered map logic to use the new persistent builders, optimized memory allocation, and auto-detected storage backends.
This commit is contained in:
+306
-139
@@ -1,9 +1,14 @@
|
||||
use std::{fs, io, path::{Path, PathBuf}};
|
||||
use std::cmp::Ordering;
|
||||
use std::fs::{self, File};
|
||||
use std::io::{self, Write as _};
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use memmap2::Mmap;
|
||||
use ndarray::{Array1, Array2};
|
||||
use rayon::prelude::*;
|
||||
|
||||
use crate::builder::PersistentCompactIntVecBuilder;
|
||||
use crate::format::{HEADER_SIZE, INDEX_ENTRY_SIZE, OVERFLOW_ENTRY_SIZE};
|
||||
use crate::meta::MatrixMeta;
|
||||
use crate::reader::PersistentCompactIntVec;
|
||||
|
||||
@@ -11,13 +16,15 @@ fn col_path(dir: &Path, col: usize) -> PathBuf {
|
||||
dir.join(format!("col_{col:06}.pciv"))
|
||||
}
|
||||
|
||||
pub struct PersistentCompactIntMatrix {
|
||||
// ── ColumnarCompactIntMatrix ──────────────────────────────────────────────────
|
||||
|
||||
pub struct ColumnarCompactIntMatrix {
|
||||
cols: Vec<PersistentCompactIntVec>,
|
||||
n: usize,
|
||||
}
|
||||
|
||||
impl PersistentCompactIntMatrix {
|
||||
pub fn open(dir: &Path) -> io::Result<Self> {
|
||||
impl ColumnarCompactIntMatrix {
|
||||
pub(crate) fn open(dir: &Path) -> io::Result<Self> {
|
||||
let meta = MatrixMeta::load(dir)?;
|
||||
let cols = (0..meta.n_cols)
|
||||
.map(|c| PersistentCompactIntVec::open(&col_path(dir, c)))
|
||||
@@ -25,25 +32,29 @@ impl PersistentCompactIntMatrix {
|
||||
Ok(Self { cols, n: meta.n })
|
||||
}
|
||||
|
||||
pub fn n(&self) -> usize { self.n }
|
||||
pub fn n_cols(&self) -> usize { self.cols.len() }
|
||||
pub fn col(&self, c: usize) -> &PersistentCompactIntVec { &self.cols[c] }
|
||||
pub(crate) fn n(&self) -> usize { self.n }
|
||||
pub(crate) fn n_cols(&self) -> usize { self.cols.len() }
|
||||
pub(crate) fn col(&self, c: usize) -> &PersistentCompactIntVec { &self.cols[c] }
|
||||
|
||||
pub fn row(&self, slot: usize) -> Box<[u32]> {
|
||||
pub(crate) fn row(&self, slot: usize) -> Box<[u32]> {
|
||||
self.cols.iter().map(|c| c.get(slot)).collect()
|
||||
}
|
||||
|
||||
/// Fill `buf[i]` with `col_i[slot]`, without allocating.
|
||||
/// `buf` must have length ≥ `self.n_cols()`.
|
||||
pub fn fill_row(&self, slot: usize, buf: &mut [u32]) {
|
||||
pub(crate) fn fill_row(&self, slot: usize, buf: &mut [u32]) {
|
||||
for (c, col) in self.cols.iter().enumerate() {
|
||||
buf[c] = col.get(slot);
|
||||
}
|
||||
}
|
||||
|
||||
// ── Distance matrices ─────────────────────────────────────────────────────
|
||||
pub(crate) fn sum(&self) -> Array1<u64> {
|
||||
let sums: Vec<u64> = (0..self.n_cols())
|
||||
.into_par_iter()
|
||||
.map(|c| self.col(c).sum())
|
||||
.collect();
|
||||
Array1::from_vec(sums)
|
||||
}
|
||||
|
||||
pub fn bray_dist_matrix(&self) -> Array2<f64> {
|
||||
pub(crate) fn bray_dist_matrix(&self) -> Array2<f64> {
|
||||
let sum_min = self.partial_bray_dist_matrix();
|
||||
let col_sums = self.sum();
|
||||
let n = self.n_cols();
|
||||
@@ -60,63 +71,19 @@ impl PersistentCompactIntMatrix {
|
||||
m
|
||||
}
|
||||
|
||||
pub fn relfreq_bray_dist_matrix(&self) -> Array2<f64> {
|
||||
self.pairwise(|i, j| self.col(i).relfreq_bray_dist(self.col(j)))
|
||||
}
|
||||
|
||||
pub fn euclidean_dist_matrix(&self) -> Array2<f64> {
|
||||
self.pairwise(|i, j| self.col(i).euclidean_dist(self.col(j)))
|
||||
}
|
||||
|
||||
pub fn relfreq_euclidean_dist_matrix(&self) -> Array2<f64> {
|
||||
self.pairwise(|i, j| self.col(i).relfreq_euclidean_dist(self.col(j)))
|
||||
}
|
||||
|
||||
pub fn hellinger_dist_matrix(&self) -> Array2<f64> {
|
||||
self.pairwise(|i, j| self.col(i).hellinger_dist(self.col(j)))
|
||||
}
|
||||
|
||||
pub fn jaccard_dist_matrix(&self) -> Array2<f64> {
|
||||
self.pairwise(|i, j| self.col(i).jaccard_dist(self.col(j)))
|
||||
}
|
||||
|
||||
pub fn threshold_jaccard_dist_matrix(&self, threshold: u32) -> Array2<f64> {
|
||||
self.pairwise(|i, j| self.col(i).threshold_jaccard_dist(self.col(j), threshold))
|
||||
}
|
||||
|
||||
/// Returns the sum of each column as `Array1<u64>`.
|
||||
pub fn sum(&self) -> Array1<u64> {
|
||||
let sums: Vec<u64> = (0..self.n_cols())
|
||||
.into_par_iter()
|
||||
.map(|c| self.col(c).sum())
|
||||
.collect();
|
||||
Array1::from_vec(sums)
|
||||
}
|
||||
|
||||
// ── Partial matrices (additively decomposable across layers) ──────────────
|
||||
|
||||
/// Returns `sum_min[n×n]` where `sum_min[i,j]` = Σ_slot min(col_i[slot], col_j[slot]).
|
||||
/// The denominator `col_sums[i] + col_sums[j]` is obtained from `self.sum()`.
|
||||
/// Additive across layers by element-wise addition.
|
||||
pub fn partial_bray_dist_matrix(&self) -> Array2<u64> {
|
||||
pub(crate) fn partial_bray_dist_matrix(&self) -> Array2<u64> {
|
||||
self.pairwise_u64(|i, j| self.col(i).partial_bray_dist(self.col(j)))
|
||||
}
|
||||
|
||||
/// Returns sum of squared differences `[n×n]`.
|
||||
/// Reduce across layers by element-wise addition, then take `sqrt` for the final distance.
|
||||
pub fn partial_euclidean_dist_matrix(&self) -> Array2<f64> {
|
||||
pub(crate) fn partial_euclidean_dist_matrix(&self) -> Array2<f64> {
|
||||
self.pairwise(|i, j| self.col(i).partial_euclidean_dist(self.col(j)))
|
||||
}
|
||||
|
||||
/// Returns `(inter[n×n], union[n×n])` for threshold-Jaccard.
|
||||
/// Reduce across layers by element-wise addition before computing `1 - inter/union`.
|
||||
pub fn partial_threshold_jaccard_dist_matrix(
|
||||
&self,
|
||||
threshold: u32,
|
||||
pub(crate) fn partial_threshold_jaccard_dist_matrix(
|
||||
&self, threshold: u32,
|
||||
) -> (Array2<u64>, Array2<u64>) {
|
||||
let n = self.n_cols();
|
||||
let pairs = upper_pairs(n);
|
||||
|
||||
let results: Vec<(usize, usize, u64, u64)> = pairs
|
||||
.into_par_iter()
|
||||
.map(|(i, j)| {
|
||||
@@ -125,7 +92,6 @@ impl PersistentCompactIntMatrix {
|
||||
(i, j, inter, union)
|
||||
})
|
||||
.collect();
|
||||
|
||||
let mut inter_m = Array2::zeros((n, n));
|
||||
let mut union_m = Array2::zeros((n, n));
|
||||
for (i, j, inter, union) in results {
|
||||
@@ -135,99 +101,299 @@ impl PersistentCompactIntMatrix {
|
||||
(inter_m, union_m)
|
||||
}
|
||||
|
||||
/// Returns matrix of `Σ_slot min(col_i[slot]/sum_i, col_j[slot]/sum_j)` per pair.
|
||||
/// `col_sums` must be the GLOBAL sums across all layers/partitions.
|
||||
/// Reduce across layers by element-wise addition; final distance = `1 - value`.
|
||||
pub fn partial_relfreq_bray_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
||||
pub(crate) fn partial_relfreq_bray_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
||||
self.pairwise(|i, j| {
|
||||
self.col(i).partial_relfreq_bray_dist(
|
||||
self.col(j),
|
||||
col_sums[i] as f64,
|
||||
col_sums[j] as f64,
|
||||
)
|
||||
self.col(i).partial_relfreq_bray_dist(self.col(j), col_sums[i] as f64, col_sums[j] as f64)
|
||||
})
|
||||
}
|
||||
|
||||
/// Returns matrix of `Σ_slot (col_i[slot]/sum_i - col_j[slot]/sum_j)²` per pair.
|
||||
/// `col_sums` must be the GLOBAL sums across all layers/partitions.
|
||||
/// Reduce across layers by element-wise addition; final distance = `sqrt(value)`.
|
||||
pub fn partial_relfreq_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
||||
pub(crate) fn partial_relfreq_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
||||
self.pairwise(|i, j| {
|
||||
self.col(i).partial_relfreq_euclidean_dist(
|
||||
self.col(j),
|
||||
col_sums[i] as f64,
|
||||
col_sums[j] as f64,
|
||||
)
|
||||
self.col(i).partial_relfreq_euclidean_dist(self.col(j), col_sums[i] as f64, col_sums[j] as f64)
|
||||
})
|
||||
}
|
||||
|
||||
/// Returns matrix of `Σ_slot (√(col_i/sum_i) - √(col_j/sum_j))²` per pair.
|
||||
/// `col_sums` must be the GLOBAL sums across all layers/partitions.
|
||||
/// Reduce across layers by element-wise addition; final distance = `sqrt(value) / √2`.
|
||||
pub fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
||||
pub(crate) fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
||||
self.pairwise(|i, j| {
|
||||
self.col(i).partial_hellinger_euclidean_dist(
|
||||
self.col(j),
|
||||
col_sums[i] as f64,
|
||||
col_sums[j] as f64,
|
||||
)
|
||||
self.col(i).partial_hellinger_euclidean_dist(self.col(j), col_sums[i] as f64, col_sums[j] as f64)
|
||||
})
|
||||
}
|
||||
|
||||
// ── Private helpers ───────────────────────────────────────────────────────
|
||||
pub(crate) fn relfreq_bray_dist_matrix(&self) -> Array2<f64> {
|
||||
self.pairwise(|i, j| self.col(i).relfreq_bray_dist(self.col(j)))
|
||||
}
|
||||
|
||||
pub(crate) fn euclidean_dist_matrix(&self) -> Array2<f64> {
|
||||
self.pairwise(|i, j| self.col(i).euclidean_dist(self.col(j)))
|
||||
}
|
||||
|
||||
pub(crate) fn relfreq_euclidean_dist_matrix(&self) -> Array2<f64> {
|
||||
self.pairwise(|i, j| self.col(i).relfreq_euclidean_dist(self.col(j)))
|
||||
}
|
||||
|
||||
pub(crate) fn hellinger_dist_matrix(&self) -> Array2<f64> {
|
||||
self.pairwise(|i, j| self.col(i).hellinger_dist(self.col(j)))
|
||||
}
|
||||
|
||||
pub(crate) fn jaccard_dist_matrix(&self) -> Array2<f64> {
|
||||
self.pairwise(|i, j| self.col(i).jaccard_dist(self.col(j)))
|
||||
}
|
||||
|
||||
pub(crate) fn threshold_jaccard_dist_matrix(&self, threshold: u32) -> Array2<f64> {
|
||||
self.pairwise(|i, j| self.col(i).threshold_jaccard_dist(self.col(j), threshold))
|
||||
}
|
||||
|
||||
pub(crate) fn append_column(dir: &Path, value_of: impl Fn(usize) -> u32) -> io::Result<()> {
|
||||
let mut meta = MatrixMeta::load(dir)?;
|
||||
let mut b = PersistentCompactIntVecBuilder::new(meta.n, &col_path(dir, meta.n_cols))?;
|
||||
for slot in 0..meta.n { b.set(slot, value_of(slot)); }
|
||||
b.close()?;
|
||||
meta.n_cols += 1;
|
||||
meta.save(dir)
|
||||
}
|
||||
|
||||
fn pairwise(&self, f: impl Fn(usize, usize) -> f64 + Sync) -> Array2<f64> {
|
||||
let n = self.n_cols();
|
||||
let results: Vec<(usize, usize, f64)> = upper_pairs(n)
|
||||
.into_par_iter()
|
||||
.map(|(i, j)| (i, j, f(i, j)))
|
||||
.collect();
|
||||
.into_par_iter().map(|(i, j)| (i, j, f(i, j))).collect();
|
||||
fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v)))
|
||||
}
|
||||
|
||||
fn pairwise_u64(&self, f: impl Fn(usize, usize) -> u64 + Sync) -> Array2<u64> {
|
||||
let n = self.n_cols();
|
||||
let results: Vec<(usize, usize, u64)> = upper_pairs(n)
|
||||
.into_par_iter()
|
||||
.map(|(i, j)| (i, j, f(i, j)))
|
||||
.collect();
|
||||
.into_par_iter().map(|(i, j)| (i, j, f(i, j))).collect();
|
||||
fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v)))
|
||||
}
|
||||
}
|
||||
|
||||
fn upper_pairs(n: usize) -> Vec<(usize, usize)> {
|
||||
(0..n).flat_map(|i| (i + 1..n).map(move |j| (i, j))).collect()
|
||||
// ── PackedCompactIntMatrix ────────────────────────────────────────────────────
|
||||
|
||||
const PCMX_MAGIC: [u8; 4] = *b"PCMX";
|
||||
const PCMX_HEADER: usize = 24; // magic(4) + pad(4) + n_rows(8) + n_cols(8)
|
||||
|
||||
/// Per-column metadata pre-parsed from the embedded PCIV header.
|
||||
struct ColInfo {
|
||||
primary_start: usize, // absolute mmap offset to primary array
|
||||
data_offset: usize, // absolute mmap offset to overflow array
|
||||
n_overflow: usize,
|
||||
step: usize,
|
||||
index: Vec<(usize, usize)>,
|
||||
}
|
||||
|
||||
fn fill_symmetric<T>(n: usize, vals: impl Iterator<Item = (usize, usize, T, T)>) -> Array2<T>
|
||||
where
|
||||
T: Clone + Default,
|
||||
{
|
||||
let mut m = Array2::from_elem((n, n), T::default());
|
||||
for (i, j, vij, vji) in vals {
|
||||
m[[i, j]] = vij;
|
||||
m[[j, i]] = vji;
|
||||
pub struct PackedCompactIntMatrix {
|
||||
mmap: Mmap,
|
||||
n_rows: usize,
|
||||
n_cols: usize,
|
||||
columns: Vec<ColInfo>,
|
||||
}
|
||||
|
||||
impl PackedCompactIntMatrix {
|
||||
pub(crate) fn open(path: &Path) -> io::Result<Self> {
|
||||
let mmap = unsafe { Mmap::map(&File::open(path)?)? };
|
||||
if mmap.len() < PCMX_HEADER {
|
||||
return Err(io::Error::new(io::ErrorKind::InvalidData, "PCMX file too short"));
|
||||
}
|
||||
if &mmap[0..4] != &PCMX_MAGIC {
|
||||
return Err(io::Error::new(io::ErrorKind::InvalidData, "bad PCMX magic"));
|
||||
}
|
||||
let n_rows = u64::from_le_bytes(mmap[8..16].try_into().unwrap()) as usize;
|
||||
let n_cols = u64::from_le_bytes(mmap[16..24].try_into().unwrap()) as usize;
|
||||
|
||||
let mut columns = Vec::with_capacity(n_cols);
|
||||
for c in 0..n_cols {
|
||||
let off_pos = PCMX_HEADER + c * 8;
|
||||
let col_base = u64::from_le_bytes(mmap[off_pos..off_pos+8].try_into().unwrap()) as usize;
|
||||
// Parse embedded PCIV header at col_base
|
||||
let n_ov = u64::from_le_bytes(mmap[col_base+16..col_base+24].try_into().unwrap()) as usize;
|
||||
let n_idx = u64::from_le_bytes(mmap[col_base+24..col_base+32].try_into().unwrap()) as usize;
|
||||
let step = u64::from_le_bytes(mmap[col_base+32..col_base+40].try_into().unwrap()) as usize;
|
||||
let n_pciv = u64::from_le_bytes(mmap[col_base+8..col_base+16].try_into().unwrap()) as usize;
|
||||
|
||||
let primary_start = col_base + HEADER_SIZE;
|
||||
let data_offset = primary_start + n_pciv;
|
||||
let index_offset = data_offset + n_ov * OVERFLOW_ENTRY_SIZE;
|
||||
|
||||
let mut index = Vec::with_capacity(n_idx);
|
||||
for i in 0..n_idx {
|
||||
let ioff = index_offset + i * INDEX_ENTRY_SIZE;
|
||||
let slot = u64::from_le_bytes(mmap[ioff..ioff+8].try_into().unwrap()) as usize;
|
||||
let pos = u64::from_le_bytes(mmap[ioff+8..ioff+16].try_into().unwrap()) as usize;
|
||||
index.push((slot, pos));
|
||||
}
|
||||
columns.push(ColInfo { primary_start, data_offset, n_overflow: n_ov, step, index });
|
||||
}
|
||||
|
||||
Ok(Self { mmap, n_rows, n_cols, columns })
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn get(&self, col: usize, slot: usize) -> u32 {
|
||||
let ci = &self.columns[col];
|
||||
let v = self.mmap[ci.primary_start + slot];
|
||||
if v < 255 { return v as u32; }
|
||||
self.overflow_get(ci, slot)
|
||||
}
|
||||
|
||||
fn overflow_get(&self, ci: &ColInfo, slot: usize) -> u32 {
|
||||
let (pos_start, pos_end) = if ci.step == 0 {
|
||||
(0, ci.n_overflow)
|
||||
} else {
|
||||
let i = ci.index.partition_point(|&(s, _)| s <= slot).saturating_sub(1);
|
||||
let start = ci.index[i].1;
|
||||
let end = if i + 1 < ci.index.len() { ci.index[i+1].1 } else { ci.n_overflow };
|
||||
(start, end)
|
||||
};
|
||||
let mut lo = pos_start;
|
||||
let mut hi = pos_end;
|
||||
while lo < hi {
|
||||
let mid = lo + (hi - lo) / 2;
|
||||
let off = ci.data_offset + mid * OVERFLOW_ENTRY_SIZE;
|
||||
let stored = u64::from_le_bytes(self.mmap[off..off+8].try_into().unwrap()) as usize;
|
||||
match stored.cmp(&slot) {
|
||||
Ordering::Equal => return u32::from_le_bytes(self.mmap[off+8..off+12].try_into().unwrap()),
|
||||
Ordering::Less => lo = mid + 1,
|
||||
Ordering::Greater => hi = mid,
|
||||
}
|
||||
}
|
||||
panic!("slot {slot} marked overflow but not found")
|
||||
}
|
||||
|
||||
pub(crate) fn fill_row(&self, slot: usize, buf: &mut [u32]) {
|
||||
for c in 0..self.n_cols { buf[c] = self.get(c, slot); }
|
||||
}
|
||||
|
||||
pub(crate) fn row(&self, slot: usize) -> Box<[u32]> {
|
||||
(0..self.n_cols).map(|c| self.get(c, slot)).collect()
|
||||
}
|
||||
m
|
||||
}
|
||||
|
||||
// ── Column append ─────────────────────────────────────────────────────────────
|
||||
/// Build `counts/matrix.pcmx` from existing `col_*.pciv` files.
|
||||
pub fn pack_compact_int_matrix(dir: &Path) -> io::Result<()> {
|
||||
let meta = MatrixMeta::load(dir)?;
|
||||
let n_cols = meta.n_cols;
|
||||
|
||||
let col_files: Vec<Vec<u8>> = (0..n_cols)
|
||||
.map(|c| fs::read(col_path(dir, c)))
|
||||
.collect::<io::Result<_>>()?;
|
||||
|
||||
let header_size = PCMX_HEADER + n_cols * 8;
|
||||
let mut col_offset = header_size;
|
||||
let mut offsets = Vec::with_capacity(n_cols);
|
||||
for data in &col_files {
|
||||
offsets.push(col_offset as u64);
|
||||
col_offset += data.len();
|
||||
}
|
||||
|
||||
let packed_path = dir.join("matrix.pcmx");
|
||||
let mut file = File::create(&packed_path)?;
|
||||
file.write_all(&PCMX_MAGIC)?;
|
||||
file.write_all(&[0u8; 4])?;
|
||||
file.write_all(&(meta.n as u64).to_le_bytes())?;
|
||||
file.write_all(&(n_cols as u64).to_le_bytes())?;
|
||||
for &off in &offsets { file.write_all(&off.to_le_bytes())?; }
|
||||
for data in &col_files { file.write_all(data)?; }
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// ── PersistentCompactIntMatrix — public enum ──────────────────────────────────
|
||||
|
||||
pub enum PersistentCompactIntMatrix {
|
||||
Columnar(ColumnarCompactIntMatrix),
|
||||
Packed(PackedCompactIntMatrix),
|
||||
}
|
||||
|
||||
impl PersistentCompactIntMatrix {
|
||||
/// Append a new column to an existing matrix on disk.
|
||||
///
|
||||
/// Reads `meta.json` to obtain `n` and the current column count, writes
|
||||
/// `col_{n_cols:06}.pciv` filled by `value_of(slot)`, then increments
|
||||
/// `n_cols` in `meta.json`.
|
||||
pub fn append_column(dir: &Path, value_of: impl Fn(usize) -> u32) -> io::Result<()> {
|
||||
let mut meta = MatrixMeta::load(dir)?;
|
||||
let mut b = PersistentCompactIntVecBuilder::new(meta.n, &col_path(dir, meta.n_cols))?;
|
||||
for slot in 0..meta.n {
|
||||
b.set(slot, value_of(slot));
|
||||
/// Open from `layer_dir`, auto-detecting Packed or Columnar.
|
||||
pub fn open(layer_dir: &Path) -> io::Result<Self> {
|
||||
let counts_dir = layer_dir.join("counts");
|
||||
|
||||
if counts_dir.join("matrix.pcmx").exists() {
|
||||
return Ok(Self::Packed(PackedCompactIntMatrix::open(&counts_dir.join("matrix.pcmx"))?));
|
||||
}
|
||||
b.close()?;
|
||||
meta.n_cols += 1;
|
||||
meta.save(dir)
|
||||
|
||||
if MatrixMeta::load(&counts_dir).is_ok() {
|
||||
return Ok(Self::Columnar(ColumnarCompactIntMatrix::open(&counts_dir)?));
|
||||
}
|
||||
|
||||
Err(io::Error::new(
|
||||
io::ErrorKind::NotFound,
|
||||
format!("no count matrix found in {} — run 'obikmer upgrade'", layer_dir.display()),
|
||||
))
|
||||
}
|
||||
|
||||
pub fn n(&self) -> usize {
|
||||
match self { Self::Columnar(m) => m.n(), Self::Packed(m) => m.n_rows }
|
||||
}
|
||||
|
||||
pub fn n_cols(&self) -> usize {
|
||||
match self { Self::Columnar(m) => m.n_cols(), Self::Packed(m) => m.n_cols }
|
||||
}
|
||||
|
||||
pub fn col(&self, c: usize) -> &PersistentCompactIntVec {
|
||||
match self {
|
||||
Self::Columnar(m) => m.col(c),
|
||||
_ => panic!("col() only available on Columnar PersistentCompactIntMatrix"),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn row(&self, slot: usize) -> Box<[u32]> {
|
||||
match self { Self::Columnar(m) => m.row(slot), Self::Packed(m) => m.row(slot) }
|
||||
}
|
||||
|
||||
pub fn fill_row(&self, slot: usize, buf: &mut [u32]) {
|
||||
match self { Self::Columnar(m) => m.fill_row(slot, buf), Self::Packed(m) => m.fill_row(slot, buf) }
|
||||
}
|
||||
|
||||
pub fn sum(&self) -> Array1<u64> {
|
||||
match self {
|
||||
Self::Columnar(m) => m.sum(),
|
||||
_ => panic!("sum() only available on Columnar PersistentCompactIntMatrix"),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn bray_dist_matrix(&self) -> Array2<f64> {
|
||||
match self { Self::Columnar(m) => m.bray_dist_matrix(), _ => panic!("Columnar only") }
|
||||
}
|
||||
pub fn relfreq_bray_dist_matrix(&self) -> Array2<f64> {
|
||||
match self { Self::Columnar(m) => m.relfreq_bray_dist_matrix(), _ => panic!("Columnar only") }
|
||||
}
|
||||
pub fn euclidean_dist_matrix(&self) -> Array2<f64> {
|
||||
match self { Self::Columnar(m) => m.euclidean_dist_matrix(), _ => panic!("Columnar only") }
|
||||
}
|
||||
pub fn relfreq_euclidean_dist_matrix(&self) -> Array2<f64> {
|
||||
match self { Self::Columnar(m) => m.relfreq_euclidean_dist_matrix(), _ => panic!("Columnar only") }
|
||||
}
|
||||
pub fn hellinger_dist_matrix(&self) -> Array2<f64> {
|
||||
match self { Self::Columnar(m) => m.hellinger_dist_matrix(), _ => panic!("Columnar only") }
|
||||
}
|
||||
pub fn jaccard_dist_matrix(&self) -> Array2<f64> {
|
||||
match self { Self::Columnar(m) => m.jaccard_dist_matrix(), _ => panic!("Columnar only") }
|
||||
}
|
||||
pub fn threshold_jaccard_dist_matrix(&self, threshold: u32) -> Array2<f64> {
|
||||
match self { Self::Columnar(m) => m.threshold_jaccard_dist_matrix(threshold), _ => panic!("Columnar only") }
|
||||
}
|
||||
pub fn partial_bray_dist_matrix(&self) -> Array2<u64> {
|
||||
match self { Self::Columnar(m) => m.partial_bray_dist_matrix(), _ => panic!("Columnar only") }
|
||||
}
|
||||
pub fn partial_euclidean_dist_matrix(&self) -> Array2<f64> {
|
||||
match self { Self::Columnar(m) => m.partial_euclidean_dist_matrix(), _ => panic!("Columnar only") }
|
||||
}
|
||||
pub fn partial_threshold_jaccard_dist_matrix(&self, threshold: u32) -> (Array2<u64>, Array2<u64>) {
|
||||
match self { Self::Columnar(m) => m.partial_threshold_jaccard_dist_matrix(threshold), _ => panic!("Columnar only") }
|
||||
}
|
||||
pub fn partial_relfreq_bray_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
||||
match self { Self::Columnar(m) => m.partial_relfreq_bray_dist_matrix(col_sums), _ => panic!("Columnar only") }
|
||||
}
|
||||
pub fn partial_relfreq_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
||||
match self { Self::Columnar(m) => m.partial_relfreq_euclidean_dist_matrix(col_sums), _ => panic!("Columnar only") }
|
||||
}
|
||||
pub fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
||||
match self { Self::Columnar(m) => m.partial_hellinger_euclidean_dist_matrix(col_sums), _ => panic!("Columnar only") }
|
||||
}
|
||||
|
||||
pub fn append_column(dir: &Path, value_of: impl Fn(usize) -> u32) -> io::Result<()> {
|
||||
ColumnarCompactIntMatrix::append_column(dir, value_of)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -240,24 +406,12 @@ impl ColumnWeights for PersistentCompactIntMatrix {
|
||||
}
|
||||
|
||||
impl CountPartials for PersistentCompactIntMatrix {
|
||||
fn partial_bray(&self) -> Array2<u64> {
|
||||
self.partial_bray_dist_matrix()
|
||||
}
|
||||
fn partial_euclidean(&self) -> Array2<f64> {
|
||||
self.partial_euclidean_dist_matrix()
|
||||
}
|
||||
fn partial_threshold_jaccard(&self, threshold: u32) -> (Array2<u64>, Array2<u64>) {
|
||||
self.partial_threshold_jaccard_dist_matrix(threshold)
|
||||
}
|
||||
fn partial_relfreq_bray(&self, global: &Array1<u64>) -> Array2<f64> {
|
||||
self.partial_relfreq_bray_dist_matrix(global)
|
||||
}
|
||||
fn partial_relfreq_euclidean(&self, global: &Array1<u64>) -> Array2<f64> {
|
||||
self.partial_relfreq_euclidean_dist_matrix(global)
|
||||
}
|
||||
fn partial_hellinger(&self, global: &Array1<u64>) -> Array2<f64> {
|
||||
self.partial_hellinger_euclidean_dist_matrix(global)
|
||||
}
|
||||
fn partial_bray(&self) -> Array2<u64> { self.partial_bray_dist_matrix() }
|
||||
fn partial_euclidean(&self) -> Array2<f64> { self.partial_euclidean_dist_matrix() }
|
||||
fn partial_threshold_jaccard(&self, t: u32) -> (Array2<u64>, Array2<u64>) { self.partial_threshold_jaccard_dist_matrix(t) }
|
||||
fn partial_relfreq_bray(&self, g: &Array1<u64>) -> Array2<f64> { self.partial_relfreq_bray_dist_matrix(g) }
|
||||
fn partial_relfreq_euclidean(&self, g: &Array1<u64>) -> Array2<f64> { self.partial_relfreq_euclidean_dist_matrix(g) }
|
||||
fn partial_hellinger(&self, g: &Array1<u64>) -> Array2<f64> { self.partial_hellinger_euclidean_dist_matrix(g) }
|
||||
}
|
||||
|
||||
// ── Builder ───────────────────────────────────────────────────────────────────
|
||||
@@ -287,3 +441,16 @@ impl PersistentCompactIntMatrixBuilder {
|
||||
MatrixMeta { n: self.n, n_cols: self.n_cols }.save(&self.dir)
|
||||
}
|
||||
}
|
||||
|
||||
// ── Helpers ───────────────────────────────────────────────────────────────────
|
||||
|
||||
fn upper_pairs(n: usize) -> Vec<(usize, usize)> {
|
||||
(0..n).flat_map(|i| (i + 1..n).map(move |j| (i, j))).collect()
|
||||
}
|
||||
|
||||
fn fill_symmetric<T>(n: usize, vals: impl Iterator<Item = (usize, usize, T, T)>) -> Array2<T>
|
||||
where T: Clone + Default {
|
||||
let mut m = Array2::from_elem((n, n), T::default());
|
||||
for (i, j, vij, vji) in vals { m[[i, j]] = vij; m[[j, i]] = vji; }
|
||||
m
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user