refactor(obicompactvec): unify bit and int vector slice views
Refactors column and matrix access to use unified `BitSliceView` and `IntSliceView` abstractions, replacing legacy `PackedCol`/`IntColView` types. Introduces `BitSlice`/`IntSlice` traits for zero-copy, trait-based bitwise and arithmetic operations across persistent and temporary vector types. Removes deprecated in-memory `MemoryBitVec` and `MemoryIntVec` implementations and their tests, while updating dependent crates to use the new view-based API and `BitSliceMut` trait.
This commit is contained in:
@@ -1,5 +1,3 @@
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::HashMap;
|
||||
use std::fs::{self, File};
|
||||
use std::io::{self, BufWriter, Write as _};
|
||||
use std::path::{Path, PathBuf};
|
||||
@@ -10,14 +8,13 @@ use rayon::prelude::*;
|
||||
|
||||
use crate::bitmatrix::{pairwise_matrix, pairwise2_matrix};
|
||||
use crate::builder::PersistentCompactIntVecBuilder;
|
||||
use crate::colgroup::{ColGroup, MatrixGroupOps, inc_primary_bits};
|
||||
use crate::memoryintvec::MemoryIntVec;
|
||||
use crate::tempbitvec::{TempBitVec, TempBitVecBuilder};
|
||||
use crate::tempintvec::{TempCompactIntVec, TempCompactIntVecBuilder};
|
||||
use crate::format::{byte_count_nonzero, byte_sum, HEADER_SIZE, OVERFLOW_ENTRY_SIZE, parse_index_entry, parse_overflow_entry};
|
||||
use crate::colgroup::{ColGroup, MatrixGroupOps};
|
||||
use crate::format::{HEADER_SIZE, OVERFLOW_ENTRY_SIZE};
|
||||
use crate::meta::MatrixMeta;
|
||||
use crate::reader::PersistentCompactIntVec;
|
||||
use crate::traits::{BitSliceMut, IntSlice, IntSliceMut};
|
||||
use crate::tempbitvec::{TempBitVec, TempBitVecBuilder};
|
||||
use crate::tempintvec::{TempCompactIntVec, TempCompactIntVecBuilder};
|
||||
use crate::views::IntSliceView;
|
||||
|
||||
fn col_path(dir: &Path, col: usize) -> PathBuf {
|
||||
dir.join(format!("col_{col:06}.pciv"))
|
||||
@@ -48,9 +45,7 @@ impl ColumnarCompactIntMatrix {
|
||||
}
|
||||
|
||||
pub(crate) fn fill_row(&self, slot: usize, buf: &mut [u32]) {
|
||||
for (c, col) in self.cols.iter().enumerate() {
|
||||
buf[c] = col.get(slot);
|
||||
}
|
||||
for (c, col) in self.cols.iter().enumerate() { buf[c] = col.get(slot); }
|
||||
}
|
||||
|
||||
pub(crate) fn sum(&self) -> Array1<u64> {
|
||||
@@ -72,31 +67,22 @@ impl ColumnarCompactIntMatrix {
|
||||
pub(crate) fn partial_bray_dist_matrix(&self) -> Array2<u64> {
|
||||
pairwise_matrix(self.n_cols(), |i, j| self.col(i).partial_bray_dist(self.col(j)))
|
||||
}
|
||||
|
||||
pub(crate) fn partial_euclidean_dist_matrix(&self) -> Array2<f64> {
|
||||
pairwise_matrix(self.n_cols(), |i, j| self.col(i).partial_euclidean_dist(self.col(j)))
|
||||
}
|
||||
|
||||
pub(crate) fn partial_threshold_jaccard_dist_matrix(
|
||||
&self, threshold: u32,
|
||||
) -> (Array2<u64>, Array2<u64>) {
|
||||
pairwise2_matrix(self.n_cols(), |i, j| {
|
||||
self.col(i).partial_threshold_jaccard_dist(self.col(j), threshold)
|
||||
})
|
||||
pub(crate) fn partial_threshold_jaccard_dist_matrix(&self, threshold: u32) -> (Array2<u64>, Array2<u64>) {
|
||||
pairwise2_matrix(self.n_cols(), |i, j| self.col(i).partial_threshold_jaccard_dist(self.col(j), threshold))
|
||||
}
|
||||
|
||||
pub(crate) fn partial_relfreq_bray_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
||||
pairwise_matrix(self.n_cols(), |i, j| {
|
||||
self.col(i).partial_relfreq_bray_dist(self.col(j), col_sums[i] as f64, col_sums[j] as f64)
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn partial_relfreq_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
||||
pairwise_matrix(self.n_cols(), |i, j| {
|
||||
self.col(i).partial_relfreq_euclidean_dist(self.col(j), col_sums[i] as f64, col_sums[j] as f64)
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
||||
pairwise_matrix(self.n_cols(), |i, j| {
|
||||
self.col(i).partial_hellinger_euclidean_dist(self.col(j), col_sums[i] as f64, col_sums[j] as f64)
|
||||
@@ -111,7 +97,6 @@ impl ColumnarCompactIntMatrix {
|
||||
meta.n_cols += 1;
|
||||
meta.save(dir)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// ── PackedCompactIntMatrix ────────────────────────────────────────────────────
|
||||
@@ -119,153 +104,12 @@ impl ColumnarCompactIntMatrix {
|
||||
const PCMX_MAGIC: [u8; 4] = *b"PCMX";
|
||||
const PCMX_HEADER: usize = 24; // magic(4) + pad(4) + n_rows(8) + n_cols(8)
|
||||
|
||||
/// Per-column metadata pre-parsed from the embedded PCIV header.
|
||||
struct ColInfo {
|
||||
primary_start: usize, // absolute mmap offset to primary array
|
||||
data_offset: usize, // absolute mmap offset to overflow array
|
||||
primary_start: usize,
|
||||
data_offset: usize,
|
||||
n_overflow: usize,
|
||||
step: usize,
|
||||
index: Vec<(usize, usize)>,
|
||||
}
|
||||
|
||||
// ── PackedIntCol — lightweight column view backed by the shared mmap ──────────
|
||||
|
||||
pub(crate) struct PackedIntCol<'a> {
|
||||
primary: &'a [u8],
|
||||
overflow: &'a [u8], // raw bytes: n_overflow × OVERFLOW_ENTRY_SIZE
|
||||
n_overflow: usize,
|
||||
step: usize,
|
||||
index: &'a [(usize, usize)],
|
||||
n: usize,
|
||||
}
|
||||
|
||||
impl PackedIntCol<'_> {
|
||||
fn overflow_get(&self, slot: usize) -> u32 {
|
||||
let (pos_start, pos_end) = if self.step == 0 {
|
||||
(0, self.n_overflow)
|
||||
} else {
|
||||
let i = self.index.partition_point(|&(s, _)| s <= slot).saturating_sub(1);
|
||||
let start = self.index[i].1;
|
||||
let end = if i + 1 < self.index.len() { self.index[i + 1].1 } else { self.n_overflow };
|
||||
(start, end)
|
||||
};
|
||||
let mut lo = pos_start;
|
||||
let mut hi = pos_end;
|
||||
while lo < hi {
|
||||
let mid = lo + (hi - lo) / 2;
|
||||
let (stored, val) = parse_overflow_entry(self.overflow, 0, mid);
|
||||
match stored.cmp(&slot) {
|
||||
Ordering::Equal => return val,
|
||||
Ordering::Less => lo = mid + 1,
|
||||
Ordering::Greater => hi = mid,
|
||||
}
|
||||
}
|
||||
panic!("slot {slot} marked overflow but not found")
|
||||
}
|
||||
}
|
||||
|
||||
impl IntSlice for PackedIntCol<'_> {
|
||||
fn len(&self) -> usize { self.n }
|
||||
|
||||
fn get(&self, slot: usize) -> u32 {
|
||||
let v = self.primary[slot];
|
||||
if v < 255 { v as u32 } else { self.overflow_get(slot) }
|
||||
}
|
||||
|
||||
fn primary_bytes(&self) -> &[u8] { self.primary }
|
||||
|
||||
fn overflow_entries(&self) -> impl Iterator<Item = (usize, u32)> + '_ {
|
||||
(0..self.n_overflow).map(|i| parse_overflow_entry(self.overflow, 0, i))
|
||||
}
|
||||
|
||||
fn iter(&self) -> impl Iterator<Item = u32> + '_ {
|
||||
PackedIntColIter {
|
||||
primary: self.primary,
|
||||
overflow: self.overflow,
|
||||
slot: 0,
|
||||
overflow_pos: 0,
|
||||
n: self.n,
|
||||
}
|
||||
}
|
||||
|
||||
fn sum(&self) -> u64 {
|
||||
byte_sum(self.primary, (0..self.n_overflow).map(|i| parse_overflow_entry(self.overflow, 0, i).1))
|
||||
}
|
||||
|
||||
fn count_nonzero(&self) -> u64 { byte_count_nonzero(self.primary) }
|
||||
}
|
||||
|
||||
struct PackedIntColIter<'a> {
|
||||
primary: &'a [u8],
|
||||
overflow: &'a [u8],
|
||||
slot: usize,
|
||||
overflow_pos: usize,
|
||||
n: usize,
|
||||
}
|
||||
|
||||
impl Iterator for PackedIntColIter<'_> {
|
||||
type Item = u32;
|
||||
|
||||
fn next(&mut self) -> Option<u32> {
|
||||
if self.slot >= self.n { return None; }
|
||||
let v = self.primary[self.slot];
|
||||
self.slot += 1;
|
||||
if v < 255 {
|
||||
Some(v as u32)
|
||||
} else {
|
||||
let (_, val) = parse_overflow_entry(self.overflow, 0, self.overflow_pos);
|
||||
self.overflow_pos += 1;
|
||||
Some(val)
|
||||
}
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
let rem = self.n - self.slot;
|
||||
(rem, Some(rem))
|
||||
}
|
||||
}
|
||||
|
||||
impl ExactSizeIterator for PackedIntColIter<'_> {}
|
||||
|
||||
// ── IntColView — uniform column access across Columnar and Packed ─────────────
|
||||
|
||||
enum IntColViewInner<'a> {
|
||||
Columnar(&'a PersistentCompactIntVec),
|
||||
Packed(PackedIntCol<'a>),
|
||||
}
|
||||
|
||||
/// Opaque column view returned by [`PersistentCompactIntMatrix::col_view`].
|
||||
/// Implements [`IntSlice`] uniformly for both Columnar and Packed matrix formats.
|
||||
pub struct IntColView<'a>(IntColViewInner<'a>);
|
||||
|
||||
impl IntSlice for IntColView<'_> {
|
||||
fn len(&self) -> usize {
|
||||
match &self.0 { IntColViewInner::Columnar(c) => c.len(), IntColViewInner::Packed(c) => c.len() }
|
||||
}
|
||||
fn get(&self, slot: usize) -> u32 {
|
||||
match &self.0 { IntColViewInner::Columnar(c) => c.get(slot), IntColViewInner::Packed(c) => c.get(slot) }
|
||||
}
|
||||
fn primary_bytes(&self) -> &[u8] {
|
||||
match &self.0 { IntColViewInner::Columnar(c) => c.primary_bytes(), IntColViewInner::Packed(c) => c.primary_bytes() }
|
||||
}
|
||||
fn overflow_entries(&self) -> impl Iterator<Item = (usize, u32)> + '_ {
|
||||
// Box<dyn Iterator> implements Iterator, satisfying RPITIT across two distinct types.
|
||||
let it: Box<dyn Iterator<Item = (usize, u32)> + '_> = match &self.0 {
|
||||
IntColViewInner::Columnar(c) => Box::new(c.overflow_entries()),
|
||||
IntColViewInner::Packed(c) => Box::new(c.overflow_entries()),
|
||||
};
|
||||
it
|
||||
}
|
||||
fn sum(&self) -> u64 {
|
||||
match &self.0 { IntColViewInner::Columnar(c) => c.sum(), IntColViewInner::Packed(c) => c.sum() }
|
||||
}
|
||||
fn count_nonzero(&self) -> u64 {
|
||||
match &self.0 { IntColViewInner::Columnar(c) => c.count_nonzero(), IntColViewInner::Packed(c) => c.count_nonzero() }
|
||||
}
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
pub struct PackedCompactIntMatrix {
|
||||
mmap: Mmap,
|
||||
n_rows: usize,
|
||||
@@ -289,52 +133,30 @@ impl PackedCompactIntMatrix {
|
||||
for c in 0..n_cols {
|
||||
let off_pos = PCMX_HEADER + c * 8;
|
||||
let col_base = u64::from_le_bytes(mmap[off_pos..off_pos+8].try_into().unwrap()) as usize;
|
||||
// Parse embedded PCIV header at col_base
|
||||
let n_ov = u64::from_le_bytes(mmap[col_base+16..col_base+24].try_into().unwrap()) as usize;
|
||||
let n_idx = u64::from_le_bytes(mmap[col_base+24..col_base+32].try_into().unwrap()) as usize;
|
||||
let step = u64::from_le_bytes(mmap[col_base+32..col_base+40].try_into().unwrap()) as usize;
|
||||
let n_pciv = u64::from_le_bytes(mmap[col_base+8..col_base+16].try_into().unwrap()) as usize;
|
||||
|
||||
let primary_start = col_base + HEADER_SIZE;
|
||||
let data_offset = primary_start + n_pciv;
|
||||
let index_offset = data_offset + n_ov * OVERFLOW_ENTRY_SIZE;
|
||||
|
||||
let mut index = Vec::with_capacity(n_idx);
|
||||
for i in 0..n_idx {
|
||||
index.push(parse_index_entry(&mmap, index_offset, i));
|
||||
}
|
||||
columns.push(ColInfo { primary_start, data_offset, n_overflow: n_ov, step, index });
|
||||
columns.push(ColInfo { primary_start, data_offset, n_overflow: n_ov });
|
||||
}
|
||||
|
||||
Ok(Self { mmap, n_rows, n_cols, columns })
|
||||
}
|
||||
|
||||
pub(crate) fn col_slice(&self, c: usize) -> PackedIntCol<'_> {
|
||||
pub(crate) fn col_view(&self, c: usize) -> IntSliceView<'_> {
|
||||
let ci = &self.columns[c];
|
||||
PackedIntCol {
|
||||
primary: &self.mmap[ci.primary_start..ci.primary_start + self.n_rows],
|
||||
overflow: &self.mmap[ci.data_offset..ci.data_offset + ci.n_overflow * OVERFLOW_ENTRY_SIZE],
|
||||
n_overflow: ci.n_overflow,
|
||||
step: ci.step,
|
||||
index: &ci.index,
|
||||
n: self.n_rows,
|
||||
}
|
||||
let primary = &self.mmap[ci.primary_start..ci.primary_start + self.n_rows];
|
||||
let overflow_raw = &self.mmap[ci.data_offset..ci.data_offset + ci.n_overflow * OVERFLOW_ENTRY_SIZE];
|
||||
IntSliceView::new(primary, overflow_raw, ci.n_overflow, self.n_rows)
|
||||
}
|
||||
|
||||
pub(crate) fn col_persist(&self, c: usize, path: &Path) -> io::Result<PersistentCompactIntVecBuilder> {
|
||||
let col = self.col_slice(c);
|
||||
let overflow: HashMap<usize, u32> = col.overflow_entries().collect();
|
||||
PersistentCompactIntVecBuilder::from_raw_primary(col.primary, overflow, path)
|
||||
}
|
||||
|
||||
pub(crate) fn col_as_memory(&self, c: usize) -> MemoryIntVec {
|
||||
MemoryIntVec::from(&self.col_slice(c))
|
||||
let view = self.col_view(c);
|
||||
let overflow: std::collections::HashMap<usize, u32> = view.overflow_entries().collect();
|
||||
PersistentCompactIntVecBuilder::from_raw_primary(view.primary_bytes(), overflow, path)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn get(&self, col: usize, slot: usize) -> u32 {
|
||||
self.col_slice(col).get(slot)
|
||||
}
|
||||
pub(crate) fn get(&self, col: usize, slot: usize) -> u32 { self.col_view(col).get(slot) }
|
||||
|
||||
pub(crate) fn fill_row(&self, slot: usize, buf: &mut [u32]) {
|
||||
for c in 0..self.n_cols { buf[c] = self.get(c, slot); }
|
||||
@@ -346,86 +168,61 @@ impl PackedCompactIntMatrix {
|
||||
|
||||
pub(crate) fn sum(&self) -> Array1<u64> {
|
||||
Array1::from_vec(
|
||||
(0..self.n_cols).into_par_iter()
|
||||
.map(|c| self.col_slice(c).sum())
|
||||
.collect()
|
||||
(0..self.n_cols).into_par_iter().map(|c| self.col_view(c).sum()).collect()
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) fn count_nonzero(&self) -> Array1<u64> {
|
||||
Array1::from_vec(
|
||||
(0..self.n_cols).into_par_iter()
|
||||
.map(|c| self.col_slice(c).count_nonzero())
|
||||
.collect()
|
||||
(0..self.n_cols).into_par_iter().map(|c| self.col_view(c).count_nonzero()).collect()
|
||||
)
|
||||
}
|
||||
|
||||
// ── Pair primitives — sequential scan via col_slice().iter() ─────────────
|
||||
|
||||
fn pair_partial_bray(&self, i: usize, j: usize) -> u64 {
|
||||
self.col_slice(i).iter().zip(self.col_slice(j).iter())
|
||||
.map(|(a, b)| a.min(b) as u64)
|
||||
.sum()
|
||||
self.col_view(i).iter().zip(self.col_view(j).iter()).map(|(a, b)| a.min(b) as u64).sum()
|
||||
}
|
||||
|
||||
fn pair_partial_euclidean(&self, i: usize, j: usize) -> f64 {
|
||||
self.col_slice(i).iter().zip(self.col_slice(j).iter())
|
||||
.map(|(a, b)| { let d = a as f64 - b as f64; d * d })
|
||||
.sum()
|
||||
self.col_view(i).iter().zip(self.col_view(j).iter())
|
||||
.map(|(a, b)| { let d = a as f64 - b as f64; d * d }).sum()
|
||||
}
|
||||
|
||||
fn pair_partial_threshold_jaccard(&self, i: usize, j: usize, t: u32) -> (u64, u64) {
|
||||
self.col_slice(i).iter().zip(self.col_slice(j).iter())
|
||||
self.col_view(i).iter().zip(self.col_view(j).iter())
|
||||
.fold((0u64, 0u64), |(inter, uni), (a, b)| {
|
||||
let ap = a >= t;
|
||||
let bp = b >= t;
|
||||
let ap = a >= t; let bp = b >= t;
|
||||
(inter + (ap & bp) as u64, uni + (ap | bp) as u64)
|
||||
})
|
||||
}
|
||||
|
||||
fn pair_partial_relfreq_bray(&self, i: usize, j: usize, si: f64, sj: f64) -> f64 {
|
||||
if si == 0.0 || sj == 0.0 { return 0.0; }
|
||||
self.col_slice(i).iter().zip(self.col_slice(j).iter())
|
||||
.map(|(a, b)| (a as f64 / si).min(b as f64 / sj))
|
||||
.sum()
|
||||
self.col_view(i).iter().zip(self.col_view(j).iter())
|
||||
.map(|(a, b)| (a as f64 / si).min(b as f64 / sj)).sum()
|
||||
}
|
||||
|
||||
fn pair_partial_relfreq_euclidean(&self, i: usize, j: usize, si: f64, sj: f64) -> f64 {
|
||||
if si == 0.0 || sj == 0.0 { return 0.0; }
|
||||
self.col_slice(i).iter().zip(self.col_slice(j).iter())
|
||||
.map(|(a, b)| { let d = a as f64 / si - b as f64 / sj; d * d })
|
||||
.sum()
|
||||
self.col_view(i).iter().zip(self.col_view(j).iter())
|
||||
.map(|(a, b)| { let d = a as f64 / si - b as f64 / sj; d * d }).sum()
|
||||
}
|
||||
|
||||
fn pair_partial_hellinger(&self, i: usize, j: usize, si: f64, sj: f64) -> f64 {
|
||||
if si == 0.0 || sj == 0.0 { return 0.0; }
|
||||
self.col_slice(i).iter().zip(self.col_slice(j).iter())
|
||||
.map(|(a, b)| { let d = (a as f64 / si).sqrt() - (b as f64 / sj).sqrt(); d * d })
|
||||
.sum()
|
||||
self.col_view(i).iter().zip(self.col_view(j).iter())
|
||||
.map(|(a, b)| { let d = (a as f64 / si).sqrt() - (b as f64 / sj).sqrt(); d * d }).sum()
|
||||
}
|
||||
|
||||
// ── Matrix methods ────────────────────────────────────────────────────────
|
||||
|
||||
pub(crate) fn partial_bray_dist_matrix(&self) -> Array2<u64> {
|
||||
pairwise_matrix(self.n_cols, |i, j| self.pair_partial_bray(i, j))
|
||||
}
|
||||
|
||||
pub(crate) fn partial_euclidean_dist_matrix(&self) -> Array2<f64> {
|
||||
pairwise_matrix(self.n_cols, |i, j| self.pair_partial_euclidean(i, j))
|
||||
}
|
||||
|
||||
pub(crate) fn partial_threshold_jaccard_dist_matrix(&self, t: u32) -> (Array2<u64>, Array2<u64>) {
|
||||
pairwise2_matrix(self.n_cols, |i, j| self.pair_partial_threshold_jaccard(i, j, t))
|
||||
}
|
||||
|
||||
pub(crate) fn partial_relfreq_bray_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
||||
pairwise_matrix(self.n_cols, |i, j| self.pair_partial_relfreq_bray(i, j, col_sums[i] as f64, col_sums[j] as f64))
|
||||
}
|
||||
|
||||
pub(crate) fn partial_relfreq_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
||||
pairwise_matrix(self.n_cols, |i, j| self.pair_partial_relfreq_euclidean(i, j, col_sums[i] as f64, col_sums[j] as f64))
|
||||
}
|
||||
|
||||
pub(crate) fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
||||
pairwise_matrix(self.n_cols, |i, j| self.pair_partial_hellinger(i, j, col_sums[i] as f64, col_sums[j] as f64))
|
||||
}
|
||||
@@ -435,32 +232,21 @@ impl PackedCompactIntMatrix {
|
||||
pub fn pack_compact_int_matrix(dir: &Path) -> io::Result<()> {
|
||||
let packed_path = dir.join("matrix.pcmx");
|
||||
if packed_path.exists() {
|
||||
// Matrix complete; remove any leftover column files from a killed cleanup.
|
||||
if let Ok(meta) = MatrixMeta::load(dir) {
|
||||
for c in 0..meta.n_cols { let _ = fs::remove_file(col_path(dir, c)); }
|
||||
let _ = fs::remove_file(dir.join("meta.json"));
|
||||
}
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let meta = MatrixMeta::load(dir)?;
|
||||
let meta = MatrixMeta::load(dir)?;
|
||||
let n_cols = meta.n_cols;
|
||||
|
||||
// Compute offsets from file sizes — no column data loaded into RAM.
|
||||
let col_sizes: Vec<u64> = (0..n_cols)
|
||||
.map(|c| fs::metadata(col_path(dir, c)).map(|m| m.len()))
|
||||
.collect::<io::Result<_>>()?;
|
||||
|
||||
let header_size = (PCMX_HEADER + n_cols * 8) as u64;
|
||||
let mut col_offset = header_size;
|
||||
let mut offsets = Vec::with_capacity(n_cols);
|
||||
for &size in &col_sizes {
|
||||
offsets.push(col_offset);
|
||||
col_offset += size;
|
||||
}
|
||||
|
||||
// Write to a temp file; rename atomically so a killed process never leaves
|
||||
// a truncated matrix.pcmx that would be mistaken for a complete file.
|
||||
for &size in &col_sizes { offsets.push(col_offset); col_offset += size; }
|
||||
let tmp_path = dir.join("matrix.pcmx.tmp");
|
||||
let mut out = BufWriter::new(File::create(&tmp_path)?);
|
||||
out.write_all(&PCMX_MAGIC)?;
|
||||
@@ -468,13 +254,10 @@ pub fn pack_compact_int_matrix(dir: &Path) -> io::Result<()> {
|
||||
out.write_all(&(meta.n as u64).to_le_bytes())?;
|
||||
out.write_all(&(n_cols as u64).to_le_bytes())?;
|
||||
for &off in &offsets { out.write_all(&off.to_le_bytes())?; }
|
||||
for c in 0..n_cols {
|
||||
io::copy(&mut File::open(col_path(dir, c))?, &mut out)?;
|
||||
}
|
||||
for c in 0..n_cols { io::copy(&mut File::open(col_path(dir, c))?, &mut out)?; }
|
||||
out.flush()?;
|
||||
drop(out);
|
||||
fs::rename(&tmp_path, &packed_path)?;
|
||||
|
||||
for c in 0..n_cols { fs::remove_file(col_path(dir, c))?; }
|
||||
fs::remove_file(dir.join("meta.json"))?;
|
||||
Ok(())
|
||||
@@ -488,18 +271,14 @@ pub enum PersistentCompactIntMatrix {
|
||||
}
|
||||
|
||||
impl PersistentCompactIntMatrix {
|
||||
/// Open from `layer_dir`, auto-detecting Packed or Columnar.
|
||||
pub fn open(layer_dir: &Path) -> io::Result<Self> {
|
||||
let counts_dir = layer_dir.join("counts");
|
||||
|
||||
if counts_dir.join("matrix.pcmx").exists() {
|
||||
return Ok(Self::Packed(PackedCompactIntMatrix::open(&counts_dir.join("matrix.pcmx"))?));
|
||||
}
|
||||
|
||||
if MatrixMeta::load(&counts_dir).is_ok() {
|
||||
return Ok(Self::Columnar(ColumnarCompactIntMatrix::open(&counts_dir)?));
|
||||
}
|
||||
|
||||
Err(io::Error::new(
|
||||
io::ErrorKind::NotFound,
|
||||
format!("no count matrix found in {} — run 'obikmer upgrade'", layer_dir.display()),
|
||||
@@ -509,7 +288,6 @@ impl PersistentCompactIntMatrix {
|
||||
pub fn n(&self) -> usize {
|
||||
match self { Self::Columnar(m) => m.n(), Self::Packed(m) => m.n_rows }
|
||||
}
|
||||
|
||||
pub fn n_cols(&self) -> usize {
|
||||
match self { Self::Columnar(m) => m.n_cols(), Self::Packed(m) => m.n_cols }
|
||||
}
|
||||
@@ -521,10 +299,10 @@ impl PersistentCompactIntMatrix {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn col_view(&self, c: usize) -> IntColView<'_> {
|
||||
pub fn col_view(&self, c: usize) -> IntSliceView<'_> {
|
||||
match self {
|
||||
Self::Columnar(m) => IntColView(IntColViewInner::Columnar(m.col(c))),
|
||||
Self::Packed(m) => IntColView(IntColViewInner::Packed(m.col_slice(c))),
|
||||
Self::Columnar(m) => m.col(c).view(),
|
||||
Self::Packed(m) => m.col_view(c),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -535,29 +313,18 @@ impl PersistentCompactIntMatrix {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn col_as_memory(&self, c: usize) -> MemoryIntVec {
|
||||
match self {
|
||||
Self::Columnar(m) => MemoryIntVec::from(m.col(c)),
|
||||
Self::Packed(m) => m.col_as_memory(c),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn row(&self, slot: usize) -> Box<[u32]> {
|
||||
match self { Self::Columnar(m) => m.row(slot), Self::Packed(m) => m.row(slot) }
|
||||
}
|
||||
|
||||
pub fn fill_row(&self, slot: usize, buf: &mut [u32]) {
|
||||
match self { Self::Columnar(m) => m.fill_row(slot, buf), Self::Packed(m) => m.fill_row(slot, buf) }
|
||||
}
|
||||
|
||||
pub fn sum(&self) -> Array1<u64> {
|
||||
match self { Self::Columnar(m) => m.sum(), Self::Packed(m) => m.sum() }
|
||||
}
|
||||
|
||||
pub fn count_nonzero(&self) -> Array1<u64> {
|
||||
match self { Self::Columnar(m) => m.count_nonzero(), Self::Packed(m) => m.count_nonzero() }
|
||||
}
|
||||
|
||||
pub fn partial_bray_dist_matrix(&self) -> Array2<u64> {
|
||||
match self { Self::Columnar(m) => m.partial_bray_dist_matrix(), Self::Packed(m) => m.partial_bray_dist_matrix() }
|
||||
}
|
||||
@@ -576,7 +343,6 @@ impl PersistentCompactIntMatrix {
|
||||
pub fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
||||
match self { Self::Columnar(m) => m.partial_hellinger_euclidean_dist_matrix(col_sums), Self::Packed(m) => m.partial_hellinger_euclidean_dist_matrix(col_sums) }
|
||||
}
|
||||
|
||||
pub fn append_column(dir: &Path, value_of: impl Fn(usize) -> u32) -> io::Result<()> {
|
||||
ColumnarCompactIntMatrix::append_column(dir, value_of)
|
||||
}
|
||||
@@ -592,12 +358,12 @@ impl ColumnWeights for PersistentCompactIntMatrix {
|
||||
}
|
||||
|
||||
impl CountPartials for PersistentCompactIntMatrix {
|
||||
fn partial_bray(&self) -> Array2<u64> { self.partial_bray_dist_matrix() }
|
||||
fn partial_euclidean(&self) -> Array2<f64> { self.partial_euclidean_dist_matrix() }
|
||||
fn partial_bray(&self) -> Array2<u64> { self.partial_bray_dist_matrix() }
|
||||
fn partial_euclidean(&self) -> Array2<f64> { self.partial_euclidean_dist_matrix() }
|
||||
fn partial_threshold_jaccard(&self, t: u32) -> (Array2<u64>, Array2<u64>) { self.partial_threshold_jaccard_dist_matrix(t) }
|
||||
fn partial_relfreq_bray(&self, g: &Array1<u64>) -> Array2<f64> { self.partial_relfreq_bray_dist_matrix(g) }
|
||||
fn partial_relfreq_euclidean(&self, g: &Array1<u64>) -> Array2<f64> { self.partial_relfreq_euclidean_dist_matrix(g) }
|
||||
fn partial_hellinger(&self, g: &Array1<u64>) -> Array2<f64> { self.partial_hellinger_euclidean_dist_matrix(g) }
|
||||
fn partial_relfreq_bray(&self, g: &Array1<u64>) -> Array2<f64> { self.partial_relfreq_bray_dist_matrix(g) }
|
||||
fn partial_relfreq_euclidean(&self, g: &Array1<u64>) -> Array2<f64> { self.partial_relfreq_euclidean_dist_matrix(g) }
|
||||
fn partial_hellinger(&self, g: &Array1<u64>) -> Array2<f64> { self.partial_hellinger_euclidean_dist_matrix(g) }
|
||||
}
|
||||
|
||||
// ── Builder ───────────────────────────────────────────────────────────────────
|
||||
@@ -613,16 +379,13 @@ impl PersistentCompactIntMatrixBuilder {
|
||||
fs::create_dir_all(dir)?;
|
||||
Ok(Self { dir: dir.to_path_buf(), n, n_cols: 0 })
|
||||
}
|
||||
|
||||
pub fn n(&self) -> usize { self.n }
|
||||
pub fn n_cols(&self) -> usize { self.n_cols }
|
||||
|
||||
pub fn add_col(&mut self) -> io::Result<PersistentCompactIntVecBuilder> {
|
||||
let path = col_path(&self.dir, self.n_cols);
|
||||
self.n_cols += 1;
|
||||
PersistentCompactIntVecBuilder::new(self.n, &path)
|
||||
}
|
||||
|
||||
pub fn close(self) -> io::Result<()> {
|
||||
MatrixMeta { n: self.n, n_cols: self.n_cols }.save(&self.dir)
|
||||
}
|
||||
@@ -634,30 +397,20 @@ impl MatrixGroupOps for PersistentCompactIntMatrix {
|
||||
fn partial_group_presence_count(&self, g: &ColGroup, threshold: u32) -> io::Result<TempCompactIntVec> {
|
||||
let n = self.n();
|
||||
if g.indices.len() < 255 {
|
||||
// Fast path: counts fit in u8 — accumulate directly into raw bytes.
|
||||
let mut builder = TempCompactIntVecBuilder::new(n)?;
|
||||
{
|
||||
let primary = builder.primary_bytes_mut();
|
||||
for &c in &g.indices {
|
||||
let mask = self.col_view(c).cmp_scalar(|v| v >= threshold);
|
||||
inc_primary_bits(primary, &mask);
|
||||
}
|
||||
for &c in &g.indices {
|
||||
builder.inc_predicate_fast(self.col_view(c), |v| v >= threshold);
|
||||
}
|
||||
builder.freeze()
|
||||
} else {
|
||||
// Slow path: chunk by 254 to keep per-chunk u8 safe, then add chunks.
|
||||
let mut result = TempCompactIntVecBuilder::new(n)?;
|
||||
for chunk in g.indices.chunks(254) {
|
||||
let mut chunk_builder = TempCompactIntVecBuilder::new(n)?;
|
||||
{
|
||||
let primary = chunk_builder.primary_bytes_mut();
|
||||
for &c in chunk {
|
||||
let mask = self.col_view(c).cmp_scalar(|v| v >= threshold);
|
||||
inc_primary_bits(primary, &mask);
|
||||
}
|
||||
let mut chunk_b = TempCompactIntVecBuilder::new(n)?;
|
||||
for &c in chunk {
|
||||
chunk_b.inc_predicate_fast(self.col_view(c), |v| v >= threshold);
|
||||
}
|
||||
let chunk_frozen = chunk_builder.freeze()?;
|
||||
IntSliceMut::add(&mut result, &chunk_frozen);
|
||||
let frozen = chunk_b.freeze()?;
|
||||
result.add(frozen.view());
|
||||
}
|
||||
result.freeze()
|
||||
}
|
||||
@@ -666,10 +419,7 @@ impl MatrixGroupOps for PersistentCompactIntMatrix {
|
||||
fn partial_group_sum(&self, g: &ColGroup) -> io::Result<TempCompactIntVec> {
|
||||
let n = self.n();
|
||||
let mut result = TempCompactIntVecBuilder::new(n)?;
|
||||
for &c in &g.indices {
|
||||
let view = self.col_view(c);
|
||||
IntSliceMut::add(&mut result, &view);
|
||||
}
|
||||
for &c in &g.indices { result.add(self.col_view(c)); }
|
||||
result.freeze()
|
||||
}
|
||||
|
||||
@@ -677,8 +427,7 @@ impl MatrixGroupOps for PersistentCompactIntMatrix {
|
||||
let n = self.n();
|
||||
let mut result = TempBitVecBuilder::new(n)?;
|
||||
for &c in &g.indices {
|
||||
let mask = self.col_view(c).cmp_scalar(|v| v >= threshold);
|
||||
result.or(&mask);
|
||||
result.or_where(self.col_view(c), |v| v >= threshold);
|
||||
}
|
||||
result.freeze()
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user