refactor: replace in-memory vectors with temp-file-backed storage

Introduces `TempCompactIntVec` and `TempBitVec` as temporary, file-backed intermediates to replace eager in-memory vectors, enabling OS-level paging under memory pressure. Updates the `MatrixGroupOps` trait to return `io::Result` types, allowing proper error propagation and supporting chunked accumulation for large column groups. Includes builder patterns with `.freeze()` finalization, automatic `TempDir` cleanup on drop, and necessary test updates to handle the new fallible signatures. Also fixes `Cargo.toml` section ordering.
This commit is contained in:
Eric Coissac
2026-06-17 15:13:22 +02:00
parent 1d38d87ff9
commit fb4962c4fe
11 changed files with 399 additions and 131 deletions
+1 -1
View File
@@ -7,6 +7,6 @@ edition = "2024"
memmap2 = "0.9"
ndarray = "0.16"
rayon = "1"
tempfile = "3"
[dev-dependencies]
tempfile = "3"
+28 -17
View File
@@ -8,8 +8,9 @@ use rayon::prelude::*;
use crate::bitvec::{PersistentBitVec, PersistentBitVecBuilder};
use crate::colgroup::{ColGroup, MatrixGroupOps, inc_primary_bits};
use crate::memoryintvec::MemoryIntVec;
use crate::memoryvec::MemoryBitVec;
use crate::tempbitvec::{TempBitVec, TempBitVecBuilder};
use crate::tempintvec::{TempCompactIntVec, TempCompactIntVecBuilder};
use crate::traits::{BitSlice, BitSliceMut, IntSliceMut};
use crate::layer_meta::LayerMeta;
use crate::meta::MatrixMeta;
@@ -452,39 +453,49 @@ impl PersistentBitMatrixBuilder {
// ── MatrixGroupOps ────────────────────────────────────────────────────────────
impl MatrixGroupOps for PersistentBitMatrix {
fn partial_group_presence_count(&self, g: &ColGroup, _threshold: u32) -> MemoryIntVec {
fn partial_group_presence_count(&self, g: &ColGroup, _threshold: u32) -> io::Result<TempCompactIntVec> {
// Bit matrices store 0/1 — threshold is structurally always 1.
// Materialize each column to a MemoryBitVec and accumulate directly.
let n = self.n();
if g.indices.len() < 255 {
let mut primary = vec![0u8; n];
for &c in &g.indices {
let mbv = MemoryBitVec::from(&self.col_view(c));
inc_primary_bits(&mut primary, &mbv);
let mut builder = TempCompactIntVecBuilder::new(n)?;
{
let primary = builder.primary_bytes_mut();
for &c in &g.indices {
let mbv = MemoryBitVec::from(&self.col_view(c));
inc_primary_bits(primary, &mbv);
}
}
MemoryIntVec::from_primary(primary)
builder.freeze()
} else {
let mut result = MemoryIntVec::new(n);
for &c in &g.indices {
let mbv = MemoryBitVec::from(&self.col_view(c));
result.count_bits(&mbv);
let mut result = TempCompactIntVecBuilder::new(n)?;
for chunk in g.indices.chunks(254) {
let mut chunk_builder = TempCompactIntVecBuilder::new(n)?;
{
let primary = chunk_builder.primary_bytes_mut();
for &c in chunk {
let mbv = MemoryBitVec::from(&self.col_view(c));
inc_primary_bits(primary, &mbv);
}
}
let chunk_frozen = chunk_builder.freeze()?;
IntSliceMut::add(&mut result, &chunk_frozen);
}
result
result.freeze()
}
}
fn partial_group_sum(&self, g: &ColGroup) -> MemoryIntVec {
fn partial_group_sum(&self, g: &ColGroup) -> io::Result<TempCompactIntVec> {
// For bit matrices, sum = count of 1-bits — identical to presence_count.
self.partial_group_presence_count(g, 1)
}
fn partial_group_any(&self, g: &ColGroup, _threshold: u32) -> MemoryBitVec {
fn partial_group_any(&self, g: &ColGroup, _threshold: u32) -> io::Result<TempBitVec> {
let n = self.n();
let mut result = MemoryBitVec::new(n);
let mut result = TempBitVecBuilder::new(n)?;
for &c in &g.indices {
result.or(&self.col_view(c));
}
result
result.freeze()
}
}
+8 -8
View File
@@ -122,19 +122,19 @@ impl PersistentCompactIntVecBuilder {
/// Flush the primary mmap, then write sorted overflow data + index and fix the header.
pub fn close(self) -> io::Result<()> {
self.mmap.flush()?;
let Self {
path,
mmap,
n,
overflow,
} = self;
let Self { path, mmap, n, overflow } = self;
drop(mmap);
let mut entries: Vec<(usize, u32)> = overflow.into_iter().collect();
entries.sort_unstable_by_key(|&(slot, _)| slot);
finalize_pciv(&path, n, &entries)
}
/// Close and reopen as a read-only [`PersistentCompactIntVec`].
pub fn finish(self) -> io::Result<PersistentCompactIntVec> {
let path = self.path.clone();
self.close()?;
PersistentCompactIntVec::open(&path)
}
}
// ── IntSlice / IntSliceMut impls ──────────────────────────────────────────────
+7 -4
View File
@@ -1,5 +1,8 @@
use crate::memoryintvec::MemoryIntVec;
use std::io;
use crate::memoryvec::MemoryBitVec;
use crate::tempbitvec::TempBitVec;
use crate::tempintvec::TempCompactIntVec;
use crate::traits::BitSlice;
// ── ColGroup ──────────────────────────────────────────────────────────────────
@@ -30,13 +33,13 @@ impl ColGroup {
/// — they are derived at the index level from these intermediates.
pub trait MatrixGroupOps {
/// Per-slot count of group columns whose value ≥ `threshold`.
fn partial_group_presence_count(&self, g: &ColGroup, threshold: u32) -> MemoryIntVec;
fn partial_group_presence_count(&self, g: &ColGroup, threshold: u32) -> io::Result<TempCompactIntVec>;
/// Per-slot sum of values across all group columns.
fn partial_group_sum(&self, g: &ColGroup) -> MemoryIntVec;
fn partial_group_sum(&self, g: &ColGroup) -> io::Result<TempCompactIntVec>;
/// Per-slot OR: true if any group column has value ≥ `threshold`.
fn partial_group_any(&self, g: &ColGroup, threshold: u32) -> MemoryBitVec;
fn partial_group_any(&self, g: &ColGroup, threshold: u32) -> io::Result<TempBitVec>;
}
// ── Internal helper ───────────────────────────────────────────────────────────
+32 -21
View File
@@ -12,7 +12,8 @@ use crate::bitmatrix::{pairwise_matrix, pairwise2_matrix};
use crate::builder::PersistentCompactIntVecBuilder;
use crate::colgroup::{ColGroup, MatrixGroupOps, inc_primary_bits};
use crate::memoryintvec::MemoryIntVec;
use crate::memoryvec::MemoryBitVec;
use crate::tempbitvec::{TempBitVec, TempBitVecBuilder};
use crate::tempintvec::{TempCompactIntVec, TempCompactIntVecBuilder};
use crate::format::{byte_count_nonzero, byte_sum, HEADER_SIZE, OVERFLOW_ENTRY_SIZE, parse_index_entry, parse_overflow_entry};
use crate::meta::MatrixMeta;
use crate::reader::PersistentCompactIntVec;
@@ -630,45 +631,55 @@ impl PersistentCompactIntMatrixBuilder {
// ── MatrixGroupOps ────────────────────────────────────────────────────────────
impl MatrixGroupOps for PersistentCompactIntMatrix {
fn partial_group_presence_count(&self, g: &ColGroup, threshold: u32) -> MemoryIntVec {
fn partial_group_presence_count(&self, g: &ColGroup, threshold: u32) -> io::Result<TempCompactIntVec> {
let n = self.n();
if g.indices.len() < 255 {
// Fast path: counts fit in u8 — accumulate directly into raw bytes,
// no overflow map involved.
let mut primary = vec![0u8; n];
for &c in &g.indices {
let mask = self.col_view(c).cmp_scalar(|v| v >= threshold);
inc_primary_bits(&mut primary, &mask);
// Fast path: counts fit in u8 — accumulate directly into raw bytes.
let mut builder = TempCompactIntVecBuilder::new(n)?;
{
let primary = builder.primary_bytes_mut();
for &c in &g.indices {
let mask = self.col_view(c).cmp_scalar(|v| v >= threshold);
inc_primary_bits(primary, &mask);
}
}
MemoryIntVec::from_primary(primary)
builder.freeze()
} else {
// Slow path (rare): use IntSliceMut::count_bits which handles overflow.
let mut result = MemoryIntVec::new(n);
for &c in &g.indices {
let mask = self.col_view(c).cmp_scalar(|v| v >= threshold);
result.count_bits(&mask);
// Slow path: chunk by 254 to keep per-chunk u8 safe, then add chunks.
let mut result = TempCompactIntVecBuilder::new(n)?;
for chunk in g.indices.chunks(254) {
let mut chunk_builder = TempCompactIntVecBuilder::new(n)?;
{
let primary = chunk_builder.primary_bytes_mut();
for &c in chunk {
let mask = self.col_view(c).cmp_scalar(|v| v >= threshold);
inc_primary_bits(primary, &mask);
}
}
let chunk_frozen = chunk_builder.freeze()?;
IntSliceMut::add(&mut result, &chunk_frozen);
}
result
result.freeze()
}
}
fn partial_group_sum(&self, g: &ColGroup) -> MemoryIntVec {
fn partial_group_sum(&self, g: &ColGroup) -> io::Result<TempCompactIntVec> {
let n = self.n();
let mut result = MemoryIntVec::new(n);
let mut result = TempCompactIntVecBuilder::new(n)?;
for &c in &g.indices {
let view = self.col_view(c);
IntSliceMut::add(&mut result, &view);
}
result
result.freeze()
}
fn partial_group_any(&self, g: &ColGroup, threshold: u32) -> MemoryBitVec {
fn partial_group_any(&self, g: &ColGroup, threshold: u32) -> io::Result<TempBitVec> {
let n = self.n();
let mut result = MemoryBitVec::new(n);
let mut result = TempBitVecBuilder::new(n)?;
for &c in &g.indices {
let mask = self.col_view(c).cmp_scalar(|v| v >= threshold);
result.or(&mask);
}
result
result.freeze()
}
}
+4
View File
@@ -9,6 +9,8 @@ mod memoryintvec;
mod memoryvec;
mod meta;
mod reader;
mod tempbitvec;
mod tempintvec;
pub mod traits;
pub use bitvec::{BitIter, PersistentBitVec, PersistentBitVecBuilder};
@@ -20,6 +22,8 @@ pub use layer_meta::LayerMeta;
pub use memoryintvec::{MemoryIntIter, MemoryIntVec};
pub use memoryvec::MemoryBitVec;
pub use reader::PersistentCompactIntVec;
pub use tempbitvec::TempBitVec;
pub use tempintvec::TempCompactIntVec;
pub use traits::{BitPartials, BitSlice, BitSliceMut, BitToInt, ColumnWeights, CountPartials, IntSlice, IntSliceMut, IntToBit};
#[cfg(test)]
+69
View File
@@ -0,0 +1,69 @@
use std::io;
use std::path::Path;
use tempfile::TempDir;
use crate::bitvec::{PersistentBitVec, PersistentBitVecBuilder};
use crate::traits::{BitSlice, BitSliceMut};
// ── TempBitVec — frozen read-only, auto-deleted on drop ──────────────────────
/// A bit vector backed by a temporary file.
/// Implements [`BitSlice`]; the file is deleted when this value is dropped.
/// Call [`make_persistent`](Self::make_persistent) to promote to a durable file.
pub struct TempBitVec {
vec: PersistentBitVec,
// Dropped after `vec` (field order), so the mmap is released before the
// temp directory is deleted.
_temp: TempDir,
}
impl TempBitVec {
/// Copy to a permanent file and open as a [`PersistentBitVec`].
pub fn make_persistent(&self, path: &Path) -> io::Result<PersistentBitVec> {
std::fs::copy(self.vec.path(), path)?;
PersistentBitVec::open(path)
}
pub fn len(&self) -> usize { self.vec.len() }
pub fn is_empty(&self) -> bool { self.vec.is_empty() }
}
impl BitSlice for TempBitVec {
fn len(&self) -> usize { self.vec.len() }
fn words(&self) -> &[u64] { self.vec.words() }
}
// ── TempBitVecBuilder — mutable, becomes TempBitVec on freeze ────────────────
/// Writable builder for a [`TempBitVec`]. `pub(crate)` — callers receive
/// only the frozen result via [`freeze`](Self::freeze).
pub(crate) struct TempBitVecBuilder {
builder: PersistentBitVecBuilder,
temp: TempDir,
}
impl TempBitVecBuilder {
pub(crate) fn new(n: usize) -> io::Result<Self> {
let temp = TempDir::new()?;
let path = temp.path().join("data.pbiv");
let builder = PersistentBitVecBuilder::new(n, &path)?;
Ok(Self { builder, temp })
}
/// Finalize writes and return a frozen, read-only [`TempBitVec`].
pub(crate) fn freeze(self) -> io::Result<TempBitVec> {
let Self { builder, temp } = self;
let vec = builder.finish()?;
Ok(TempBitVec { vec, _temp: temp })
}
}
impl BitSlice for TempBitVecBuilder {
fn len(&self) -> usize { self.builder.len() }
fn words(&self) -> &[u64] { self.builder.words() }
}
impl BitSliceMut for TempBitVecBuilder {
fn words_mut(&mut self) -> &mut [u64] { self.builder.words_mut() }
}
+82
View File
@@ -0,0 +1,82 @@
use std::io;
use std::path::Path;
use tempfile::TempDir;
use crate::builder::PersistentCompactIntVecBuilder;
use crate::reader::PersistentCompactIntVec;
use crate::traits::{IntSlice, IntSliceMut};
// ── TempCompactIntVec — frozen read-only, auto-deleted on drop ────────────────
/// A compact int vector backed by a temporary file.
/// Implements [`IntSlice`]; the file is deleted when this value is dropped.
/// Call [`make_persistent`](Self::make_persistent) to promote to a durable file.
pub struct TempCompactIntVec {
vec: PersistentCompactIntVec,
// Dropped after `vec` (field order), so the mmap is released before the
// temp directory is deleted.
_temp: TempDir,
}
impl TempCompactIntVec {
/// Copy to a permanent file and open as a [`PersistentCompactIntVec`].
pub fn make_persistent(&self, path: &Path) -> io::Result<PersistentCompactIntVec> {
std::fs::copy(self.vec.path(), path)?;
PersistentCompactIntVec::open(path)
}
pub fn len(&self) -> usize { self.vec.len() }
pub fn is_empty(&self) -> bool { self.vec.is_empty() }
}
impl IntSlice for TempCompactIntVec {
fn len(&self) -> usize { self.vec.len() }
fn get(&self, slot: usize) -> u32 { self.vec.get(slot) }
fn primary_bytes(&self) -> &[u8] { self.vec.primary_bytes() }
fn overflow_entries(&self) -> impl Iterator<Item = (usize, u32)> + '_ {
self.vec.overflow_entries()
}
fn sum(&self) -> u64 { self.vec.sum() }
fn count_nonzero(&self) -> u64 { self.vec.count_nonzero() }
}
// ── TempCompactIntVecBuilder — mutable, becomes TempCompactIntVec on freeze ──
/// Writable builder for a [`TempCompactIntVec`]. `pub(crate)` — callers
/// receive only the frozen result via [`freeze`](Self::freeze).
pub(crate) struct TempCompactIntVecBuilder {
builder: PersistentCompactIntVecBuilder,
temp: TempDir,
}
impl TempCompactIntVecBuilder {
pub(crate) fn new(n: usize) -> io::Result<Self> {
let temp = TempDir::new()?;
let path = temp.path().join("data.pciv");
let builder = PersistentCompactIntVecBuilder::new(n, &path)?;
Ok(Self { builder, temp })
}
/// Finalize writes and return a frozen, read-only [`TempCompactIntVec`].
pub(crate) fn freeze(self) -> io::Result<TempCompactIntVec> {
let Self { builder, temp } = self;
let vec = builder.finish()?;
Ok(TempCompactIntVec { vec, _temp: temp })
}
}
impl IntSlice for TempCompactIntVecBuilder {
fn len(&self) -> usize { self.builder.len() }
fn get(&self, slot: usize) -> u32 { self.builder.get(slot) }
fn primary_bytes(&self) -> &[u8] { self.builder.primary_bytes() }
fn overflow_entries(&self) -> impl Iterator<Item = (usize, u32)> + '_ {
self.builder.overflow_entries()
}
}
impl IntSliceMut for TempCompactIntVecBuilder {
fn set(&mut self, slot: usize, value: u32) { self.builder.set(slot, value); }
fn primary_bytes_mut(&mut self) -> &mut [u8] { self.builder.primary_bytes_mut() }
fn clear_overflow(&mut self) { self.builder.clear_overflow(); }
}
+10 -10
View File
@@ -5,7 +5,7 @@ use crate::{
PersistentBitMatrix, PersistentBitMatrixBuilder,
PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder,
};
use crate::traits::{BitSliceMut, IntSlice, IntSliceMut};
use crate::traits::{BitSlice, BitSliceMut, IntSlice, IntSliceMut};
use crate::{MemoryBitVec, MemoryIntVec};
// ── helpers ───────────────────────────────────────────────────────────────────
@@ -47,7 +47,7 @@ fn int_partial_group_sum_basic() {
// group {0,2}: sum = [101, 2, 8]
let (_d, m) = make_int_matrix(&[&[1, 2, 3], &[10, 20, 30], &[100, 0, 5]]);
let g = ColGroup::new("g", vec![0, 2]);
let result = m.partial_group_sum(&g);
let result = m.partial_group_sum(&g).unwrap();
assert_eq!(result.get(0), 101);
assert_eq!(result.get(1), 2);
assert_eq!(result.get(2), 8);
@@ -58,7 +58,7 @@ fn int_partial_group_sum_with_overflow() {
// col0=[300,0], col1=[200,400]: group {0,1}: sum=[500, 400]
let (_d, m) = make_int_matrix(&[&[300, 0], &[200, 400]]);
let g = ColGroup::new("g", vec![0, 1]);
let result = m.partial_group_sum(&g);
let result = m.partial_group_sum(&g).unwrap();
assert_eq!(result.get(0), 500);
assert_eq!(result.get(1), 400);
assert_eq!(result.sum(), 900);
@@ -73,7 +73,7 @@ fn int_partial_group_presence_count() {
// group {0,1,2}: counts = [2, 1, 1, 2]
let (_d, m) = make_int_matrix(&[&[5, 1, 0, 3], &[2, 0, 4, 3], &[0, 3, 1, 0]]);
let g = ColGroup::new("g", vec![0, 1, 2]);
let result = m.partial_group_presence_count(&g, 2);
let result = m.partial_group_presence_count(&g, 2).unwrap();
assert_eq!(result.get(0), 2);
assert_eq!(result.get(1), 1);
assert_eq!(result.get(2), 1);
@@ -87,7 +87,7 @@ fn int_partial_group_presence_count_with_overflow() {
// group {0,1,2}: counts = [1, 1, 3]
let (_d, m) = make_int_matrix(&[&[300, 0, 10], &[0, 400, 10], &[1, 1, 10]]);
let g = ColGroup::new("g", vec![0, 1, 2]);
let result = m.partial_group_presence_count(&g, 5);
let result = m.partial_group_presence_count(&g, 5).unwrap();
assert_eq!(result.get(0), 1);
assert_eq!(result.get(1), 1);
assert_eq!(result.get(2), 3);
@@ -102,7 +102,7 @@ fn int_partial_group_any() {
// group {0,1,2}: any = [T, T, T, F]
let (_d, m) = make_int_matrix(&[&[0, 3, 0, 1], &[2, 0, 0, 0], &[0, 0, 5, 0]]);
let g = ColGroup::new("g", vec![0, 1, 2]);
let result = m.partial_group_any(&g, 2);
let result = m.partial_group_any(&g, 2).unwrap();
assert_eq!(result.get(0), true);
assert_eq!(result.get(1), true);
assert_eq!(result.get(2), true);
@@ -164,7 +164,7 @@ fn bit_partial_group_presence_count() {
&[false,true, true, false],
]);
let g = ColGroup::new("g", vec![0, 1, 2]);
let result = m.partial_group_presence_count(&g, 1);
let result = m.partial_group_presence_count(&g, 1).unwrap();
assert_eq!(result.get(0), 2);
assert_eq!(result.get(1), 2);
assert_eq!(result.get(2), 2);
@@ -181,7 +181,7 @@ fn bit_partial_group_any() {
&[false, false, true],
]);
let g = ColGroup::new("g", vec![0, 1]);
let result = m.partial_group_any(&g, 1);
let result = m.partial_group_any(&g, 1).unwrap();
assert_eq!(result.get(0), true);
assert_eq!(result.get(1), false);
assert_eq!(result.get(2), true);
@@ -200,8 +200,8 @@ fn int_presence_count_additive_across_split() {
let (_db, mb) = make_int_matrix(data_b);
let g = ColGroup::new("g", vec![0, 1]);
let pa = ma.partial_group_presence_count(&g, 2);
let pb = mb.partial_group_presence_count(&g, 2);
let pa = ma.partial_group_presence_count(&g, 2).unwrap();
let pb = mb.partial_group_presence_count(&g, 2).unwrap();
// Concatenate by adding (disjoint kmer ranges — here we just verify
// individual results match the expected per-partition counts).