feat: add memory vectors, slice traits, and column extraction methods

Introduce `MemoryBitVec` and `MemoryIntVec` for efficient in-memory storage with hybrid compression and overflow handling. Implement `BitSlice`, `BitSliceMut`, `IntSlice`, and `IntSliceMut` traits across persistent and memory-backed types to enable generic slice operations and bitwise/arithmetic overloads. Add `col_persist` and `col_as_memory` methods to `BitMatrix` and `IntMatrix` for efficient column extraction. Align with the new single-pass rebuild architecture by supporting fast kmer filtering and matrix rebuilding. Includes comprehensive tests and profiling instrumentation for the packing phase.
This commit is contained in:
Eric Coissac
2026-06-16 23:18:10 +02:00
parent b6fcbc545f
commit cde6457eea
15 changed files with 1120 additions and 70 deletions
+42
View File
@@ -1,4 +1,5 @@
use std::cmp::Ordering;
use std::collections::HashMap;
use std::fs::{self, File};
use std::io::{self, BufWriter, Write as _};
use std::path::{Path, PathBuf};
@@ -8,6 +9,7 @@ use ndarray::{Array1, Array2};
use rayon::prelude::*;
use crate::builder::PersistentCompactIntVecBuilder;
use crate::memoryintvec::MemoryIntVec;
use crate::format::{HEADER_SIZE, INDEX_ENTRY_SIZE, OVERFLOW_ENTRY_SIZE};
use crate::meta::MatrixMeta;
use crate::reader::PersistentCompactIntVec;
@@ -194,6 +196,32 @@ impl PackedCompactIntMatrix {
Ok(Self { mmap, n_rows, n_cols, columns })
}
pub(crate) fn col_persist(&self, c: usize, path: &Path) -> io::Result<PersistentCompactIntVecBuilder> {
let ci = &self.columns[c];
let primary = &self.mmap[ci.primary_start..ci.primary_start + self.n_rows];
let mut overflow = HashMap::with_capacity(ci.n_overflow);
for i in 0..ci.n_overflow {
let off = ci.data_offset + i * OVERFLOW_ENTRY_SIZE;
let slot = u64::from_le_bytes(self.mmap[off..off+8].try_into().unwrap()) as usize;
let value = u32::from_le_bytes(self.mmap[off+8..off+12].try_into().unwrap());
overflow.insert(slot, value);
}
PersistentCompactIntVecBuilder::from_raw_primary(primary, overflow, path)
}
pub(crate) fn col_as_memory(&self, c: usize) -> MemoryIntVec {
let ci = &self.columns[c];
let primary = self.mmap[ci.primary_start..ci.primary_start + self.n_rows].to_vec();
let mut overflow = HashMap::with_capacity(ci.n_overflow);
for i in 0..ci.n_overflow {
let off = ci.data_offset + i * OVERFLOW_ENTRY_SIZE;
let slot = u64::from_le_bytes(self.mmap[off..off+8].try_into().unwrap()) as usize;
let value = u32::from_le_bytes(self.mmap[off+8..off+12].try_into().unwrap());
overflow.insert(slot, value);
}
MemoryIntVec::from_primary_and_overflow(primary, overflow)
}
#[inline]
pub(crate) fn get(&self, col: usize, slot: usize) -> u32 {
let ci = &self.columns[col];
@@ -442,6 +470,20 @@ impl PersistentCompactIntMatrix {
}
}
pub fn col_persist(&self, c: usize, path: &Path) -> io::Result<PersistentCompactIntVecBuilder> {
match self {
Self::Columnar(m) => PersistentCompactIntVecBuilder::build_from(m.col(c), path),
Self::Packed(m) => m.col_persist(c, path),
}
}
pub fn col_as_memory(&self, c: usize) -> MemoryIntVec {
match self {
Self::Columnar(m) => MemoryIntVec::from(m.col(c)),
Self::Packed(m) => m.col_as_memory(c),
}
}
pub fn row(&self, slot: usize) -> Box<[u32]> {
match self { Self::Columnar(m) => m.row(slot), Self::Packed(m) => m.row(slot) }
}