feat(matrix): add partial group reductions and column persistence
Expands MatrixGroupOps with partial_group_min/max helpers for bitwise reductions and introduces add_col_from methods to persist external vectors as matrix columns. Refactors column aggregation in the partitioner to leverage these group operations directly, replacing iterative row processing with simplified builder lifecycle management and explicit metadata serialization.
This commit is contained in:
@@ -402,6 +402,26 @@ impl PersistentBitMatrixBuilder {
|
||||
PersistentBitVecBuilder::new(self.n, &path)
|
||||
}
|
||||
|
||||
pub fn add_col_from(&mut self, src: &TempBitVec) -> io::Result<()> {
|
||||
src.make_persistent(&col_path(&self.dir, self.n_cols))?;
|
||||
self.n_cols += 1;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn add_col_from_int(&mut self, src: &TempCompactIntVec) -> io::Result<()> {
|
||||
let path = col_path(&self.dir, self.n_cols);
|
||||
self.n_cols += 1;
|
||||
let mut b = PersistentBitVecBuilder::new(self.n, &path)?;
|
||||
let view = src.view();
|
||||
for slot in 0..self.n {
|
||||
if view.primary_bytes()[slot] > 0 { b.set(slot, true); }
|
||||
}
|
||||
for (slot, _) in view.overflow_entries() {
|
||||
b.set(slot, true);
|
||||
}
|
||||
b.close()
|
||||
}
|
||||
|
||||
pub fn close(self) -> io::Result<()> {
|
||||
MatrixMeta { n: self.n, n_cols: self.n_cols }.save(&self.dir)
|
||||
}
|
||||
@@ -446,6 +466,26 @@ impl MatrixGroupOps for PersistentBitMatrix {
|
||||
}
|
||||
result.freeze()
|
||||
}
|
||||
|
||||
fn partial_group_min(&self, g: &ColGroup) -> io::Result<TempCompactIntVec> {
|
||||
// min of 0/1 values = AND: 1 only if ALL columns are 1
|
||||
let n = self.n();
|
||||
let mut result = TempCompactIntVecBuilder::new(n)?;
|
||||
if let Some((&first, rest)) = g.indices.split_first() {
|
||||
result.inc_present_fast(self.col_view(first));
|
||||
for &c in rest { result.mask_with(self.col_view(c)); }
|
||||
}
|
||||
result.freeze()
|
||||
}
|
||||
|
||||
fn partial_group_max(&self, g: &ColGroup) -> io::Result<TempCompactIntVec> {
|
||||
// max of 0/1 values = OR: 1 if any column is 1
|
||||
let any = self.partial_group_any(g, 1)?;
|
||||
let n = any.len();
|
||||
let mut result = TempCompactIntVecBuilder::new(n)?;
|
||||
result.inc_present(any.view());
|
||||
result.freeze()
|
||||
}
|
||||
}
|
||||
|
||||
// ── Shared matrix helpers (also used by intmatrix.rs) ─────────────────────────
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use std::io;
|
||||
|
||||
use crate::tempbitvec::TempBitVec;
|
||||
use crate::tempbitvec::{TempBitVec, TempBitVecBuilder};
|
||||
use crate::tempintvec::TempCompactIntVec;
|
||||
|
||||
// ── ColGroup ──────────────────────────────────────────────────────────────────
|
||||
@@ -23,12 +23,14 @@ impl ColGroup {
|
||||
|
||||
// ── MatrixGroupOps ────────────────────────────────────────────────────────────
|
||||
|
||||
/// Per-matrix group aggregations that return **additive intermediates**.
|
||||
/// Per-matrix group aggregations.
|
||||
///
|
||||
/// Results must be composed by the caller (concat across partitions, add across
|
||||
/// layers) before applying final predicates (`geq`, `leq`, …). Non-additive
|
||||
/// predicates like `group_all` or `group_at_least(k)` are intentionally absent
|
||||
/// — they are derived at the index level from these intermediates.
|
||||
/// `partial_group_presence_count`, `partial_group_sum`, `partial_group_any`,
|
||||
/// `partial_group_min`, `partial_group_max` are the primitives; each impl must
|
||||
/// provide all five.
|
||||
///
|
||||
/// `partial_group_all` and `partial_group_none` have default implementations
|
||||
/// derived from `partial_group_presence_count` and should rarely need overriding.
|
||||
pub trait MatrixGroupOps {
|
||||
/// Per-slot count of group columns whose value ≥ `threshold`.
|
||||
fn partial_group_presence_count(&self, g: &ColGroup, threshold: u32) -> io::Result<TempCompactIntVec>;
|
||||
@@ -36,6 +38,35 @@ pub trait MatrixGroupOps {
|
||||
/// Per-slot sum of values across all group columns.
|
||||
fn partial_group_sum(&self, g: &ColGroup) -> io::Result<TempCompactIntVec>;
|
||||
|
||||
/// Per-slot OR: true if any group column has value ≥ `threshold`.
|
||||
/// Per-slot OR: 1 if any group column has value ≥ `threshold`.
|
||||
fn partial_group_any(&self, g: &ColGroup, threshold: u32) -> io::Result<TempBitVec>;
|
||||
|
||||
/// Per-slot min value across all group columns (0 if group is empty).
|
||||
fn partial_group_min(&self, g: &ColGroup) -> io::Result<TempCompactIntVec>;
|
||||
|
||||
/// Per-slot max value across all group columns (0 if group is empty).
|
||||
fn partial_group_max(&self, g: &ColGroup) -> io::Result<TempCompactIntVec>;
|
||||
|
||||
/// Per-slot AND: 1 if ALL group columns have value ≥ `threshold`.
|
||||
fn partial_group_all(&self, g: &ColGroup, threshold: u32) -> io::Result<TempBitVec> {
|
||||
let counts = self.partial_group_presence_count(g, threshold)?;
|
||||
let n = counts.len();
|
||||
let n_required = g.indices.len() as u32;
|
||||
let mut b = TempBitVecBuilder::new(n)?;
|
||||
for slot in 0..n {
|
||||
if counts.get(slot) >= n_required { b.set(slot, true); }
|
||||
}
|
||||
b.freeze()
|
||||
}
|
||||
|
||||
/// Per-slot NOR: 1 if NO group column has value ≥ `threshold`.
|
||||
fn partial_group_none(&self, g: &ColGroup, threshold: u32) -> io::Result<TempBitVec> {
|
||||
let counts = self.partial_group_presence_count(g, threshold)?;
|
||||
let n = counts.len();
|
||||
let mut b = TempBitVecBuilder::new(n)?;
|
||||
for slot in 0..n {
|
||||
if counts.get(slot) == 0 { b.set(slot, true); }
|
||||
}
|
||||
b.freeze()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -386,6 +386,21 @@ impl PersistentCompactIntMatrixBuilder {
|
||||
self.n_cols += 1;
|
||||
PersistentCompactIntVecBuilder::new(self.n, &path)
|
||||
}
|
||||
|
||||
pub fn add_col_from(&mut self, src: &TempCompactIntVec) -> io::Result<()> {
|
||||
src.make_persistent(&col_path(&self.dir, self.n_cols))?;
|
||||
self.n_cols += 1;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn add_col_from_bit(&mut self, src: &TempBitVec) -> io::Result<()> {
|
||||
let path = col_path(&self.dir, self.n_cols);
|
||||
self.n_cols += 1;
|
||||
let mut b = PersistentCompactIntVecBuilder::new(self.n, &path)?;
|
||||
b.inc_present(src.view());
|
||||
b.close()
|
||||
}
|
||||
|
||||
pub fn close(self) -> io::Result<()> {
|
||||
MatrixMeta { n: self.n, n_cols: self.n_cols }.save(&self.dir)
|
||||
}
|
||||
@@ -431,4 +446,21 @@ impl MatrixGroupOps for PersistentCompactIntMatrix {
|
||||
}
|
||||
result.freeze()
|
||||
}
|
||||
|
||||
fn partial_group_min(&self, g: &ColGroup) -> io::Result<TempCompactIntVec> {
|
||||
let n = self.n();
|
||||
let mut result = TempCompactIntVecBuilder::new(n)?;
|
||||
if let Some((&first, rest)) = g.indices.split_first() {
|
||||
result.add(self.col_view(first));
|
||||
for &c in rest { result.min(self.col_view(c)); }
|
||||
}
|
||||
result.freeze()
|
||||
}
|
||||
|
||||
fn partial_group_max(&self, g: &ColGroup) -> io::Result<TempCompactIntVec> {
|
||||
let n = self.n();
|
||||
let mut result = TempCompactIntVecBuilder::new(n)?;
|
||||
for &c in &g.indices { result.max(self.col_view(c)); }
|
||||
result.freeze()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,8 +3,9 @@ use std::io;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use obicompactvec::{
|
||||
PersistentBitMatrix, PersistentBitMatrixBuilder, PersistentBitVecBuilder,
|
||||
PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, PersistentCompactIntVecBuilder,
|
||||
ColGroup, MatrixGroupOps,
|
||||
PersistentBitMatrix, PersistentBitMatrixBuilder,
|
||||
PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder,
|
||||
};
|
||||
use obilayeredmap::meta::PartitionMeta;
|
||||
use obilayeredmap::OLMError;
|
||||
@@ -40,52 +41,6 @@ pub struct OutputCol {
|
||||
pub op: AggOp,
|
||||
}
|
||||
|
||||
// ── Aggregation ───────────────────────────────────────────────────────────────
|
||||
|
||||
#[inline]
|
||||
fn aggregate(op: AggOp, indices: &[usize], src_row: &[u32], threshold: u32) -> u32 {
|
||||
match op {
|
||||
AggOp::Any => {
|
||||
if indices.iter().any(|&i| src_row[i] > threshold) { 1 } else { 0 }
|
||||
}
|
||||
AggOp::All => {
|
||||
if indices.is_empty() { return 0; }
|
||||
if indices.iter().all(|&i| src_row[i] > threshold) { 1 } else { 0 }
|
||||
}
|
||||
AggOp::None => {
|
||||
if indices.iter().all(|&i| src_row[i] <= threshold) { 1 } else { 0 }
|
||||
}
|
||||
AggOp::Sum => {
|
||||
indices.iter().map(|&i| src_row[i]).fold(0u32, |a, b| a.saturating_add(b))
|
||||
}
|
||||
AggOp::Min => indices.iter().map(|&i| src_row[i]).min().unwrap_or(0),
|
||||
AggOp::Max => indices.iter().map(|&i| src_row[i]).max().unwrap_or(0),
|
||||
}
|
||||
}
|
||||
|
||||
// ── ColBuilder ────────────────────────────────────────────────────────────────
|
||||
|
||||
enum ColBuilder {
|
||||
Bit(PersistentBitVecBuilder),
|
||||
Int(PersistentCompactIntVecBuilder),
|
||||
}
|
||||
|
||||
impl ColBuilder {
|
||||
fn set_val(&mut self, slot: usize, value: u32) {
|
||||
match self {
|
||||
ColBuilder::Bit(b) => b.set(slot, value > 0),
|
||||
ColBuilder::Int(b) => b.set(slot, value),
|
||||
}
|
||||
}
|
||||
|
||||
fn close(self) -> SKResult<()> {
|
||||
match self {
|
||||
ColBuilder::Bit(b) => b.close().map_err(SKError::Io),
|
||||
ColBuilder::Int(b) => b.close().map_err(SKError::Io),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ── Helpers ───────────────────────────────────────────────────────────────────
|
||||
|
||||
fn olm_to_sk(e: OLMError) -> SKError {
|
||||
@@ -95,21 +50,6 @@ fn olm_to_sk(e: OLMError) -> SKError {
|
||||
}
|
||||
}
|
||||
|
||||
fn col_path_bit(dir: &Path, col: usize) -> PathBuf {
|
||||
dir.join(format!("col_{col:06}.pbiv"))
|
||||
}
|
||||
|
||||
fn col_path_int(dir: &Path, col: usize) -> PathBuf {
|
||||
dir.join(format!("col_{col:06}.pciv"))
|
||||
}
|
||||
|
||||
fn write_matrix_meta(dir: &Path, n: usize, n_cols: usize) -> io::Result<()> {
|
||||
fs::write(
|
||||
dir.join("meta.json"),
|
||||
format!("{{\"n\":{n},\"n_cols\":{n_cols}}}\n"),
|
||||
)
|
||||
}
|
||||
|
||||
/// Copy all plain files (not subdirectories) from `src_dir` to `dst_dir`.
|
||||
fn copy_layer_files(src_dir: &Path, dst_dir: &Path) -> io::Result<()> {
|
||||
for entry in fs::read_dir(src_dir)? {
|
||||
@@ -125,30 +65,64 @@ fn copy_layer_files(src_dir: &Path, dst_dir: &Path) -> io::Result<()> {
|
||||
// ── fill_builders ─────────────────────────────────────────────────────────────
|
||||
|
||||
fn fill_builders(
|
||||
builders: &mut [ColBuilder],
|
||||
specs: &[OutputCol],
|
||||
n: usize,
|
||||
n_src: usize,
|
||||
src_layer_dir: &Path,
|
||||
src_is_count: bool,
|
||||
threshold: u32,
|
||||
output_presence: bool,
|
||||
mut dst_bit: Option<&mut PersistentBitMatrixBuilder>,
|
||||
mut dst_int: Option<&mut PersistentCompactIntMatrixBuilder>,
|
||||
) -> SKResult<()> {
|
||||
let mut src_buf = vec![0u32; n_src];
|
||||
|
||||
if src_is_count {
|
||||
let mat = PersistentCompactIntMatrix::open(src_layer_dir).map_err(SKError::Io)?;
|
||||
for slot in 0..n {
|
||||
mat.fill_row(slot, &mut src_buf);
|
||||
for (col, spec) in specs.iter().enumerate() {
|
||||
builders[col].set_val(slot, aggregate(spec.op, &spec.indices, &src_buf, threshold));
|
||||
for spec in specs {
|
||||
let g = ColGroup::new(&spec.label, spec.indices.clone());
|
||||
if output_presence {
|
||||
let b = dst_bit.as_deref_mut().unwrap();
|
||||
match spec.op {
|
||||
AggOp::Any => b.add_col_from (&mat.partial_group_any (&g, threshold).map_err(SKError::Io)?),
|
||||
AggOp::All => b.add_col_from (&mat.partial_group_all (&g, threshold).map_err(SKError::Io)?),
|
||||
AggOp::None => b.add_col_from (&mat.partial_group_none(&g, threshold).map_err(SKError::Io)?),
|
||||
AggOp::Sum => b.add_col_from_int(&mat.partial_group_sum (&g).map_err(SKError::Io)?),
|
||||
AggOp::Min => b.add_col_from_int(&mat.partial_group_min (&g).map_err(SKError::Io)?),
|
||||
AggOp::Max => b.add_col_from_int(&mat.partial_group_max (&g).map_err(SKError::Io)?),
|
||||
}.map_err(SKError::Io)?;
|
||||
} else {
|
||||
let b = dst_int.as_deref_mut().unwrap();
|
||||
match spec.op {
|
||||
AggOp::Sum => b.add_col_from (&mat.partial_group_sum (&g).map_err(SKError::Io)?),
|
||||
AggOp::Min => b.add_col_from (&mat.partial_group_min (&g).map_err(SKError::Io)?),
|
||||
AggOp::Max => b.add_col_from (&mat.partial_group_max (&g).map_err(SKError::Io)?),
|
||||
AggOp::Any => b.add_col_from_bit(&mat.partial_group_any (&g, threshold).map_err(SKError::Io)?),
|
||||
AggOp::All => b.add_col_from_bit(&mat.partial_group_all (&g, threshold).map_err(SKError::Io)?),
|
||||
AggOp::None => b.add_col_from_bit(&mat.partial_group_none(&g, threshold).map_err(SKError::Io)?),
|
||||
}.map_err(SKError::Io)?;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
let mat = PersistentBitMatrix::open(src_layer_dir).map_err(SKError::Io)?;
|
||||
for slot in 0..n {
|
||||
mat.fill_row(slot, &mut src_buf);
|
||||
for (col, spec) in specs.iter().enumerate() {
|
||||
builders[col].set_val(slot, aggregate(spec.op, &spec.indices, &src_buf, threshold));
|
||||
for spec in specs {
|
||||
let g = ColGroup::new(&spec.label, spec.indices.clone());
|
||||
if output_presence {
|
||||
let b = dst_bit.as_deref_mut().unwrap();
|
||||
match spec.op {
|
||||
AggOp::Any => b.add_col_from (&mat.partial_group_any (&g, 1).map_err(SKError::Io)?),
|
||||
AggOp::All => b.add_col_from (&mat.partial_group_all (&g, 1).map_err(SKError::Io)?),
|
||||
AggOp::None => b.add_col_from (&mat.partial_group_none(&g, 1).map_err(SKError::Io)?),
|
||||
AggOp::Sum => b.add_col_from_int(&mat.partial_group_sum (&g).map_err(SKError::Io)?),
|
||||
AggOp::Min => b.add_col_from_int(&mat.partial_group_min (&g).map_err(SKError::Io)?),
|
||||
AggOp::Max => b.add_col_from_int(&mat.partial_group_max (&g).map_err(SKError::Io)?),
|
||||
}.map_err(SKError::Io)?;
|
||||
} else {
|
||||
let b = dst_int.as_deref_mut().unwrap();
|
||||
match spec.op {
|
||||
AggOp::Sum => b.add_col_from (&mat.partial_group_sum (&g).map_err(SKError::Io)?),
|
||||
AggOp::Min => b.add_col_from (&mat.partial_group_min (&g).map_err(SKError::Io)?),
|
||||
AggOp::Max => b.add_col_from (&mat.partial_group_max (&g).map_err(SKError::Io)?),
|
||||
AggOp::Any => b.add_col_from_bit(&mat.partial_group_any (&g, 1).map_err(SKError::Io)?),
|
||||
AggOp::All => b.add_col_from_bit(&mat.partial_group_all (&g, 1).map_err(SKError::Io)?),
|
||||
AggOp::None => b.add_col_from_bit(&mat.partial_group_none(&g, 1).map_err(SKError::Io)?),
|
||||
}.map_err(SKError::Io)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -168,7 +142,7 @@ impl KmerPartition {
|
||||
src: &KmerPartition,
|
||||
i: usize,
|
||||
specs: &[OutputCol],
|
||||
n_src_genomes: usize,
|
||||
_n_src_genomes: usize,
|
||||
threshold: u32,
|
||||
output_presence: bool,
|
||||
in_place: bool,
|
||||
@@ -188,7 +162,6 @@ impl KmerPartition {
|
||||
fs::create_dir_all(&dst_index_dir)?;
|
||||
}
|
||||
|
||||
let n_out = specs.len();
|
||||
let data_subdir = if output_presence { "presence" } else { "counts" };
|
||||
|
||||
for l in 0..src_meta.n_layers {
|
||||
@@ -201,7 +174,7 @@ impl KmerPartition {
|
||||
let presence_dir = src_layer_dir.join("presence");
|
||||
let src_is_count = counts_dir.exists() && !presence_dir.exists();
|
||||
|
||||
// Determine number of slots from the source matrix.
|
||||
// Determine number of slots and detect implicit layers.
|
||||
let n = if counts_dir.exists() {
|
||||
PersistentCompactIntMatrix::open(&src_layer_dir).map_err(SKError::Io)?.n()
|
||||
} else if presence_dir.exists() {
|
||||
@@ -216,7 +189,7 @@ impl KmerPartition {
|
||||
};
|
||||
|
||||
// Choose the output data directory (temp name for in-place).
|
||||
let (dst_data_dir, final_data_dir) = if in_place {
|
||||
let (dst_data_dir, final_data_dir): (PathBuf, PathBuf) = if in_place {
|
||||
let tmp = dst_layer_dir.join(format!("{data_subdir}_new"));
|
||||
let perm = dst_layer_dir.join(data_subdir);
|
||||
(tmp, perm)
|
||||
@@ -231,37 +204,22 @@ impl KmerPartition {
|
||||
}
|
||||
fs::create_dir_all(&dst_data_dir)?;
|
||||
|
||||
// Initialise packed-format skeleton.
|
||||
if output_presence {
|
||||
PersistentBitMatrixBuilder::new(n, &dst_data_dir)
|
||||
.map_err(SKError::Io)?.close().map_err(SKError::Io)?;
|
||||
let (mut dst_bit, mut dst_int) = if output_presence {
|
||||
(Some(PersistentBitMatrixBuilder::new(n, &dst_data_dir).map_err(SKError::Io)?), None)
|
||||
} else {
|
||||
PersistentCompactIntMatrixBuilder::new(n, &dst_data_dir)
|
||||
.map_err(SKError::Io)?.close().map_err(SKError::Io)?;
|
||||
}
|
||||
|
||||
// Create column builders.
|
||||
let mut builders: Vec<ColBuilder> = (0..n_out)
|
||||
.map(|col| -> SKResult<ColBuilder> {
|
||||
if output_presence {
|
||||
Ok(ColBuilder::Bit(PersistentBitVecBuilder::new(
|
||||
n, &col_path_bit(&dst_data_dir, col),
|
||||
)?))
|
||||
} else {
|
||||
Ok(ColBuilder::Int(PersistentCompactIntVecBuilder::new(
|
||||
n, &col_path_int(&dst_data_dir, col),
|
||||
)?))
|
||||
}
|
||||
})
|
||||
.collect::<SKResult<_>>()?;
|
||||
(None, Some(PersistentCompactIntMatrixBuilder::new(n, &dst_data_dir).map_err(SKError::Io)?))
|
||||
};
|
||||
|
||||
fill_builders(
|
||||
&mut builders, specs, n, n_src_genomes,
|
||||
&src_layer_dir, src_is_count, threshold,
|
||||
specs, &src_layer_dir, src_is_count, threshold, output_presence,
|
||||
dst_bit.as_mut(), dst_int.as_mut(),
|
||||
)?;
|
||||
|
||||
for b in builders { b.close()?; }
|
||||
write_matrix_meta(&dst_data_dir, n, n_out).map_err(SKError::Io)?;
|
||||
if output_presence {
|
||||
dst_bit.unwrap().close().map_err(SKError::Io)?;
|
||||
} else {
|
||||
dst_int.unwrap().close().map_err(SKError::Io)?;
|
||||
}
|
||||
|
||||
// In-place: swap old data dir for new.
|
||||
if in_place {
|
||||
|
||||
Reference in New Issue
Block a user