Push mtzqmmrlmzzx #34
@@ -11,8 +11,11 @@ src/obicompactvec/src/
|
||||
reader.rs PersistentCompactIntVec (read-only)
|
||||
builder.rs PersistentCompactIntVecBuilder (read-write)
|
||||
memoryintvec.rs MemoryIntVec
|
||||
tempintvec.rs TempCompactIntVec, TempCompactIntVecBuilder (temp-file-backed)
|
||||
tempbitvec.rs TempBitVec, TempBitVecBuilder (temp-file-backed)
|
||||
bitmatrix.rs PersistentBitMatrix, PersistentBitMatrixBuilder
|
||||
intmatrix.rs PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder
|
||||
colgroup.rs ColGroup, MatrixGroupOps trait
|
||||
format.rs file format constants, encode/decode helpers
|
||||
layer_meta.rs LayerMeta (column metadata)
|
||||
meta.rs matrix metadata
|
||||
@@ -24,13 +27,22 @@ graph TD
|
||||
traits --> memoryintvec
|
||||
bitvec --> memoryvec
|
||||
bitvec --> bitmatrix
|
||||
bitvec --> tempbitvec
|
||||
format --> reader
|
||||
format --> builder
|
||||
reader --> intmatrix
|
||||
reader --> tempintvec
|
||||
builder --> intmatrix
|
||||
builder --> memoryintvec
|
||||
builder --> tempintvec
|
||||
memoryvec --> traits
|
||||
memoryintvec --> traits
|
||||
tempintvec --> intmatrix
|
||||
tempintvec --> bitmatrix
|
||||
tempbitvec --> intmatrix
|
||||
tempbitvec --> bitmatrix
|
||||
colgroup --> intmatrix
|
||||
colgroup --> bitmatrix
|
||||
layer_meta --> bitmatrix
|
||||
layer_meta --> intmatrix
|
||||
meta --> bitmatrix
|
||||
@@ -479,6 +491,8 @@ See `persistent_compact_int_vec.md` for file format and lifecycle.
|
||||
| `MemoryIntVec` | inherent merge-scan ✓ | `byte_sum` ✓ | `byte_count_nonzero` ✓ |
|
||||
| `PersistentCompactIntVecBuilder` | default (get-per-slot) | `byte_sum` on mmap ✓ | `byte_count_nonzero` on mmap ✓ |
|
||||
| `PersistentCompactIntVec` | inherent merge-scan Iter ✓ | inherent `sum()` ✓ | inherent `count_nonzero()` ✓ |
|
||||
| `TempCompactIntVec` | delegates to inner `PersistentCompactIntVec` | delegates | delegates |
|
||||
| `TempCompactIntVecBuilder` | default (get-per-slot) | delegates to builder | delegates to builder |
|
||||
| `PackedIntCol<'a>` | inherent PackedIntColIter ✓ | byte_sum ✓ | byte_count_nonzero ✓ |
|
||||
|
||||
`PackedIntCol` is used internally by `PersistentCompactIntMatrix` (packed format) for column views.
|
||||
@@ -557,45 +571,68 @@ Required: `partial_jaccard() -> (Array2<u64>, Array2<u64>)` (inter, union), `par
|
||||
|
||||
---
|
||||
|
||||
## Planned — Filter / Select API
|
||||
## Temp-file-backed types
|
||||
|
||||
### Composition across layers and partitions
|
||||
`MemoryBitVec` and `MemoryIntVec` are reserved for truly transient intra-method intermediates (e.g. a single `cmp_scalar` result that lives for one loop iteration). **All inter-function results use temp-file-backed types** so the OS can page them out under memory pressure. This matters in practice: processing dozens of layers × hundreds of partitions in parallel would otherwise accumulate gigabytes of live anonymous memory.
|
||||
|
||||
```mermaid
|
||||
graph TD
|
||||
subgraph Index
|
||||
CG["ColGroup\nVec<usize> — valid everywhere"]
|
||||
ACC["MemoryIntVec\nglobal accumulator"]
|
||||
PRED["geq / leq / and / or\n→ MemoryBitVec mask"]
|
||||
end
|
||||
### Lifecycle
|
||||
|
||||
subgraph "Layer 1"
|
||||
subgraph "Partition A kmers 0..k/2"
|
||||
MA["Matrix A\npartial_group_presence_count"]
|
||||
end
|
||||
subgraph "Partition B kmers k/2..k"
|
||||
MB["Matrix B\npartial_group_presence_count"]
|
||||
end
|
||||
CONCAT1["concat → MemoryIntVec\[0..k\]"]
|
||||
end
|
||||
|
||||
subgraph "Layer 2"
|
||||
CONCAT2["concat → MemoryIntVec\[0..k\]"]
|
||||
end
|
||||
|
||||
CG -->|"same indices"| MA
|
||||
CG -->|"same indices"| MB
|
||||
MA -->|"kmer range A"| CONCAT1
|
||||
MB -->|"kmer range B"| CONCAT1
|
||||
CONCAT1 -->|"IntSliceMut::add"| ACC
|
||||
CONCAT2 -->|"IntSliceMut::add"| ACC
|
||||
ACC --> PRED
|
||||
```
|
||||
TempCompactIntVecBuilder::new(n) → writable mmap in TempDir
|
||||
↓ (set / add / count_bits / mask_with / …)
|
||||
.freeze() → TempCompactIntVec (read-only mmap + TempDir)
|
||||
↓ (optional)
|
||||
.make_persistent(path) → PersistentCompactIntVec (permanent file)
|
||||
```
|
||||
|
||||
Same pattern for `TempBitVecBuilder` → `TempBitVec` → `PersistentBitVec`.
|
||||
|
||||
**Drop order**: in `TempCompactIntVec { vec: PersistentCompactIntVec, _temp: TempDir }`, Rust drops fields in declaration order — `vec` (mmap) is released before `_temp` (directory) is deleted. No explicit `drop()` needed.
|
||||
|
||||
### TempCompactIntVec / TempCompactIntVecBuilder
|
||||
|
||||
```rust
|
||||
pub struct TempCompactIntVec {
|
||||
vec: PersistentCompactIntVec,
|
||||
_temp: TempDir, // dropped after vec
|
||||
}
|
||||
|
||||
pub(crate) struct TempCompactIntVecBuilder {
|
||||
builder: PersistentCompactIntVecBuilder,
|
||||
temp: TempDir,
|
||||
}
|
||||
```
|
||||
|
||||
`TempCompactIntVec` implements `IntSlice` (full delegation to inner `PersistentCompactIntVec`).
|
||||
`TempCompactIntVecBuilder` implements `IntSlice` + `IntSliceMut` (delegation to inner builder).
|
||||
`make_persistent(path)` copies the temp file to `path` and opens it as `PersistentCompactIntVec`.
|
||||
|
||||
### TempBitVec / TempBitVecBuilder
|
||||
|
||||
```rust
|
||||
pub struct TempBitVec {
|
||||
vec: PersistentBitVec,
|
||||
_temp: TempDir,
|
||||
}
|
||||
|
||||
pub(crate) struct TempBitVecBuilder {
|
||||
builder: PersistentBitVecBuilder,
|
||||
temp: TempDir,
|
||||
}
|
||||
```
|
||||
|
||||
`TempBitVec` implements `BitSlice`.
|
||||
`TempBitVecBuilder` implements `BitSlice` + `BitSliceMut`.
|
||||
`make_persistent(path)` copies the temp file and opens as `PersistentBitVec`.
|
||||
|
||||
---
|
||||
|
||||
## Filter / Select API
|
||||
|
||||
### ColGroup
|
||||
|
||||
```rust
|
||||
struct ColGroup { name: String, indices: Vec<usize> }
|
||||
pub struct ColGroup { pub name: String, pub indices: Vec<usize> }
|
||||
```
|
||||
|
||||
Defined **once at the index level** from column metadata. Valid in all matrices of all layers and partitions because column structure is identical across the entire hierarchy (same samples/genomes everywhere; only rows = kmer slots are partitioned).
|
||||
@@ -607,68 +644,75 @@ Defined **once at the index level** from column metadata. Valid in all matrices
|
||||
- **Across partitions**: kmer space is partitioned → partial results are **concatenated** (disjoint kmer ranges).
|
||||
- **Across layers**: same kmer space, different counts → partial results are **aggregated** (add, OR, etc.).
|
||||
|
||||
### Additivity rules
|
||||
### MatrixGroupOps
|
||||
|
||||
```mermaid
|
||||
flowchart LR
|
||||
subgraph "Matrix level — returns MemoryIntVec"
|
||||
PGP["partial_group_presence_count\npartial_group_sum\npartial_group_any → MemoryBitVec"]
|
||||
end
|
||||
subgraph "Index level — applies predicate"
|
||||
GA["group_at_least(k)\n= accumulate.geq(k)"]
|
||||
GALL["group_all\n= accumulate.geq(n_cols)"]
|
||||
GANY["group_any\n= OR of partial_group_any"]
|
||||
end
|
||||
PGP -->|"concat across partitions\nadd across layers"| GA
|
||||
PGP --> GALL
|
||||
PGP --> GANY
|
||||
```
|
||||
|
||||
Non-additive predicates (`group_all`, `group_at_least`) do **not** exist at matrix level — they require the global accumulated count.
|
||||
|
||||
### MatrixGroupOps (planned trait)
|
||||
|
||||
Group operations live on the matrix and expose only **additive intermediates** (`MemoryIntVec`). Predicates (final thresholds → `MemoryBitVec`) are applied at the index level after accumulation.
|
||||
Group operations live on the matrix and expose only **additive intermediates** backed by temp files. Predicates (final thresholds → `MemoryBitVec`) are applied at the index level after accumulation.
|
||||
|
||||
```rust
|
||||
trait MatrixGroupOps {
|
||||
// How many columns in group have value >= threshold, per kmer slot
|
||||
fn partial_group_presence_count(&self, g: &ColGroup, threshold: u32) -> MemoryIntVec;
|
||||
pub trait MatrixGroupOps {
|
||||
fn partial_group_presence_count(&self, g: &ColGroup, threshold: u32)
|
||||
-> io::Result<TempCompactIntVec>;
|
||||
|
||||
// Sum of values across group columns, per kmer slot
|
||||
fn partial_group_sum(&self, g: &ColGroup) -> MemoryIntVec;
|
||||
fn partial_group_sum(&self, g: &ColGroup)
|
||||
-> io::Result<TempCompactIntVec>;
|
||||
|
||||
// Kmer present (value >= threshold) in at least one column of group
|
||||
fn partial_group_any(&self, g: &ColGroup, threshold: u32) -> MemoryBitVec;
|
||||
fn partial_group_any(&self, g: &ColGroup, threshold: u32)
|
||||
-> io::Result<TempBitVec>;
|
||||
}
|
||||
```
|
||||
|
||||
Non-additive predicates (`group_all`, `group_at_least(k)`) are **not** on the matrix — they are composed at the index level from the additive intermediates:
|
||||
Implemented for both `PersistentCompactIntMatrix` and `PersistentBitMatrix`. For bit matrices, `partial_group_sum` delegates to `partial_group_presence_count(g, 1)` since values are 0/1.
|
||||
|
||||
**`partial_group_presence_count` — chunking for large groups:**
|
||||
|
||||
When `g.indices.len() < 255`, per-slot counts fit in a raw `u8` — fast path: accumulate directly into `primary_bytes_mut()` using `inc_primary_bits`, then `freeze()`. No overflow map needed.
|
||||
|
||||
When `g.indices.len() ≥ 255`, process in chunks of 254 columns — each chunk stays within `u8` range — then add chunks into a running `TempCompactIntVecBuilder` accumulator via `IntSliceMut::add`. This keeps peak memory proportional to one partition, not the number of columns × partitions.
|
||||
|
||||
```
|
||||
fast path (< 255 cols):
|
||||
builder = TempCompactIntVecBuilder::new(n)
|
||||
for c in group:
|
||||
mask = col_view(c).cmp_scalar(|v| v >= threshold) // MemoryBitVec
|
||||
inc_primary_bits(primary_bytes_mut, mask) // u8 safe
|
||||
builder.freeze()
|
||||
|
||||
slow path (≥ 255 cols):
|
||||
result = TempCompactIntVecBuilder::new(n)
|
||||
for chunk in group.chunks(254):
|
||||
chunk_builder = TempCompactIntVecBuilder::new(n)
|
||||
inc_primary_bits(chunk_builder, …)
|
||||
chunk_frozen = chunk_builder.freeze()
|
||||
IntSliceMut::add(&mut result, &chunk_frozen)
|
||||
result.freeze()
|
||||
```
|
||||
|
||||
Non-additive predicates (`group_all`, `group_at_least(k)`) are **not** on the matrix — composed at the index level:
|
||||
|
||||
```
|
||||
// "present in >= 2 ingroup columns with count >= 3, absent from all outgroup"
|
||||
let presence = layers.map(|l| l.partial_group_presence_count(&ingroup, 3)).sum();
|
||||
let in_mask = presence.geq(2); // MemoryBitVec
|
||||
let presence = layers.map(|l| l.partial_group_presence_count(&ingroup, 3)?).add_all()?;
|
||||
let in_mask = presence.geq(2);
|
||||
|
||||
let out_sum = layers.map(|l| l.partial_group_sum(&outgroup)).sum();
|
||||
let out_mask = out_sum.leq(0); // MemoryBitVec
|
||||
let out_sum = layers.map(|l| l.partial_group_sum(&outgroup)?).add_all()?;
|
||||
let out_mask = out_sum.leq(0);
|
||||
|
||||
let mask = in_mask.and(&out_mask); // BitSliceMut::and — O(n/64)
|
||||
let mask = in_mask & &out_mask; // BitSliceMut::and — O(n/64)
|
||||
```
|
||||
|
||||
### mask_with (planned IntSliceMut method)
|
||||
### mask_with (IntSliceMut)
|
||||
|
||||
Apply a bit mask to a count vector: zero slots where the mask bit is 0. Iterates only zero bits — O(n_zeros), O(1) when mask is all-ones.
|
||||
Provided method on `IntSliceMut`. Zeros every slot where the corresponding mask bit is 0. Iterates only zero bits — O(n_zeros), O(1) when mask is all-ones.
|
||||
|
||||
```
|
||||
for (w_idx, word) in mask.words():
|
||||
if word == u64::MAX: continue
|
||||
if word == u64::MAX: continue // skip all-ones words
|
||||
zeros = !word
|
||||
while zeros != 0:
|
||||
bit = trailing_zeros(zeros)
|
||||
s = w_idx * 64 + bit
|
||||
self.set(s, 0)
|
||||
if primary[s] != 0: self.set(s, 0) // clears overflow entry too
|
||||
zeros &= zeros − 1
|
||||
```
|
||||
|
||||
This is the terminal operation for both Filter (zero non-selected kmer slots in a count matrix) and Select (positional selection without MPHF).
|
||||
Terminal operation for Filter (retain only selected kmer slots in a count vector) and Select (positional selection without MPHF).
|
||||
|
||||
@@ -0,0 +1,44 @@
|
||||
# La crate obicompactvector
|
||||
|
||||
Le code actuelle est ce qu'il est. Ce n'est pad la vrérité absolue, c'est un premier effort d'implémentation rien de plus. Ci-dessous je vais décrire les objectif et la structure qui devrait être. LA VERITE A ATTEINDRE.
|
||||
|
||||
La crate fournie des représentations les plus compact possible en mémoire de matrice de comptage ou de présence de k-mer dans des génomes. Chaque colonne représente un génome chaque ligne un kmer. une matrice est une collection de vecteur ou chacun des vecteur est un colonne de la matrice.
|
||||
|
||||
Les matrices comme les colonnes ont vocation à être persistante. Les données sont stockées dans des fichiers binaires. Les données sont mappées en mémoire via `mmap`
|
||||
|
||||
Les structure sont par essence immutables. Il existe des représentations mutables des colonnes qui permettent leur construction. À la fin de leur construction, les colonnes sont fermée ce qui les rends immutable.
|
||||
|
||||
Les matrices peuvent êtres représenté de deux façons:
|
||||
- via un répertoire contenant une collection de fichier colonnes
|
||||
- via un fichier matrix qui est la concatenation de plusieurs fichiers colonnes.
|
||||
|
||||
|
||||
## Les matrices de comptage
|
||||
|
||||
Ce sont des matrice d'entiers positif la plus part du temps de petites valeurs (inferieurs à 255). On assume que toutes les valeurs sont représentables sur un `u32`
|
||||
|
||||
## Les matrices de presence
|
||||
|
||||
Ce sont des matrices de boolean représenté comme des champs de bits
|
||||
|
||||
Il existe une forme implicite des vecteur de présence, qui n'est représenté par aucun fichier pour lequel toutes les valeurs sont vraies
|
||||
|
||||
## représentation légère des colonnes
|
||||
|
||||
Les colonnes qu'elles soient de unitiaire (fichier colonne) ou partie d'un fichier composite matrice peuvent être représenté par un objet léger donnant acces à ces valeurs ainsi qu'à la longeur du vecteurs. Toutes les méthodes de calcules doivent uniquement travailler à partir de ces représentations légère unifiées des colonnes.
|
||||
|
||||
### Représentation légère d'un vecteur de présence
|
||||
|
||||
Le vecteur est représenté par
|
||||
- un champs de bits encodé comme un [u64]
|
||||
- un usize encodant la longeur du champs de bits
|
||||
|
||||
### Représentation légère d'un vecteur de présence
|
||||
|
||||
Le vecteur est représenté par
|
||||
- un vecteur [u8] encodant directement les valeur faibe du vecteur [0,255[
|
||||
La valeur 255 est une valeur sentinelle indiquant que la valeure vraie est >=255
|
||||
et se trouvent dans une structure d'overflow
|
||||
- un iterateur de (usize,u32) listant les valeurs d'overflow coorespondant aux valeurs
|
||||
sentinels (255) du [u8]
|
||||
- un usize encodant la longeur du champs de bits
|
||||
@@ -7,6 +7,6 @@ edition = "2024"
|
||||
memmap2 = "0.9"
|
||||
ndarray = "0.16"
|
||||
rayon = "1"
|
||||
tempfile = "3"
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = "3"
|
||||
|
||||
@@ -8,8 +8,9 @@ use rayon::prelude::*;
|
||||
|
||||
use crate::bitvec::{PersistentBitVec, PersistentBitVecBuilder};
|
||||
use crate::colgroup::{ColGroup, MatrixGroupOps, inc_primary_bits};
|
||||
use crate::memoryintvec::MemoryIntVec;
|
||||
use crate::memoryvec::MemoryBitVec;
|
||||
use crate::tempbitvec::{TempBitVec, TempBitVecBuilder};
|
||||
use crate::tempintvec::{TempCompactIntVec, TempCompactIntVecBuilder};
|
||||
use crate::traits::{BitSlice, BitSliceMut, IntSliceMut};
|
||||
use crate::layer_meta::LayerMeta;
|
||||
use crate::meta::MatrixMeta;
|
||||
@@ -452,39 +453,49 @@ impl PersistentBitMatrixBuilder {
|
||||
// ── MatrixGroupOps ────────────────────────────────────────────────────────────
|
||||
|
||||
impl MatrixGroupOps for PersistentBitMatrix {
|
||||
fn partial_group_presence_count(&self, g: &ColGroup, _threshold: u32) -> MemoryIntVec {
|
||||
fn partial_group_presence_count(&self, g: &ColGroup, _threshold: u32) -> io::Result<TempCompactIntVec> {
|
||||
// Bit matrices store 0/1 — threshold is structurally always 1.
|
||||
// Materialize each column to a MemoryBitVec and accumulate directly.
|
||||
let n = self.n();
|
||||
if g.indices.len() < 255 {
|
||||
let mut primary = vec![0u8; n];
|
||||
for &c in &g.indices {
|
||||
let mbv = MemoryBitVec::from(&self.col_view(c));
|
||||
inc_primary_bits(&mut primary, &mbv);
|
||||
let mut builder = TempCompactIntVecBuilder::new(n)?;
|
||||
{
|
||||
let primary = builder.primary_bytes_mut();
|
||||
for &c in &g.indices {
|
||||
let mbv = MemoryBitVec::from(&self.col_view(c));
|
||||
inc_primary_bits(primary, &mbv);
|
||||
}
|
||||
}
|
||||
MemoryIntVec::from_primary(primary)
|
||||
builder.freeze()
|
||||
} else {
|
||||
let mut result = MemoryIntVec::new(n);
|
||||
for &c in &g.indices {
|
||||
let mbv = MemoryBitVec::from(&self.col_view(c));
|
||||
result.count_bits(&mbv);
|
||||
let mut result = TempCompactIntVecBuilder::new(n)?;
|
||||
for chunk in g.indices.chunks(254) {
|
||||
let mut chunk_builder = TempCompactIntVecBuilder::new(n)?;
|
||||
{
|
||||
let primary = chunk_builder.primary_bytes_mut();
|
||||
for &c in chunk {
|
||||
let mbv = MemoryBitVec::from(&self.col_view(c));
|
||||
inc_primary_bits(primary, &mbv);
|
||||
}
|
||||
}
|
||||
let chunk_frozen = chunk_builder.freeze()?;
|
||||
IntSliceMut::add(&mut result, &chunk_frozen);
|
||||
}
|
||||
result
|
||||
result.freeze()
|
||||
}
|
||||
}
|
||||
|
||||
fn partial_group_sum(&self, g: &ColGroup) -> MemoryIntVec {
|
||||
fn partial_group_sum(&self, g: &ColGroup) -> io::Result<TempCompactIntVec> {
|
||||
// For bit matrices, sum = count of 1-bits — identical to presence_count.
|
||||
self.partial_group_presence_count(g, 1)
|
||||
}
|
||||
|
||||
fn partial_group_any(&self, g: &ColGroup, _threshold: u32) -> MemoryBitVec {
|
||||
fn partial_group_any(&self, g: &ColGroup, _threshold: u32) -> io::Result<TempBitVec> {
|
||||
let n = self.n();
|
||||
let mut result = MemoryBitVec::new(n);
|
||||
let mut result = TempBitVecBuilder::new(n)?;
|
||||
for &c in &g.indices {
|
||||
result.or(&self.col_view(c));
|
||||
}
|
||||
result
|
||||
result.freeze()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -122,19 +122,19 @@ impl PersistentCompactIntVecBuilder {
|
||||
/// Flush the primary mmap, then write sorted overflow data + index and fix the header.
|
||||
pub fn close(self) -> io::Result<()> {
|
||||
self.mmap.flush()?;
|
||||
let Self {
|
||||
path,
|
||||
mmap,
|
||||
n,
|
||||
overflow,
|
||||
} = self;
|
||||
let Self { path, mmap, n, overflow } = self;
|
||||
drop(mmap);
|
||||
|
||||
let mut entries: Vec<(usize, u32)> = overflow.into_iter().collect();
|
||||
entries.sort_unstable_by_key(|&(slot, _)| slot);
|
||||
|
||||
finalize_pciv(&path, n, &entries)
|
||||
}
|
||||
|
||||
/// Close and reopen as a read-only [`PersistentCompactIntVec`].
|
||||
pub fn finish(self) -> io::Result<PersistentCompactIntVec> {
|
||||
let path = self.path.clone();
|
||||
self.close()?;
|
||||
PersistentCompactIntVec::open(&path)
|
||||
}
|
||||
}
|
||||
|
||||
// ── IntSlice / IntSliceMut impls ──────────────────────────────────────────────
|
||||
|
||||
@@ -1,5 +1,8 @@
|
||||
use crate::memoryintvec::MemoryIntVec;
|
||||
use std::io;
|
||||
|
||||
use crate::memoryvec::MemoryBitVec;
|
||||
use crate::tempbitvec::TempBitVec;
|
||||
use crate::tempintvec::TempCompactIntVec;
|
||||
use crate::traits::BitSlice;
|
||||
|
||||
// ── ColGroup ──────────────────────────────────────────────────────────────────
|
||||
@@ -30,13 +33,13 @@ impl ColGroup {
|
||||
/// — they are derived at the index level from these intermediates.
|
||||
pub trait MatrixGroupOps {
|
||||
/// Per-slot count of group columns whose value ≥ `threshold`.
|
||||
fn partial_group_presence_count(&self, g: &ColGroup, threshold: u32) -> MemoryIntVec;
|
||||
fn partial_group_presence_count(&self, g: &ColGroup, threshold: u32) -> io::Result<TempCompactIntVec>;
|
||||
|
||||
/// Per-slot sum of values across all group columns.
|
||||
fn partial_group_sum(&self, g: &ColGroup) -> MemoryIntVec;
|
||||
fn partial_group_sum(&self, g: &ColGroup) -> io::Result<TempCompactIntVec>;
|
||||
|
||||
/// Per-slot OR: true if any group column has value ≥ `threshold`.
|
||||
fn partial_group_any(&self, g: &ColGroup, threshold: u32) -> MemoryBitVec;
|
||||
fn partial_group_any(&self, g: &ColGroup, threshold: u32) -> io::Result<TempBitVec>;
|
||||
}
|
||||
|
||||
// ── Internal helper ───────────────────────────────────────────────────────────
|
||||
|
||||
@@ -12,7 +12,8 @@ use crate::bitmatrix::{pairwise_matrix, pairwise2_matrix};
|
||||
use crate::builder::PersistentCompactIntVecBuilder;
|
||||
use crate::colgroup::{ColGroup, MatrixGroupOps, inc_primary_bits};
|
||||
use crate::memoryintvec::MemoryIntVec;
|
||||
use crate::memoryvec::MemoryBitVec;
|
||||
use crate::tempbitvec::{TempBitVec, TempBitVecBuilder};
|
||||
use crate::tempintvec::{TempCompactIntVec, TempCompactIntVecBuilder};
|
||||
use crate::format::{byte_count_nonzero, byte_sum, HEADER_SIZE, OVERFLOW_ENTRY_SIZE, parse_index_entry, parse_overflow_entry};
|
||||
use crate::meta::MatrixMeta;
|
||||
use crate::reader::PersistentCompactIntVec;
|
||||
@@ -630,45 +631,55 @@ impl PersistentCompactIntMatrixBuilder {
|
||||
// ── MatrixGroupOps ────────────────────────────────────────────────────────────
|
||||
|
||||
impl MatrixGroupOps for PersistentCompactIntMatrix {
|
||||
fn partial_group_presence_count(&self, g: &ColGroup, threshold: u32) -> MemoryIntVec {
|
||||
fn partial_group_presence_count(&self, g: &ColGroup, threshold: u32) -> io::Result<TempCompactIntVec> {
|
||||
let n = self.n();
|
||||
if g.indices.len() < 255 {
|
||||
// Fast path: counts fit in u8 — accumulate directly into raw bytes,
|
||||
// no overflow map involved.
|
||||
let mut primary = vec![0u8; n];
|
||||
for &c in &g.indices {
|
||||
let mask = self.col_view(c).cmp_scalar(|v| v >= threshold);
|
||||
inc_primary_bits(&mut primary, &mask);
|
||||
// Fast path: counts fit in u8 — accumulate directly into raw bytes.
|
||||
let mut builder = TempCompactIntVecBuilder::new(n)?;
|
||||
{
|
||||
let primary = builder.primary_bytes_mut();
|
||||
for &c in &g.indices {
|
||||
let mask = self.col_view(c).cmp_scalar(|v| v >= threshold);
|
||||
inc_primary_bits(primary, &mask);
|
||||
}
|
||||
}
|
||||
MemoryIntVec::from_primary(primary)
|
||||
builder.freeze()
|
||||
} else {
|
||||
// Slow path (rare): use IntSliceMut::count_bits which handles overflow.
|
||||
let mut result = MemoryIntVec::new(n);
|
||||
for &c in &g.indices {
|
||||
let mask = self.col_view(c).cmp_scalar(|v| v >= threshold);
|
||||
result.count_bits(&mask);
|
||||
// Slow path: chunk by 254 to keep per-chunk u8 safe, then add chunks.
|
||||
let mut result = TempCompactIntVecBuilder::new(n)?;
|
||||
for chunk in g.indices.chunks(254) {
|
||||
let mut chunk_builder = TempCompactIntVecBuilder::new(n)?;
|
||||
{
|
||||
let primary = chunk_builder.primary_bytes_mut();
|
||||
for &c in chunk {
|
||||
let mask = self.col_view(c).cmp_scalar(|v| v >= threshold);
|
||||
inc_primary_bits(primary, &mask);
|
||||
}
|
||||
}
|
||||
let chunk_frozen = chunk_builder.freeze()?;
|
||||
IntSliceMut::add(&mut result, &chunk_frozen);
|
||||
}
|
||||
result
|
||||
result.freeze()
|
||||
}
|
||||
}
|
||||
|
||||
fn partial_group_sum(&self, g: &ColGroup) -> MemoryIntVec {
|
||||
fn partial_group_sum(&self, g: &ColGroup) -> io::Result<TempCompactIntVec> {
|
||||
let n = self.n();
|
||||
let mut result = MemoryIntVec::new(n);
|
||||
let mut result = TempCompactIntVecBuilder::new(n)?;
|
||||
for &c in &g.indices {
|
||||
let view = self.col_view(c);
|
||||
IntSliceMut::add(&mut result, &view);
|
||||
}
|
||||
result
|
||||
result.freeze()
|
||||
}
|
||||
|
||||
fn partial_group_any(&self, g: &ColGroup, threshold: u32) -> MemoryBitVec {
|
||||
fn partial_group_any(&self, g: &ColGroup, threshold: u32) -> io::Result<TempBitVec> {
|
||||
let n = self.n();
|
||||
let mut result = MemoryBitVec::new(n);
|
||||
let mut result = TempBitVecBuilder::new(n)?;
|
||||
for &c in &g.indices {
|
||||
let mask = self.col_view(c).cmp_scalar(|v| v >= threshold);
|
||||
result.or(&mask);
|
||||
}
|
||||
result
|
||||
result.freeze()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,6 +9,8 @@ mod memoryintvec;
|
||||
mod memoryvec;
|
||||
mod meta;
|
||||
mod reader;
|
||||
mod tempbitvec;
|
||||
mod tempintvec;
|
||||
pub mod traits;
|
||||
|
||||
pub use bitvec::{BitIter, PersistentBitVec, PersistentBitVecBuilder};
|
||||
@@ -20,6 +22,8 @@ pub use layer_meta::LayerMeta;
|
||||
pub use memoryintvec::{MemoryIntIter, MemoryIntVec};
|
||||
pub use memoryvec::MemoryBitVec;
|
||||
pub use reader::PersistentCompactIntVec;
|
||||
pub use tempbitvec::TempBitVec;
|
||||
pub use tempintvec::TempCompactIntVec;
|
||||
pub use traits::{BitPartials, BitSlice, BitSliceMut, BitToInt, ColumnWeights, CountPartials, IntSlice, IntSliceMut, IntToBit};
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -0,0 +1,69 @@
|
||||
use std::io;
|
||||
use std::path::Path;
|
||||
|
||||
use tempfile::TempDir;
|
||||
|
||||
use crate::bitvec::{PersistentBitVec, PersistentBitVecBuilder};
|
||||
use crate::traits::{BitSlice, BitSliceMut};
|
||||
|
||||
// ── TempBitVec — frozen read-only, auto-deleted on drop ──────────────────────
|
||||
|
||||
/// A bit vector backed by a temporary file.
|
||||
/// Implements [`BitSlice`]; the file is deleted when this value is dropped.
|
||||
/// Call [`make_persistent`](Self::make_persistent) to promote to a durable file.
|
||||
pub struct TempBitVec {
|
||||
vec: PersistentBitVec,
|
||||
// Dropped after `vec` (field order), so the mmap is released before the
|
||||
// temp directory is deleted.
|
||||
_temp: TempDir,
|
||||
}
|
||||
|
||||
impl TempBitVec {
|
||||
/// Copy to a permanent file and open as a [`PersistentBitVec`].
|
||||
pub fn make_persistent(&self, path: &Path) -> io::Result<PersistentBitVec> {
|
||||
std::fs::copy(self.vec.path(), path)?;
|
||||
PersistentBitVec::open(path)
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize { self.vec.len() }
|
||||
pub fn is_empty(&self) -> bool { self.vec.is_empty() }
|
||||
}
|
||||
|
||||
impl BitSlice for TempBitVec {
|
||||
fn len(&self) -> usize { self.vec.len() }
|
||||
fn words(&self) -> &[u64] { self.vec.words() }
|
||||
}
|
||||
|
||||
// ── TempBitVecBuilder — mutable, becomes TempBitVec on freeze ────────────────
|
||||
|
||||
/// Writable builder for a [`TempBitVec`]. `pub(crate)` — callers receive
|
||||
/// only the frozen result via [`freeze`](Self::freeze).
|
||||
pub(crate) struct TempBitVecBuilder {
|
||||
builder: PersistentBitVecBuilder,
|
||||
temp: TempDir,
|
||||
}
|
||||
|
||||
impl TempBitVecBuilder {
|
||||
pub(crate) fn new(n: usize) -> io::Result<Self> {
|
||||
let temp = TempDir::new()?;
|
||||
let path = temp.path().join("data.pbiv");
|
||||
let builder = PersistentBitVecBuilder::new(n, &path)?;
|
||||
Ok(Self { builder, temp })
|
||||
}
|
||||
|
||||
/// Finalize writes and return a frozen, read-only [`TempBitVec`].
|
||||
pub(crate) fn freeze(self) -> io::Result<TempBitVec> {
|
||||
let Self { builder, temp } = self;
|
||||
let vec = builder.finish()?;
|
||||
Ok(TempBitVec { vec, _temp: temp })
|
||||
}
|
||||
}
|
||||
|
||||
impl BitSlice for TempBitVecBuilder {
|
||||
fn len(&self) -> usize { self.builder.len() }
|
||||
fn words(&self) -> &[u64] { self.builder.words() }
|
||||
}
|
||||
|
||||
impl BitSliceMut for TempBitVecBuilder {
|
||||
fn words_mut(&mut self) -> &mut [u64] { self.builder.words_mut() }
|
||||
}
|
||||
@@ -0,0 +1,82 @@
|
||||
use std::io;
|
||||
use std::path::Path;
|
||||
|
||||
use tempfile::TempDir;
|
||||
|
||||
use crate::builder::PersistentCompactIntVecBuilder;
|
||||
use crate::reader::PersistentCompactIntVec;
|
||||
use crate::traits::{IntSlice, IntSliceMut};
|
||||
|
||||
// ── TempCompactIntVec — frozen read-only, auto-deleted on drop ────────────────
|
||||
|
||||
/// A compact int vector backed by a temporary file.
|
||||
/// Implements [`IntSlice`]; the file is deleted when this value is dropped.
|
||||
/// Call [`make_persistent`](Self::make_persistent) to promote to a durable file.
|
||||
pub struct TempCompactIntVec {
|
||||
vec: PersistentCompactIntVec,
|
||||
// Dropped after `vec` (field order), so the mmap is released before the
|
||||
// temp directory is deleted.
|
||||
_temp: TempDir,
|
||||
}
|
||||
|
||||
impl TempCompactIntVec {
|
||||
/// Copy to a permanent file and open as a [`PersistentCompactIntVec`].
|
||||
pub fn make_persistent(&self, path: &Path) -> io::Result<PersistentCompactIntVec> {
|
||||
std::fs::copy(self.vec.path(), path)?;
|
||||
PersistentCompactIntVec::open(path)
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize { self.vec.len() }
|
||||
pub fn is_empty(&self) -> bool { self.vec.is_empty() }
|
||||
}
|
||||
|
||||
impl IntSlice for TempCompactIntVec {
|
||||
fn len(&self) -> usize { self.vec.len() }
|
||||
fn get(&self, slot: usize) -> u32 { self.vec.get(slot) }
|
||||
fn primary_bytes(&self) -> &[u8] { self.vec.primary_bytes() }
|
||||
fn overflow_entries(&self) -> impl Iterator<Item = (usize, u32)> + '_ {
|
||||
self.vec.overflow_entries()
|
||||
}
|
||||
fn sum(&self) -> u64 { self.vec.sum() }
|
||||
fn count_nonzero(&self) -> u64 { self.vec.count_nonzero() }
|
||||
}
|
||||
|
||||
// ── TempCompactIntVecBuilder — mutable, becomes TempCompactIntVec on freeze ──
|
||||
|
||||
/// Writable builder for a [`TempCompactIntVec`]. `pub(crate)` — callers
|
||||
/// receive only the frozen result via [`freeze`](Self::freeze).
|
||||
pub(crate) struct TempCompactIntVecBuilder {
|
||||
builder: PersistentCompactIntVecBuilder,
|
||||
temp: TempDir,
|
||||
}
|
||||
|
||||
impl TempCompactIntVecBuilder {
|
||||
pub(crate) fn new(n: usize) -> io::Result<Self> {
|
||||
let temp = TempDir::new()?;
|
||||
let path = temp.path().join("data.pciv");
|
||||
let builder = PersistentCompactIntVecBuilder::new(n, &path)?;
|
||||
Ok(Self { builder, temp })
|
||||
}
|
||||
|
||||
/// Finalize writes and return a frozen, read-only [`TempCompactIntVec`].
|
||||
pub(crate) fn freeze(self) -> io::Result<TempCompactIntVec> {
|
||||
let Self { builder, temp } = self;
|
||||
let vec = builder.finish()?;
|
||||
Ok(TempCompactIntVec { vec, _temp: temp })
|
||||
}
|
||||
}
|
||||
|
||||
impl IntSlice for TempCompactIntVecBuilder {
|
||||
fn len(&self) -> usize { self.builder.len() }
|
||||
fn get(&self, slot: usize) -> u32 { self.builder.get(slot) }
|
||||
fn primary_bytes(&self) -> &[u8] { self.builder.primary_bytes() }
|
||||
fn overflow_entries(&self) -> impl Iterator<Item = (usize, u32)> + '_ {
|
||||
self.builder.overflow_entries()
|
||||
}
|
||||
}
|
||||
|
||||
impl IntSliceMut for TempCompactIntVecBuilder {
|
||||
fn set(&mut self, slot: usize, value: u32) { self.builder.set(slot, value); }
|
||||
fn primary_bytes_mut(&mut self) -> &mut [u8] { self.builder.primary_bytes_mut() }
|
||||
fn clear_overflow(&mut self) { self.builder.clear_overflow(); }
|
||||
}
|
||||
@@ -5,7 +5,7 @@ use crate::{
|
||||
PersistentBitMatrix, PersistentBitMatrixBuilder,
|
||||
PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder,
|
||||
};
|
||||
use crate::traits::{BitSliceMut, IntSlice, IntSliceMut};
|
||||
use crate::traits::{BitSlice, BitSliceMut, IntSlice, IntSliceMut};
|
||||
use crate::{MemoryBitVec, MemoryIntVec};
|
||||
|
||||
// ── helpers ───────────────────────────────────────────────────────────────────
|
||||
@@ -47,7 +47,7 @@ fn int_partial_group_sum_basic() {
|
||||
// group {0,2}: sum = [101, 2, 8]
|
||||
let (_d, m) = make_int_matrix(&[&[1, 2, 3], &[10, 20, 30], &[100, 0, 5]]);
|
||||
let g = ColGroup::new("g", vec![0, 2]);
|
||||
let result = m.partial_group_sum(&g);
|
||||
let result = m.partial_group_sum(&g).unwrap();
|
||||
assert_eq!(result.get(0), 101);
|
||||
assert_eq!(result.get(1), 2);
|
||||
assert_eq!(result.get(2), 8);
|
||||
@@ -58,7 +58,7 @@ fn int_partial_group_sum_with_overflow() {
|
||||
// col0=[300,0], col1=[200,400]: group {0,1}: sum=[500, 400]
|
||||
let (_d, m) = make_int_matrix(&[&[300, 0], &[200, 400]]);
|
||||
let g = ColGroup::new("g", vec![0, 1]);
|
||||
let result = m.partial_group_sum(&g);
|
||||
let result = m.partial_group_sum(&g).unwrap();
|
||||
assert_eq!(result.get(0), 500);
|
||||
assert_eq!(result.get(1), 400);
|
||||
assert_eq!(result.sum(), 900);
|
||||
@@ -73,7 +73,7 @@ fn int_partial_group_presence_count() {
|
||||
// group {0,1,2}: counts = [2, 1, 1, 2]
|
||||
let (_d, m) = make_int_matrix(&[&[5, 1, 0, 3], &[2, 0, 4, 3], &[0, 3, 1, 0]]);
|
||||
let g = ColGroup::new("g", vec![0, 1, 2]);
|
||||
let result = m.partial_group_presence_count(&g, 2);
|
||||
let result = m.partial_group_presence_count(&g, 2).unwrap();
|
||||
assert_eq!(result.get(0), 2);
|
||||
assert_eq!(result.get(1), 1);
|
||||
assert_eq!(result.get(2), 1);
|
||||
@@ -87,7 +87,7 @@ fn int_partial_group_presence_count_with_overflow() {
|
||||
// group {0,1,2}: counts = [1, 1, 3]
|
||||
let (_d, m) = make_int_matrix(&[&[300, 0, 10], &[0, 400, 10], &[1, 1, 10]]);
|
||||
let g = ColGroup::new("g", vec![0, 1, 2]);
|
||||
let result = m.partial_group_presence_count(&g, 5);
|
||||
let result = m.partial_group_presence_count(&g, 5).unwrap();
|
||||
assert_eq!(result.get(0), 1);
|
||||
assert_eq!(result.get(1), 1);
|
||||
assert_eq!(result.get(2), 3);
|
||||
@@ -102,7 +102,7 @@ fn int_partial_group_any() {
|
||||
// group {0,1,2}: any = [T, T, T, F]
|
||||
let (_d, m) = make_int_matrix(&[&[0, 3, 0, 1], &[2, 0, 0, 0], &[0, 0, 5, 0]]);
|
||||
let g = ColGroup::new("g", vec![0, 1, 2]);
|
||||
let result = m.partial_group_any(&g, 2);
|
||||
let result = m.partial_group_any(&g, 2).unwrap();
|
||||
assert_eq!(result.get(0), true);
|
||||
assert_eq!(result.get(1), true);
|
||||
assert_eq!(result.get(2), true);
|
||||
@@ -164,7 +164,7 @@ fn bit_partial_group_presence_count() {
|
||||
&[false,true, true, false],
|
||||
]);
|
||||
let g = ColGroup::new("g", vec![0, 1, 2]);
|
||||
let result = m.partial_group_presence_count(&g, 1);
|
||||
let result = m.partial_group_presence_count(&g, 1).unwrap();
|
||||
assert_eq!(result.get(0), 2);
|
||||
assert_eq!(result.get(1), 2);
|
||||
assert_eq!(result.get(2), 2);
|
||||
@@ -181,7 +181,7 @@ fn bit_partial_group_any() {
|
||||
&[false, false, true],
|
||||
]);
|
||||
let g = ColGroup::new("g", vec![0, 1]);
|
||||
let result = m.partial_group_any(&g, 1);
|
||||
let result = m.partial_group_any(&g, 1).unwrap();
|
||||
assert_eq!(result.get(0), true);
|
||||
assert_eq!(result.get(1), false);
|
||||
assert_eq!(result.get(2), true);
|
||||
@@ -200,8 +200,8 @@ fn int_presence_count_additive_across_split() {
|
||||
let (_db, mb) = make_int_matrix(data_b);
|
||||
let g = ColGroup::new("g", vec![0, 1]);
|
||||
|
||||
let pa = ma.partial_group_presence_count(&g, 2);
|
||||
let pb = mb.partial_group_presence_count(&g, 2);
|
||||
let pa = ma.partial_group_presence_count(&g, 2).unwrap();
|
||||
let pb = mb.partial_group_presence_count(&g, 2).unwrap();
|
||||
|
||||
// Concatenate by adding (disjoint kmer ranges — here we just verify
|
||||
// individual results match the expected per-partition counts).
|
||||
|
||||
Reference in New Issue
Block a user