Push mtzqmmrlmzzx #34
@@ -18,6 +18,25 @@ src/obicompactvec/src/
|
|||||||
meta.rs matrix metadata
|
meta.rs matrix metadata
|
||||||
```
|
```
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
graph TD
|
||||||
|
traits --> memoryvec
|
||||||
|
traits --> memoryintvec
|
||||||
|
bitvec --> memoryvec
|
||||||
|
bitvec --> bitmatrix
|
||||||
|
format --> reader
|
||||||
|
format --> builder
|
||||||
|
reader --> intmatrix
|
||||||
|
builder --> intmatrix
|
||||||
|
builder --> memoryintvec
|
||||||
|
memoryvec --> traits
|
||||||
|
memoryintvec --> traits
|
||||||
|
layer_meta --> bitmatrix
|
||||||
|
layer_meta --> intmatrix
|
||||||
|
meta --> bitmatrix
|
||||||
|
meta --> intmatrix
|
||||||
|
```
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Compact int encoding
|
## Compact int encoding
|
||||||
@@ -34,6 +53,15 @@ All integer vectors use the same two-tier encoding regardless of storage backend
|
|||||||
- In `MemoryIntVec` and `PersistentCompactIntVecBuilder`: a `HashMap<usize, u32>` in RAM.
|
- In `MemoryIntVec` and `PersistentCompactIntVecBuilder`: a `HashMap<usize, u32>` in RAM.
|
||||||
- In `PersistentCompactIntVec` (reader): a sorted `[(slot: u64, value: u32)]` array in the mmap, with a sparse L1-resident index for binary search.
|
- In `PersistentCompactIntVec` (reader): a sorted `[(slot: u64, value: u32)]` array in the mmap, with a sparse L1-resident index for binary search.
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
flowchart LR
|
||||||
|
slot --> P["primary[slot]: u8"]
|
||||||
|
P -->|"< 255"| V["value = byte (0–254)"]
|
||||||
|
P -->|"= 255 sentinel"| OV["overflow store"]
|
||||||
|
OV -->|"MemoryIntVec / Builder"| HM["HashMap<usize, u32>\nin RAM"]
|
||||||
|
OV -->|"PersistentCompactIntVec"| SA["sorted [(slot,value)] in mmap\n+ sparse L1 index"]
|
||||||
|
```
|
||||||
|
|
||||||
**Key property — sentinel 255 = +∞ on `u8`:**
|
**Key property — sentinel 255 = +∞ on `u8`:**
|
||||||
|
|
||||||
This is exploited throughout the binary operations. On a `u8` comparison, 255 behaves as positive infinity:
|
This is exploited throughout the binary operations. On a `u8` comparison, 255 behaves as positive infinity:
|
||||||
@@ -47,6 +75,66 @@ In practice, k (overflow count) ≪ n (total slots). Observed genomic data: ~0.0
|
|||||||
|
|
||||||
## Trait hierarchy
|
## Trait hierarchy
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
classDiagram
|
||||||
|
class BitSlice {
|
||||||
|
<<trait>>
|
||||||
|
+len() usize
|
||||||
|
+words() &[u64]
|
||||||
|
+get(slot) bool
|
||||||
|
+count_ones() u64
|
||||||
|
+count_zeros() u64
|
||||||
|
+partial_jaccard_dist(other) (u64,u64)
|
||||||
|
+jaccard_dist(other) f64
|
||||||
|
+hamming_dist(other) u64
|
||||||
|
}
|
||||||
|
class BitSliceMut {
|
||||||
|
<<trait>>
|
||||||
|
+words_mut() &mut [u64]
|
||||||
|
+set(slot, value)
|
||||||
|
+copy_from(src)
|
||||||
|
+and(other)
|
||||||
|
+or(other)
|
||||||
|
+xor(other)
|
||||||
|
+not()
|
||||||
|
}
|
||||||
|
class IntSlice {
|
||||||
|
<<trait>>
|
||||||
|
+len() usize
|
||||||
|
+get(slot) u32
|
||||||
|
+primary_bytes() &[u8]
|
||||||
|
+overflow_entries() Iterator
|
||||||
|
+iter() Iterator
|
||||||
|
+sum() u64
|
||||||
|
+count_nonzero() u64
|
||||||
|
+cmp_scalar(pred) MemoryBitVec
|
||||||
|
+lt/leq/gt/geq(t) MemoryBitVec
|
||||||
|
}
|
||||||
|
class IntSliceMut {
|
||||||
|
<<trait>>
|
||||||
|
+set(slot, value)
|
||||||
|
+primary_bytes_mut() &mut [u8]
|
||||||
|
+clear_overflow()
|
||||||
|
+inc/dec/add_at(slot)
|
||||||
|
+copy_from(src)
|
||||||
|
+min/max/add/diff(other)
|
||||||
|
+count_bits(bits)
|
||||||
|
}
|
||||||
|
class IntToBit {
|
||||||
|
<<trait blanket>>
|
||||||
|
+to_bitvec(threshold) MemoryBitVec
|
||||||
|
+to_presence() MemoryBitVec
|
||||||
|
}
|
||||||
|
class BitToInt {
|
||||||
|
<<trait blanket>>
|
||||||
|
+to_intvec() MemoryIntVec
|
||||||
|
}
|
||||||
|
BitSliceMut --|> BitSlice : extends
|
||||||
|
IntSliceMut --|> IntSlice : extends
|
||||||
|
IntToBit --|> IntSlice : blanket T:IntSlice
|
||||||
|
BitToInt --|> BitSlice : blanket T:BitSlice
|
||||||
|
```
|
||||||
|
|
||||||
### BitSlice (read-only)
|
### BitSlice (read-only)
|
||||||
|
|
||||||
Required: `len()`, `words() -> &[u64]`.
|
Required: `len()`, `words() -> &[u64]`.
|
||||||
@@ -148,17 +236,17 @@ The required methods expose the encoding internals. All provided methods are imp
|
|||||||
|
|
||||||
Exploits 255 = +∞: `u8::min(a, 255) = a` and `u8::min(255, b) = b`. Only the case where both sides are ≥ 255 needs actual overflow values.
|
Exploits 255 = +∞: `u8::min(a, 255) = a` and `u8::min(255, b) = b`. Only the case where both sides are ≥ 255 needs actual overflow values.
|
||||||
|
|
||||||
```
|
```mermaid
|
||||||
1. Snapshot self's overflow: self_ov: Vec<(slot, value)>
|
flowchart TD
|
||||||
Snapshot other's overflow: other_ov: HashMap<slot, value>
|
A["min(self, other)"] --> B["snapshot self_ov: Vec<(slot,val)>\nsnapshot other_ov: HashMap<slot,val>"]
|
||||||
2. clear_overflow() — removes all self's overflow entries
|
B --> C["clear_overflow()"]
|
||||||
3. Pass 1 (byte min, SIMD-vectorizable):
|
C --> D["Pass 1 — byte min, SIMD-vectorizable\nprimary[s] = min(self[s], other[s]) ∀s"]
|
||||||
for each byte pair: self.primary[s] = min(self.primary[s], other.primary[s])
|
D --> E["Pass 2 — both-overflow fixup\nfor (slot, self_val) in self_ov"]
|
||||||
4. Pass 2 (both-overflow fixup):
|
E --> F{"slot ∈ other_ov?"}
|
||||||
for (slot, self_val) in self_ov:
|
F -->|yes| G["set(slot, min(self_val, other_ov[slot]))"]
|
||||||
if slot in other_ov:
|
F -->|no| H["byte pass wrote other.primary < 255\nclear_overflow removed stale entry\nno action"]
|
||||||
self.set(slot, min(self_val, other_ov[slot]))
|
G --> I[done]
|
||||||
// else: byte pass already wrote other.primary[slot] < 255 — correct
|
H --> I
|
||||||
```
|
```
|
||||||
|
|
||||||
Overflow entries where only self was overflow are correctly handled: after `clear_overflow` + byte pass, `self.primary[slot] = min(255, other.primary[slot]) = other.primary[slot]` (which is < 255). No overflow entry — correct.
|
Overflow entries where only self was overflow are correctly handled: after `clear_overflow` + byte pass, `self.primary[slot] = min(255, other.primary[slot]) = other.primary[slot]` (which is < 255). No overflow entry — correct.
|
||||||
@@ -169,15 +257,13 @@ Exploits 255 = +∞: `u8::max(a, 255) = 255` → any slot where either side is o
|
|||||||
|
|
||||||
Solution: read and update self's original value at other's overflow slots *before* the byte pass overwrites them.
|
Solution: read and update self's original value at other's overflow slots *before* the byte pass overwrites them.
|
||||||
|
|
||||||
```
|
```mermaid
|
||||||
Pre-pass (O(k_other)):
|
flowchart TD
|
||||||
for (slot, other_val) in other.overflow_entries():
|
A["max(self, other)"] --> B["Pre-pass O(k_other)\nfor (slot, other_val) in other.overflow_entries()"]
|
||||||
self_val = self.get(slot) // reads original value
|
B --> C["self_val = self.get(slot)\nself.set(slot, max(self_val, other_val))"]
|
||||||
self.set(slot, max(self_val, other_val))
|
C --> D["Pass 1 — byte max, SIMD-vectorizable\nprimary[s] = max(self[s], other[s]) ∀s"]
|
||||||
|
D --> E["Overflow slots: max(255,255)=255\nprimary unchanged\noverflow entry from pre-pass preserved"]
|
||||||
Pass 1 (byte max, SIMD-vectorizable):
|
E --> F[done]
|
||||||
for each byte pair: self.primary[s] = max(self.primary[s], other.primary[s])
|
|
||||||
// Overflow slots: max(255, 255) = 255 — primary unchanged, overflow entry from pre-pass preserved
|
|
||||||
```
|
```
|
||||||
|
|
||||||
After the pre-pass, self.primary[slot] = 255 for all slots in other's overflow. The byte pass leaves those 255s intact. Self's own overflow slots not in other's overflow are also 255 in primary — byte max(255, b < 255) = 255, unchanged. Correct in all cases.
|
After the pre-pass, self.primary[slot] = 255 for all slots in other's overflow. The byte pass leaves those 255s intact. Self's own overflow slots not in other's overflow are also 255 in primary — byte max(255, b < 255) = 255, unchanged. Correct in all cases.
|
||||||
@@ -198,6 +284,18 @@ for s in 0..n:
|
|||||||
self.set(s, self.get(s) + other.get(s))
|
self.set(s, self.get(s) + other.get(s))
|
||||||
```
|
```
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
flowchart TD
|
||||||
|
A["add(self, other)"] --> B{"sb < 255\nAND ob < 255"}
|
||||||
|
B -->|"yes — hot path\nno HashMap"| C{"sb + ob < 255"}
|
||||||
|
C -->|yes| D["primary[s] = sum as u8\nsingle byte write"]
|
||||||
|
C -->|no| E["set(s, sum)\ncreates overflow entry"]
|
||||||
|
B -->|"no — ≥1 side is overflow"| F["self_val = self.get(s)\nother_val = other.get(s)\nset(s, self_val + other_val)"]
|
||||||
|
D --> Z[next slot]
|
||||||
|
E --> Z
|
||||||
|
F --> Z
|
||||||
|
```
|
||||||
|
|
||||||
The `+` on `u32` values is exact (no `saturating_add`). Overflow at u32 level panics in debug — not a real risk for kmer counts. The hot path (both < 255, sum < 255) is a single byte write with no HashMap access.
|
The `+` on `u32` values is exact (no `saturating_add`). Overflow at u32 level panics in debug — not a real risk for kmer counts. The hot path (both < 255, sum < 255) is a single byte write with no HashMap access.
|
||||||
|
|
||||||
**`diff` (saturating sub) algorithm:**
|
**`diff` (saturating sub) algorithm:**
|
||||||
@@ -211,16 +309,21 @@ The `+` on `u32` values is exact (no `saturating_add`). Overflow at u32 level pa
|
|||||||
| 255 | < 255 | `self.get(s) − ob` | self only |
|
| 255 | < 255 | `self.get(s) − ob` | self only |
|
||||||
| 255 | 255 | `self.get(s) − other.get(s)` | both |
|
| 255 | 255 | `self.get(s) − other.get(s)` | both |
|
||||||
|
|
||||||
```
|
```mermaid
|
||||||
for s in 0..n:
|
flowchart TD
|
||||||
sb = self.primary[s]
|
A["diff(self, other)"] --> B{"sb < 255\nself not overflow"}
|
||||||
ob = other.primary[s]
|
B -->|"yes — hot path O(n)"| C{"ob < 255"}
|
||||||
if sb < 255: // hot path: O(n), no HashMap
|
C -->|yes| D["primary[s] = sb.saturating_sub(ob)\nbyte write, no HashMap"]
|
||||||
self.primary[s] = if ob < 255 { sb.saturating_sub(ob) } else { 0 }
|
C -->|"no: b ≥ 255 > a"| E["primary[s] = 0"]
|
||||||
else: // cold path: O(k_self)
|
B -->|"no — cold path O(k_self)"| F["self_val = self.get(s)"]
|
||||||
self_val = self.get(s)
|
F --> G{"ob < 255"}
|
||||||
other_val = if ob < 255 { ob as u32 } else { other.get(s) }
|
G -->|yes| H["other_val = ob as u32"]
|
||||||
self.set(s, self_val.saturating_sub(other_val))
|
G -->|no| I["other_val = other.get(s)"]
|
||||||
|
H --> J["set(s, self_val.saturating_sub(other_val))"]
|
||||||
|
I --> J
|
||||||
|
D --> Z[next slot]
|
||||||
|
E --> Z
|
||||||
|
J --> Z
|
||||||
```
|
```
|
||||||
|
|
||||||
Overflow entries that drop below 255 (case sb=255, result < 255) are removed by `set()`. Overflow entries that remain ≥ 255 are updated. Correct in all four cases.
|
Overflow entries that drop below 255 (case sb=255, result < 255) are removed by `set()`. Overflow entries that remain ≥ 255 are updated. Correct in all four cases.
|
||||||
@@ -243,6 +346,70 @@ for (w_idx, word) in bits.words():
|
|||||||
|
|
||||||
## Concrete types
|
## Concrete types
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
classDiagram
|
||||||
|
class MemoryBitVec {
|
||||||
|
-words: Vec~u64~
|
||||||
|
-n: usize
|
||||||
|
+iter() BitIter
|
||||||
|
+ones(n) Self
|
||||||
|
+persist(path) Builder
|
||||||
|
}
|
||||||
|
class MemoryIntVec {
|
||||||
|
-primary: Vec~u8~
|
||||||
|
-overflow: HashMap~usize,u32~
|
||||||
|
-n: usize
|
||||||
|
+iter() MemoryIntIter
|
||||||
|
+filled(n, value) Self
|
||||||
|
+persist(path) Builder
|
||||||
|
}
|
||||||
|
class PersistentBitVec {
|
||||||
|
-mmap: Mmap
|
||||||
|
-n: usize
|
||||||
|
+iter() BitIter
|
||||||
|
+count_ones() u64
|
||||||
|
}
|
||||||
|
class PersistentBitVecBuilder {
|
||||||
|
-mmap: MmapMut
|
||||||
|
-n: usize
|
||||||
|
+close()
|
||||||
|
+build_from(src, path)
|
||||||
|
+build_from_counts(src, t, path)
|
||||||
|
}
|
||||||
|
class PersistentCompactIntVec {
|
||||||
|
-mmap: Mmap
|
||||||
|
-n usize
|
||||||
|
-n_overflow usize
|
||||||
|
-step usize
|
||||||
|
-index: Vec~(usize,usize)~
|
||||||
|
+iter() Iter
|
||||||
|
+get(slot) u32
|
||||||
|
+sum() u64
|
||||||
|
}
|
||||||
|
class PersistentCompactIntVecBuilder {
|
||||||
|
-mmap: MmapMut
|
||||||
|
-n: usize
|
||||||
|
-overflow: HashMap~usize,u32~
|
||||||
|
+set(slot, value)
|
||||||
|
+close()
|
||||||
|
+build_from(src, path)
|
||||||
|
}
|
||||||
|
|
||||||
|
MemoryBitVec ..|> BitSlice
|
||||||
|
MemoryBitVec ..|> BitSliceMut
|
||||||
|
PersistentBitVec ..|> BitSlice
|
||||||
|
PersistentBitVecBuilder ..|> BitSlice
|
||||||
|
PersistentBitVecBuilder ..|> BitSliceMut
|
||||||
|
MemoryIntVec ..|> IntSlice
|
||||||
|
MemoryIntVec ..|> IntSliceMut
|
||||||
|
PersistentCompactIntVec ..|> IntSlice
|
||||||
|
PersistentCompactIntVecBuilder ..|> IntSlice
|
||||||
|
PersistentCompactIntVecBuilder ..|> IntSliceMut
|
||||||
|
|
||||||
|
PersistentBitVecBuilder --> PersistentBitVec : close() then open()
|
||||||
|
PersistentCompactIntVecBuilder --> PersistentCompactIntVec : close() then open()
|
||||||
|
```
|
||||||
|
|
||||||
### Memory types
|
### Memory types
|
||||||
|
|
||||||
**`MemoryBitVec`**
|
**`MemoryBitVec`**
|
||||||
@@ -392,6 +559,39 @@ Required: `partial_jaccard() -> (Array2<u64>, Array2<u64>)` (inter, union), `par
|
|||||||
|
|
||||||
## Planned — Filter / Select API
|
## Planned — Filter / Select API
|
||||||
|
|
||||||
|
### Composition across layers and partitions
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
graph TD
|
||||||
|
subgraph Index
|
||||||
|
CG["ColGroup\nVec<usize> — valid everywhere"]
|
||||||
|
ACC["MemoryIntVec\nglobal accumulator"]
|
||||||
|
PRED["geq / leq / and / or\n→ MemoryBitVec mask"]
|
||||||
|
end
|
||||||
|
|
||||||
|
subgraph "Layer 1"
|
||||||
|
subgraph "Partition A kmers 0..k/2"
|
||||||
|
MA["Matrix A\npartial_group_presence_count"]
|
||||||
|
end
|
||||||
|
subgraph "Partition B kmers k/2..k"
|
||||||
|
MB["Matrix B\npartial_group_presence_count"]
|
||||||
|
end
|
||||||
|
CONCAT1["concat → MemoryIntVec\[0..k\]"]
|
||||||
|
end
|
||||||
|
|
||||||
|
subgraph "Layer 2"
|
||||||
|
CONCAT2["concat → MemoryIntVec\[0..k\]"]
|
||||||
|
end
|
||||||
|
|
||||||
|
CG -->|"same indices"| MA
|
||||||
|
CG -->|"same indices"| MB
|
||||||
|
MA -->|"kmer range A"| CONCAT1
|
||||||
|
MB -->|"kmer range B"| CONCAT1
|
||||||
|
CONCAT1 -->|"IntSliceMut::add"| ACC
|
||||||
|
CONCAT2 -->|"IntSliceMut::add"| ACC
|
||||||
|
ACC --> PRED
|
||||||
|
```
|
||||||
|
|
||||||
### ColGroup
|
### ColGroup
|
||||||
|
|
||||||
```rust
|
```rust
|
||||||
@@ -407,6 +607,25 @@ Defined **once at the index level** from column metadata. Valid in all matrices
|
|||||||
- **Across partitions**: kmer space is partitioned → partial results are **concatenated** (disjoint kmer ranges).
|
- **Across partitions**: kmer space is partitioned → partial results are **concatenated** (disjoint kmer ranges).
|
||||||
- **Across layers**: same kmer space, different counts → partial results are **aggregated** (add, OR, etc.).
|
- **Across layers**: same kmer space, different counts → partial results are **aggregated** (add, OR, etc.).
|
||||||
|
|
||||||
|
### Additivity rules
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
flowchart LR
|
||||||
|
subgraph "Matrix level — returns MemoryIntVec"
|
||||||
|
PGP["partial_group_presence_count\npartial_group_sum\npartial_group_any → MemoryBitVec"]
|
||||||
|
end
|
||||||
|
subgraph "Index level — applies predicate"
|
||||||
|
GA["group_at_least(k)\n= accumulate.geq(k)"]
|
||||||
|
GALL["group_all\n= accumulate.geq(n_cols)"]
|
||||||
|
GANY["group_any\n= OR of partial_group_any"]
|
||||||
|
end
|
||||||
|
PGP -->|"concat across partitions\nadd across layers"| GA
|
||||||
|
PGP --> GALL
|
||||||
|
PGP --> GANY
|
||||||
|
```
|
||||||
|
|
||||||
|
Non-additive predicates (`group_all`, `group_at_least`) do **not** exist at matrix level — they require the global accumulated count.
|
||||||
|
|
||||||
### MatrixGroupOps (planned trait)
|
### MatrixGroupOps (planned trait)
|
||||||
|
|
||||||
Group operations live on the matrix and expose only **additive intermediates** (`MemoryIntVec`). Predicates (final thresholds → `MemoryBitVec`) are applied at the index level after accumulation.
|
Group operations live on the matrix and expose only **additive intermediates** (`MemoryIntVec`). Predicates (final thresholds → `MemoryBitVec`) are applied at the index level after accumulation.
|
||||||
|
|||||||
Reference in New Issue
Block a user