diff --git a/docmd/implementation/obicompactvec.md b/docmd/implementation/obicompactvec.md index 2926443..71dc939 100644 --- a/docmd/implementation/obicompactvec.md +++ b/docmd/implementation/obicompactvec.md @@ -11,8 +11,11 @@ src/obicompactvec/src/ reader.rs PersistentCompactIntVec (read-only) builder.rs PersistentCompactIntVecBuilder (read-write) memoryintvec.rs MemoryIntVec + tempintvec.rs TempCompactIntVec, TempCompactIntVecBuilder (temp-file-backed) + tempbitvec.rs TempBitVec, TempBitVecBuilder (temp-file-backed) bitmatrix.rs PersistentBitMatrix, PersistentBitMatrixBuilder intmatrix.rs PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder + colgroup.rs ColGroup, MatrixGroupOps trait format.rs file format constants, encode/decode helpers layer_meta.rs LayerMeta (column metadata) meta.rs matrix metadata @@ -24,13 +27,22 @@ graph TD traits --> memoryintvec bitvec --> memoryvec bitvec --> bitmatrix + bitvec --> tempbitvec format --> reader format --> builder reader --> intmatrix + reader --> tempintvec builder --> intmatrix builder --> memoryintvec + builder --> tempintvec memoryvec --> traits memoryintvec --> traits + tempintvec --> intmatrix + tempintvec --> bitmatrix + tempbitvec --> intmatrix + tempbitvec --> bitmatrix + colgroup --> intmatrix + colgroup --> bitmatrix layer_meta --> bitmatrix layer_meta --> intmatrix meta --> bitmatrix @@ -479,6 +491,8 @@ See `persistent_compact_int_vec.md` for file format and lifecycle. | `MemoryIntVec` | inherent merge-scan ✓ | `byte_sum` ✓ | `byte_count_nonzero` ✓ | | `PersistentCompactIntVecBuilder` | default (get-per-slot) | `byte_sum` on mmap ✓ | `byte_count_nonzero` on mmap ✓ | | `PersistentCompactIntVec` | inherent merge-scan Iter ✓ | inherent `sum()` ✓ | inherent `count_nonzero()` ✓ | +| `TempCompactIntVec` | delegates to inner `PersistentCompactIntVec` | delegates | delegates | +| `TempCompactIntVecBuilder` | default (get-per-slot) | delegates to builder | delegates to builder | | `PackedIntCol<'a>` | inherent PackedIntColIter ✓ | byte_sum ✓ | byte_count_nonzero ✓ | `PackedIntCol` is used internally by `PersistentCompactIntMatrix` (packed format) for column views. @@ -557,45 +571,68 @@ Required: `partial_jaccard() -> (Array2, Array2)` (inter, union), `par --- -## Planned — Filter / Select API +## Temp-file-backed types -### Composition across layers and partitions +`MemoryBitVec` and `MemoryIntVec` are reserved for truly transient intra-method intermediates (e.g. a single `cmp_scalar` result that lives for one loop iteration). **All inter-function results use temp-file-backed types** so the OS can page them out under memory pressure. This matters in practice: processing dozens of layers × hundreds of partitions in parallel would otherwise accumulate gigabytes of live anonymous memory. -```mermaid -graph TD - subgraph Index - CG["ColGroup\nVec<usize> — valid everywhere"] - ACC["MemoryIntVec\nglobal accumulator"] - PRED["geq / leq / and / or\n→ MemoryBitVec mask"] - end +### Lifecycle - subgraph "Layer 1" - subgraph "Partition A kmers 0..k/2" - MA["Matrix A\npartial_group_presence_count"] - end - subgraph "Partition B kmers k/2..k" - MB["Matrix B\npartial_group_presence_count"] - end - CONCAT1["concat → MemoryIntVec\[0..k\]"] - end - - subgraph "Layer 2" - CONCAT2["concat → MemoryIntVec\[0..k\]"] - end - - CG -->|"same indices"| MA - CG -->|"same indices"| MB - MA -->|"kmer range A"| CONCAT1 - MB -->|"kmer range B"| CONCAT1 - CONCAT1 -->|"IntSliceMut::add"| ACC - CONCAT2 -->|"IntSliceMut::add"| ACC - ACC --> PRED ``` +TempCompactIntVecBuilder::new(n) → writable mmap in TempDir + ↓ (set / add / count_bits / mask_with / …) + .freeze() → TempCompactIntVec (read-only mmap + TempDir) + ↓ (optional) + .make_persistent(path) → PersistentCompactIntVec (permanent file) +``` + +Same pattern for `TempBitVecBuilder` → `TempBitVec` → `PersistentBitVec`. + +**Drop order**: in `TempCompactIntVec { vec: PersistentCompactIntVec, _temp: TempDir }`, Rust drops fields in declaration order — `vec` (mmap) is released before `_temp` (directory) is deleted. No explicit `drop()` needed. + +### TempCompactIntVec / TempCompactIntVecBuilder + +```rust +pub struct TempCompactIntVec { + vec: PersistentCompactIntVec, + _temp: TempDir, // dropped after vec +} + +pub(crate) struct TempCompactIntVecBuilder { + builder: PersistentCompactIntVecBuilder, + temp: TempDir, +} +``` + +`TempCompactIntVec` implements `IntSlice` (full delegation to inner `PersistentCompactIntVec`). +`TempCompactIntVecBuilder` implements `IntSlice` + `IntSliceMut` (delegation to inner builder). +`make_persistent(path)` copies the temp file to `path` and opens it as `PersistentCompactIntVec`. + +### TempBitVec / TempBitVecBuilder + +```rust +pub struct TempBitVec { + vec: PersistentBitVec, + _temp: TempDir, +} + +pub(crate) struct TempBitVecBuilder { + builder: PersistentBitVecBuilder, + temp: TempDir, +} +``` + +`TempBitVec` implements `BitSlice`. +`TempBitVecBuilder` implements `BitSlice` + `BitSliceMut`. +`make_persistent(path)` copies the temp file and opens as `PersistentBitVec`. + +--- + +## Filter / Select API ### ColGroup ```rust -struct ColGroup { name: String, indices: Vec } +pub struct ColGroup { pub name: String, pub indices: Vec } ``` Defined **once at the index level** from column metadata. Valid in all matrices of all layers and partitions because column structure is identical across the entire hierarchy (same samples/genomes everywhere; only rows = kmer slots are partitioned). @@ -607,68 +644,75 @@ Defined **once at the index level** from column metadata. Valid in all matrices - **Across partitions**: kmer space is partitioned → partial results are **concatenated** (disjoint kmer ranges). - **Across layers**: same kmer space, different counts → partial results are **aggregated** (add, OR, etc.). -### Additivity rules +### MatrixGroupOps -```mermaid -flowchart LR - subgraph "Matrix level — returns MemoryIntVec" - PGP["partial_group_presence_count\npartial_group_sum\npartial_group_any → MemoryBitVec"] - end - subgraph "Index level — applies predicate" - GA["group_at_least(k)\n= accumulate.geq(k)"] - GALL["group_all\n= accumulate.geq(n_cols)"] - GANY["group_any\n= OR of partial_group_any"] - end - PGP -->|"concat across partitions\nadd across layers"| GA - PGP --> GALL - PGP --> GANY -``` - -Non-additive predicates (`group_all`, `group_at_least`) do **not** exist at matrix level — they require the global accumulated count. - -### MatrixGroupOps (planned trait) - -Group operations live on the matrix and expose only **additive intermediates** (`MemoryIntVec`). Predicates (final thresholds → `MemoryBitVec`) are applied at the index level after accumulation. +Group operations live on the matrix and expose only **additive intermediates** backed by temp files. Predicates (final thresholds → `MemoryBitVec`) are applied at the index level after accumulation. ```rust -trait MatrixGroupOps { - // How many columns in group have value >= threshold, per kmer slot - fn partial_group_presence_count(&self, g: &ColGroup, threshold: u32) -> MemoryIntVec; +pub trait MatrixGroupOps { + fn partial_group_presence_count(&self, g: &ColGroup, threshold: u32) + -> io::Result; - // Sum of values across group columns, per kmer slot - fn partial_group_sum(&self, g: &ColGroup) -> MemoryIntVec; + fn partial_group_sum(&self, g: &ColGroup) + -> io::Result; - // Kmer present (value >= threshold) in at least one column of group - fn partial_group_any(&self, g: &ColGroup, threshold: u32) -> MemoryBitVec; + fn partial_group_any(&self, g: &ColGroup, threshold: u32) + -> io::Result; } ``` -Non-additive predicates (`group_all`, `group_at_least(k)`) are **not** on the matrix — they are composed at the index level from the additive intermediates: +Implemented for both `PersistentCompactIntMatrix` and `PersistentBitMatrix`. For bit matrices, `partial_group_sum` delegates to `partial_group_presence_count(g, 1)` since values are 0/1. + +**`partial_group_presence_count` — chunking for large groups:** + +When `g.indices.len() < 255`, per-slot counts fit in a raw `u8` — fast path: accumulate directly into `primary_bytes_mut()` using `inc_primary_bits`, then `freeze()`. No overflow map needed. + +When `g.indices.len() ≥ 255`, process in chunks of 254 columns — each chunk stays within `u8` range — then add chunks into a running `TempCompactIntVecBuilder` accumulator via `IntSliceMut::add`. This keeps peak memory proportional to one partition, not the number of columns × partitions. + +``` +fast path (< 255 cols): + builder = TempCompactIntVecBuilder::new(n) + for c in group: + mask = col_view(c).cmp_scalar(|v| v >= threshold) // MemoryBitVec + inc_primary_bits(primary_bytes_mut, mask) // u8 safe + builder.freeze() + +slow path (≥ 255 cols): + result = TempCompactIntVecBuilder::new(n) + for chunk in group.chunks(254): + chunk_builder = TempCompactIntVecBuilder::new(n) + inc_primary_bits(chunk_builder, …) + chunk_frozen = chunk_builder.freeze() + IntSliceMut::add(&mut result, &chunk_frozen) + result.freeze() +``` + +Non-additive predicates (`group_all`, `group_at_least(k)`) are **not** on the matrix — composed at the index level: ``` // "present in >= 2 ingroup columns with count >= 3, absent from all outgroup" -let presence = layers.map(|l| l.partial_group_presence_count(&ingroup, 3)).sum(); -let in_mask = presence.geq(2); // MemoryBitVec +let presence = layers.map(|l| l.partial_group_presence_count(&ingroup, 3)?).add_all()?; +let in_mask = presence.geq(2); -let out_sum = layers.map(|l| l.partial_group_sum(&outgroup)).sum(); -let out_mask = out_sum.leq(0); // MemoryBitVec +let out_sum = layers.map(|l| l.partial_group_sum(&outgroup)?).add_all()?; +let out_mask = out_sum.leq(0); -let mask = in_mask.and(&out_mask); // BitSliceMut::and — O(n/64) +let mask = in_mask & &out_mask; // BitSliceMut::and — O(n/64) ``` -### mask_with (planned IntSliceMut method) +### mask_with (IntSliceMut) -Apply a bit mask to a count vector: zero slots where the mask bit is 0. Iterates only zero bits — O(n_zeros), O(1) when mask is all-ones. +Provided method on `IntSliceMut`. Zeros every slot where the corresponding mask bit is 0. Iterates only zero bits — O(n_zeros), O(1) when mask is all-ones. ``` for (w_idx, word) in mask.words(): - if word == u64::MAX: continue + if word == u64::MAX: continue // skip all-ones words zeros = !word while zeros != 0: bit = trailing_zeros(zeros) s = w_idx * 64 + bit - self.set(s, 0) + if primary[s] != 0: self.set(s, 0) // clears overflow entry too zeros &= zeros − 1 ``` -This is the terminal operation for both Filter (zero non-selected kmer slots in a count matrix) and Select (positional selection without MPHF). +Terminal operation for Filter (retain only selected kmer slots in a count vector) and Select (positional selection without MPHF). diff --git a/obicompactvector_reflexion.md b/obicompactvector_reflexion.md new file mode 100644 index 0000000..a8e2356 --- /dev/null +++ b/obicompactvector_reflexion.md @@ -0,0 +1,44 @@ +# La crate obicompactvector + +Le code actuelle est ce qu'il est. Ce n'est pad la vrérité absolue, c'est un premier effort d'implémentation rien de plus. Ci-dessous je vais décrire les objectif et la structure qui devrait être. LA VERITE A ATTEINDRE. + +La crate fournie des représentations les plus compact possible en mémoire de matrice de comptage ou de présence de k-mer dans des génomes. Chaque colonne représente un génome chaque ligne un kmer. une matrice est une collection de vecteur ou chacun des vecteur est un colonne de la matrice. + +Les matrices comme les colonnes ont vocation à être persistante. Les données sont stockées dans des fichiers binaires. Les données sont mappées en mémoire via `mmap` + +Les structure sont par essence immutables. Il existe des représentations mutables des colonnes qui permettent leur construction. À la fin de leur construction, les colonnes sont fermée ce qui les rends immutable. + +Les matrices peuvent êtres représenté de deux façons: + - via un répertoire contenant une collection de fichier colonnes + - via un fichier matrix qui est la concatenation de plusieurs fichiers colonnes. + + +## Les matrices de comptage + +Ce sont des matrice d'entiers positif la plus part du temps de petites valeurs (inferieurs à 255). On assume que toutes les valeurs sont représentables sur un `u32` + +## Les matrices de presence + +Ce sont des matrices de boolean représenté comme des champs de bits + +Il existe une forme implicite des vecteur de présence, qui n'est représenté par aucun fichier pour lequel toutes les valeurs sont vraies + +## représentation légère des colonnes + +Les colonnes qu'elles soient de unitiaire (fichier colonne) ou partie d'un fichier composite matrice peuvent être représenté par un objet léger donnant acces à ces valeurs ainsi qu'à la longeur du vecteurs. Toutes les méthodes de calcules doivent uniquement travailler à partir de ces représentations légère unifiées des colonnes. + +### Représentation légère d'un vecteur de présence + +Le vecteur est représenté par + - un champs de bits encodé comme un [u64] + - un usize encodant la longeur du champs de bits + +### Représentation légère d'un vecteur de présence + +Le vecteur est représenté par + - un vecteur [u8] encodant directement les valeur faibe du vecteur [0,255[ + La valeur 255 est une valeur sentinelle indiquant que la valeure vraie est >=255 + et se trouvent dans une structure d'overflow + - un iterateur de (usize,u32) listant les valeurs d'overflow coorespondant aux valeurs + sentinels (255) du [u8] + - un usize encodant la longeur du champs de bits diff --git a/src/obicompactvec/Cargo.toml b/src/obicompactvec/Cargo.toml index ddb1e40..777b606 100644 --- a/src/obicompactvec/Cargo.toml +++ b/src/obicompactvec/Cargo.toml @@ -7,6 +7,6 @@ edition = "2024" memmap2 = "0.9" ndarray = "0.16" rayon = "1" +tempfile = "3" [dev-dependencies] -tempfile = "3" diff --git a/src/obicompactvec/src/bitmatrix.rs b/src/obicompactvec/src/bitmatrix.rs index a51058e..8039e29 100644 --- a/src/obicompactvec/src/bitmatrix.rs +++ b/src/obicompactvec/src/bitmatrix.rs @@ -8,8 +8,9 @@ use rayon::prelude::*; use crate::bitvec::{PersistentBitVec, PersistentBitVecBuilder}; use crate::colgroup::{ColGroup, MatrixGroupOps, inc_primary_bits}; -use crate::memoryintvec::MemoryIntVec; use crate::memoryvec::MemoryBitVec; +use crate::tempbitvec::{TempBitVec, TempBitVecBuilder}; +use crate::tempintvec::{TempCompactIntVec, TempCompactIntVecBuilder}; use crate::traits::{BitSlice, BitSliceMut, IntSliceMut}; use crate::layer_meta::LayerMeta; use crate::meta::MatrixMeta; @@ -452,39 +453,49 @@ impl PersistentBitMatrixBuilder { // ── MatrixGroupOps ──────────────────────────────────────────────────────────── impl MatrixGroupOps for PersistentBitMatrix { - fn partial_group_presence_count(&self, g: &ColGroup, _threshold: u32) -> MemoryIntVec { + fn partial_group_presence_count(&self, g: &ColGroup, _threshold: u32) -> io::Result { // Bit matrices store 0/1 — threshold is structurally always 1. - // Materialize each column to a MemoryBitVec and accumulate directly. let n = self.n(); if g.indices.len() < 255 { - let mut primary = vec![0u8; n]; - for &c in &g.indices { - let mbv = MemoryBitVec::from(&self.col_view(c)); - inc_primary_bits(&mut primary, &mbv); + let mut builder = TempCompactIntVecBuilder::new(n)?; + { + let primary = builder.primary_bytes_mut(); + for &c in &g.indices { + let mbv = MemoryBitVec::from(&self.col_view(c)); + inc_primary_bits(primary, &mbv); + } } - MemoryIntVec::from_primary(primary) + builder.freeze() } else { - let mut result = MemoryIntVec::new(n); - for &c in &g.indices { - let mbv = MemoryBitVec::from(&self.col_view(c)); - result.count_bits(&mbv); + let mut result = TempCompactIntVecBuilder::new(n)?; + for chunk in g.indices.chunks(254) { + let mut chunk_builder = TempCompactIntVecBuilder::new(n)?; + { + let primary = chunk_builder.primary_bytes_mut(); + for &c in chunk { + let mbv = MemoryBitVec::from(&self.col_view(c)); + inc_primary_bits(primary, &mbv); + } + } + let chunk_frozen = chunk_builder.freeze()?; + IntSliceMut::add(&mut result, &chunk_frozen); } - result + result.freeze() } } - fn partial_group_sum(&self, g: &ColGroup) -> MemoryIntVec { + fn partial_group_sum(&self, g: &ColGroup) -> io::Result { // For bit matrices, sum = count of 1-bits — identical to presence_count. self.partial_group_presence_count(g, 1) } - fn partial_group_any(&self, g: &ColGroup, _threshold: u32) -> MemoryBitVec { + fn partial_group_any(&self, g: &ColGroup, _threshold: u32) -> io::Result { let n = self.n(); - let mut result = MemoryBitVec::new(n); + let mut result = TempBitVecBuilder::new(n)?; for &c in &g.indices { result.or(&self.col_view(c)); } - result + result.freeze() } } diff --git a/src/obicompactvec/src/builder.rs b/src/obicompactvec/src/builder.rs index 3e622d9..271b5d8 100644 --- a/src/obicompactvec/src/builder.rs +++ b/src/obicompactvec/src/builder.rs @@ -122,19 +122,19 @@ impl PersistentCompactIntVecBuilder { /// Flush the primary mmap, then write sorted overflow data + index and fix the header. pub fn close(self) -> io::Result<()> { self.mmap.flush()?; - let Self { - path, - mmap, - n, - overflow, - } = self; + let Self { path, mmap, n, overflow } = self; drop(mmap); - let mut entries: Vec<(usize, u32)> = overflow.into_iter().collect(); entries.sort_unstable_by_key(|&(slot, _)| slot); - finalize_pciv(&path, n, &entries) } + + /// Close and reopen as a read-only [`PersistentCompactIntVec`]. + pub fn finish(self) -> io::Result { + let path = self.path.clone(); + self.close()?; + PersistentCompactIntVec::open(&path) + } } // ── IntSlice / IntSliceMut impls ────────────────────────────────────────────── diff --git a/src/obicompactvec/src/colgroup.rs b/src/obicompactvec/src/colgroup.rs index df4576f..9fe1659 100644 --- a/src/obicompactvec/src/colgroup.rs +++ b/src/obicompactvec/src/colgroup.rs @@ -1,5 +1,8 @@ -use crate::memoryintvec::MemoryIntVec; +use std::io; + use crate::memoryvec::MemoryBitVec; +use crate::tempbitvec::TempBitVec; +use crate::tempintvec::TempCompactIntVec; use crate::traits::BitSlice; // ── ColGroup ────────────────────────────────────────────────────────────────── @@ -30,13 +33,13 @@ impl ColGroup { /// — they are derived at the index level from these intermediates. pub trait MatrixGroupOps { /// Per-slot count of group columns whose value ≥ `threshold`. - fn partial_group_presence_count(&self, g: &ColGroup, threshold: u32) -> MemoryIntVec; + fn partial_group_presence_count(&self, g: &ColGroup, threshold: u32) -> io::Result; /// Per-slot sum of values across all group columns. - fn partial_group_sum(&self, g: &ColGroup) -> MemoryIntVec; + fn partial_group_sum(&self, g: &ColGroup) -> io::Result; /// Per-slot OR: true if any group column has value ≥ `threshold`. - fn partial_group_any(&self, g: &ColGroup, threshold: u32) -> MemoryBitVec; + fn partial_group_any(&self, g: &ColGroup, threshold: u32) -> io::Result; } // ── Internal helper ─────────────────────────────────────────────────────────── diff --git a/src/obicompactvec/src/intmatrix.rs b/src/obicompactvec/src/intmatrix.rs index 172d7b0..fc64c48 100644 --- a/src/obicompactvec/src/intmatrix.rs +++ b/src/obicompactvec/src/intmatrix.rs @@ -12,7 +12,8 @@ use crate::bitmatrix::{pairwise_matrix, pairwise2_matrix}; use crate::builder::PersistentCompactIntVecBuilder; use crate::colgroup::{ColGroup, MatrixGroupOps, inc_primary_bits}; use crate::memoryintvec::MemoryIntVec; -use crate::memoryvec::MemoryBitVec; +use crate::tempbitvec::{TempBitVec, TempBitVecBuilder}; +use crate::tempintvec::{TempCompactIntVec, TempCompactIntVecBuilder}; use crate::format::{byte_count_nonzero, byte_sum, HEADER_SIZE, OVERFLOW_ENTRY_SIZE, parse_index_entry, parse_overflow_entry}; use crate::meta::MatrixMeta; use crate::reader::PersistentCompactIntVec; @@ -630,45 +631,55 @@ impl PersistentCompactIntMatrixBuilder { // ── MatrixGroupOps ──────────────────────────────────────────────────────────── impl MatrixGroupOps for PersistentCompactIntMatrix { - fn partial_group_presence_count(&self, g: &ColGroup, threshold: u32) -> MemoryIntVec { + fn partial_group_presence_count(&self, g: &ColGroup, threshold: u32) -> io::Result { let n = self.n(); if g.indices.len() < 255 { - // Fast path: counts fit in u8 — accumulate directly into raw bytes, - // no overflow map involved. - let mut primary = vec![0u8; n]; - for &c in &g.indices { - let mask = self.col_view(c).cmp_scalar(|v| v >= threshold); - inc_primary_bits(&mut primary, &mask); + // Fast path: counts fit in u8 — accumulate directly into raw bytes. + let mut builder = TempCompactIntVecBuilder::new(n)?; + { + let primary = builder.primary_bytes_mut(); + for &c in &g.indices { + let mask = self.col_view(c).cmp_scalar(|v| v >= threshold); + inc_primary_bits(primary, &mask); + } } - MemoryIntVec::from_primary(primary) + builder.freeze() } else { - // Slow path (rare): use IntSliceMut::count_bits which handles overflow. - let mut result = MemoryIntVec::new(n); - for &c in &g.indices { - let mask = self.col_view(c).cmp_scalar(|v| v >= threshold); - result.count_bits(&mask); + // Slow path: chunk by 254 to keep per-chunk u8 safe, then add chunks. + let mut result = TempCompactIntVecBuilder::new(n)?; + for chunk in g.indices.chunks(254) { + let mut chunk_builder = TempCompactIntVecBuilder::new(n)?; + { + let primary = chunk_builder.primary_bytes_mut(); + for &c in chunk { + let mask = self.col_view(c).cmp_scalar(|v| v >= threshold); + inc_primary_bits(primary, &mask); + } + } + let chunk_frozen = chunk_builder.freeze()?; + IntSliceMut::add(&mut result, &chunk_frozen); } - result + result.freeze() } } - fn partial_group_sum(&self, g: &ColGroup) -> MemoryIntVec { + fn partial_group_sum(&self, g: &ColGroup) -> io::Result { let n = self.n(); - let mut result = MemoryIntVec::new(n); + let mut result = TempCompactIntVecBuilder::new(n)?; for &c in &g.indices { let view = self.col_view(c); IntSliceMut::add(&mut result, &view); } - result + result.freeze() } - fn partial_group_any(&self, g: &ColGroup, threshold: u32) -> MemoryBitVec { + fn partial_group_any(&self, g: &ColGroup, threshold: u32) -> io::Result { let n = self.n(); - let mut result = MemoryBitVec::new(n); + let mut result = TempBitVecBuilder::new(n)?; for &c in &g.indices { let mask = self.col_view(c).cmp_scalar(|v| v >= threshold); result.or(&mask); } - result + result.freeze() } } diff --git a/src/obicompactvec/src/lib.rs b/src/obicompactvec/src/lib.rs index c5f3705..6625ab6 100644 --- a/src/obicompactvec/src/lib.rs +++ b/src/obicompactvec/src/lib.rs @@ -9,6 +9,8 @@ mod memoryintvec; mod memoryvec; mod meta; mod reader; +mod tempbitvec; +mod tempintvec; pub mod traits; pub use bitvec::{BitIter, PersistentBitVec, PersistentBitVecBuilder}; @@ -20,6 +22,8 @@ pub use layer_meta::LayerMeta; pub use memoryintvec::{MemoryIntIter, MemoryIntVec}; pub use memoryvec::MemoryBitVec; pub use reader::PersistentCompactIntVec; +pub use tempbitvec::TempBitVec; +pub use tempintvec::TempCompactIntVec; pub use traits::{BitPartials, BitSlice, BitSliceMut, BitToInt, ColumnWeights, CountPartials, IntSlice, IntSliceMut, IntToBit}; #[cfg(test)] diff --git a/src/obicompactvec/src/tempbitvec.rs b/src/obicompactvec/src/tempbitvec.rs new file mode 100644 index 0000000..3945075 --- /dev/null +++ b/src/obicompactvec/src/tempbitvec.rs @@ -0,0 +1,69 @@ +use std::io; +use std::path::Path; + +use tempfile::TempDir; + +use crate::bitvec::{PersistentBitVec, PersistentBitVecBuilder}; +use crate::traits::{BitSlice, BitSliceMut}; + +// ── TempBitVec — frozen read-only, auto-deleted on drop ────────────────────── + +/// A bit vector backed by a temporary file. +/// Implements [`BitSlice`]; the file is deleted when this value is dropped. +/// Call [`make_persistent`](Self::make_persistent) to promote to a durable file. +pub struct TempBitVec { + vec: PersistentBitVec, + // Dropped after `vec` (field order), so the mmap is released before the + // temp directory is deleted. + _temp: TempDir, +} + +impl TempBitVec { + /// Copy to a permanent file and open as a [`PersistentBitVec`]. + pub fn make_persistent(&self, path: &Path) -> io::Result { + std::fs::copy(self.vec.path(), path)?; + PersistentBitVec::open(path) + } + + pub fn len(&self) -> usize { self.vec.len() } + pub fn is_empty(&self) -> bool { self.vec.is_empty() } +} + +impl BitSlice for TempBitVec { + fn len(&self) -> usize { self.vec.len() } + fn words(&self) -> &[u64] { self.vec.words() } +} + +// ── TempBitVecBuilder — mutable, becomes TempBitVec on freeze ──────────────── + +/// Writable builder for a [`TempBitVec`]. `pub(crate)` — callers receive +/// only the frozen result via [`freeze`](Self::freeze). +pub(crate) struct TempBitVecBuilder { + builder: PersistentBitVecBuilder, + temp: TempDir, +} + +impl TempBitVecBuilder { + pub(crate) fn new(n: usize) -> io::Result { + let temp = TempDir::new()?; + let path = temp.path().join("data.pbiv"); + let builder = PersistentBitVecBuilder::new(n, &path)?; + Ok(Self { builder, temp }) + } + + /// Finalize writes and return a frozen, read-only [`TempBitVec`]. + pub(crate) fn freeze(self) -> io::Result { + let Self { builder, temp } = self; + let vec = builder.finish()?; + Ok(TempBitVec { vec, _temp: temp }) + } +} + +impl BitSlice for TempBitVecBuilder { + fn len(&self) -> usize { self.builder.len() } + fn words(&self) -> &[u64] { self.builder.words() } +} + +impl BitSliceMut for TempBitVecBuilder { + fn words_mut(&mut self) -> &mut [u64] { self.builder.words_mut() } +} diff --git a/src/obicompactvec/src/tempintvec.rs b/src/obicompactvec/src/tempintvec.rs new file mode 100644 index 0000000..ced3cef --- /dev/null +++ b/src/obicompactvec/src/tempintvec.rs @@ -0,0 +1,82 @@ +use std::io; +use std::path::Path; + +use tempfile::TempDir; + +use crate::builder::PersistentCompactIntVecBuilder; +use crate::reader::PersistentCompactIntVec; +use crate::traits::{IntSlice, IntSliceMut}; + +// ── TempCompactIntVec — frozen read-only, auto-deleted on drop ──────────────── + +/// A compact int vector backed by a temporary file. +/// Implements [`IntSlice`]; the file is deleted when this value is dropped. +/// Call [`make_persistent`](Self::make_persistent) to promote to a durable file. +pub struct TempCompactIntVec { + vec: PersistentCompactIntVec, + // Dropped after `vec` (field order), so the mmap is released before the + // temp directory is deleted. + _temp: TempDir, +} + +impl TempCompactIntVec { + /// Copy to a permanent file and open as a [`PersistentCompactIntVec`]. + pub fn make_persistent(&self, path: &Path) -> io::Result { + std::fs::copy(self.vec.path(), path)?; + PersistentCompactIntVec::open(path) + } + + pub fn len(&self) -> usize { self.vec.len() } + pub fn is_empty(&self) -> bool { self.vec.is_empty() } +} + +impl IntSlice for TempCompactIntVec { + fn len(&self) -> usize { self.vec.len() } + fn get(&self, slot: usize) -> u32 { self.vec.get(slot) } + fn primary_bytes(&self) -> &[u8] { self.vec.primary_bytes() } + fn overflow_entries(&self) -> impl Iterator + '_ { + self.vec.overflow_entries() + } + fn sum(&self) -> u64 { self.vec.sum() } + fn count_nonzero(&self) -> u64 { self.vec.count_nonzero() } +} + +// ── TempCompactIntVecBuilder — mutable, becomes TempCompactIntVec on freeze ── + +/// Writable builder for a [`TempCompactIntVec`]. `pub(crate)` — callers +/// receive only the frozen result via [`freeze`](Self::freeze). +pub(crate) struct TempCompactIntVecBuilder { + builder: PersistentCompactIntVecBuilder, + temp: TempDir, +} + +impl TempCompactIntVecBuilder { + pub(crate) fn new(n: usize) -> io::Result { + let temp = TempDir::new()?; + let path = temp.path().join("data.pciv"); + let builder = PersistentCompactIntVecBuilder::new(n, &path)?; + Ok(Self { builder, temp }) + } + + /// Finalize writes and return a frozen, read-only [`TempCompactIntVec`]. + pub(crate) fn freeze(self) -> io::Result { + let Self { builder, temp } = self; + let vec = builder.finish()?; + Ok(TempCompactIntVec { vec, _temp: temp }) + } +} + +impl IntSlice for TempCompactIntVecBuilder { + fn len(&self) -> usize { self.builder.len() } + fn get(&self, slot: usize) -> u32 { self.builder.get(slot) } + fn primary_bytes(&self) -> &[u8] { self.builder.primary_bytes() } + fn overflow_entries(&self) -> impl Iterator + '_ { + self.builder.overflow_entries() + } +} + +impl IntSliceMut for TempCompactIntVecBuilder { + fn set(&mut self, slot: usize, value: u32) { self.builder.set(slot, value); } + fn primary_bytes_mut(&mut self) -> &mut [u8] { self.builder.primary_bytes_mut() } + fn clear_overflow(&mut self) { self.builder.clear_overflow(); } +} diff --git a/src/obicompactvec/src/tests/colgroup.rs b/src/obicompactvec/src/tests/colgroup.rs index 813d4fa..388508d 100644 --- a/src/obicompactvec/src/tests/colgroup.rs +++ b/src/obicompactvec/src/tests/colgroup.rs @@ -5,7 +5,7 @@ use crate::{ PersistentBitMatrix, PersistentBitMatrixBuilder, PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, }; -use crate::traits::{BitSliceMut, IntSlice, IntSliceMut}; +use crate::traits::{BitSlice, BitSliceMut, IntSlice, IntSliceMut}; use crate::{MemoryBitVec, MemoryIntVec}; // ── helpers ─────────────────────────────────────────────────────────────────── @@ -47,7 +47,7 @@ fn int_partial_group_sum_basic() { // group {0,2}: sum = [101, 2, 8] let (_d, m) = make_int_matrix(&[&[1, 2, 3], &[10, 20, 30], &[100, 0, 5]]); let g = ColGroup::new("g", vec![0, 2]); - let result = m.partial_group_sum(&g); + let result = m.partial_group_sum(&g).unwrap(); assert_eq!(result.get(0), 101); assert_eq!(result.get(1), 2); assert_eq!(result.get(2), 8); @@ -58,7 +58,7 @@ fn int_partial_group_sum_with_overflow() { // col0=[300,0], col1=[200,400]: group {0,1}: sum=[500, 400] let (_d, m) = make_int_matrix(&[&[300, 0], &[200, 400]]); let g = ColGroup::new("g", vec![0, 1]); - let result = m.partial_group_sum(&g); + let result = m.partial_group_sum(&g).unwrap(); assert_eq!(result.get(0), 500); assert_eq!(result.get(1), 400); assert_eq!(result.sum(), 900); @@ -73,7 +73,7 @@ fn int_partial_group_presence_count() { // group {0,1,2}: counts = [2, 1, 1, 2] let (_d, m) = make_int_matrix(&[&[5, 1, 0, 3], &[2, 0, 4, 3], &[0, 3, 1, 0]]); let g = ColGroup::new("g", vec![0, 1, 2]); - let result = m.partial_group_presence_count(&g, 2); + let result = m.partial_group_presence_count(&g, 2).unwrap(); assert_eq!(result.get(0), 2); assert_eq!(result.get(1), 1); assert_eq!(result.get(2), 1); @@ -87,7 +87,7 @@ fn int_partial_group_presence_count_with_overflow() { // group {0,1,2}: counts = [1, 1, 3] let (_d, m) = make_int_matrix(&[&[300, 0, 10], &[0, 400, 10], &[1, 1, 10]]); let g = ColGroup::new("g", vec![0, 1, 2]); - let result = m.partial_group_presence_count(&g, 5); + let result = m.partial_group_presence_count(&g, 5).unwrap(); assert_eq!(result.get(0), 1); assert_eq!(result.get(1), 1); assert_eq!(result.get(2), 3); @@ -102,7 +102,7 @@ fn int_partial_group_any() { // group {0,1,2}: any = [T, T, T, F] let (_d, m) = make_int_matrix(&[&[0, 3, 0, 1], &[2, 0, 0, 0], &[0, 0, 5, 0]]); let g = ColGroup::new("g", vec![0, 1, 2]); - let result = m.partial_group_any(&g, 2); + let result = m.partial_group_any(&g, 2).unwrap(); assert_eq!(result.get(0), true); assert_eq!(result.get(1), true); assert_eq!(result.get(2), true); @@ -164,7 +164,7 @@ fn bit_partial_group_presence_count() { &[false,true, true, false], ]); let g = ColGroup::new("g", vec![0, 1, 2]); - let result = m.partial_group_presence_count(&g, 1); + let result = m.partial_group_presence_count(&g, 1).unwrap(); assert_eq!(result.get(0), 2); assert_eq!(result.get(1), 2); assert_eq!(result.get(2), 2); @@ -181,7 +181,7 @@ fn bit_partial_group_any() { &[false, false, true], ]); let g = ColGroup::new("g", vec![0, 1]); - let result = m.partial_group_any(&g, 1); + let result = m.partial_group_any(&g, 1).unwrap(); assert_eq!(result.get(0), true); assert_eq!(result.get(1), false); assert_eq!(result.get(2), true); @@ -200,8 +200,8 @@ fn int_presence_count_additive_across_split() { let (_db, mb) = make_int_matrix(data_b); let g = ColGroup::new("g", vec![0, 1]); - let pa = ma.partial_group_presence_count(&g, 2); - let pb = mb.partial_group_presence_count(&g, 2); + let pa = ma.partial_group_presence_count(&g, 2).unwrap(); + let pb = mb.partial_group_presence_count(&g, 2).unwrap(); // Concatenate by adding (disjoint kmer ranges — here we just verify // individual results match the expected per-partition counts).