diff --git a/docmd/implementation/obilayeredmap.md b/docmd/implementation/obilayeredmap.md index 9a5ccbf..5568608 100644 --- a/docmd/implementation/obilayeredmap.md +++ b/docmd/implementation/obilayeredmap.md @@ -10,26 +10,26 @@ The MPHF + evidence infrastructure is fixed for all modes. The **payload** — data associated with each slot — is orthogonal and varies by mode. -| Mode | Description | Payload type | File | +| Mode | Description | Payload type | Storage | |---|---|---|---| | 1. Set | membership test only | `()` | — | -| 2. Set with count | occurrences per kmer per sample | `PersistentCompactIntVec` | `counts.pciv` | -| 3. Presence/absence matrix | which genomes contain each kmer | `PersistentBitVec` per genome | `presence_N.pbiv` | -| 4. Count matrix | occurrences per kmer per genome | `PersistentCompactIntVec` per genome | `counts_N.pciv` | +| 2. Count | occurrences per kmer per sample | `PersistentCompactIntMatrix` | `counts/` directory | +| 3. Presence/absence matrix | which genomes contain each kmer | `PersistentBitMatrix` | `presence/` directory | +| 4. Count matrix | occurrences per kmer per genome | `PersistentCompactIntMatrix` | `counts/` directory | -Both `PersistentCompactIntVec` and `PersistentBitVec` come from the `obicompactvec` crate. Modes 3 and 4 are not yet implemented; the per-genome multi-file layout and query API remain to be designed. +Both `PersistentCompactIntMatrix` and `PersistentBitMatrix` come from the `obicompactvec` crate. Mode 3 has a build path (`Layer::::build_presence`); mode 4 is not yet implemented. -### Payload for mode 2: PersistentCompactIntVec +### Payload for modes 2/4: PersistentCompactIntMatrix -`PersistentCompactIntVec` (PCIV) stores one `u32` count per MPHF slot in a single mmap'd `.pciv` file. Its encoding: a primary `u8` array (value 255 = overflow sentinel) backed by a sorted overflow section of `(slot: u64, value: u32)` entries and a sparse L1-fitting index for fast binary search. This handles the geometric count distribution efficiently — most values fit in 1 byte, overflow entries are rare. +`PersistentCompactIntMatrix` is a column-major matrix stored in a directory: one `col_NNNNNN.pciv` file per column, plus a `meta.json`. Each column is a `PersistentCompactIntVec` — a mmap'd PCIV file with a `u8` primary array (255 = overflow sentinel), a sorted overflow section of `(slot: u64, value: u32)` entries, and a sparse L1-fitting index. -Capacity: 0 to u32::MAX per slot. No separate decision needed on bit-width: PCIV adapts to the data. +Mode 2 writes 1 column per layer (one sample). Mode 4 writes G columns (one per genome). `read(slot)` returns `Box<[u32]>` — the full row across all columns. -### Payload for mode 3/4: PersistentBitVec / PersistentCompactIntVec +### Payload for mode 3: PersistentBitMatrix -`PersistentBitVec` (PBIV) stores one bit per MPHF slot in a mmap'd `.pbiv` file with u64 word-level bulk operations (AND, OR, XOR, NOT, POPCNT, Jaccard, Hamming). One PBIV per genome gives a column-major presence/absence matrix, making per-genome set operations cache-friendly. +`PersistentBitMatrix` is a column-major bit matrix stored in a directory: one `col_NNNNNN.pbiv` per genome, plus `meta.json`. Each column is a `PersistentBitVec` — a mmap'd PBIV file with u64 word-level bulk operations (AND, OR, XOR, NOT, POPCNT, Jaccard, Hamming). `read(slot)` returns `Box<[bool]>` — the presence vector across all genomes. -Mode 4 replaces PBIV with PCIV per genome. Multi-file layout and query API are not yet designed. +Column-major layout makes per-genome set operations cache-friendly; the full row is assembled on demand at query time. --- @@ -57,14 +57,15 @@ pub struct Hit { } ``` -`LayerData` covers the **read path only** (`open` + `read`). The write path (build) is intentionally not in the trait — build signatures differ between modes (mode 1 takes no extra argument, mode 2 takes a `count_of` closure) and forcing this into a trait would require an associated `Context` type with no benefit over specialized `impl` blocks. +`LayerData` covers the **read path only** (`open` + `read`). The write path (build) is intentionally not in the trait — build signatures differ between modes and forcing this into a trait would require an associated `Context` type with no benefit over specialized `impl` blocks. Implemented concrete types: | Type | `Item` | Description | |---|---|---| | `()` | `()` | mode 1 — membership only | -| `PersistentCompactIntVec` | `u32` | mode 2 — per-slot count | +| `PersistentCompactIntMatrix` | `Box<[u32]>` | modes 2/4 — one count per column | +| `PersistentBitMatrix` | `Box<[bool]>` | mode 3 — one presence bit per column | `LayeredMap` mirrors the same parameterisation: `LayeredMap`. @@ -81,8 +82,14 @@ index_root/ ← LayeredMap (collection) unitigs.bin unitigs.bin.idx evidence.bin - counts.pciv [mode 2 only] - presence_N.pbiv [mode 3/4, one per genome — not yet implemented] + counts/ [modes 2/4] + meta.json {"n": N, "n_cols": 1} + col_000000.pciv + presence/ [mode 3] + meta.json {"n": N, "n_cols": G} + col_000000.pbiv + col_000001.pbiv + ... layer_1/ ... part_00001/ @@ -106,7 +113,8 @@ layer_N/ unitigs.bin — packed 2-bit nucleotide sequences (obiskio binary format) unitigs.bin.idx — UIDX index: n_unitigs, n_kmers, seqls[], packed_offsets[] evidence.bin — u32 per MPHF slot: (unitig_id: 25 | rank: 7) - counts.pciv — [mode 2] PersistentCompactIntVec: one u32 per slot + counts/ — [modes 2/4] PersistentCompactIntMatrix + presence/ — [mode 3] PersistentBitMatrix ``` `unitigs.bin` is the packed-2-bit sequence file produced by `obiskio::UnitigFileWriter`. The companion `.idx` file stores: magic `UIDX`, `n_unitigs: u32`, `n_kmers: u64`, `seqls: [u8; n_unitigs]` (kmer count − 1 per chunk), and `packed_offsets: [u32; n_unitigs + 1]` (byte offsets into `unitigs.bin`, sentinel-terminated). This gives O(1) random access to any unitig and the total kmer count without scanning the sequence file. @@ -165,13 +173,24 @@ impl Layer<()> { pub fn build(out_dir: &Path) -> OLMResult } -// mode 2 -impl Layer { +// modes 2/4 +impl Layer { pub fn build(out_dir: &Path, count_of: impl Fn(CanonicalKmer) -> u32) -> OLMResult pub fn build_from_map(out_dir: &Path, counts: &HashMap) -> OLMResult } + +// mode 3 +impl Layer { + pub fn build_presence( + out_dir: &Path, + n_genomes: usize, + present_in: impl Fn(CanonicalKmer, usize) -> bool, + ) -> OLMResult +} ``` +Mode 2 creates a `PersistentCompactIntMatrixBuilder` with 1 column and fills it via `build_second_pass`. Mode 3 creates a `PersistentBitMatrixBuilder` with `n_genomes` columns and fills all columns in a single pass. + Any duplicate slot or out-of-bounds index detected during `build_second_pass` returns `OLMError::Mphf`. `new_from_par_iter` avoids materialising all keys as `Vec`. --- @@ -196,6 +215,8 @@ fn query(kmer) -> Option<(usize, Hit)>: Expected probe depth: 1 for kmers in layer 0, increasing for later layers. +For mode 2, `hit.data` is `Box<[u32]>` with 1 element; `hit.data[0]` is the count. For mode 3, `hit.data` is `Box<[bool]>` with G elements, one per genome. + --- ## Add-layer algorithm @@ -221,15 +242,13 @@ Each partition's new layer is built independently; the operation is fully parall | `epserde 0.8` | zero-copy serialisation of MPHF | | `memmap2` | mmap of layer files | | `obiskio` | unitig file writer/reader | -| `obicompactvec` | payload types: `PersistentCompactIntVec`, `PersistentBitVec` | +| `obicompactvec` | payload types: `PersistentCompactIntMatrix`, `PersistentBitMatrix` | --- ## Open questions -- **Mode 3/4 multi-file layout**: one PBIV/PCIV per genome per layer means O(n_layers × n_genomes) files. Directory layout, open strategy, and query API are not yet designed. -- **Mode 4 scale**: count matrix (n_kmers × n_genomes × bytes_per_count) reaches hundreds of GB for large collections. A sparse representation may be required; access pattern and density threshold are not yet defined. -- **Presence matrix layout**: column-major (one PBIV per genome) favours per-genome operations; row-major favours per-kmer queries. Dominant access pattern not yet characterised. +- **Mode 4**: count matrix (n_kmers × n_genomes × bytes_per_count) is structurally identical to mode 3 but uses `PersistentCompactIntMatrix` with G columns. Build API not yet implemented. Scale concern: hundreds of GB for large collections — a sparse representation may be required at high genome counts. - **Layer merge**: merging two `LayeredMap` instances into a single-layer index requires full rebuild. Define API and cost model. - **Canonical kmer orientation**: evidence stores canonical kmer; strand recovery requires one 64-bit revcomp comparison at query time. - **`try_new_from_par_iter`**: `ptr_hash::new_from_par_iter` silently discards construction failure. Post-construction verification (current workaround) is correct but does not allow retry. A `try_new_from_par_iter` PR upstream would close this gap. diff --git a/docmd/implementation/persistent_bit_vec.md b/docmd/implementation/persistent_bit_vec.md index 74bd4c7..db8399e 100644 --- a/docmd/implementation/persistent_bit_vec.md +++ b/docmd/implementation/persistent_bit_vec.md @@ -1,4 +1,4 @@ -# PersistentBitVec +# PersistentBitVec and PersistentBitMatrix ## Purpose @@ -6,9 +6,13 @@ Typical use: converting k-mer count vectors to presence/absence vectors (with optional threshold), then computing set-theoretic distances (Jaccard) or edit distances (Hamming) between samples. +`PersistentBitMatrix` wraps multiple `PersistentBitVec` columns in a directory, exposing a column-major binary matrix with row-access API. A single-column bit matrix is a vector at the API level. + --- -## File format +## PersistentBitVec — single-column file + +### File format Single `.pbiv` file. @@ -28,11 +32,9 @@ offset 16: **Total file size**: `16 + ⌈n/64⌉ × 8` bytes. ---- +### Lifecycle -## Lifecycle - -### Builder (`PersistentBitVecBuilder`) +#### Builder (`PersistentBitVecBuilder`) ```rust struct PersistentBitVecBuilder { @@ -43,8 +45,6 @@ struct PersistentBitVecBuilder { The file and mmap are created immediately at construction. The header is written once at `new()` or copied from the source at `build_from*()`. `close()` is a single flush — there is no tail to append, unlike `PersistentCompactIntVec`. -#### Constructors - **`new(n: usize, path: &Path) -> io::Result`** Creates the file, writes the header, zero-extends to `16 + ⌈n/64⌉×8` bytes, mmaps immediately. All bits default to 0. @@ -68,16 +68,16 @@ Handles overflow values (≥ 255) transparently — the count iterator returns t Shorthand for `build_from_counts(source, 1, path)`. -#### Bit-level access +**Bit-level access** ```rust -fn get(&self, slot: u64) -> bool -fn set(&mut self, slot: u64, value: bool) +fn get(&self, slot: usize) -> bool +fn set(&mut self, slot: usize, value: bool) ``` Byte-level mmap access: `mmap[16 + slot/8]`, bit `slot % 8`. O(1). -#### Word-level bulk operations +**Word-level bulk operations** All operate on `⌈n/64⌉` u64 words. O(n/64) per call. @@ -90,13 +90,11 @@ builder.not(); // self[i] = !self[i], then re-zero padding bits `and`/`or`/`xor` read `other`'s word slice directly (no allocation). `not()` flips all words then masks the last word's padding bits to restore the invariant. -#### `close(self) -> io::Result<()>` +**`close(self) -> io::Result<()>`** Flushes the mmap. The header was written at construction and is never rewritten. O(1) in Rust code. ---- - -### Reader (`PersistentBitVec`) +#### Reader (`PersistentBitVec`) ```rust struct PersistentBitVec { @@ -106,19 +104,19 @@ struct PersistentBitVec { } ``` -#### `open(path: &Path) -> io::Result` +**`open(path: &Path) -> io::Result`** Mmaps the file, validates magic, reads `n` from bytes `[8..16]`. O(1). -#### `get(slot: u64) -> bool` +**`get(slot: usize) -> bool`** Byte-level read from `mmap[16 + slot/8]`. O(1). -#### `iter() -> BitIter<'_>` +**`iter() -> BitIter<'_>`** Sequential scan, byte by byte, yielding `bool` values in slot order. Implements `ExactSizeIterator`. O(n). -#### Aggregates +**Aggregates** ```rust fn count_ones(&self) -> u64 // popcount over all words; padding bits are 0 @@ -127,22 +125,20 @@ fn count_zeros(&self) -> u64 // n - count_ones() `count_ones` iterates `⌈n/64⌉` words and calls `u64::count_ones()` (maps to `POPCNT`). O(n/64). -#### Distance methods +**Distance methods** Both operate word by word. O(n/64). | Method | Formula | Notes | |---|---|---| -| `jaccard_dist(&other) -> f64` | `1 − |A∩B| / |A∪B|` | `(a&b).count_ones()`, `(a\|b).count_ones()` per word | +| `jaccard_dist(&other) -> f64` | `1 − \|A∩B\| / \|A∪B\|` | `(a&b).count_ones()`, `(a\|b).count_ones()` per word | | `hamming_dist(&other) -> u64` | number of differing bits | `(a^b).count_ones()` per word | Edge case (both all-zero → union = 0): `jaccard_dist` returns 0.0. ---- +### Implementation notes -## Implementation notes - -### u64 word view +#### u64 word view The unsafe cast from `&[u8]` to `&[u64]` is sound because: @@ -152,13 +148,11 @@ The unsafe cast from `&[u8]` to `&[u64]` is sound because: This gives zero-copy word-level access with no intermediate allocation. -### Padding invariant +#### Padding invariant Writing `not()` without masking the last word would corrupt `count_ones()`, `hamming_dist()`, and `jaccard_dist()`. The mask applied after flipping is `(1u64 << (n % 64)) - 1` (no-op if `n % 64 == 0`). All other operations (`and`, `or`, `xor`) preserve existing zero padding since they can only clear or preserve bits already set by `not()`. ---- - -## Complexity +### Complexity | Operation | Time | Notes | |---|---|---| @@ -171,3 +165,74 @@ Writing `not()` without masking the last word would corrupt `count_ones()`, `ham | `build_from` | O(file_size) | OS copy | | `build_from_counts` / `build_from_presence` | O(n) | count iter + word fill | | `close` | O(1) | flush only | + +--- + +## PersistentBitMatrix — column-major directory + +### Design + +A directory containing `meta.json` and N column files `col_000000.pbiv`, `col_000001.pbiv`, …, each a `PersistentBitVec`. Used for presence/absence matrices: one column per genome, one bit per MPHF slot. + +``` +presence/ + meta.json {"n": , "n_cols": } + col_000000.pbiv genome 0 + col_000001.pbiv genome 1 + ... +``` + +Column-major layout makes per-genome set operations (Jaccard, Hamming, AND/OR) cache-friendly — each genome is a contiguous file. Row access (which genomes contain a given kmer) requires one O(1) read per column. + +### Builder (`PersistentBitMatrixBuilder`) + +```rust +struct PersistentBitMatrixBuilder { + dir: PathBuf, + n: usize, + n_cols: usize, +} +``` + +**`new(n: usize, dir: &Path) -> io::Result`** + +Creates the directory (including parents). + +**`add_col(&mut self) -> io::Result`** + +Creates `col_NNNNNN.pbiv` for the next column and returns its builder. The caller fills the column and calls `builder.close()` before calling `add_col` again. + +**`close(self) -> io::Result<()>`** + +Writes `meta.json` with the final `n` and `n_cols`. + +### Reader (`PersistentBitMatrix`) + +```rust +struct PersistentBitMatrix { + cols: Vec, + n: usize, +} +``` + +**`open(dir: &Path) -> io::Result`** + +Reads `meta.json`, opens all `col_NNNNNN.pbiv` files. + +**`row(slot: usize) -> Box<[bool]>`** + +Returns the presence vector: `[col_0[slot], col_1[slot], …, col_{G-1}[slot]]`. One byte read per column. O(G). + +**`col(c: usize) -> &PersistentBitVec`** + +Direct access to a single column for column-oriented operations. + +### LayerData implementation + +```rust +impl LayerData for PersistentBitMatrix { + type Item = Box<[bool]>; + fn open(layer_dir: &Path) -> OLMResult { /* opens layer_dir/presence/ */ } + fn read(&self, slot: usize) -> Box<[bool]> { self.row(slot) } +} +``` diff --git a/docmd/implementation/persistent_compact_int_vec.md b/docmd/implementation/persistent_compact_int_vec.md index 00b5ecf..3808262 100644 --- a/docmd/implementation/persistent_compact_int_vec.md +++ b/docmd/implementation/persistent_compact_int_vec.md @@ -1,4 +1,4 @@ -# PersistentCompactIntVec +# PersistentCompactIntVec and PersistentCompactIntMatrix ## Purpose @@ -6,78 +6,81 @@ Motivation from observed count distributions in genomics data: 99.9% of k-mer counts fit in a u8; overflow (count ≥ 255) affects ~0.07% of distinct k-mers but can reach values above 10⁶ (chloroplast, ribosomal repeats). +`PersistentCompactIntMatrix` wraps multiple `PersistentCompactIntVec` columns in a directory, exposing a column-major matrix with row-access API. A vector is a matrix with 1 column. + --- -## Design +## PersistentCompactIntVec — single-column file + +### Design Two-tier structure: -1. **Primary array** — `[u8; n]`, stored at offset 24 in the PCIV file and mmap'd. Values 0–254 are stored directly. Value **255 is a sentinel** meaning "look in overflow". -2. **Overflow section** — sorted list of `(slot: u32, value: u32)` pairs for all slots where the true value ≥ 255, with a **sparse L1-fitting index** for fast lookup. +1. **Primary array** — `[u8; n]`, stored at offset 40 in the PCIV file and mmap'd. Values 0–254 are stored directly. Value **255 is a sentinel** meaning "look in overflow". +2. **Overflow section** — sorted list of `(slot: u64, value: u32)` pairs for all slots where the true value ≥ 255, with a **sparse L1-fitting index** for fast lookup. ``` primary[slot] < 255 → return primary[slot] primary[slot] == 255 → binary search in overflow ``` ---- +### File format -## Single-file format - -Everything is stored in a single `.pciv` file. Write order matches computation order: the header placeholder is written first, then primary (known at `new()`), then overflow data and index (known at `close()`), then the header is overwritten at offset 0. +Single `.pciv` file. Write order: header placeholder → primary → overflow + index → header overwrite at offset 0. ``` offset 0: - magic: [u8; 4] = b"PCIV" - n: u64 number of slots - n_overflow: u32 number of overflow entries - step: u32 sparse index step (0 = no index) - n_index: u32 number of index entries + magic: [u8; 4] = b"PCIV" + _pad: [u8; 4] = 0 + n: u64 number of slots + n_overflow: u64 number of overflow entries + n_index: u64 number of sparse index entries + step: u64 sparse index step (0 = no index) -offset 24: - primary: [u8; n] one byte per slot, 255 = overflow sentinel +offset 40: + primary: [u8; n] one byte per slot, 255 = overflow sentinel -offset 24 + n: - data: [(slot: u32, value: u32); n_overflow] sorted by slot +offset 40 + n: + data: [(slot: u64, value: u32); n_overflow] 12 bytes each, sorted by slot -offset 24 + n + n_overflow × 8: - index: [(slot: u32, pos: u32); n_index] sparse index +offset 40 + n + n_overflow × 12: + index: [(slot: u64, pos: u64); n_index] 16 bytes each, sparse index ``` The index entries point into `data`: `index[i] = (slot of data[i×step], i×step)`. ---- +All integer fields are little-endian. Slot indices are stored as `u64` in the file; they are `usize` in Rust code. -## Lifecycle +### Lifecycle -### Builder (`PersistentCompactIntVecBuilder`) +#### Builder (`PersistentCompactIntVecBuilder`) -Used during construction. The primary section is **mmap'd immediately** at construction time (both for `new` and `build_from`), so the file exists and is addressable from the start. The overflow is held in a `HashMap` in RAM. +Used during construction. The primary section is **mmap'd immediately** at construction time (both for `new` and `build_from`), so the file exists and is addressable from the start. The overflow is held in a `HashMap` in RAM. ```rust struct PersistentCompactIntVecBuilder { path: PathBuf, - mmap: MmapMut, // primary section live in the file from the start + mmap: MmapMut, // primary section live in the file from the start n: usize, - overflow: HashMap, // values ≥ 255 + overflow: HashMap, // values ≥ 255 } ``` -#### `new(n: usize, path: &Path) -> io::Result` +**`new(n: usize, path: &Path) -> io::Result`** Creates the file, pre-allocates `HEADER_SIZE + n` zero bytes, mmaps it. The primary is zero-initialised (all slots = 0). Returns immediately ready for `set` / `get`. -#### `build_from(source: &PersistentCompactIntVec, path: &Path) -> io::Result` +**`build_from(source: &PersistentCompactIntVec, path: &Path) -> io::Result`** Copies the source PCIV file to `path` (OS-level copy — no per-slot iteration), mmaps the copy, then loads the overflow section into a `HashMap`. Initialisation cost: O(file copy) + O(n_overflow), not O(n). At `close()`, the primary section is **not rewritten**: it is already in the file via mmap. Only the overflow data, the sparse index, and the header are updated. -#### `set(slot: u64, value: u32)` / `get(slot: u64) -> u32` +**`set(slot: usize, value: u32)` / `get(slot: usize) -> u32`** Direct mmap byte access for the primary; HashMap for the overflow. Both O(1). Mutations can move a slot between tiers freely (downward mutation removes the HashMap entry; upward mutation adds it). -#### Element-wise operations — `min`, `max`, `add`, `diff` +**Element-wise operations — `min`, `max`, `add`, `diff`** Each takes a `&PersistentCompactIntVec` of equal length and updates `self` in place via `set`: @@ -90,17 +93,15 @@ builder.diff(&other); // self[i] = self[i].saturating_sub(other[i]) All iterate `other` with `other.iter()` (merge-scan, O(n_other)). -#### `close(self) -> io::Result<()>` +**`close(self) -> io::Result<()>`** 1. Flush and drop the mmap (primary changes are now on disk). -2. Sort the overflow HashMap into `Vec<(u32, u32)>`. +2. Sort the overflow HashMap into `Vec<(usize, u32)>`. 3. Truncate the file to `HEADER_SIZE + n` (removes old data+index if `build_from` was used). 4. Append sorted overflow data, then sparse index. 5. Seek to offset 0, overwrite the header with final values. ---- - -### Reader (`PersistentCompactIntVec`) +#### Reader (`PersistentCompactIntVec`) Used at query time. The whole file is mmap'd; only the sparse index is copied into a `Vec` at open time (≤ 32 KB, L1-resident). @@ -109,19 +110,19 @@ struct PersistentCompactIntVec { mmap: Mmap, n: usize, n_overflow: usize, - step: u32, - index: Vec<(u32, u32)>, // L1-resident - primary_offset: usize, // = 24 (HEADER_SIZE) - data_offset: usize, // = 24 + n + step: usize, + index: Vec<(usize, usize)>, // (slot, pos) — L1-resident + primary_offset: usize, // = 40 (HEADER_SIZE) + data_offset: usize, // = 40 + n path: PathBuf, } ``` -#### `open(path: &Path) -> io::Result` +**`open(path: &Path) -> io::Result`** -Mmaps the file, parses the 24-byte header, copies the sparse index entries into a `Vec`. The primary and data sections stay mmap'd. +Mmaps the file, parses the 40-byte header, copies the sparse index entries into a `Vec`. The primary and data sections stay mmap'd. -#### `get(slot: u64) -> u32` — random access +**`get(slot: usize) -> u32` — random access** ``` primary[slot] < 255 → return it directly @@ -134,19 +135,19 @@ step > 0: binary_search(data[index[i].pos .. index[i+1].pos], slot) ``` -#### `iter() -> Iter<'_>` — sequential scan, O(n) +**`iter() -> Iter<'_>` — sequential scan, O(n)** Merge-scan: reads primary bytes in order; on sentinel 255, advances a sequential pointer into the sorted data section rather than doing a binary search. This gives O(n + n_overflow) with no random access into the data section. `Iter` implements `ExactSizeIterator`. `&PersistentCompactIntVec` implements `IntoIterator`. -#### Aggregate +**Aggregate** ```rust fn sum(&self) -> u64 // Σ self[i] as u64, via iter() ``` -#### Distance methods +**Distance methods** All take `&other` of equal length, iterate both with `zip(self.iter(), other.iter())`, and return `f64`. @@ -158,29 +159,23 @@ All take `&other` of equal length, iterate both with `zip(self.iter(), other.ite | `relfreq_euclidean_dist` | Euclidean on relative frequencies | | `hellinger_euclidean_dist` | `√Σ(√pᵢ − √qᵢ)²` — Euclidean on sqrt(relfreq) | | `hellinger_dist` | `hellinger_euclidean_dist / √2` — standard Hellinger distance ∈ [0, 1] | -| `threshold_jaccard_dist(&other, threshold: u32)` | `1 − |A∩B| / |A∪B|` where presence iff count ≥ threshold | +| `threshold_jaccard_dist(&other, threshold: u32)` | `1 − \|A∩B\| / \|A∪B\|` where presence iff count ≥ threshold | | `jaccard_dist` | `threshold_jaccard_dist(&other, 1)` | Edge cases (both vectors all-zero, or union empty for Jaccard): distance = 0.0. ---- - -## Step computation +### Step computation Chosen at `close()` once `n_overflow` is known: ``` -L1_entries = 32 768 / 8 = 4096 +L1_INDEX_ENTRIES = 2048 -step = 0 if n_overflow ≤ 4096 -step = ⌈n_overflow / 4096⌉ otherwise +step = 0 if n_overflow ≤ 2048 +step = ⌈n_overflow / 2048⌉ otherwise ``` -For the Betula nana reference (359 044 overflows): step = 88, n_index = 4 080 entries = 31.9 KB. - ---- - -## Complexity +### Complexity | Operation | Time | Notes | |---|---|---| @@ -194,3 +189,72 @@ For the Betula nana reference (359 044 overflows): step = 88, n_index = 4 080 en | `close` | O(n_overflow log n_overflow) | sort + sequential write | | `open` | O(n_index) | index copy into Vec | | `build_from` | O(file_size) + O(n_overflow) | OS copy + HashMap load | + +--- + +## PersistentCompactIntMatrix — column-major directory + +### Design + +A directory containing `meta.json` and N column files `col_000000.pciv`, `col_000001.pciv`, …, each a `PersistentCompactIntVec`. This is the type used by `LayerData` — a single-column matrix is functionally equivalent to a vector but shares the same interface as multi-column matrices. + +``` +counts/ + meta.json {"n": , "n_cols": } + col_000000.pciv + col_000001.pciv + ... +``` + +### Builder (`PersistentCompactIntMatrixBuilder`) + +```rust +struct PersistentCompactIntMatrixBuilder { + dir: PathBuf, + n: usize, + n_cols: usize, +} +``` + +**`new(n: usize, dir: &Path) -> io::Result`** + +Creates the directory (including parents). Does not write `meta.json` yet. + +**`add_col(&mut self) -> io::Result`** + +Creates `col_NNNNNN.pciv` for the next column and returns its builder. The caller fills the column and calls `builder.close()` before calling `add_col` again. + +**`close(self) -> io::Result<()>`** + +Writes `meta.json` with the final `n` and `n_cols`. Must be called after all column builders are closed. + +### Reader (`PersistentCompactIntMatrix`) + +```rust +struct PersistentCompactIntMatrix { + cols: Vec, + n: usize, +} +``` + +**`open(dir: &Path) -> io::Result`** + +Reads `meta.json`, opens all `col_NNNNNN.pciv` files. + +**`row(slot: usize) -> Box<[u32]>`** + +Returns the full row: `[col_0[slot], col_1[slot], …, col_{N-1}[slot]]`. One mmap access per column. O(N). + +**`col(c: usize) -> &PersistentCompactIntVec`** + +Direct access to a single column for column-oriented operations (distance computations, iteration). + +### LayerData implementation + +```rust +impl LayerData for PersistentCompactIntMatrix { + type Item = Box<[u32]>; + fn open(layer_dir: &Path) -> OLMResult { /* opens layer_dir/counts/ */ } + fn read(&self, slot: usize) -> Box<[u32]> { self.row(slot) } +} +``` diff --git a/src/obicompactvec/src/bitmatrix.rs b/src/obicompactvec/src/bitmatrix.rs new file mode 100644 index 0000000..e646ba8 --- /dev/null +++ b/src/obicompactvec/src/bitmatrix.rs @@ -0,0 +1,57 @@ +use std::{fs, io, path::{Path, PathBuf}}; + +use crate::bitvec::{PersistentBitVec, PersistentBitVecBuilder}; +use crate::meta::MatrixMeta; + +fn col_path(dir: &Path, col: usize) -> PathBuf { + dir.join(format!("col_{col:06}.pbiv")) +} + +pub struct PersistentBitMatrix { + cols: Vec, + n: usize, +} + +impl PersistentBitMatrix { + pub fn open(dir: &Path) -> io::Result { + let meta = MatrixMeta::load(dir)?; + let cols = (0..meta.n_cols) + .map(|c| PersistentBitVec::open(&col_path(dir, c))) + .collect::>>()?; + Ok(Self { cols, n: meta.n }) + } + + pub fn n(&self) -> usize { self.n } + pub fn n_cols(&self) -> usize { self.cols.len() } + pub fn col(&self, c: usize) -> &PersistentBitVec { &self.cols[c] } + + pub fn row(&self, slot: usize) -> Box<[bool]> { + self.cols.iter().map(|c| c.get(slot)).collect() + } +} + +pub struct PersistentBitMatrixBuilder { + dir: PathBuf, + n: usize, + n_cols: usize, +} + +impl PersistentBitMatrixBuilder { + pub fn new(n: usize, dir: &Path) -> io::Result { + fs::create_dir_all(dir)?; + Ok(Self { dir: dir.to_path_buf(), n, n_cols: 0 }) + } + + pub fn n(&self) -> usize { self.n } + pub fn n_cols(&self) -> usize { self.n_cols } + + pub fn add_col(&mut self) -> io::Result { + let path = col_path(&self.dir, self.n_cols); + self.n_cols += 1; + PersistentBitVecBuilder::new(self.n, &path) + } + + pub fn close(self) -> io::Result<()> { + MatrixMeta { n: self.n, n_cols: self.n_cols }.save(&self.dir) + } +} diff --git a/src/obicompactvec/src/intmatrix.rs b/src/obicompactvec/src/intmatrix.rs new file mode 100644 index 0000000..f86e1a2 --- /dev/null +++ b/src/obicompactvec/src/intmatrix.rs @@ -0,0 +1,58 @@ +use std::{fs, io, path::{Path, PathBuf}}; + +use crate::builder::PersistentCompactIntVecBuilder; +use crate::meta::MatrixMeta; +use crate::reader::PersistentCompactIntVec; + +fn col_path(dir: &Path, col: usize) -> PathBuf { + dir.join(format!("col_{col:06}.pciv")) +} + +pub struct PersistentCompactIntMatrix { + cols: Vec, + n: usize, +} + +impl PersistentCompactIntMatrix { + pub fn open(dir: &Path) -> io::Result { + let meta = MatrixMeta::load(dir)?; + let cols = (0..meta.n_cols) + .map(|c| PersistentCompactIntVec::open(&col_path(dir, c))) + .collect::>>()?; + Ok(Self { cols, n: meta.n }) + } + + pub fn n(&self) -> usize { self.n } + pub fn n_cols(&self) -> usize { self.cols.len() } + pub fn col(&self, c: usize) -> &PersistentCompactIntVec { &self.cols[c] } + + pub fn row(&self, slot: usize) -> Box<[u32]> { + self.cols.iter().map(|c| c.get(slot)).collect() + } +} + +pub struct PersistentCompactIntMatrixBuilder { + dir: PathBuf, + n: usize, + n_cols: usize, +} + +impl PersistentCompactIntMatrixBuilder { + pub fn new(n: usize, dir: &Path) -> io::Result { + fs::create_dir_all(dir)?; + Ok(Self { dir: dir.to_path_buf(), n, n_cols: 0 }) + } + + pub fn n(&self) -> usize { self.n } + pub fn n_cols(&self) -> usize { self.n_cols } + + pub fn add_col(&mut self) -> io::Result { + let path = col_path(&self.dir, self.n_cols); + self.n_cols += 1; + PersistentCompactIntVecBuilder::new(self.n, &path) + } + + pub fn close(self) -> io::Result<()> { + MatrixMeta { n: self.n, n_cols: self.n_cols }.save(&self.dir) + } +} diff --git a/src/obicompactvec/src/lib.rs b/src/obicompactvec/src/lib.rs index 73ec076..df02fb8 100644 --- a/src/obicompactvec/src/lib.rs +++ b/src/obicompactvec/src/lib.rs @@ -1,10 +1,15 @@ mod bitvec; +mod bitmatrix; mod builder; mod format; +mod intmatrix; +mod meta; mod reader; pub use bitvec::{BitIter, PersistentBitVec, PersistentBitVecBuilder}; +pub use bitmatrix::{PersistentBitMatrix, PersistentBitMatrixBuilder}; pub use builder::PersistentCompactIntVecBuilder; +pub use intmatrix::{PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder}; pub use reader::PersistentCompactIntVec; #[cfg(test)] diff --git a/src/obicompactvec/src/meta.rs b/src/obicompactvec/src/meta.rs new file mode 100644 index 0000000..d8d8466 --- /dev/null +++ b/src/obicompactvec/src/meta.rs @@ -0,0 +1,32 @@ +use std::{fs, io, path::Path}; + +pub struct MatrixMeta { + pub n: usize, + pub n_cols: usize, +} + +impl MatrixMeta { + pub fn load(dir: &Path) -> io::Result { + let s = fs::read_to_string(dir.join("meta.json"))?; + parse(&s).ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "bad meta.json")) + } + + pub fn save(&self, dir: &Path) -> io::Result<()> { + fs::write( + dir.join("meta.json"), + format!("{{\"n\":{},\"n_cols\":{}}}\n", self.n, self.n_cols), + ) + } +} + +fn parse(s: &str) -> Option { + Some(MatrixMeta { n: field(s, "n")?, n_cols: field(s, "n_cols")? }) +} + +fn field(s: &str, name: &str) -> Option { + let key = format!("\"{}\":", name); + let pos = s.find(&key)? + key.len(); + let rest = s[pos..].trim_start(); + let end = rest.find(|c: char| !c.is_ascii_digit()).unwrap_or(rest.len()); + rest[..end].parse().ok() +} diff --git a/src/obicompactvec/src/reader.rs b/src/obicompactvec/src/reader.rs index a3fc606..8a23667 100644 --- a/src/obicompactvec/src/reader.rs +++ b/src/obicompactvec/src/reader.rs @@ -7,41 +7,45 @@ use memmap2::Mmap; use crate::format::{HEADER_SIZE, INDEX_ENTRY_SIZE, MAGIC, OVERFLOW_ENTRY_SIZE}; pub struct PersistentCompactIntVec { - mmap: Mmap, - n: usize, - n_overflow: usize, - pub step: usize, - index: Vec<(usize, usize)>, // (slot, pos) — L1-resident sparse index - primary_offset: usize, // = HEADER_SIZE - data_offset: usize, // = HEADER_SIZE + n - path: PathBuf, + mmap: Mmap, + n: usize, + n_overflow: usize, + pub step: usize, + index: Vec<(usize, usize)>, // (slot, pos) — L1-resident sparse index + primary_offset: usize, // = HEADER_SIZE + data_offset: usize, // = HEADER_SIZE + n + path: PathBuf, } impl PersistentCompactIntVec { + /// Opens a persistent compact int vector from the given path. pub fn open(path: &Path) -> io::Result { let mmap = unsafe { Mmap::map(&File::open(path)?)? }; if mmap.len() < HEADER_SIZE { - return Err(io::Error::new(io::ErrorKind::InvalidData, "PCIV file too short")); + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "PCIV file too short", + )); } if &mmap[0..4] != &MAGIC { return Err(io::Error::new(io::ErrorKind::InvalidData, "bad PCIV magic")); } - let n = u64::from_le_bytes(mmap[8..16].try_into().unwrap()) as usize; + let n = u64::from_le_bytes(mmap[8..16].try_into().unwrap()) as usize; let n_overflow = u64::from_le_bytes(mmap[16..24].try_into().unwrap()) as usize; - let n_index = u64::from_le_bytes(mmap[24..32].try_into().unwrap()) as usize; - let step = u64::from_le_bytes(mmap[32..40].try_into().unwrap()) as usize; + let n_index = u64::from_le_bytes(mmap[24..32].try_into().unwrap()) as usize; + let step = u64::from_le_bytes(mmap[32..40].try_into().unwrap()) as usize; let primary_offset = HEADER_SIZE; - let data_offset = primary_offset + n; - let index_offset = data_offset + n_overflow * OVERFLOW_ENTRY_SIZE; + let data_offset = primary_offset + n; + let index_offset = data_offset + n_overflow * OVERFLOW_ENTRY_SIZE; let mut index = Vec::with_capacity(n_index); for i in 0..n_index { - let off = index_offset + i * INDEX_ENTRY_SIZE; + let off = index_offset + i * INDEX_ENTRY_SIZE; let slot = u64::from_le_bytes(mmap[off..off + 8].try_into().unwrap()) as usize; - let pos = u64::from_le_bytes(mmap[off + 8..off + 16].try_into().unwrap()) as usize; + let pos = u64::from_le_bytes(mmap[off + 8..off + 16].try_into().unwrap()) as usize; index.push((slot, pos)); } @@ -57,36 +61,44 @@ impl PersistentCompactIntVec { }) } + /// Returns the path of the compact int vector file. pub fn path(&self) -> &Path { &self.path } + /// Returns the length of the compact int vector. pub fn len(&self) -> usize { self.n } + /// Returns whether the compact int vector is empty. pub fn is_empty(&self) -> bool { self.n == 0 } + /// Returns the value at the given slot. pub fn get(&self, slot: usize) -> u32 { match self.mmap[self.primary_offset + slot] { 255 => self.overflow_get(slot), - v => v as u32, + v => v as u32, } } + /// Returns the value at the given slot from the overflow region. fn overflow_get(&self, slot: usize) -> u32 { let pos_start; let pos_end; if self.step == 0 { pos_start = 0; - pos_end = self.n_overflow; + pos_end = self.n_overflow; } else { - let i = self.index.partition_point(|&(s, _)| s <= slot).saturating_sub(1); + let i = self + .index + .partition_point(|&(s, _)| s <= slot) + .saturating_sub(1); pos_start = self.index[i].1; - pos_end = if i + 1 < self.index.len() { + pos_end = if i + 1 < self.index.len() { self.index[i + 1].1 } else { self.n_overflow @@ -98,8 +110,8 @@ impl PersistentCompactIntVec { while lo < hi { let mid = lo + (hi - lo) / 2; match self.data_slot(mid).cmp(&slot) { - std::cmp::Ordering::Equal => return self.data_value(mid), - std::cmp::Ordering::Less => lo = mid + 1, + std::cmp::Ordering::Equal => return self.data_value(mid), + std::cmp::Ordering::Less => lo = mid + 1, std::cmp::Ordering::Greater => hi = mid, } } @@ -107,85 +119,203 @@ impl PersistentCompactIntVec { } #[inline] + /// Returns the slot at the given index in the overflow region. fn data_slot(&self, i: usize) -> usize { let off = self.data_offset + i * OVERFLOW_ENTRY_SIZE; u64::from_le_bytes(self.mmap[off..off + 8].try_into().unwrap()) as usize } #[inline] + /// Returns the value at the given index in the overflow region. fn data_value(&self, i: usize) -> u32 { let off = self.data_offset + i * OVERFLOW_ENTRY_SIZE + 8; u32::from_le_bytes(self.mmap[off..off + 4].try_into().unwrap()) } + #[inline] + /// Returns the sum of all values in the compact int vector. pub fn sum(&self) -> u64 { self.iter().map(|v| v as u64).sum() } + #[inline] + /// Returns the Bray-Curtis distance between two compact int vectors. pub fn bray_dist(&self, other: &PersistentCompactIntVec) -> f64 { - assert_eq!(self.n, other.len(), "length mismatch"); - let (sum_min, sum_a, sum_b) = self.iter().zip(other.iter()).fold( - (0u64, 0u64, 0u64), - |(sm, sa, sb), (a, b)| (sm + a.min(b) as u64, sa + a as u64, sb + b as u64), - ); - let denom = sum_a + sum_b; - if denom == 0 { return 0.0; } + let (sum_min, denom) = self.partial_bray_dist(other); + if denom == 0 { + return 0.0; + } 1.0 - 2.0 * sum_min as f64 / denom as f64 } + /// Returns the partial Bray-Curtis distance between two compact int vectors. + /// + /// Returns a tuple `(sum_min, denom)` where `sum_min` is the sum of the minimum values + /// at each index, and `denom` is the sum of the values in both vectors. + /// This is used internally by [`bray_dist`] and to easily compute the Bray-Curtis distance + /// over a set of vector pairs. + /// + /// Returns the tuple `(sum_min, sum_a + sum_b)` where `sum_min` is the sum of the minimum + /// values at each index, `sum_a` is the sum of the first vector's counts, and `sum_b` is + /// the sum of the second vector's counts. + pub fn partial_bray_dist(&self, other: &PersistentCompactIntVec) -> (u64, u64) { + assert_eq!(self.n, other.len(), "length mismatch"); + let (sum_min, sum_a, sum_b) = self + .iter() + .zip(other.iter()) + .fold((0u64, 0u64, 0u64), |(sm, sa, sb), (a, b)| { + (sm + a.min(b) as u64, sa + a as u64, sb + b as u64) + }); + (sum_min, sum_a + sum_b) + } + + /// Returns the relative frequency Bray-Curtis distance between two compact int vectors. + /// + /// This is a variant of [`bray_dist`] that uses relative frequencies instead of raw counts. pub fn relfreq_bray_dist(&self, other: &PersistentCompactIntVec) -> f64 { assert_eq!(self.n, other.len(), "length mismatch"); let sum_a = self.sum() as f64; let sum_b = other.sum() as f64; - if sum_a == 0.0 && sum_b == 0.0 { return 0.0; } - let sum_min: f64 = self.iter().zip(other.iter()) + if sum_a == 0.0 && sum_b == 0.0 { + return 0.0; + } + let sum_min = self.partial_relfreq_bray_dist(other, sum_a, sum_b); + 1.0 - sum_min + } + + /// Returns the partial relative frequency Bray-Curtis distance between two compact int vectors. + /// + /// This is used internally by [`relfreq_bray_dist`] and to easily compute the relative frequency + /// Bray-Curtis distance over a set of vector pairs. + /// + /// Arguments: + /// - `other`: the other compact int vector to compare with + /// - `sum_a`: the sum of the first vector's counts + /// - `sum_b`: the sum of the second vector's counts + /// + /// Returns the sum of the minimum relative frequencies at each index. + pub fn partial_relfreq_bray_dist( + &self, + other: &PersistentCompactIntVec, + sum_a: f64, + sum_b: f64, + ) -> f64 { + assert_eq!(self.n, other.len(), "length mismatch"); + let sum_min: f64 = self + .iter() + .zip(other.iter()) .map(|(a, b)| { let pa = if sum_a > 0.0 { a as f64 / sum_a } else { 0.0 }; let pb = if sum_b > 0.0 { b as f64 / sum_b } else { 0.0 }; pa.min(pb) }) .sum(); - 1.0 - sum_min + sum_min } + /// Returns the euclidean distance between two compact int vectors. pub fn euclidean_dist(&self, other: &PersistentCompactIntVec) -> f64 { - assert_eq!(self.n, other.len(), "length mismatch"); - let sq: f64 = self.iter().zip(other.iter()) - .map(|(a, b)| { let d = a as f64 - b as f64; d * d }) - .sum(); - sq.sqrt() + self.partial_euclidean_dist(other).sqrt() } + /// Returns the partial euclidean distance between two compact int vectors. + /// + /// This is used internally by [`euclidean_dist`] and to easily compute the euclidean distance + /// over a set of vector pairs. + /// + /// The result is the sum of the squared differences between corresponding elements of the two + /// vectors. + pub fn partial_euclidean_dist(&self, other: &PersistentCompactIntVec) -> f64 { + assert_eq!(self.n, other.len(), "length mismatch"); + self.iter() + .zip(other.iter()) + .map(|(a, b)| { + let d = a as f64 - b as f64; + d * d + }) + .sum() + } + + /// Returns the relative frequency euclidean distance between two compact int vectors. + /// + /// This is a variant of [`euclidean_dist`] that uses relative frequencies instead of raw counts. pub fn relfreq_euclidean_dist(&self, other: &PersistentCompactIntVec) -> f64 { assert_eq!(self.n, other.len(), "length mismatch"); let sum_a = self.sum() as f64; let sum_b = other.sum() as f64; - if sum_a == 0.0 && sum_b == 0.0 { return 0.0; } - let sq: f64 = self.iter().zip(other.iter()) + if sum_a == 0.0 && sum_b == 0.0 { + return 0.0; + } + self.partial_relfreq_euclidean_dist(other, sum_a, sum_b) + .sqrt() + } + + /// Returns the partial relative frequency euclidean distance between two compact int vectors. + /// + /// This is used internally by [`relfreq_euclidean_dist`] and to easily compute the relative frequency + /// euclidean distance over a set of vector pairs. + pub fn partial_relfreq_euclidean_dist( + &self, + other: &PersistentCompactIntVec, + sum_a: f64, + sum_b: f64, + ) -> f64 { + assert_eq!(self.n, other.len(), "length mismatch"); + self.iter() + .zip(other.iter()) .map(|(a, b)| { let pa = if sum_a > 0.0 { a as f64 / sum_a } else { 0.0 }; let pb = if sum_b > 0.0 { b as f64 / sum_b } else { 0.0 }; let d = pa - pb; d * d }) - .sum(); - sq.sqrt() + .sum() } + /// Returns the Euclidean distance between two compact int vectors using the Hellinger transform. + /// + /// The Hellinger transform is applied to the raw counts of each vector, and the result is + /// the Euclidean distance between the transformed vectors. The Hellinger transform is defined + /// as the square root of the relative frequencies. pub fn hellinger_euclidean_dist(&self, other: &PersistentCompactIntVec) -> f64 { assert_eq!(self.n, other.len(), "length mismatch"); let sum_a = self.sum() as f64; let sum_b = other.sum() as f64; - if sum_a == 0.0 && sum_b == 0.0 { return 0.0; } - let sq: f64 = self.iter().zip(other.iter()) + if sum_a == 0.0 && sum_b == 0.0 { + return 0.0; + } + self.partial_hellinger_euclidean_dist(other, sum_a, sum_b) + .sqrt() + } + + /// Returns the partial Hellinger Euclidean distance between two compact int vectors. + /// + /// This is used internally by [`hellinger_euclidean_dist`] and to easily compute the Hellinger + /// Euclidean distance over a set of vector pairs. + pub fn partial_hellinger_euclidean_dist( + &self, + other: &PersistentCompactIntVec, + sum_a: f64, + sum_b: f64, + ) -> f64 { + assert_eq!(self.n, other.len(), "length mismatch"); + self.iter() + .zip(other.iter()) .map(|(a, b)| { - let pa = if sum_a > 0.0 { (a as f64 / sum_a).sqrt() } else { 0.0 }; - let pb = if sum_b > 0.0 { (b as f64 / sum_b).sqrt() } else { 0.0 }; + let pa = if sum_a > 0.0 { + (a as f64 / sum_a).sqrt() + } else { + 0.0 + }; + let pb = if sum_b > 0.0 { + (b as f64 / sum_b).sqrt() + } else { + 0.0 + }; let d = pa - pb; d * d }) - .sum(); - sq.sqrt() + .sum() } pub fn hellinger_dist(&self, other: &PersistentCompactIntVec) -> f64 { @@ -194,16 +324,26 @@ impl PersistentCompactIntVec { pub fn threshold_jaccard_dist(&self, other: &PersistentCompactIntVec, threshold: u32) -> f64 { assert_eq!(self.n, other.len(), "length mismatch"); - let (intersection, union) = self.iter().zip(other.iter()).fold( - (0u64, 0u64), - |(inter, uni), (a, b)| { + let (intersection, union) = self.partial_threshold_jaccard_dist(other, threshold); + if union == 0 { + return 0.0; + } + 1.0 - intersection as f64 / union as f64 + } + + pub fn partial_threshold_jaccard_dist( + &self, + other: &PersistentCompactIntVec, + threshold: u32, + ) -> (u64, u64) { + assert_eq!(self.n, other.len(), "length mismatch"); + self.iter() + .zip(other.iter()) + .fold((0u64, 0u64), |(inter, uni), (a, b)| { let ap = a >= threshold; let bp = b >= threshold; (inter + (ap & bp) as u64, uni + (ap | bp) as u64) - }, - ); - if union == 0 { return 0.0; } - 1.0 - intersection as f64 / union as f64 + }) } pub fn jaccard_dist(&self, other: &PersistentCompactIntVec) -> f64 { @@ -211,7 +351,11 @@ impl PersistentCompactIntVec { } pub fn iter(&self) -> Iter<'_> { - Iter { pciv: self, slot: 0, overflow_pos: 0 } + Iter { + pciv: self, + slot: 0, + overflow_pos: 0, + } } } @@ -225,8 +369,8 @@ impl<'a> IntoIterator for &'a PersistentCompactIntVec { } pub struct Iter<'a> { - pciv: &'a PersistentCompactIntVec, - slot: usize, + pciv: &'a PersistentCompactIntVec, + slot: usize, overflow_pos: usize, } diff --git a/src/obicompactvec/src/tests/bitmatrix.rs b/src/obicompactvec/src/tests/bitmatrix.rs new file mode 100644 index 0000000..1fab8ff --- /dev/null +++ b/src/obicompactvec/src/tests/bitmatrix.rs @@ -0,0 +1,69 @@ +use tempfile::tempdir; + +use crate::{PersistentBitMatrix, PersistentBitMatrixBuilder}; + +#[test] +fn single_col_roundtrip() { + let dir = tempdir().unwrap(); + let mut b = PersistentBitMatrixBuilder::new(4, dir.path()).unwrap(); + let mut col = b.add_col().unwrap(); + col.set(0, true); + col.set(1, false); + col.set(2, true); + col.set(3, true); + col.close().unwrap(); + b.close().unwrap(); + + let m = PersistentBitMatrix::open(dir.path()).unwrap(); + assert_eq!(m.n_cols(), 1); + assert_eq!(m.n(), 4); + assert_eq!(&*m.row(0), &[true]); + assert_eq!(&*m.row(1), &[false]); + assert_eq!(&*m.row(2), &[true]); + assert_eq!(&*m.row(3), &[true]); +} + +#[test] +fn two_cols_roundtrip() { + let dir = tempdir().unwrap(); + let mut b = PersistentBitMatrixBuilder::new(3, dir.path()).unwrap(); + let mut col0 = b.add_col().unwrap(); + col0.set(0, true); col0.set(1, false); col0.set(2, true); + col0.close().unwrap(); + let mut col1 = b.add_col().unwrap(); + col1.set(0, false); col1.set(1, true); col1.set(2, false); + col1.close().unwrap(); + b.close().unwrap(); + + let m = PersistentBitMatrix::open(dir.path()).unwrap(); + assert_eq!(m.n_cols(), 2); + assert_eq!(&*m.row(0), &[true, false]); + assert_eq!(&*m.row(1), &[false, true]); + assert_eq!(&*m.row(2), &[true, false]); +} + +#[test] +fn col_accessor() { + let dir = tempdir().unwrap(); + let mut b = PersistentBitMatrixBuilder::new(3, dir.path()).unwrap(); + let mut col = b.add_col().unwrap(); + col.set(0, true); col.set(1, false); col.set(2, true); + col.close().unwrap(); + b.close().unwrap(); + + let m = PersistentBitMatrix::open(dir.path()).unwrap(); + assert!(m.col(0).get(0)); + assert!(!m.col(0).get(1)); + assert!(m.col(0).get(2)); +} + +#[test] +fn zero_cols_roundtrip() { + let dir = tempdir().unwrap(); + let b = PersistentBitMatrixBuilder::new(8, dir.path()).unwrap(); + b.close().unwrap(); + + let m = PersistentBitMatrix::open(dir.path()).unwrap(); + assert_eq!(m.n_cols(), 0); + assert_eq!(m.n(), 8); +} diff --git a/src/obicompactvec/src/tests/intmatrix.rs b/src/obicompactvec/src/tests/intmatrix.rs new file mode 100644 index 0000000..8af7d6e --- /dev/null +++ b/src/obicompactvec/src/tests/intmatrix.rs @@ -0,0 +1,68 @@ +use tempfile::tempdir; + +use crate::{PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder}; + +#[test] +fn single_col_roundtrip() { + let dir = tempdir().unwrap(); + let mut b = PersistentCompactIntMatrixBuilder::new(4, dir.path()).unwrap(); + let mut col = b.add_col().unwrap(); + col.set(0, 10); + col.set(1, 200); + col.set(2, 300); + col.set(3, 1000); + col.close().unwrap(); + b.close().unwrap(); + + let m = PersistentCompactIntMatrix::open(dir.path()).unwrap(); + assert_eq!(m.n_cols(), 1); + assert_eq!(m.n(), 4); + assert_eq!(&*m.row(0), &[10u32]); + assert_eq!(&*m.row(1), &[200u32]); + assert_eq!(&*m.row(2), &[300u32]); + assert_eq!(&*m.row(3), &[1000u32]); +} + +#[test] +fn two_cols_roundtrip() { + let dir = tempdir().unwrap(); + let mut b = PersistentCompactIntMatrixBuilder::new(3, dir.path()).unwrap(); + let mut col0 = b.add_col().unwrap(); + col0.set(0, 1); col0.set(1, 2); col0.set(2, 3); + col0.close().unwrap(); + let mut col1 = b.add_col().unwrap(); + col1.set(0, 10); col1.set(1, 20); col1.set(2, 30); + col1.close().unwrap(); + b.close().unwrap(); + + let m = PersistentCompactIntMatrix::open(dir.path()).unwrap(); + assert_eq!(m.n_cols(), 2); + assert_eq!(&*m.row(0), &[1u32, 10]); + assert_eq!(&*m.row(1), &[2u32, 20]); + assert_eq!(&*m.row(2), &[3u32, 30]); +} + +#[test] +fn col_accessor() { + let dir = tempdir().unwrap(); + let mut b = PersistentCompactIntMatrixBuilder::new(2, dir.path()).unwrap(); + let mut col0 = b.add_col().unwrap(); + col0.set(0, 5); col0.set(1, 7); + col0.close().unwrap(); + b.close().unwrap(); + + let m = PersistentCompactIntMatrix::open(dir.path()).unwrap(); + assert_eq!(m.col(0).get(0), 5); + assert_eq!(m.col(0).get(1), 7); +} + +#[test] +fn zero_cols_roundtrip() { + let dir = tempdir().unwrap(); + let b = PersistentCompactIntMatrixBuilder::new(10, dir.path()).unwrap(); + b.close().unwrap(); + + let m = PersistentCompactIntMatrix::open(dir.path()).unwrap(); + assert_eq!(m.n_cols(), 0); + assert_eq!(m.n(), 10); +} diff --git a/src/obicompactvec/src/tests/mod.rs b/src/obicompactvec/src/tests/mod.rs index 0eb8adb..4d2d9ad 100644 --- a/src/obicompactvec/src/tests/mod.rs +++ b/src/obicompactvec/src/tests/mod.rs @@ -1,4 +1,6 @@ +mod bitmatrix; mod bitvec; +mod intmatrix; use tempfile::tempdir; diff --git a/src/obilayeredmap/src/layer.rs b/src/obilayeredmap/src/layer.rs index 281d907..6520fe6 100644 --- a/src/obilayeredmap/src/layer.rs +++ b/src/obilayeredmap/src/layer.rs @@ -4,7 +4,10 @@ use std::path::Path; use cacheline_ef::{CachelineEf, CachelineEfVec}; use epserde::prelude::*; -use obicompactvec::{PersistentCompactIntVec, PersistentCompactIntVecBuilder}; +use obicompactvec::{ + PersistentBitMatrix, PersistentBitMatrixBuilder, + PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, +}; use obikseq::CanonicalKmer; use obiskio::{UnitigFileReader, UnitigFileWriter}; use ptr_hash::{PtrHash, PtrHashParams, bucket_fn::CubicEps, hash::Xx64}; @@ -15,7 +18,8 @@ use crate::evidence::{Evidence, EvidenceWriter}; pub(crate) const MPHF_FILE: &str = "mphf.bin"; pub(crate) const UNITIGS_FILE: &str = "unitigs.bin"; const EVIDENCE_FILE: &str = "evidence.bin"; -const COUNTS_FILE: &str = "counts.pciv"; +const COUNTS_DIR: &str = "counts"; +const PRESENCE_DIR: &str = "presence"; type Mphf = PtrHash>, Xx64, Vec>; @@ -33,12 +37,20 @@ impl LayerData for () { fn read(&self, _slot: usize) {} } -impl LayerData for PersistentCompactIntVec { - type Item = u32; +impl LayerData for PersistentCompactIntMatrix { + type Item = Box<[u32]>; fn open(layer_dir: &Path) -> OLMResult { - PersistentCompactIntVec::open(&layer_dir.join(COUNTS_FILE)).map_err(OLMError::Io) + PersistentCompactIntMatrix::open(&layer_dir.join(COUNTS_DIR)).map_err(OLMError::Io) } - fn read(&self, slot: usize) -> u32 { self.get(slot) } + fn read(&self, slot: usize) -> Box<[u32]> { self.row(slot) } +} + +impl LayerData for PersistentBitMatrix { + type Item = Box<[bool]>; + fn open(layer_dir: &Path) -> OLMResult { + PersistentBitMatrix::open(&layer_dir.join(PRESENCE_DIR)).map_err(OLMError::Io) + } + fn read(&self, slot: usize) -> Box<[bool]> { self.row(slot) } } // ── Structures ──────────────────────────────────────────────────────────────── @@ -151,27 +163,31 @@ impl Layer<()> { } } -// ── Mode 2 — counts (PersistentCompactIntVec) ───────────────────────────────── +// ── Mode 2 — count matrix (1 column per layer) ──────────────────────────────── -impl Layer { +impl Layer { pub fn build(out_dir: &Path, count_of: impl Fn(CanonicalKmer) -> u32) -> OLMResult { let unitigs = UnitigFileReader::open(&out_dir.join(UNITIGS_FILE))?; let n = unitigs.n_kmers(); + let counts_dir = out_dir.join(COUNTS_DIR); if n == 0 { empty_layer(out_dir)?; - PersistentCompactIntVecBuilder::new(0, &out_dir.join(COUNTS_FILE)) - .and_then(|b| b.close()) + let mut mb = PersistentCompactIntMatrixBuilder::new(0, &counts_dir) .map_err(OLMError::Io)?; + mb.add_col().map_err(OLMError::Io)?.close().map_err(OLMError::Io)?; + mb.close().map_err(OLMError::Io)?; return Ok(0); } let mphf = build_mphf(out_dir, n)?; - let mut cnt = PersistentCompactIntVecBuilder::new(n, &out_dir.join(COUNTS_FILE)) + let mut mb = PersistentCompactIntMatrixBuilder::new(n, &counts_dir) .map_err(OLMError::Io)?; + let mut col = mb.add_col().map_err(OLMError::Io)?; build_second_pass(out_dir, n, &mphf, &mut |slot, kmer| { - cnt.set(slot, count_of(kmer)); + col.set(slot, count_of(kmer)); Ok(()) })?; - cnt.close().map_err(OLMError::Io)?; + col.close().map_err(OLMError::Io)?; + mb.close().map_err(OLMError::Io)?; Ok(n) } @@ -183,6 +199,49 @@ impl Layer { } } +// ── Mode 3 — presence/absence matrix (1 column per genome) ─────────────────── + +impl Layer { + pub fn build_presence( + out_dir: &Path, + n_genomes: usize, + present_in: impl Fn(CanonicalKmer, usize) -> bool, + ) -> OLMResult { + let unitigs = UnitigFileReader::open(&out_dir.join(UNITIGS_FILE))?; + let n = unitigs.n_kmers(); + let presence_dir = out_dir.join(PRESENCE_DIR); + if n == 0 { + empty_layer(out_dir)?; + let mut mb = PersistentBitMatrixBuilder::new(0, &presence_dir) + .map_err(OLMError::Io)?; + for _ in 0..n_genomes { + mb.add_col().map_err(OLMError::Io)?.close().map_err(OLMError::Io)?; + } + mb.close().map_err(OLMError::Io)?; + return Ok(0); + } + let mphf = build_mphf(out_dir, n)?; + + let mut mb = PersistentBitMatrixBuilder::new(n, &presence_dir).map_err(OLMError::Io)?; + let mut cols: Vec<_> = (0..n_genomes) + .map(|_| mb.add_col().map_err(OLMError::Io)) + .collect::>()?; + + build_second_pass(out_dir, n, &mphf, &mut |slot, kmer| { + for (g, col) in cols.iter_mut().enumerate() { + col.set(slot, present_in(kmer, g)); + } + Ok(()) + })?; + + for col in cols { + col.close().map_err(OLMError::Io)?; + } + mb.close().map_err(OLMError::Io)?; + Ok(n) + } +} + #[cfg(test)] #[path = "tests/layer.rs"] mod tests; diff --git a/src/obilayeredmap/src/map.rs b/src/obilayeredmap/src/map.rs index a2cc43b..e9cb591 100644 --- a/src/obilayeredmap/src/map.rs +++ b/src/obilayeredmap/src/map.rs @@ -2,7 +2,7 @@ use std::collections::HashMap; use std::fs; use std::path::{Path, PathBuf}; -use obicompactvec::PersistentCompactIntVec; +use obicompactvec::PersistentCompactIntMatrix; use obikseq::CanonicalKmer; use obiskio::UnitigFileWriter; @@ -96,13 +96,13 @@ impl LayeredMap<()> { } } -// ── Mode 2 — counts ─────────────────────────────────────────────────────────── +// ── Mode 2 — count matrix ───────────────────────────────────────────────────── -impl LayeredMap { +impl LayeredMap { pub fn push_layer(&mut self, count_of: impl Fn(CanonicalKmer) -> u32) -> OLMResult { let i = self.layers.len(); let dir = layer_dir(&self.root, i); - Layer::::build(&dir, count_of)?; + Layer::::build(&dir, count_of)?; self.append_layer()?; Ok(i) } diff --git a/src/obilayeredmap/src/tests/layer.rs b/src/obilayeredmap/src/tests/layer.rs index d32a386..7b0ec32 100644 --- a/src/obilayeredmap/src/tests/layer.rs +++ b/src/obilayeredmap/src/tests/layer.rs @@ -1,5 +1,5 @@ use super::*; -use obicompactvec::PersistentCompactIntVec; +use obicompactvec::PersistentCompactIntMatrix; use obikseq::{set_k, Kmer, Sequence as _, Unitig}; use tempfile::tempdir; @@ -44,14 +44,14 @@ fn counts_are_stored_and_retrieved() { let kmers = all_canonical_kmers(dir.path(), 4); let count_map: HashMap = kmers.iter().enumerate().map(|(i, &k)| (k, i as u32 + 1)).collect(); - Layer::::build( + Layer::::build( dir.path(), |kmer| count_map.get(&kmer).copied().unwrap_or(0), ).unwrap(); - let layer = Layer::::open(dir.path()).unwrap(); + let layer = Layer::::open(dir.path()).unwrap(); for kmer in &kmers { let hit = layer.query(*kmer).expect("kmer must be present"); - assert_eq!(hit.data, count_map[kmer]); + assert_eq!(hit.data[0], count_map[kmer]); } } @@ -71,10 +71,10 @@ fn open_after_build_is_consistent() { set_k(4); let dir = tempdir().unwrap(); write_unitigs(dir.path(), &[b"AAAACGT"]); - let n = Layer::::build(dir.path(), |_| 7).unwrap(); + let n = Layer::::build(dir.path(), |_| 7).unwrap(); assert_eq!(n, 4); - let layer = Layer::::open(dir.path()).unwrap(); + let layer = Layer::::open(dir.path()).unwrap(); let kmer = Kmer::from_ascii(b"AAAA").unwrap().canonical(); let hit = layer.query(kmer).expect("AAAA must be present"); - assert_eq!(hit.data, 7); + assert_eq!(hit.data[0], 7); } diff --git a/src/obilayeredmap/src/tests/map.rs b/src/obilayeredmap/src/tests/map.rs index 57e2450..30f9530 100644 --- a/src/obilayeredmap/src/tests/map.rs +++ b/src/obilayeredmap/src/tests/map.rs @@ -1,10 +1,10 @@ use super::*; -use obicompactvec::PersistentCompactIntVec; +use obicompactvec::PersistentCompactIntMatrix; use obikseq::{set_k, Sequence as _, Unitig}; use tempfile::tempdir; fn push_unitigs_and_layer( - map: &mut LayeredMap, + map: &mut LayeredMap, seqs: &[&[u8]], count: u32, ) { @@ -33,10 +33,10 @@ fn open_reloads_layer_count() { set_k(4); let dir = tempdir().unwrap(); { - let mut map = LayeredMap::::create(dir.path()).unwrap(); + let mut map = LayeredMap::::create(dir.path()).unwrap(); push_unitigs_and_layer(&mut map, &[b"AAAACGT"], 1); } - let map = LayeredMap::::open(dir.path()).unwrap(); + let map = LayeredMap::::open(dir.path()).unwrap(); assert_eq!(map.n_layers(), 1); } @@ -44,37 +44,37 @@ fn open_reloads_layer_count() { fn query_finds_kmer_in_layer_zero() { set_k(4); let dir = tempdir().unwrap(); - let mut map = LayeredMap::::create(dir.path()).unwrap(); + let mut map = LayeredMap::::create(dir.path()).unwrap(); push_unitigs_and_layer(&mut map, &[b"AAAACGT"], 3); let kmer = canonical(b"AAAC"); let (layer_idx, hit) = map.query(kmer).expect("kmer must be found"); assert_eq!(layer_idx, 0); - assert_eq!(hit.data, 3); + assert_eq!(hit.data[0], 3); } #[test] fn query_finds_kmer_in_correct_layer() { set_k(4); let dir = tempdir().unwrap(); - let mut map = LayeredMap::::create(dir.path()).unwrap(); + let mut map = LayeredMap::::create(dir.path()).unwrap(); push_unitigs_and_layer(&mut map, &[b"AAAACGT"], 1); push_unitigs_and_layer(&mut map, &[b"GGGACGT"], 2); assert_eq!(map.n_layers(), 2); let (li, hit) = map.query(canonical(b"AAAA")).expect("AAAA must be found"); assert_eq!(li, 0); - assert_eq!(hit.data, 1); + assert_eq!(hit.data[0], 1); let (li, hit) = map.query(canonical(b"GGGA")).expect("GGGA must be found"); assert_eq!(li, 1); - assert_eq!(hit.data, 2); + assert_eq!(hit.data[0], 2); } #[test] fn query_absent_returns_none() { set_k(4); let dir = tempdir().unwrap(); - let mut map = LayeredMap::::create(dir.path()).unwrap(); + let mut map = LayeredMap::::create(dir.path()).unwrap(); push_unitigs_and_layer(&mut map, &[b"AAAACGT"], 1); let absent = canonical(b"CCCC"); assert!(map.query(absent).is_none()); @@ -84,7 +84,7 @@ fn query_absent_returns_none() { fn push_layer_from_map_convenience() { set_k(4); let dir = tempdir().unwrap(); - let mut map = LayeredMap::::create(dir.path()).unwrap(); + let mut map = LayeredMap::::create(dir.path()).unwrap(); let mut w = map.next_layer_writer().unwrap(); w.write(&Unitig::from_ascii(b"AAAACGT")).unwrap(); w.close().unwrap(); @@ -93,5 +93,5 @@ fn push_layer_from_map_convenience() { ].into_iter().collect(); map.push_layer_from_map(&counts).unwrap(); let (_, hit) = map.query(canonical(b"AAAA")).unwrap(); - assert_eq!(hit.data, 10); + assert_eq!(hit.data[0], 10); }