From 0b3fcf3cf04b0ad3452cb79259e8a41a5e7dd5ea Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Wed, 13 May 2026 21:39:08 +0800 Subject: [PATCH] feat: add PersistentBitVec and upgrade PersistentCompactIntVec format Introduces PersistentBitVec, a dense, memory-mapped bit vector optimized for bulk u64-word operations and SIMD acceleration, complete with bitwise operators and Jaccard/Hamming distance metrics. Upgrades PersistentCompactIntVec to a unified .pciv format using 64-bit indices and offsets, consolidating the binary layout and updating builder/reader lifecycles accordingly. Adds corresponding documentation, updates MkDocs navigation, and implements a comprehensive test suite for persistence round-trips, edge cases, and metric accuracy. --- docmd/implementation/persistent_bit_vec.md | 173 +++++++++++++ .../persistent_compact_int_vec.md | 236 +++++++++-------- mkdocs.yml | 1 + src/obicompactvec/src/bitvec.rs | 241 ++++++++++++++++++ src/obicompactvec/src/builder.rs | 93 ++++--- src/obicompactvec/src/format.rs | 45 ++-- src/obicompactvec/src/lib.rs | 2 + src/obicompactvec/src/reader.rs | 134 ++++++++-- src/obicompactvec/src/tests/bitvec.rs | 216 ++++++++++++++++ src/obicompactvec/src/tests/mod.rs | 120 ++++++++- 10 files changed, 1064 insertions(+), 197 deletions(-) create mode 100644 docmd/implementation/persistent_bit_vec.md create mode 100644 src/obicompactvec/src/bitvec.rs create mode 100644 src/obicompactvec/src/tests/bitvec.rs diff --git a/docmd/implementation/persistent_bit_vec.md b/docmd/implementation/persistent_bit_vec.md new file mode 100644 index 0000000..74bd4c7 --- /dev/null +++ b/docmd/implementation/persistent_bit_vec.md @@ -0,0 +1,173 @@ +# PersistentBitVec + +## Purpose + +`PersistentBitVec` stores a dense bit vector (presence/absence per slot) backed by a single mmap'd file. It is the binary counterpart of `PersistentCompactIntVec` and shares the same lifecycle pattern (builder → close → reader). All bulk operations work on u64 words rather than bytes, giving 8× fewer iterations and enabling the compiler to emit POPCNT and SIMD instructions. + +Typical use: converting k-mer count vectors to presence/absence vectors (with optional threshold), then computing set-theoretic distances (Jaccard) or edit distances (Hamming) between samples. + +--- + +## File format + +Single `.pbiv` file. + +``` +offset 0: + magic: [u8; 4] = b"PBIV" + _pad: [u8; 4] = 0 alignment padding + n: u64 number of bits + +offset 16: + data: [u64; ⌈n/64⌉] bit words, LSB-first, zero-padded +``` + +**Header is 16 bytes**, so data starts at an offset divisible by 8. Since `mmap` returns page-aligned memory (≥ 4096-byte aligned), the data slice is u64-aligned, enabling a zero-copy `&[u8] → &[u64]` reinterpretation. + +**Bit layout**: bit `i` is in `data[i >> 6]` at bit position `i & 63` (LSB-first). Bits `[n, ⌈n/64⌉×64)` are **always zero** (padding). This invariant is maintained by all write operations and must be restored by `not()` after flipping. + +**Total file size**: `16 + ⌈n/64⌉ × 8` bytes. + +--- + +## Lifecycle + +### Builder (`PersistentBitVecBuilder`) + +```rust +struct PersistentBitVecBuilder { + mmap: MmapMut, + n: usize, +} +``` + +The file and mmap are created immediately at construction. The header is written once at `new()` or copied from the source at `build_from*()`. `close()` is a single flush — there is no tail to append, unlike `PersistentCompactIntVec`. + +#### Constructors + +**`new(n: usize, path: &Path) -> io::Result`** + +Creates the file, writes the header, zero-extends to `16 + ⌈n/64⌉×8` bytes, mmaps immediately. All bits default to 0. + +**`build_from(source: &PersistentBitVec, path: &Path) -> io::Result`** + +OS-level file copy (no per-bit iteration), then mmap. Initialisation cost: O(file_size). + +**`build_from_counts(source: &PersistentCompactIntVec, threshold: u32, path: &Path) -> io::Result`** + +Creates a new file, iterates `source` with its merge-scan iterator (O(n)), and writes bits directly into u64 words: + +```rust +// bit i = 1 iff source[i] >= threshold +words[slot >> 6] |= 1u64 << (slot & 63); +``` + +Handles overflow values (≥ 255) transparently — the count iterator returns the true u32 value regardless. + +**`build_from_presence(source: &PersistentCompactIntVec, path: &Path) -> io::Result`** + +Shorthand for `build_from_counts(source, 1, path)`. + +#### Bit-level access + +```rust +fn get(&self, slot: u64) -> bool +fn set(&mut self, slot: u64, value: bool) +``` + +Byte-level mmap access: `mmap[16 + slot/8]`, bit `slot % 8`. O(1). + +#### Word-level bulk operations + +All operate on `⌈n/64⌉` u64 words. O(n/64) per call. + +```rust +builder.and(&other); // self[i] &= other[i] for all i +builder.or(&other); // self[i] |= other[i] +builder.xor(&other); // self[i] ^= other[i] +builder.not(); // self[i] = !self[i], then re-zero padding bits +``` + +`and`/`or`/`xor` read `other`'s word slice directly (no allocation). `not()` flips all words then masks the last word's padding bits to restore the invariant. + +#### `close(self) -> io::Result<()>` + +Flushes the mmap. The header was written at construction and is never rewritten. O(1) in Rust code. + +--- + +### Reader (`PersistentBitVec`) + +```rust +struct PersistentBitVec { + mmap: Mmap, + n: usize, + path: PathBuf, +} +``` + +#### `open(path: &Path) -> io::Result` + +Mmaps the file, validates magic, reads `n` from bytes `[8..16]`. O(1). + +#### `get(slot: u64) -> bool` + +Byte-level read from `mmap[16 + slot/8]`. O(1). + +#### `iter() -> BitIter<'_>` + +Sequential scan, byte by byte, yielding `bool` values in slot order. Implements `ExactSizeIterator`. O(n). + +#### Aggregates + +```rust +fn count_ones(&self) -> u64 // popcount over all words; padding bits are 0 +fn count_zeros(&self) -> u64 // n - count_ones() +``` + +`count_ones` iterates `⌈n/64⌉` words and calls `u64::count_ones()` (maps to `POPCNT`). O(n/64). + +#### Distance methods + +Both operate word by word. O(n/64). + +| Method | Formula | Notes | +|---|---|---| +| `jaccard_dist(&other) -> f64` | `1 − |A∩B| / |A∪B|` | `(a&b).count_ones()`, `(a\|b).count_ones()` per word | +| `hamming_dist(&other) -> u64` | number of differing bits | `(a^b).count_ones()` per word | + +Edge case (both all-zero → union = 0): `jaccard_dist` returns 0.0. + +--- + +## Implementation notes + +### u64 word view + +The unsafe cast from `&[u8]` to `&[u64]` is sound because: + +1. `mmap` base is page-aligned (≥ 4096-byte boundary). +2. Data offset = 16, and `16 % 8 == 0` → the data pointer is 8-byte aligned. +3. Data length = `⌈n/64⌉ × 8` bytes — always a multiple of 8. + +This gives zero-copy word-level access with no intermediate allocation. + +### Padding invariant + +Writing `not()` without masking the last word would corrupt `count_ones()`, `hamming_dist()`, and `jaccard_dist()`. The mask applied after flipping is `(1u64 << (n % 64)) - 1` (no-op if `n % 64 == 0`). All other operations (`and`, `or`, `xor`) preserve existing zero padding since they can only clear or preserve bits already set by `not()`. + +--- + +## Complexity + +| Operation | Time | Notes | +|---|---|---| +| `new` / `open` | O(1) | mmap setup + header parse | +| `get` / `set` (builder or reader) | O(1) | byte-level mmap | +| `iter()` | O(n) | byte-by-byte scan | +| `count_ones` / `count_zeros` | O(n/64) | POPCNT per u64 word | +| `and` / `or` / `xor` / `not` | O(n/64) | word-level bitwise ops | +| `jaccard_dist` / `hamming_dist` | O(n/64) | word AND/OR/XOR + POPCNT | +| `build_from` | O(file_size) | OS copy | +| `build_from_counts` / `build_from_presence` | O(n) | count iter + word fill | +| `close` | O(1) | flush only | diff --git a/docmd/implementation/persistent_compact_int_vec.md b/docmd/implementation/persistent_compact_int_vec.md index ba3b4b6..00b5ecf 100644 --- a/docmd/implementation/persistent_compact_int_vec.md +++ b/docmd/implementation/persistent_compact_int_vec.md @@ -2,7 +2,7 @@ ## Purpose -`PersistentCompactIntVec` stores a dense array of non-negative integers indexed by MPHF slot where the vast majority of values are small (0–254) and large values are rare. It is designed for mmap-compatible random access with minimal memory footprint and optimal cache behaviour. +`PersistentCompactIntVec` stores a dense array of non-negative integers indexed by MPHF slot where the vast majority of values are small (0–254) and large values are rare. It is designed for mmap-compatible random and sequential access with minimal memory footprint and optimal cache behaviour. Motivation from observed count distributions in genomics data: 99.9% of k-mer counts fit in a u8; overflow (count ≥ 255) affects ~0.07% of distinct k-mers but can reach values above 10⁶ (chloroplast, ribosomal repeats). @@ -12,8 +12,8 @@ Motivation from observed count distributions in genomics data: 99.9% of k-mer co Two-tier structure: -1. **Primary array** — `[u8; n]`, mmap'd as a flat file. Values 0–254 are stored directly. Value **255 is a sentinel** meaning "look in overflow". -2. **Overflow structure** — sorted list of `(slot: u32, value: u32)` pairs for all slots where the true value ≥ 255, with a **sparse L1-fitting index** for fast lookup. +1. **Primary array** — `[u8; n]`, stored at offset 24 in the PCIV file and mmap'd. Values 0–254 are stored directly. Value **255 is a sentinel** meaning "look in overflow". +2. **Overflow section** — sorted list of `(slot: u32, value: u32)` pairs for all slots where the true value ≥ 255, with a **sparse L1-fitting index** for fast lookup. ``` primary[slot] < 255 → return primary[slot] @@ -22,149 +22,161 @@ primary[slot] == 255 → binary search in overflow --- -## Lifecycle +## Single-file format -The structure has two distinct runtime roles with different APIs. +Everything is stored in a single `.pciv` file. Write order matches computation order: the header placeholder is written first, then primary (known at `new()`), then overflow data and index (known at `close()`), then the header is overwritten at offset 0. + +``` +offset 0: + magic: [u8; 4] = b"PCIV" + n: u64 number of slots + n_overflow: u32 number of overflow entries + step: u32 sparse index step (0 = no index) + n_index: u32 number of index entries + +offset 24: + primary: [u8; n] one byte per slot, 255 = overflow sentinel + +offset 24 + n: + data: [(slot: u32, value: u32); n_overflow] sorted by slot + +offset 24 + n + n_overflow × 8: + index: [(slot: u32, pos: u32); n_index] sparse index +``` + +The index entries point into `data`: `index[i] = (slot of data[i×step], i×step)`. + +--- + +## Lifecycle ### Builder (`PersistentCompactIntVecBuilder`) -Used during layer construction. Holds the primary array and overflow map in memory; supports arbitrary reads and writes before finalisation. +Used during construction. The primary section is **mmap'd immediately** at construction time (both for `new` and `build_from`), so the file exists and is addressable from the start. The overflow is held in a `HashMap` in RAM. ```rust struct PersistentCompactIntVecBuilder { - primary: Vec, // in memory; written to disk at close() - overflow: HashMap, // O(1) get/set for values ≥ 255 + path: PathBuf, + mmap: MmapMut, // primary section live in the file from the start + n: usize, + overflow: HashMap, // values ≥ 255 } ``` -**Phase 1 — `new(n: usize)`** -Allocates `primary` of length `n` initialised to 0. `overflow` is empty. +#### `new(n: usize, path: &Path) -> io::Result` -**Phase 2 — fill (repeated `set` / `get`)** +Creates the file, pre-allocates `HEADER_SIZE + n` zero bytes, mmaps it. The primary is zero-initialised (all slots = 0). Returns immediately ready for `set` / `get`. + +#### `build_from(source: &PersistentCompactIntVec, path: &Path) -> io::Result` + +Copies the source PCIV file to `path` (OS-level copy — no per-slot iteration), mmaps the copy, then loads the overflow section into a `HashMap`. Initialisation cost: O(file copy) + O(n_overflow), not O(n). + +At `close()`, the primary section is **not rewritten**: it is already in the file via mmap. Only the overflow data, the sparse index, and the header are updated. + +#### `set(slot: u64, value: u32)` / `get(slot: u64) -> u32` + +Direct mmap byte access for the primary; HashMap for the overflow. Both O(1). Mutations can move a slot between tiers freely (downward mutation removes the HashMap entry; upward mutation adds it). + +#### Element-wise operations — `min`, `max`, `add`, `diff` + +Each takes a `&PersistentCompactIntVec` of equal length and updates `self` in place via `set`: ```rust -fn set(&mut self, slot: u64, value: u32) { - if value < 255 { - self.primary[slot] = value as u8; - self.overflow.remove(&slot); // in case of downward mutation - } else { - self.primary[slot] = 255; // sentinel - self.overflow.insert(slot, value); - } -} - -fn get(&self, slot: u64) -> u32 { - match self.primary[slot] { - 255 => *self.overflow.get(&slot).unwrap(), - v => v as u32, - } -} +builder.min(&other); // self[i] = min(self[i], other[i]) +builder.max(&other); // self[i] = max(self[i], other[i]) +builder.add(&other); // self[i] = self[i].checked_add(other[i]) (panics on u32 overflow) +builder.diff(&other); // self[i] = self[i].saturating_sub(other[i]) ``` -Reads and mutations are both O(1). Overflow entries can be created, updated, or removed freely during this phase. +All iterate `other` with `other.iter()` (merge-scan, O(n_other)). -**Phase 3 — `close(primary_path, overflow_path)`** +#### `close(self) -> io::Result<()>` -1. Write `primary` as raw bytes to `counts_primary.bin`. -2. Collect `overflow` into `Vec<(u32, u32)>`, sort by slot. -3. Compute `step` from `n_overflow` (see below). -4. Build sparse index. -5. Write `counts_overflow.bin`. -6. Drop all in-memory state. - -The `HashMap` is the only extra allocation: bounded by `n_overflow × (8 + 4 + overhead)` bytes, typically a few MB in practice. +1. Flush and drop the mmap (primary changes are now on disk). +2. Sort the overflow HashMap into `Vec<(u32, u32)>`. +3. Truncate the file to `HEADER_SIZE + n` (removes old data+index if `build_from` was used). +4. Append sorted overflow data, then sparse index. +5. Seek to offset 0, overwrite the header with final values. --- ### Reader (`PersistentCompactIntVec`) -Used at query time. Both files are mmap'd; the sparse index is loaded into a `Vec` at open time (≤ 32 KB, L1-resident). +Used at query time. The whole file is mmap'd; only the sparse index is copied into a `Vec` at open time (≤ 32 KB, L1-resident). ```rust struct PersistentCompactIntVec { - primary: Mmap, // mmap of counts_primary.bin - index: Vec<(u32, u32)>, // sparse index, loaded into RAM at open - data: Mmap, // mmap of overflow data region - n_overflow: u32, - step: u32, + mmap: Mmap, + n: usize, + n_overflow: usize, + step: u32, + index: Vec<(u32, u32)>, // L1-resident + primary_offset: usize, // = 24 (HEADER_SIZE) + data_offset: usize, // = 24 + n + path: PathBuf, } ``` -**`open(primary_path, overflow_path)`** -Mmaps both files. Parses the overflow file header; copies the sparse index into a `Vec` (tiny, warm in cache). The data region stays mmap'd. +#### `open(path: &Path) -> io::Result` -**`get(slot: u64) -> u32`** — see Lookup section. +Mmaps the file, parses the 24-byte header, copies the sparse index entries into a `Vec`. The primary and data sections stay mmap'd. ---- - -## Overflow file format +#### `get(slot: u64) -> u32` — random access ``` -magic: [u8; 4] = b"PCIV" -n_overflow: u32 -step: u32 (0 if n_overflow ≤ L1_entries → no sparse index) -[if step > 0] - n_index: u32 = ⌈n_overflow / step⌉ - index: [(slot: u32, pos: u32); n_index] ← loaded into RAM at open -data: [(slot: u32, value: u32); n_overflow] sorted by slot, mmap'd +primary[slot] < 255 → return it directly + +step == 0: + binary_search(data[0..n_overflow], slot) + +step > 0: + i = upper_bound(index[..].slot, slot) − 1 // in L1-resident Vec + binary_search(data[index[i].pos .. index[i+1].pos], slot) ``` -`index[i]` stores the slot value and data-array position of the `i × step`-th overflow entry. +#### `iter() -> Iter<'_>` — sequential scan, O(n) + +Merge-scan: reads primary bytes in order; on sentinel 255, advances a sequential pointer into the sorted data section rather than doing a binary search. This gives O(n + n_overflow) with no random access into the data section. + +`Iter` implements `ExactSizeIterator`. `&PersistentCompactIntVec` implements `IntoIterator`. + +#### Aggregate + +```rust +fn sum(&self) -> u64 // Σ self[i] as u64, via iter() +``` + +#### Distance methods + +All take `&other` of equal length, iterate both with `zip(self.iter(), other.iter())`, and return `f64`. + +| Method | Formula | +|---|---| +| `bray_dist` | `1 − 2·Σmin(aᵢ,bᵢ) / (Σaᵢ + Σbᵢ)` | +| `relfreq_bray_dist` | Bray-Curtis on relative frequencies: `1 − Σmin(pᵢ,qᵢ)` where `pᵢ = aᵢ/Σa` | +| `euclidean_dist` | `√Σ(aᵢ − bᵢ)²` | +| `relfreq_euclidean_dist` | Euclidean on relative frequencies | +| `hellinger_euclidean_dist` | `√Σ(√pᵢ − √qᵢ)²` — Euclidean on sqrt(relfreq) | +| `hellinger_dist` | `hellinger_euclidean_dist / √2` — standard Hellinger distance ∈ [0, 1] | +| `threshold_jaccard_dist(&other, threshold: u32)` | `1 − |A∩B| / |A∪B|` where presence iff count ≥ threshold | +| `jaccard_dist` | `threshold_jaccard_dist(&other, 1)` | + +Edge cases (both vectors all-zero, or union empty for Jaccard): distance = 0.0. --- ## Step computation -The step is chosen at `close()` time, once `n_overflow` is known: +Chosen at `close()` once `n_overflow` is known: ``` -L1_SIZE = 32 * 1024 // 32 KB conservative target -INDEX_ENTRY = 8 // bytes: (u32, u32) -L1_entries = L1_SIZE / INDEX_ENTRY = 4096 +L1_entries = 32 768 / 8 = 4096 -if n_overflow ≤ L1_entries: - step = 0 // no sparse index; data itself fits in a few cache lines -else: - step = ⌈n_overflow / L1_entries⌉ +step = 0 if n_overflow ≤ 4096 +step = ⌈n_overflow / 4096⌉ otherwise ``` -For the Betula nana reference (359 044 overflows): step = 88, index = 4 080 entries = 31.9 KB. - ---- - -## Lookup - -``` -fn get(slot: u64) -> u32: - if primary[slot] < 255: - return primary[slot] as u32 - - if step == 0: - return binary_search(data[0..n_overflow], slot) - - // 1. binary search in index (Vec, L1-resident) - i = upper_bound(index[..].slot, slot) - 1 - pos_start = index[i].pos - pos_end = if i+1 < n_index { index[i+1].pos } else { n_overflow } - - // 2. binary search in contiguous block (mmap'd) - return binary_search(data[pos_start..pos_end], slot) -``` - -Cache behaviour: step 1 is entirely within the L1-resident `Vec<(u32,u32)>`; step 2 loads a contiguous block of ≤ `step × 8` bytes from the mmap. - ---- - -## Files - -``` -layer_N/ - counts_primary.bin — [u8; n_slots], raw bytes - counts_overflow.bin — PCIV header + sparse index + sorted data - (absent if n_overflow == 0) -``` - -If `counts_overflow.bin` is absent, no slot has value ≥ 255; all reads go directly to the primary array. +For the Betula nana reference (359 044 overflows): step = 88, n_index = 4 080 entries = 31.9 KB. --- @@ -172,15 +184,13 @@ If `counts_overflow.bin` is absent, no slot has value ≥ 255; all reads go dire | Operation | Time | Notes | |---|---|---| -| `set` / `get` (builder) | O(1) | HashMap for overflow | -| `get` (no overflow) | O(1) | single byte read | -| `get` (overflow, with index) | O(log step) | ~2 memory regions | -| `get` (overflow, no index) | O(log n_overflow) | data fits in a few cache lines | -| `close` | O(n_overflow log n_overflow) | sort + index build | +| `set` / `get` (builder) | O(1) | mmap byte + HashMap | +| `get` (reader, no overflow) | O(1) | single mmap byte | +| `get` (reader, with index) | O(log step) | ≤ 2 memory regions | +| `get` (reader, no index) | O(log n_overflow) | data fits in a few cache lines | +| `iter()` full scan | O(n + n_overflow) | merge-scan, no binary search | +| `sum`, distances | O(n) | via `iter()` / `zip(iter(), iter())` | +| `min` / `max` / `add` / `diff` | O(n) | via `other.iter()` + builder `set` | +| `close` | O(n_overflow log n_overflow) | sort + sequential write | | `open` | O(n_index) | index copy into Vec | - ---- - -## Generalisation - -The sentinel (255) and primary type (u8) are fixed. The overflow value type is u32, sufficient for any realistic k-mer count. For a count matrix (mode 4), one `PersistentCompactIntVec` per genome column shares the primary array layout. +| `build_from` | O(file_size) + O(n_overflow) | OS copy + HashMap load | diff --git a/mkdocs.yml b/mkdocs.yml index c4aac85..43fd737 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -46,6 +46,7 @@ nav: - Unitig evidence encoding: implementation/unitig_evidence.md - obilayeredmap crate: implementation/obilayeredmap.md - PersistentCompactIntVec: implementation/persistent_compact_int_vec.md + - PersistentBitVec: implementation/persistent_bit_vec.md - Architecture: - Sequences: architecture/sequences/invariant.md diff --git a/src/obicompactvec/src/bitvec.rs b/src/obicompactvec/src/bitvec.rs new file mode 100644 index 0000000..4afdc5f --- /dev/null +++ b/src/obicompactvec/src/bitvec.rs @@ -0,0 +1,241 @@ +use std::fs::{self, File, OpenOptions}; +use std::io::{self, Seek, SeekFrom, Write as _}; +use std::path::{Path, PathBuf}; + +use memmap2::{Mmap, MmapMut}; + +use crate::reader::PersistentCompactIntVec; + +const MAGIC: [u8; 4] = *b"PBIV"; + +// Header: magic(4) + _pad(4) + n(8) = 16 bytes. +// Data starts at offset 16, which is divisible by 8 → u64-aligned +// (mmap base is page-aligned, 16 % 8 == 0). +const HEADER_SIZE: usize = 16; + +#[inline] +fn n_words(n: usize) -> usize { n.div_ceil(64) } + +#[inline] +fn n_bytes_for_words(n: usize) -> usize { n_words(n) * 8 } + +// ── Reader ──────────────────────────────────────────────────────────────────── + +pub struct PersistentBitVec { + mmap: Mmap, + n: usize, + path: PathBuf, +} + +impl PersistentBitVec { + pub fn open(path: &Path) -> io::Result { + let mmap = unsafe { Mmap::map(&File::open(path)?)? }; + if mmap.len() < HEADER_SIZE { + return Err(io::Error::new(io::ErrorKind::InvalidData, "PBIV file too short")); + } + if &mmap[0..4] != &MAGIC { + return Err(io::Error::new(io::ErrorKind::InvalidData, "bad PBIV magic")); + } + let n = u64::from_le_bytes(mmap[8..16].try_into().unwrap()) as usize; + Ok(Self { mmap, n, path: path.to_path_buf() }) + } + + pub fn path(&self) -> &Path { &self.path } + pub fn len(&self) -> usize { self.n } + pub fn is_empty(&self) -> bool { self.n == 0 } + + pub fn get(&self, slot: usize) -> bool { + (self.mmap[HEADER_SIZE + (slot >> 3)] >> (slot & 7)) & 1 != 0 + } + + // Used by iter() and get(): exact byte window, no padding. + fn data_bytes(&self) -> &[u8] { + &self.mmap[HEADER_SIZE..HEADER_SIZE + self.n.div_ceil(8)] + } + + // Bulk word view. SAFETY: mmap is page-aligned, HEADER_SIZE=16 is divisible by 8, + // so &mmap[HEADER_SIZE] is u64-aligned. Slice length is n_words * 8 bytes. + fn data_words(&self) -> &[u64] { + let nw = n_words(self.n); + let ptr = self.mmap[HEADER_SIZE..].as_ptr() as *const u64; + unsafe { std::slice::from_raw_parts(ptr, nw) } + } + + pub fn count_ones(&self) -> u64 { + // Padding bits in the last word are 0, so no masking needed. + self.data_words().iter().map(|w| w.count_ones() as u64).sum() + } + + pub fn count_zeros(&self) -> u64 { + self.n as u64 - self.count_ones() + } + + pub fn jaccard_dist(&self, other: &PersistentBitVec) -> f64 { + assert_eq!(self.n, other.n, "length mismatch"); + let (inter, union) = self.data_words().iter().zip(other.data_words()).fold( + (0u64, 0u64), + |(i, u), (&a, &b)| (i + (a & b).count_ones() as u64, u + (a | b).count_ones() as u64), + ); + if union == 0 { return 0.0; } + 1.0 - inter as f64 / union as f64 + } + + pub fn hamming_dist(&self, other: &PersistentBitVec) -> u64 { + assert_eq!(self.n, other.n, "length mismatch"); + self.data_words().iter().zip(other.data_words()) + .map(|(&a, &b)| (a ^ b).count_ones() as u64) + .sum() + } + + pub fn iter(&self) -> BitIter<'_> { + BitIter { bytes: self.data_bytes(), slot: 0, n: self.n } + } +} + +impl<'a> IntoIterator for &'a PersistentBitVec { + type Item = bool; + type IntoIter = BitIter<'a>; + fn into_iter(self) -> BitIter<'a> { self.iter() } +} + +pub struct BitIter<'a> { + bytes: &'a [u8], + slot: usize, + n: usize, +} + +impl ExactSizeIterator for BitIter<'_> {} + +impl Iterator for BitIter<'_> { + type Item = bool; + + fn next(&mut self) -> Option { + if self.slot >= self.n { return None; } + let v = (self.bytes[self.slot >> 3] >> (self.slot & 7)) & 1 != 0; + self.slot += 1; + Some(v) + } + + fn size_hint(&self) -> (usize, Option) { + let rem = self.n - self.slot; + (rem, Some(rem)) + } +} + +// ── Builder ─────────────────────────────────────────────────────────────────── + +pub struct PersistentBitVecBuilder { + mmap: MmapMut, + n: usize, +} + +impl PersistentBitVecBuilder { + pub fn new(n: usize, path: &Path) -> io::Result { + let file_size = HEADER_SIZE + n_bytes_for_words(n); + let mut file = OpenOptions::new() + .read(true).write(true).create(true).truncate(true) + .open(path)?; + file.write_all(&MAGIC)?; + file.write_all(&[0u8; 4])?; // padding + file.write_all(&(n as u64).to_le_bytes())?; + file.seek(SeekFrom::Start(0))?; + file.set_len(file_size as u64)?; + let mmap = unsafe { MmapMut::map_mut(&file)? }; + Ok(Self { mmap, n }) + } + + pub fn build_from(source: &PersistentBitVec, path: &Path) -> io::Result { + fs::copy(source.path(), path)?; + let file = OpenOptions::new().read(true).write(true).open(path)?; + let mmap = unsafe { MmapMut::map_mut(&file)? }; + let n = source.len(); + Ok(Self { mmap, n }) + } + + pub fn len(&self) -> usize { self.n } + pub fn is_empty(&self) -> bool { self.n == 0 } + + pub fn get(&self, slot: usize) -> bool { + (self.mmap[HEADER_SIZE + (slot >> 3)] >> (slot & 7)) & 1 != 0 + } + + pub fn set(&mut self, slot: usize, value: bool) { + let byte = HEADER_SIZE + (slot >> 3); + let bit = 1u8 << (slot & 7); + if value { self.mmap[byte] |= bit; } else { self.mmap[byte] &= !bit; } + } + + // SAFETY: same alignment argument as PersistentBitVec::data_words. + fn data_words_mut(&mut self) -> &mut [u64] { + let nw = n_words(self.n); + let ptr = self.mmap[HEADER_SIZE..].as_mut_ptr() as *mut u64; + unsafe { std::slice::from_raw_parts_mut(ptr, nw) } + } + + pub fn and(&mut self, other: &PersistentBitVec) { + assert_eq!(self.n, other.n, "length mismatch"); + for (sw, &ow) in self.data_words_mut().iter_mut().zip(other.data_words()) { *sw &= ow; } + } + + pub fn or(&mut self, other: &PersistentBitVec) { + assert_eq!(self.n, other.n, "length mismatch"); + for (sw, &ow) in self.data_words_mut().iter_mut().zip(other.data_words()) { *sw |= ow; } + } + + pub fn xor(&mut self, other: &PersistentBitVec) { + assert_eq!(self.n, other.n, "length mismatch"); + for (sw, &ow) in self.data_words_mut().iter_mut().zip(other.data_words()) { *sw ^= ow; } + } + + pub fn not(&mut self) { + let rem = self.n % 64; + let words = self.data_words_mut(); + for w in words.iter_mut() { *w ^= u64::MAX; } + // Zero padding bits in the last word so count_ones / jaccard remain correct. + if rem != 0 { + if let Some(last) = words.last_mut() { *last &= (1u64 << rem) - 1; } + } + } + + /// Convert a count vector to a bit vector: bit set iff count >= threshold. + /// Fills u64 words directly from the count iterator — O(n), no bit-level set() overhead. + pub fn build_from_counts( + source: &PersistentCompactIntVec, + threshold: u32, + path: &Path, + ) -> io::Result { + let n = source.len(); + let file_size = HEADER_SIZE + n_bytes_for_words(n); + let mut file = OpenOptions::new() + .read(true).write(true).create(true).truncate(true) + .open(path)?; + file.write_all(&MAGIC)?; + file.write_all(&[0u8; 4])?; + file.write_all(&(n as u64).to_le_bytes())?; + file.seek(SeekFrom::Start(0))?; + file.set_len(file_size as u64)?; + let mut mmap = unsafe { MmapMut::map_mut(&file)? }; + + { + let nw = n_words(n); + let ptr = mmap[HEADER_SIZE..].as_mut_ptr() as *mut u64; + let words = unsafe { std::slice::from_raw_parts_mut(ptr, nw) }; + for (slot, count) in source.iter().enumerate() { + if count >= threshold { + words[slot >> 6] |= 1u64 << (slot & 63); + } + } + } + + Ok(Self { mmap, n }) + } + + /// Convert a count vector to a presence/absence bit vector (threshold = 1). + pub fn build_from_presence(source: &PersistentCompactIntVec, path: &Path) -> io::Result { + Self::build_from_counts(source, 1, path) + } + + pub fn close(self) -> io::Result<()> { + self.mmap.flush() + } +} diff --git a/src/obicompactvec/src/builder.rs b/src/obicompactvec/src/builder.rs index afb18d0..32d711f 100644 --- a/src/obicompactvec/src/builder.rs +++ b/src/obicompactvec/src/builder.rs @@ -5,23 +5,33 @@ use std::path::{Path, PathBuf}; use memmap2::MmapMut; -use crate::format::{finalize_pciv, HEADER_SIZE}; +use crate::format::{HEADER_SIZE, OVERFLOW_ENTRY_SIZE, finalize_pciv}; use crate::reader::PersistentCompactIntVec; pub struct PersistentCompactIntVecBuilder { - path: PathBuf, - mmap: MmapMut, - n: usize, - overflow: HashMap, + path: PathBuf, + mmap: MmapMut, + n: usize, + overflow: HashMap, } impl PersistentCompactIntVecBuilder { /// Create a new, zero-filled PCIV at `path`. Primary is mmapped immediately. pub fn new(n: usize, path: &Path) -> io::Result { - let file = OpenOptions::new().read(true).write(true).create(true).truncate(true).open(path)?; + let file = OpenOptions::new() + .read(true) + .write(true) + .create(true) + .truncate(true) + .open(path)?; file.set_len((HEADER_SIZE + n) as u64)?; let mmap = unsafe { MmapMut::map_mut(&file)? }; - Ok(Self { path: path.to_path_buf(), mmap, n, overflow: HashMap::new() }) + Ok(Self { + path: path.to_path_buf(), + mmap, + n, + overflow: HashMap::new(), + }) } /// Copy `source`'s file to `path`, mmap the primary section, load overflow into RAM. @@ -32,47 +42,60 @@ impl PersistentCompactIntVecBuilder { let file = OpenOptions::new().read(true).write(true).open(path)?; let mmap = unsafe { MmapMut::map_mut(&file)? }; - let n = source.len(); - let n_overflow = u32::from_le_bytes(mmap[12..16].try_into().unwrap()) as usize; + let n = source.len(); + let n_overflow = u64::from_le_bytes(mmap[16..24].try_into().unwrap()) as usize; let data_offset = HEADER_SIZE + n; let mut overflow = HashMap::with_capacity(n_overflow); for i in 0..n_overflow { - let off = data_offset + i * 8; - let slot = u32::from_le_bytes(mmap[off..off + 4].try_into().unwrap()) as u64; - let value = u32::from_le_bytes(mmap[off + 4..off + 8].try_into().unwrap()); + let off = data_offset + i * OVERFLOW_ENTRY_SIZE; + let slot = u64::from_le_bytes(mmap[off..off + 8].try_into().unwrap()) as usize; + let value = u32::from_le_bytes(mmap[off + 8..off + 12].try_into().unwrap()); overflow.insert(slot, value); } - Ok(Self { path: path.to_path_buf(), mmap, n, overflow }) + Ok(Self { + path: path.to_path_buf(), + mmap, + n, + overflow, + }) } - pub fn get(&self, slot: u64) -> u32 { - match self.mmap[HEADER_SIZE + slot as usize] { - 255 => *self.overflow.get(&slot).expect("sentinel without overflow entry"), - v => v as u32, + /// Get the value at the given slot, handling overflow if necessary. + pub fn get(&self, slot: usize) -> u32 { + match self.mmap[HEADER_SIZE + slot] { + 255 => *self + .overflow + .get(&slot) + .expect("sentinel without overflow entry"), + v => v as u32, } } - pub fn set(&mut self, slot: u64, value: u32) { + pub fn set(&mut self, slot: usize, value: u32) { if value < 255 { - self.mmap[HEADER_SIZE + slot as usize] = value as u8; + self.mmap[HEADER_SIZE + slot] = value as u8; self.overflow.remove(&slot); } else { - self.mmap[HEADER_SIZE + slot as usize] = 255; + self.mmap[HEADER_SIZE + slot] = 255; self.overflow.insert(slot, value); } } - pub fn len(&self) -> usize { self.n } + pub fn len(&self) -> usize { + self.n + } - pub fn is_empty(&self) -> bool { self.n == 0 } + pub fn is_empty(&self) -> bool { + self.n == 0 + } pub fn min(&mut self, other: &PersistentCompactIntVec) { assert_eq!(self.n, other.len(), "length mismatch"); for (slot, other_val) in other.iter().enumerate() { - if other_val < self.get(slot as u64) { - self.set(slot as u64, other_val); + if other_val < self.get(slot) { + self.set(slot, other_val); } } } @@ -80,8 +103,8 @@ impl PersistentCompactIntVecBuilder { pub fn max(&mut self, other: &PersistentCompactIntVec) { assert_eq!(self.n, other.len(), "length mismatch"); for (slot, other_val) in other.iter().enumerate() { - if other_val > self.get(slot as u64) { - self.set(slot as u64, other_val); + if other_val > self.get(slot) { + self.set(slot, other_val); } } } @@ -89,28 +112,30 @@ impl PersistentCompactIntVecBuilder { pub fn add(&mut self, other: &PersistentCompactIntVec) { assert_eq!(self.n, other.len(), "length mismatch"); for (slot, other_val) in other.iter().enumerate() { - let cur = self.get(slot as u64); - self.set(slot as u64, cur.checked_add(other_val).expect("u32 overflow in add")); + let cur = self.get(slot); + self.set(slot, cur.checked_add(other_val).expect("u32 overflow in add")); } } pub fn diff(&mut self, other: &PersistentCompactIntVec) { assert_eq!(self.n, other.len(), "length mismatch"); for (slot, other_val) in other.iter().enumerate() { - self.set(slot as u64, self.get(slot as u64).saturating_sub(other_val)); + self.set(slot, self.get(slot).saturating_sub(other_val)); } } /// Flush the primary mmap, then write sorted overflow data + index and fix the header. pub fn close(self) -> io::Result<()> { self.mmap.flush()?; - let Self { path, mmap, n, overflow } = self; + let Self { + path, + mmap, + n, + overflow, + } = self; drop(mmap); - let mut entries: Vec<(u32, u32)> = overflow - .into_iter() - .map(|(slot, val)| (slot as u32, val)) - .collect(); + let mut entries: Vec<(usize, u32)> = overflow.into_iter().collect(); entries.sort_unstable_by_key(|&(slot, _)| slot); finalize_pciv(&path, n, &entries) diff --git a/src/obicompactvec/src/format.rs b/src/obicompactvec/src/format.rs index 4931c7c..08f0079 100644 --- a/src/obicompactvec/src/format.rs +++ b/src/obicompactvec/src/format.rs @@ -3,18 +3,26 @@ use std::io::{self, BufWriter, Seek, SeekFrom, Write as _}; use std::path::Path; pub const MAGIC: [u8; 4] = *b"PCIV"; -pub const HEADER_SIZE: usize = 24; // magic(4) + n(8) + n_overflow(4) + step(4) + n_index(4) -// Sparse index target: fits in 32 KB L1 cache / 8 bytes per entry. -pub const L1_INDEX_ENTRIES: usize = 4096; +// magic(4) + _pad(4) + n(8) + n_overflow(8) + n_index(8) + step(8) +pub const HEADER_SIZE: usize = 40; + +// Overflow entry: slot(u64) + value(u32) = 12 bytes. +pub const OVERFLOW_ENTRY_SIZE: usize = 12; + +// Index entry: slot(u64) + pos(u64) = 16 bytes. +pub const INDEX_ENTRY_SIZE: usize = 16; + +// Sparse index target: ≤ 32 KB in L1 cache (16 B per entry → 2048 entries). +pub const L1_INDEX_ENTRIES: usize = 2048; /// Finalise a PCIV file whose placeholder header and primary section are already on disk. /// /// Truncates the file to `HEADER_SIZE + n`, then appends: -/// data n_overflow × 8 B (slot: u32, value: u32) sorted by slot -/// index n_index × 8 B (slot: u32, pos: u32) sparse index -/// and overwrites the header placeholder at offset 0. -pub fn finalize_pciv(path: &Path, n: usize, entries: &[(u32, u32)]) -> io::Result<()> { +/// overflow n_overflow × 12 B (slot: u64, value: u32) sorted by slot +/// index n_index × 16 B (slot: u64, pos: u64) sparse index +/// and overwrites the placeholder header at offset 0. +pub fn finalize_pciv(path: &Path, n: usize, entries: &[(usize, u32)]) -> io::Result<()> { let n_overflow = entries.len(); let file = OpenOptions::new().read(true).write(true).open(path)?; @@ -24,21 +32,21 @@ pub fn finalize_pciv(path: &Path, n: usize, entries: &[(u32, u32)]) -> io::Resul w.seek(SeekFrom::End(0))?; for &(slot, value) in entries { - w.write_all(&slot.to_le_bytes())?; + w.write_all(&(slot as u64).to_le_bytes())?; w.write_all(&value.to_le_bytes())?; } - let step: u32 = if n_overflow <= L1_INDEX_ENTRIES { + let step: usize = if n_overflow <= L1_INDEX_ENTRIES { 0 } else { - n_overflow.div_ceil(L1_INDEX_ENTRIES) as u32 + n_overflow.div_ceil(L1_INDEX_ENTRIES) }; - let n_index: u32 = if step > 0 { - let count = n_overflow.div_ceil(step as usize) as u32; - for (block, chunk) in entries.chunks(step as usize).enumerate() { - let slot = chunk[0].0; - let pos = (block * step as usize) as u32; + let n_index: usize = if step > 0 { + let count = n_overflow.div_ceil(step); + for (block, chunk) in entries.chunks(step).enumerate() { + let slot = chunk[0].0 as u64; + let pos = (block * step) as u64; w.write_all(&slot.to_le_bytes())?; w.write_all(&pos.to_le_bytes())?; } @@ -51,9 +59,10 @@ pub fn finalize_pciv(path: &Path, n: usize, entries: &[(u32, u32)]) -> io::Resul let mut file = w.into_inner().map_err(|e| e.into_error())?; file.seek(SeekFrom::Start(0))?; file.write_all(&MAGIC)?; + file.write_all(&[0u8; 4])?; file.write_all(&(n as u64).to_le_bytes())?; - file.write_all(&(n_overflow as u32).to_le_bytes())?; - file.write_all(&step.to_le_bytes())?; - file.write_all(&n_index.to_le_bytes())?; + file.write_all(&(n_overflow as u64).to_le_bytes())?; + file.write_all(&(n_index as u64).to_le_bytes())?; + file.write_all(&(step as u64).to_le_bytes())?; file.flush() } diff --git a/src/obicompactvec/src/lib.rs b/src/obicompactvec/src/lib.rs index e91eaa6..73ec076 100644 --- a/src/obicompactvec/src/lib.rs +++ b/src/obicompactvec/src/lib.rs @@ -1,7 +1,9 @@ +mod bitvec; mod builder; mod format; mod reader; +pub use bitvec::{BitIter, PersistentBitVec, PersistentBitVecBuilder}; pub use builder::PersistentCompactIntVecBuilder; pub use reader::PersistentCompactIntVec; diff --git a/src/obicompactvec/src/reader.rs b/src/obicompactvec/src/reader.rs index 0680066..a3fc606 100644 --- a/src/obicompactvec/src/reader.rs +++ b/src/obicompactvec/src/reader.rs @@ -4,16 +4,16 @@ use std::path::{Path, PathBuf}; use memmap2::Mmap; -use crate::format::{HEADER_SIZE, MAGIC}; +use crate::format::{HEADER_SIZE, INDEX_ENTRY_SIZE, MAGIC, OVERFLOW_ENTRY_SIZE}; pub struct PersistentCompactIntVec { mmap: Mmap, n: usize, n_overflow: usize, - pub step: u32, - index: Vec<(u32, u32)>, // (slot, pos) — L1-resident sparse index - primary_offset: usize, // = HEADER_SIZE - data_offset: usize, // = HEADER_SIZE + n + pub step: usize, + index: Vec<(usize, usize)>, // (slot, pos) — L1-resident sparse index + primary_offset: usize, // = HEADER_SIZE + data_offset: usize, // = HEADER_SIZE + n path: PathBuf, } @@ -28,20 +28,20 @@ impl PersistentCompactIntVec { return Err(io::Error::new(io::ErrorKind::InvalidData, "bad PCIV magic")); } - let n = u64::from_le_bytes(mmap[4..12].try_into().unwrap()) as usize; - let n_overflow = u32::from_le_bytes(mmap[12..16].try_into().unwrap()) as usize; - let step = u32::from_le_bytes(mmap[16..20].try_into().unwrap()); - let n_index = u32::from_le_bytes(mmap[20..24].try_into().unwrap()) as usize; + let n = u64::from_le_bytes(mmap[8..16].try_into().unwrap()) as usize; + let n_overflow = u64::from_le_bytes(mmap[16..24].try_into().unwrap()) as usize; + let n_index = u64::from_le_bytes(mmap[24..32].try_into().unwrap()) as usize; + let step = u64::from_le_bytes(mmap[32..40].try_into().unwrap()) as usize; let primary_offset = HEADER_SIZE; let data_offset = primary_offset + n; - let index_offset = data_offset + n_overflow * 8; + let index_offset = data_offset + n_overflow * OVERFLOW_ENTRY_SIZE; let mut index = Vec::with_capacity(n_index); for i in 0..n_index { - let off = index_offset + i * 8; - let slot = u32::from_le_bytes(mmap[off..off + 4].try_into().unwrap()); - let pos = u32::from_le_bytes(mmap[off + 4..off + 8].try_into().unwrap()); + let off = index_offset + i * INDEX_ENTRY_SIZE; + let slot = u64::from_le_bytes(mmap[off..off + 8].try_into().unwrap()) as usize; + let pos = u64::from_le_bytes(mmap[off + 8..off + 16].try_into().unwrap()) as usize; index.push((slot, pos)); } @@ -69,14 +69,14 @@ impl PersistentCompactIntVec { self.n == 0 } - pub fn get(&self, slot: u64) -> u32 { - match self.mmap[self.primary_offset + slot as usize] { - 255 => self.overflow_get(slot as u32), + pub fn get(&self, slot: usize) -> u32 { + match self.mmap[self.primary_offset + slot] { + 255 => self.overflow_get(slot), v => v as u32, } } - fn overflow_get(&self, slot: u32) -> u32 { + fn overflow_get(&self, slot: usize) -> u32 { let pos_start; let pos_end; @@ -85,9 +85,9 @@ impl PersistentCompactIntVec { pos_end = self.n_overflow; } else { let i = self.index.partition_point(|&(s, _)| s <= slot).saturating_sub(1); - pos_start = self.index[i].1 as usize; + pos_start = self.index[i].1; pos_end = if i + 1 < self.index.len() { - self.index[i + 1].1 as usize + self.index[i + 1].1 } else { self.n_overflow }; @@ -107,14 +107,14 @@ impl PersistentCompactIntVec { } #[inline] - fn data_slot(&self, i: usize) -> u32 { - let off = self.data_offset + i * 8; - u32::from_le_bytes(self.mmap[off..off + 4].try_into().unwrap()) + fn data_slot(&self, i: usize) -> usize { + let off = self.data_offset + i * OVERFLOW_ENTRY_SIZE; + u64::from_le_bytes(self.mmap[off..off + 8].try_into().unwrap()) as usize } #[inline] fn data_value(&self, i: usize) -> u32 { - let off = self.data_offset + i * 8 + 4; + let off = self.data_offset + i * OVERFLOW_ENTRY_SIZE + 8; u32::from_le_bytes(self.mmap[off..off + 4].try_into().unwrap()) } @@ -122,6 +122,94 @@ impl PersistentCompactIntVec { self.iter().map(|v| v as u64).sum() } + pub fn bray_dist(&self, other: &PersistentCompactIntVec) -> f64 { + assert_eq!(self.n, other.len(), "length mismatch"); + let (sum_min, sum_a, sum_b) = self.iter().zip(other.iter()).fold( + (0u64, 0u64, 0u64), + |(sm, sa, sb), (a, b)| (sm + a.min(b) as u64, sa + a as u64, sb + b as u64), + ); + let denom = sum_a + sum_b; + if denom == 0 { return 0.0; } + 1.0 - 2.0 * sum_min as f64 / denom as f64 + } + + pub fn relfreq_bray_dist(&self, other: &PersistentCompactIntVec) -> f64 { + assert_eq!(self.n, other.len(), "length mismatch"); + let sum_a = self.sum() as f64; + let sum_b = other.sum() as f64; + if sum_a == 0.0 && sum_b == 0.0 { return 0.0; } + let sum_min: f64 = self.iter().zip(other.iter()) + .map(|(a, b)| { + let pa = if sum_a > 0.0 { a as f64 / sum_a } else { 0.0 }; + let pb = if sum_b > 0.0 { b as f64 / sum_b } else { 0.0 }; + pa.min(pb) + }) + .sum(); + 1.0 - sum_min + } + + pub fn euclidean_dist(&self, other: &PersistentCompactIntVec) -> f64 { + assert_eq!(self.n, other.len(), "length mismatch"); + let sq: f64 = self.iter().zip(other.iter()) + .map(|(a, b)| { let d = a as f64 - b as f64; d * d }) + .sum(); + sq.sqrt() + } + + pub fn relfreq_euclidean_dist(&self, other: &PersistentCompactIntVec) -> f64 { + assert_eq!(self.n, other.len(), "length mismatch"); + let sum_a = self.sum() as f64; + let sum_b = other.sum() as f64; + if sum_a == 0.0 && sum_b == 0.0 { return 0.0; } + let sq: f64 = self.iter().zip(other.iter()) + .map(|(a, b)| { + let pa = if sum_a > 0.0 { a as f64 / sum_a } else { 0.0 }; + let pb = if sum_b > 0.0 { b as f64 / sum_b } else { 0.0 }; + let d = pa - pb; + d * d + }) + .sum(); + sq.sqrt() + } + + pub fn hellinger_euclidean_dist(&self, other: &PersistentCompactIntVec) -> f64 { + assert_eq!(self.n, other.len(), "length mismatch"); + let sum_a = self.sum() as f64; + let sum_b = other.sum() as f64; + if sum_a == 0.0 && sum_b == 0.0 { return 0.0; } + let sq: f64 = self.iter().zip(other.iter()) + .map(|(a, b)| { + let pa = if sum_a > 0.0 { (a as f64 / sum_a).sqrt() } else { 0.0 }; + let pb = if sum_b > 0.0 { (b as f64 / sum_b).sqrt() } else { 0.0 }; + let d = pa - pb; + d * d + }) + .sum(); + sq.sqrt() + } + + pub fn hellinger_dist(&self, other: &PersistentCompactIntVec) -> f64 { + self.hellinger_euclidean_dist(other) / std::f64::consts::SQRT_2 + } + + pub fn threshold_jaccard_dist(&self, other: &PersistentCompactIntVec, threshold: u32) -> f64 { + assert_eq!(self.n, other.len(), "length mismatch"); + let (intersection, union) = self.iter().zip(other.iter()).fold( + (0u64, 0u64), + |(inter, uni), (a, b)| { + let ap = a >= threshold; + let bp = b >= threshold; + (inter + (ap & bp) as u64, uni + (ap | bp) as u64) + }, + ); + if union == 0 { return 0.0; } + 1.0 - intersection as f64 / union as f64 + } + + pub fn jaccard_dist(&self, other: &PersistentCompactIntVec) -> f64 { + self.threshold_jaccard_dist(other, 1) + } + pub fn iter(&self) -> Iter<'_> { Iter { pciv: self, slot: 0, overflow_pos: 0 } } diff --git a/src/obicompactvec/src/tests/bitvec.rs b/src/obicompactvec/src/tests/bitvec.rs new file mode 100644 index 0000000..6b20568 --- /dev/null +++ b/src/obicompactvec/src/tests/bitvec.rs @@ -0,0 +1,216 @@ +use tempfile::tempdir; + +use crate::{PersistentBitVec, PersistentBitVecBuilder, PersistentCompactIntVec, PersistentCompactIntVecBuilder}; + +fn make_bv(bits: &[bool]) -> (tempfile::TempDir, PersistentBitVec) { + let dir = tempdir().unwrap(); + let path = dir.path().join("v.pbiv"); + let mut b = PersistentBitVecBuilder::new(bits.len(), &path).unwrap(); + for (i, &v) in bits.iter().enumerate() { + b.set(i, v); + } + b.close().unwrap(); + let r = PersistentBitVec::open(&path).unwrap(); + (dir, r) +} + +#[test] +fn all_false_by_default() { + let dir = tempdir().unwrap(); + let path = dir.path().join("v.pbiv"); + let b = PersistentBitVecBuilder::new(16, &path).unwrap(); + b.close().unwrap(); + let r = PersistentBitVec::open(&path).unwrap(); + assert!(r.iter().all(|v| !v)); + assert_eq!(r.count_ones(), 0); +} + +#[test] +fn set_get_roundtrip() { + let bits = vec![true, false, true, true, false, false, true, false, true]; + let (_dir, r) = make_bv(&bits); + for (i, &expected) in bits.iter().enumerate() { + assert_eq!(r.get(i), expected, "slot {i}"); + } +} + +#[test] +fn iter_matches_get() { + let bits = vec![true, false, true, false, true, true, false, true, false, true]; + let (_dir, r) = make_bv(&bits); + let via_iter: Vec = r.iter().collect(); + let via_get: Vec = (0..bits.len()).map(|s| r.get(s)).collect(); + assert_eq!(via_iter, via_get); +} + +#[test] +fn count_ones_and_zeros() { + let bits = vec![true, false, true, true, false]; + let (_dir, r) = make_bv(&bits); + assert_eq!(r.count_ones(), 3); + assert_eq!(r.count_zeros(), 2); +} + +#[test] +fn non_byte_aligned_length() { + // 13 bits — last byte has 5 padding zeros + let bits: Vec = (0..13).map(|i| i % 3 == 0).collect(); + let (_dir, r) = make_bv(&bits); + assert_eq!(r.iter().collect::>(), bits); +} + +#[test] +fn build_from_roundtrip() { + let bits = vec![true, false, true, false, true, true]; + let (_da, ra) = make_bv(&bits); + let dir_b = tempdir().unwrap(); + let path_b = dir_b.path().join("b.pbiv"); + PersistentBitVecBuilder::build_from(&ra, &path_b).unwrap().close().unwrap(); + let rb = PersistentBitVec::open(&path_b).unwrap(); + assert_eq!(ra.iter().collect::>(), rb.iter().collect::>()); +} + +#[test] +fn op_and() { + let (_da, ra) = make_bv(&[true, true, false, false]); + let (_db, rb) = make_bv(&[true, false, true, false]); + let dir = tempdir().unwrap(); + let path = dir.path().join("out.pbiv"); + let mut b = PersistentBitVecBuilder::build_from(&ra, &path).unwrap(); + b.and(&rb); + b.close().unwrap(); + let r = PersistentBitVec::open(&path).unwrap(); + assert_eq!(r.iter().collect::>(), vec![true, false, false, false]); +} + +#[test] +fn op_or() { + let (_da, ra) = make_bv(&[true, true, false, false]); + let (_db, rb) = make_bv(&[true, false, true, false]); + let dir = tempdir().unwrap(); + let path = dir.path().join("out.pbiv"); + let mut b = PersistentBitVecBuilder::build_from(&ra, &path).unwrap(); + b.or(&rb); + b.close().unwrap(); + let r = PersistentBitVec::open(&path).unwrap(); + assert_eq!(r.iter().collect::>(), vec![true, true, true, false]); +} + +#[test] +fn op_xor() { + let (_da, ra) = make_bv(&[true, true, false, false]); + let (_db, rb) = make_bv(&[true, false, true, false]); + let dir = tempdir().unwrap(); + let path = dir.path().join("out.pbiv"); + let mut b = PersistentBitVecBuilder::build_from(&ra, &path).unwrap(); + b.xor(&rb); + b.close().unwrap(); + let r = PersistentBitVec::open(&path).unwrap(); + assert_eq!(r.iter().collect::>(), vec![false, true, true, false]); +} + +#[test] +fn op_not_byte_aligned() { + let bits = vec![true, false, true, false, true, false, false, true]; // 8 bits, no padding + let (_da, ra) = make_bv(&bits); + let dir = tempdir().unwrap(); + let path = dir.path().join("out.pbiv"); + let mut b = PersistentBitVecBuilder::build_from(&ra, &path).unwrap(); + b.not(); + b.close().unwrap(); + let r = PersistentBitVec::open(&path).unwrap(); + let expected: Vec = bits.iter().map(|&v| !v).collect(); + assert_eq!(r.iter().collect::>(), expected); +} + +#[test] +fn op_not_non_aligned_no_padding_leak() { + // 5 bits: [T,F,T,F,T] → NOT → [F,T,F,T,F]; padding bits must stay 0 + let bits = vec![true, false, true, false, true]; + let (_da, ra) = make_bv(&bits); + let dir = tempdir().unwrap(); + let path = dir.path().join("out.pbiv"); + let mut b = PersistentBitVecBuilder::build_from(&ra, &path).unwrap(); + b.not(); + b.close().unwrap(); + let r = PersistentBitVec::open(&path).unwrap(); + assert_eq!(r.iter().collect::>(), vec![false, true, false, true, false]); + // count_ones must reflect exactly the 2 set bits, not 2 + stray padding bits + assert_eq!(r.count_ones(), 2); +} + +#[test] +fn jaccard_dist_basic() { + // A={0,2} B={0,1} → inter=1 union=3 → dist=2/3 + let (_da, ra) = make_bv(&[true, false, true, false]); + let (_db, rb) = make_bv(&[true, true, false, false]); + let d = ra.jaccard_dist(&rb); + assert!((d - 2.0 / 3.0).abs() < 1e-12, "got {d}"); +} + +#[test] +fn jaccard_dist_identical() { + let (_d, r) = make_bv(&[true, false, true]); + assert_eq!(r.jaccard_dist(&r), 0.0); +} + +#[test] +fn jaccard_dist_disjoint() { + let (_da, ra) = make_bv(&[true, false]); + let (_db, rb) = make_bv(&[false, true]); + assert_eq!(ra.jaccard_dist(&rb), 1.0); +} + +fn make_pciv(counts: &[u32]) -> (tempfile::TempDir, PersistentCompactIntVec) { + let dir = tempdir().unwrap(); + let path = dir.path().join("c.pciv"); + let mut b = PersistentCompactIntVecBuilder::new(counts.len(), &path).unwrap(); + for (i, &v) in counts.iter().enumerate() { b.set(i, v); } + b.close().unwrap(); + let r = PersistentCompactIntVec::open(&path).unwrap(); + (dir, r) +} + +#[test] +fn build_from_presence_basic() { + let counts = vec![0u32, 3, 0, 7, 1, 0]; + let (_dc, rc) = make_pciv(&counts); + let dir = tempdir().unwrap(); + let path = dir.path().join("bv.pbiv"); + PersistentBitVecBuilder::build_from_presence(&rc, &path).unwrap().close().unwrap(); + let bv = PersistentBitVec::open(&path).unwrap(); + assert_eq!(bv.iter().collect::>(), vec![false, true, false, true, true, false]); + assert_eq!(bv.count_ones(), 3); +} + +#[test] +fn build_from_counts_with_threshold() { + // threshold=5: present if count >= 5 + let counts = vec![0u32, 3, 5, 10, 4, 6]; + let (_dc, rc) = make_pciv(&counts); + let dir = tempdir().unwrap(); + let path = dir.path().join("bv.pbiv"); + PersistentBitVecBuilder::build_from_counts(&rc, 5, &path).unwrap().close().unwrap(); + let bv = PersistentBitVec::open(&path).unwrap(); + assert_eq!(bv.iter().collect::>(), vec![false, false, true, true, false, true]); +} + +#[test] +fn build_from_counts_with_overflow_values() { + // overflow value (>= 255) must pass threshold + let counts = vec![0u32, 100, 1_000_000, 50]; + let (_dc, rc) = make_pciv(&counts); + let dir = tempdir().unwrap(); + let path = dir.path().join("bv.pbiv"); + PersistentBitVecBuilder::build_from_counts(&rc, 200, &path).unwrap().close().unwrap(); + let bv = PersistentBitVec::open(&path).unwrap(); + assert_eq!(bv.iter().collect::>(), vec![false, false, true, false]); +} + +#[test] +fn hamming_dist_basic() { + // differ at positions 1 and 2 + let (_da, ra) = make_bv(&[true, true, false, true]); + let (_db, rb) = make_bv(&[true, false, true, true]); + assert_eq!(ra.hamming_dist(&rb), 2); +} diff --git a/src/obicompactvec/src/tests/mod.rs b/src/obicompactvec/src/tests/mod.rs index 3a64fd7..0eb8adb 100644 --- a/src/obicompactvec/src/tests/mod.rs +++ b/src/obicompactvec/src/tests/mod.rs @@ -1,8 +1,10 @@ +mod bitvec; + use tempfile::tempdir; use crate::{PersistentCompactIntVec, PersistentCompactIntVecBuilder}; -fn roundtrip(values: &[(u64, u32)], n: usize) -> Vec { +fn roundtrip(values: &[(usize, u32)], n: usize) -> Vec { let dir = tempdir().unwrap(); let path = dir.path().join("test.pciv"); let mut b = PersistentCompactIntVecBuilder::new(n, &path).unwrap(); @@ -11,7 +13,7 @@ fn roundtrip(values: &[(u64, u32)], n: usize) -> Vec { } b.close().unwrap(); let r = PersistentCompactIntVec::open(&path).unwrap(); - (0..n as u64).map(|s| r.get(s)).collect() + (0..n).map(|s| r.get(s)).collect() } fn make_pciv(values: &[u32]) -> (tempfile::TempDir, PersistentCompactIntVec) { @@ -19,7 +21,7 @@ fn make_pciv(values: &[u32]) -> (tempfile::TempDir, PersistentCompactIntVec) { let path = dir.path().join("v.pciv"); let mut b = PersistentCompactIntVecBuilder::new(values.len(), &path).unwrap(); for (i, &v) in values.iter().enumerate() { - b.set(i as u64, v); + b.set(i, v); } b.close().unwrap(); let r = PersistentCompactIntVec::open(&path).unwrap(); @@ -91,19 +93,19 @@ fn sparse_index_built_for_many_overflows() { let path = dir.path().join("test.pciv"); let mut b = PersistentCompactIntVecBuilder::new(n, &path).unwrap(); for i in 0..n { - b.set(i as u64, 1000 + i as u32); + b.set(i, 1000 + i as u32); } b.close().unwrap(); let r = PersistentCompactIntVec::open(&path).unwrap(); assert!(r.step > 0, "sparse index should have been built"); for i in 0..n { - assert_eq!(r.get(i as u64), 1000 + i as u32, "mismatch at slot {i}"); + assert_eq!(r.get(i), 1000 + i as u32, "mismatch at slot {i}"); } } #[test] fn iter_matches_get() { - let values = [(0u64, 255u32), (1, 1000), (2, 50), (3, 1_313_691), (4, 7), (5, 42)]; + let values = [(0usize, 255u32), (1, 1000), (2, 50), (3, 1_313_691), (4, 7), (5, 42)]; let n = 6; let dir = tempdir().unwrap(); let path = dir.path().join("test.pciv"); @@ -115,7 +117,7 @@ fn iter_matches_get() { let r = PersistentCompactIntVec::open(&path).unwrap(); let via_iter: Vec = r.iter().collect(); - let via_get: Vec = (0..n as u64).map(|s| r.get(s)).collect(); + let via_get: Vec = (0..n).map(|s| r.get(s)).collect(); assert_eq!(via_iter, via_get); } @@ -222,6 +224,106 @@ fn combine_diff() { assert_eq!(r.iter().collect::>(), vec![10, 700, 0, 0]); } +#[test] +fn bray_dist_basic() { + // min=[1,0,0], sum_a=2, sum_b=2 → 1 - 2*1/4 = 0.5 + let (_da, ra) = make_pciv(&[1, 0, 1]); + let (_db, rb) = make_pciv(&[1, 1, 0]); + let d = ra.bray_dist(&rb); + assert!((d - 0.5).abs() < 1e-12, "got {d}"); +} + +#[test] +fn bray_dist_identical() { + let (_d, r) = make_pciv(&[3, 7, 2]); + assert_eq!(r.bray_dist(&r), 0.0); +} + +#[test] +fn bray_dist_disjoint() { + // no overlap → min=0 → BC=1 + let (_da, ra) = make_pciv(&[1, 0]); + let (_db, rb) = make_pciv(&[0, 1]); + assert_eq!(ra.bray_dist(&rb), 1.0); +} + +#[test] +fn relfreq_bray_dist_basic() { + // [2,0] vs [0,2]: rel_freqs [1,0] vs [0,1], min=0 → dist=1 + let (_da, ra) = make_pciv(&[2, 0]); + let (_db, rb) = make_pciv(&[0, 2]); + assert_eq!(ra.relfreq_bray_dist(&rb), 1.0); + // [2,0] vs [1,0]: same direction → dist=0 + let (_dc, rc) = make_pciv(&[1, 0]); + assert_eq!(ra.relfreq_bray_dist(&rc), 0.0); +} + +#[test] +fn euclidean_dist_basic() { + // [3,4] vs [0,0] → 5 + let (_da, ra) = make_pciv(&[3, 4]); + let (_db, rb) = make_pciv(&[0, 0]); + assert!((ra.euclidean_dist(&rb) - 5.0).abs() < 1e-12); +} + +#[test] +fn euclidean_dist_identical() { + let (_d, r) = make_pciv(&[3, 7, 2]); + assert_eq!(r.euclidean_dist(&r), 0.0); +} + +#[test] +fn hellinger_euclidean_dist_disjoint() { + // [4,0] vs [0,4]: sqrt(relfreq) = [1,0] vs [0,1] → euclidean = sqrt(2) + let (_da, ra) = make_pciv(&[4, 0]); + let (_db, rb) = make_pciv(&[0, 4]); + assert!((ra.hellinger_euclidean_dist(&rb) - 2f64.sqrt()).abs() < 1e-12); +} + +#[test] +fn hellinger_euclidean_dist_identical() { + let (_d, r) = make_pciv(&[3, 7, 2]); + assert_eq!(r.hellinger_euclidean_dist(&r), 0.0); +} + +#[test] +fn jaccard_dist_basic() { + // ref={0,2,3} other={0,1,2} → inter=2 union=4 → dist=0.5 + let (_da, ra) = make_pciv(&[1, 0, 1, 1]); + let (_db, rb) = make_pciv(&[1, 1, 1, 0]); + assert!((ra.jaccard_dist(&rb) - 0.5).abs() < 1e-12); +} + +#[test] +fn jaccard_dist_identical() { + let (_d, r) = make_pciv(&[3, 0, 7]); + assert_eq!(r.jaccard_dist(&r), 0.0); +} + +#[test] +fn hellinger_dist_disjoint() { + // disjoint → hellinger_euclidean = sqrt(2) → hellinger = 1.0 + let (_da, ra) = make_pciv(&[4, 0]); + let (_db, rb) = make_pciv(&[0, 4]); + assert!((ra.hellinger_dist(&rb) - 1.0).abs() < 1e-12); +} + +#[test] +fn hellinger_dist_identical() { + let (_d, r) = make_pciv(&[3, 7, 2]); + assert_eq!(r.hellinger_dist(&r), 0.0); +} + +#[test] +fn threshold_jaccard_dist_with_threshold() { + // threshold=3: ref={0,2} (values 5,4≥3) other={1,2} (values 3,4≥3) + // inter={2}=1, union={0,1,2}=3 → dist=2/3 + let (_da, ra) = make_pciv(&[5, 1, 4]); + let (_db, rb) = make_pciv(&[2, 3, 4]); + let d = ra.threshold_jaccard_dist(&rb, 3); + assert!((d - 2.0 / 3.0).abs() < 1e-12, "got {d}"); +} + #[test] fn mixed_large_dataset() { let n = 1000usize; @@ -230,12 +332,12 @@ fn mixed_large_dataset() { let mut b = PersistentCompactIntVecBuilder::new(n, &path).unwrap(); for i in 0..n { let v = if i % 100 == 0 { 100_000 + i as u32 } else { i as u32 % 200 }; - b.set(i as u64, v); + b.set(i, v); } b.close().unwrap(); let r = PersistentCompactIntVec::open(&path).unwrap(); for i in 0..n { let expected = if i % 100 == 0 { 100_000 + i as u32 } else { i as u32 % 200 }; - assert_eq!(r.get(i as u64), expected, "slot {i}"); + assert_eq!(r.get(i), expected, "slot {i}"); } }