Files
obikmer/src/obicompactvec/src/views.rs
T
Eric Coissac f91c5a3f79 refactor(obicompactvec): unify bit and int vector slice views
Refactors column and matrix access to use unified `BitSliceView` and `IntSliceView` abstractions, replacing legacy `PackedCol`/`IntColView` types. Introduces `BitSlice`/`IntSlice` traits for zero-copy, trait-based bitwise and arithmetic operations across persistent and temporary vector types. Removes deprecated in-memory `MemoryBitVec` and `MemoryIntVec` implementations and their tests, while updating dependent crates to use the new view-based API and `BitSliceMut` trait.
2026-06-17 23:51:32 +02:00

279 lines
10 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
use crate::format::{byte_count_nonzero, byte_sum, parse_overflow_entry};
// ── BitSliceView ──────────────────────────────────────────────────────────────
/// Lightweight, copy-able read-only view over a u64 word array.
/// Bit `i` is in `words[i >> 6]` at position `i & 63`. Padding bits are zero.
#[derive(Clone, Copy)]
pub struct BitSliceView<'a> {
pub(crate) words: &'a [u64],
pub(crate) n: usize,
}
impl<'a> BitSliceView<'a> {
#[inline]
pub fn new(words: &'a [u64], n: usize) -> Self { Self { words, n } }
pub fn len(&self) -> usize { self.n }
pub fn is_empty(&self) -> bool { self.n == 0 }
pub fn words(&self) -> &'a [u64] { self.words }
#[inline]
pub fn get(&self, slot: usize) -> bool {
(self.words[slot >> 6] >> (slot & 63)) & 1 != 0
}
pub fn count_ones(&self) -> u64 {
self.words.iter().map(|w| w.count_ones() as u64).sum()
}
pub fn count_zeros(&self) -> u64 { self.n as u64 - self.count_ones() }
pub fn iter(&self) -> BitSliceIter<'a> {
BitSliceIter { words: self.words, slot: 0, n: self.n }
}
pub fn partial_jaccard_dist(self, other: BitSliceView<'_>) -> (u64, u64) {
assert_eq!(self.n, other.n, "BitSliceView length mismatch");
self.words.iter().zip(other.words)
.fold((0u64, 0u64), |(i, u), (&a, &b)| {
(i + (a & b).count_ones() as u64, u + (a | b).count_ones() as u64)
})
}
pub fn jaccard_dist(self, other: BitSliceView<'_>) -> f64 {
let (inter, union) = self.partial_jaccard_dist(other);
if union == 0 { 0.0 } else { 1.0 - inter as f64 / union as f64 }
}
pub fn hamming_dist(self, other: BitSliceView<'_>) -> u64 {
assert_eq!(self.n, other.n, "BitSliceView length mismatch");
self.words.iter().zip(other.words)
.map(|(&a, &b)| (a ^ b).count_ones() as u64)
.sum()
}
}
// ── BitSliceIter ──────────────────────────────────────────────────────────────
pub struct BitSliceIter<'a> {
words: &'a [u64],
slot: usize,
n: usize,
}
impl Iterator for BitSliceIter<'_> {
type Item = bool;
fn next(&mut self) -> Option<bool> {
if self.slot >= self.n { return None; }
let v = (self.words[self.slot >> 6] >> (self.slot & 63)) & 1 != 0;
self.slot += 1;
Some(v)
}
fn size_hint(&self) -> (usize, Option<usize>) {
let rem = self.n - self.slot;
(rem, Some(rem))
}
}
impl ExactSizeIterator for BitSliceIter<'_> {}
// ── IntSliceView ──────────────────────────────────────────────────────────────
/// Lightweight, copy-able read-only view over a compact-int primary array plus
/// its sorted raw overflow bytes. Zero-copy: all data lives in the caller's mmap.
#[derive(Clone, Copy)]
pub struct IntSliceView<'a> {
pub(crate) primary: &'a [u8],
pub(crate) overflow_raw: &'a [u8], // n_overflow × OVERFLOW_ENTRY_SIZE bytes, sorted by slot
pub(crate) n_overflow: usize,
pub(crate) n: usize,
}
impl<'a> IntSliceView<'a> {
#[inline]
pub fn new(primary: &'a [u8], overflow_raw: &'a [u8], n_overflow: usize, n: usize) -> Self {
Self { primary, overflow_raw, n_overflow, n }
}
pub fn len(&self) -> usize { self.n }
pub fn is_empty(&self) -> bool { self.n == 0 }
pub fn primary_bytes(&self) -> &'a [u8] { self.primary }
pub fn n_overflow(&self) -> usize { self.n_overflow }
pub fn overflow_entries(&self) -> impl Iterator<Item = (usize, u32)> + 'a {
let raw = self.overflow_raw;
let n_ov = self.n_overflow;
(0..n_ov).map(move |i| parse_overflow_entry(raw, 0, i))
}
/// O(log n_overflow) via binary search (overflow is always sorted by slot).
pub fn get(&self, slot: usize) -> u32 {
let b = self.primary[slot];
if b < 255 { return b as u32; }
let mut lo = 0usize;
let mut hi = self.n_overflow;
while lo < hi {
let mid = lo + (hi - lo) / 2;
let (s, v) = parse_overflow_entry(self.overflow_raw, 0, mid);
match s.cmp(&slot) {
std::cmp::Ordering::Equal => return v,
std::cmp::Ordering::Less => lo = mid + 1,
std::cmp::Ordering::Greater => hi = mid,
}
}
panic!("slot {slot} marked overflow but not found")
}
/// Sequential merge scan: yields all n values in slot order.
pub fn iter(&self) -> IntSliceViewIter<'a> {
IntSliceViewIter {
primary: self.primary,
overflow_raw: self.overflow_raw,
slot: 0,
overflow_pos: 0,
n: self.n,
}
}
pub fn sum(&self) -> u64 {
byte_sum(self.primary, self.overflow_entries().map(|(_, v)| v))
}
pub fn count_nonzero(&self) -> u64 {
byte_count_nonzero(self.primary)
}
// ── Distance methods ──────────────────────────────────────────────────────
pub fn partial_bray_dist(self, other: IntSliceView<'_>) -> u64 {
assert_eq!(self.n, other.n, "length mismatch");
self.iter().zip(other.iter()).map(|(a, b)| a.min(b) as u64).sum()
}
pub fn bray_dist(self, other: IntSliceView<'_>) -> f64 {
let sum_min = self.partial_bray_dist(other);
let denom = self.sum() + other.sum();
if denom == 0 { 0.0 } else { 1.0 - 2.0 * sum_min as f64 / denom as f64 }
}
pub fn partial_relfreq_bray_dist(self, other: IntSliceView<'_>, sa: f64, sb: f64) -> f64 {
assert_eq!(self.n, other.n, "length mismatch");
self.iter().zip(other.iter())
.map(|(a, b)| {
let pa = if sa > 0.0 { a as f64 / sa } else { 0.0 };
let pb = if sb > 0.0 { b as f64 / sb } else { 0.0 };
pa.min(pb)
})
.sum()
}
pub fn relfreq_bray_dist(self, other: IntSliceView<'_>) -> f64 {
let sa = self.sum() as f64;
let sb = other.sum() as f64;
if sa == 0.0 && sb == 0.0 { return 0.0; }
1.0 - self.partial_relfreq_bray_dist(other, sa, sb)
}
pub fn partial_euclidean_dist(self, other: IntSliceView<'_>) -> f64 {
assert_eq!(self.n, other.n, "length mismatch");
self.iter().zip(other.iter())
.map(|(a, b)| { let d = a as f64 - b as f64; d * d })
.sum()
}
pub fn euclidean_dist(self, other: IntSliceView<'_>) -> f64 {
self.partial_euclidean_dist(other).sqrt()
}
pub fn partial_relfreq_euclidean_dist(self, other: IntSliceView<'_>, sa: f64, sb: f64) -> f64 {
assert_eq!(self.n, other.n, "length mismatch");
self.iter().zip(other.iter())
.map(|(a, b)| {
let pa = if sa > 0.0 { a as f64 / sa } else { 0.0 };
let pb = if sb > 0.0 { b as f64 / sb } else { 0.0 };
let d = pa - pb;
d * d
})
.sum()
}
pub fn relfreq_euclidean_dist(self, other: IntSliceView<'_>) -> f64 {
let sa = self.sum() as f64;
let sb = other.sum() as f64;
if sa == 0.0 && sb == 0.0 { return 0.0; }
self.partial_relfreq_euclidean_dist(other, sa, sb).sqrt()
}
pub fn partial_hellinger_euclidean_dist(self, other: IntSliceView<'_>, sa: f64, sb: f64) -> f64 {
assert_eq!(self.n, other.n, "length mismatch");
self.iter().zip(other.iter())
.map(|(a, b)| {
let pa = if sa > 0.0 { (a as f64 / sa).sqrt() } else { 0.0 };
let pb = if sb > 0.0 { (b as f64 / sb).sqrt() } else { 0.0 };
let d = pa - pb;
d * d
})
.sum()
}
pub fn hellinger_euclidean_dist(self, other: IntSliceView<'_>) -> f64 {
let sa = self.sum() as f64;
let sb = other.sum() as f64;
if sa == 0.0 && sb == 0.0 { return 0.0; }
self.partial_hellinger_euclidean_dist(other, sa, sb).sqrt()
}
pub fn hellinger_dist(self, other: IntSliceView<'_>) -> f64 {
self.hellinger_euclidean_dist(other) / std::f64::consts::SQRT_2
}
pub fn partial_threshold_jaccard_dist(self, other: IntSliceView<'_>, threshold: u32) -> (u64, u64) {
assert_eq!(self.n, other.n, "length mismatch");
self.iter().zip(other.iter())
.fold((0u64, 0u64), |(inter, uni), (a, b)| {
let ap = a >= threshold;
let bp = b >= threshold;
(inter + (ap & bp) as u64, uni + (ap | bp) as u64)
})
}
pub fn threshold_jaccard_dist(self, other: IntSliceView<'_>, threshold: u32) -> f64 {
let (inter, union) = self.partial_threshold_jaccard_dist(other, threshold);
if union == 0 { 0.0 } else { 1.0 - inter as f64 / union as f64 }
}
pub fn jaccard_dist(self, other: IntSliceView<'_>) -> f64 {
self.threshold_jaccard_dist(other, 1)
}
}
// ── IntSliceViewIter ──────────────────────────────────────────────────────────
pub struct IntSliceViewIter<'a> {
primary: &'a [u8],
overflow_raw: &'a [u8],
slot: usize,
overflow_pos: usize,
n: usize,
}
impl Iterator for IntSliceViewIter<'_> {
type Item = u32;
fn next(&mut self) -> Option<u32> {
if self.slot >= self.n { return None; }
let v = self.primary[self.slot];
self.slot += 1;
if v < 255 {
Some(v as u32)
} else {
let (_, val) = parse_overflow_entry(self.overflow_raw, 0, self.overflow_pos);
self.overflow_pos += 1;
Some(val)
}
}
fn size_hint(&self) -> (usize, Option<usize>) {
let rem = self.n - self.slot;
(rem, Some(rem))
}
}
impl ExactSizeIterator for IntSliceViewIter<'_> {}