f91c5a3f79
Refactors column and matrix access to use unified `BitSliceView` and `IntSliceView` abstractions, replacing legacy `PackedCol`/`IntColView` types. Introduces `BitSlice`/`IntSlice` traits for zero-copy, trait-based bitwise and arithmetic operations across persistent and temporary vector types. Removes deprecated in-memory `MemoryBitVec` and `MemoryIntVec` implementations and their tests, while updating dependent crates to use the new view-based API and `BitSliceMut` trait.
279 lines
10 KiB
Rust
279 lines
10 KiB
Rust
use crate::format::{byte_count_nonzero, byte_sum, parse_overflow_entry};
|
||
|
||
// ── BitSliceView ──────────────────────────────────────────────────────────────
|
||
|
||
/// Lightweight, copy-able read-only view over a u64 word array.
|
||
/// Bit `i` is in `words[i >> 6]` at position `i & 63`. Padding bits are zero.
|
||
#[derive(Clone, Copy)]
|
||
pub struct BitSliceView<'a> {
|
||
pub(crate) words: &'a [u64],
|
||
pub(crate) n: usize,
|
||
}
|
||
|
||
impl<'a> BitSliceView<'a> {
|
||
#[inline]
|
||
pub fn new(words: &'a [u64], n: usize) -> Self { Self { words, n } }
|
||
|
||
pub fn len(&self) -> usize { self.n }
|
||
pub fn is_empty(&self) -> bool { self.n == 0 }
|
||
pub fn words(&self) -> &'a [u64] { self.words }
|
||
|
||
#[inline]
|
||
pub fn get(&self, slot: usize) -> bool {
|
||
(self.words[slot >> 6] >> (slot & 63)) & 1 != 0
|
||
}
|
||
|
||
pub fn count_ones(&self) -> u64 {
|
||
self.words.iter().map(|w| w.count_ones() as u64).sum()
|
||
}
|
||
pub fn count_zeros(&self) -> u64 { self.n as u64 - self.count_ones() }
|
||
|
||
pub fn iter(&self) -> BitSliceIter<'a> {
|
||
BitSliceIter { words: self.words, slot: 0, n: self.n }
|
||
}
|
||
|
||
pub fn partial_jaccard_dist(self, other: BitSliceView<'_>) -> (u64, u64) {
|
||
assert_eq!(self.n, other.n, "BitSliceView length mismatch");
|
||
self.words.iter().zip(other.words)
|
||
.fold((0u64, 0u64), |(i, u), (&a, &b)| {
|
||
(i + (a & b).count_ones() as u64, u + (a | b).count_ones() as u64)
|
||
})
|
||
}
|
||
|
||
pub fn jaccard_dist(self, other: BitSliceView<'_>) -> f64 {
|
||
let (inter, union) = self.partial_jaccard_dist(other);
|
||
if union == 0 { 0.0 } else { 1.0 - inter as f64 / union as f64 }
|
||
}
|
||
|
||
pub fn hamming_dist(self, other: BitSliceView<'_>) -> u64 {
|
||
assert_eq!(self.n, other.n, "BitSliceView length mismatch");
|
||
self.words.iter().zip(other.words)
|
||
.map(|(&a, &b)| (a ^ b).count_ones() as u64)
|
||
.sum()
|
||
}
|
||
}
|
||
|
||
// ── BitSliceIter ──────────────────────────────────────────────────────────────
|
||
|
||
pub struct BitSliceIter<'a> {
|
||
words: &'a [u64],
|
||
slot: usize,
|
||
n: usize,
|
||
}
|
||
|
||
impl Iterator for BitSliceIter<'_> {
|
||
type Item = bool;
|
||
fn next(&mut self) -> Option<bool> {
|
||
if self.slot >= self.n { return None; }
|
||
let v = (self.words[self.slot >> 6] >> (self.slot & 63)) & 1 != 0;
|
||
self.slot += 1;
|
||
Some(v)
|
||
}
|
||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||
let rem = self.n - self.slot;
|
||
(rem, Some(rem))
|
||
}
|
||
}
|
||
impl ExactSizeIterator for BitSliceIter<'_> {}
|
||
|
||
// ── IntSliceView ──────────────────────────────────────────────────────────────
|
||
|
||
/// Lightweight, copy-able read-only view over a compact-int primary array plus
|
||
/// its sorted raw overflow bytes. Zero-copy: all data lives in the caller's mmap.
|
||
#[derive(Clone, Copy)]
|
||
pub struct IntSliceView<'a> {
|
||
pub(crate) primary: &'a [u8],
|
||
pub(crate) overflow_raw: &'a [u8], // n_overflow × OVERFLOW_ENTRY_SIZE bytes, sorted by slot
|
||
pub(crate) n_overflow: usize,
|
||
pub(crate) n: usize,
|
||
}
|
||
|
||
impl<'a> IntSliceView<'a> {
|
||
#[inline]
|
||
pub fn new(primary: &'a [u8], overflow_raw: &'a [u8], n_overflow: usize, n: usize) -> Self {
|
||
Self { primary, overflow_raw, n_overflow, n }
|
||
}
|
||
|
||
pub fn len(&self) -> usize { self.n }
|
||
pub fn is_empty(&self) -> bool { self.n == 0 }
|
||
pub fn primary_bytes(&self) -> &'a [u8] { self.primary }
|
||
pub fn n_overflow(&self) -> usize { self.n_overflow }
|
||
|
||
pub fn overflow_entries(&self) -> impl Iterator<Item = (usize, u32)> + 'a {
|
||
let raw = self.overflow_raw;
|
||
let n_ov = self.n_overflow;
|
||
(0..n_ov).map(move |i| parse_overflow_entry(raw, 0, i))
|
||
}
|
||
|
||
/// O(log n_overflow) via binary search (overflow is always sorted by slot).
|
||
pub fn get(&self, slot: usize) -> u32 {
|
||
let b = self.primary[slot];
|
||
if b < 255 { return b as u32; }
|
||
let mut lo = 0usize;
|
||
let mut hi = self.n_overflow;
|
||
while lo < hi {
|
||
let mid = lo + (hi - lo) / 2;
|
||
let (s, v) = parse_overflow_entry(self.overflow_raw, 0, mid);
|
||
match s.cmp(&slot) {
|
||
std::cmp::Ordering::Equal => return v,
|
||
std::cmp::Ordering::Less => lo = mid + 1,
|
||
std::cmp::Ordering::Greater => hi = mid,
|
||
}
|
||
}
|
||
panic!("slot {slot} marked overflow but not found")
|
||
}
|
||
|
||
/// Sequential merge scan: yields all n values in slot order.
|
||
pub fn iter(&self) -> IntSliceViewIter<'a> {
|
||
IntSliceViewIter {
|
||
primary: self.primary,
|
||
overflow_raw: self.overflow_raw,
|
||
slot: 0,
|
||
overflow_pos: 0,
|
||
n: self.n,
|
||
}
|
||
}
|
||
|
||
pub fn sum(&self) -> u64 {
|
||
byte_sum(self.primary, self.overflow_entries().map(|(_, v)| v))
|
||
}
|
||
|
||
pub fn count_nonzero(&self) -> u64 {
|
||
byte_count_nonzero(self.primary)
|
||
}
|
||
|
||
// ── Distance methods ──────────────────────────────────────────────────────
|
||
|
||
pub fn partial_bray_dist(self, other: IntSliceView<'_>) -> u64 {
|
||
assert_eq!(self.n, other.n, "length mismatch");
|
||
self.iter().zip(other.iter()).map(|(a, b)| a.min(b) as u64).sum()
|
||
}
|
||
|
||
pub fn bray_dist(self, other: IntSliceView<'_>) -> f64 {
|
||
let sum_min = self.partial_bray_dist(other);
|
||
let denom = self.sum() + other.sum();
|
||
if denom == 0 { 0.0 } else { 1.0 - 2.0 * sum_min as f64 / denom as f64 }
|
||
}
|
||
|
||
pub fn partial_relfreq_bray_dist(self, other: IntSliceView<'_>, sa: f64, sb: f64) -> f64 {
|
||
assert_eq!(self.n, other.n, "length mismatch");
|
||
self.iter().zip(other.iter())
|
||
.map(|(a, b)| {
|
||
let pa = if sa > 0.0 { a as f64 / sa } else { 0.0 };
|
||
let pb = if sb > 0.0 { b as f64 / sb } else { 0.0 };
|
||
pa.min(pb)
|
||
})
|
||
.sum()
|
||
}
|
||
|
||
pub fn relfreq_bray_dist(self, other: IntSliceView<'_>) -> f64 {
|
||
let sa = self.sum() as f64;
|
||
let sb = other.sum() as f64;
|
||
if sa == 0.0 && sb == 0.0 { return 0.0; }
|
||
1.0 - self.partial_relfreq_bray_dist(other, sa, sb)
|
||
}
|
||
|
||
pub fn partial_euclidean_dist(self, other: IntSliceView<'_>) -> f64 {
|
||
assert_eq!(self.n, other.n, "length mismatch");
|
||
self.iter().zip(other.iter())
|
||
.map(|(a, b)| { let d = a as f64 - b as f64; d * d })
|
||
.sum()
|
||
}
|
||
|
||
pub fn euclidean_dist(self, other: IntSliceView<'_>) -> f64 {
|
||
self.partial_euclidean_dist(other).sqrt()
|
||
}
|
||
|
||
pub fn partial_relfreq_euclidean_dist(self, other: IntSliceView<'_>, sa: f64, sb: f64) -> f64 {
|
||
assert_eq!(self.n, other.n, "length mismatch");
|
||
self.iter().zip(other.iter())
|
||
.map(|(a, b)| {
|
||
let pa = if sa > 0.0 { a as f64 / sa } else { 0.0 };
|
||
let pb = if sb > 0.0 { b as f64 / sb } else { 0.0 };
|
||
let d = pa - pb;
|
||
d * d
|
||
})
|
||
.sum()
|
||
}
|
||
|
||
pub fn relfreq_euclidean_dist(self, other: IntSliceView<'_>) -> f64 {
|
||
let sa = self.sum() as f64;
|
||
let sb = other.sum() as f64;
|
||
if sa == 0.0 && sb == 0.0 { return 0.0; }
|
||
self.partial_relfreq_euclidean_dist(other, sa, sb).sqrt()
|
||
}
|
||
|
||
pub fn partial_hellinger_euclidean_dist(self, other: IntSliceView<'_>, sa: f64, sb: f64) -> f64 {
|
||
assert_eq!(self.n, other.n, "length mismatch");
|
||
self.iter().zip(other.iter())
|
||
.map(|(a, b)| {
|
||
let pa = if sa > 0.0 { (a as f64 / sa).sqrt() } else { 0.0 };
|
||
let pb = if sb > 0.0 { (b as f64 / sb).sqrt() } else { 0.0 };
|
||
let d = pa - pb;
|
||
d * d
|
||
})
|
||
.sum()
|
||
}
|
||
|
||
pub fn hellinger_euclidean_dist(self, other: IntSliceView<'_>) -> f64 {
|
||
let sa = self.sum() as f64;
|
||
let sb = other.sum() as f64;
|
||
if sa == 0.0 && sb == 0.0 { return 0.0; }
|
||
self.partial_hellinger_euclidean_dist(other, sa, sb).sqrt()
|
||
}
|
||
|
||
pub fn hellinger_dist(self, other: IntSliceView<'_>) -> f64 {
|
||
self.hellinger_euclidean_dist(other) / std::f64::consts::SQRT_2
|
||
}
|
||
|
||
pub fn partial_threshold_jaccard_dist(self, other: IntSliceView<'_>, threshold: u32) -> (u64, u64) {
|
||
assert_eq!(self.n, other.n, "length mismatch");
|
||
self.iter().zip(other.iter())
|
||
.fold((0u64, 0u64), |(inter, uni), (a, b)| {
|
||
let ap = a >= threshold;
|
||
let bp = b >= threshold;
|
||
(inter + (ap & bp) as u64, uni + (ap | bp) as u64)
|
||
})
|
||
}
|
||
|
||
pub fn threshold_jaccard_dist(self, other: IntSliceView<'_>, threshold: u32) -> f64 {
|
||
let (inter, union) = self.partial_threshold_jaccard_dist(other, threshold);
|
||
if union == 0 { 0.0 } else { 1.0 - inter as f64 / union as f64 }
|
||
}
|
||
|
||
pub fn jaccard_dist(self, other: IntSliceView<'_>) -> f64 {
|
||
self.threshold_jaccard_dist(other, 1)
|
||
}
|
||
}
|
||
|
||
// ── IntSliceViewIter ──────────────────────────────────────────────────────────
|
||
|
||
pub struct IntSliceViewIter<'a> {
|
||
primary: &'a [u8],
|
||
overflow_raw: &'a [u8],
|
||
slot: usize,
|
||
overflow_pos: usize,
|
||
n: usize,
|
||
}
|
||
|
||
impl Iterator for IntSliceViewIter<'_> {
|
||
type Item = u32;
|
||
fn next(&mut self) -> Option<u32> {
|
||
if self.slot >= self.n { return None; }
|
||
let v = self.primary[self.slot];
|
||
self.slot += 1;
|
||
if v < 255 {
|
||
Some(v as u32)
|
||
} else {
|
||
let (_, val) = parse_overflow_entry(self.overflow_raw, 0, self.overflow_pos);
|
||
self.overflow_pos += 1;
|
||
Some(val)
|
||
}
|
||
}
|
||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||
let rem = self.n - self.slot;
|
||
(rem, Some(rem))
|
||
}
|
||
}
|
||
impl ExactSizeIterator for IntSliceViewIter<'_> {}
|