From d1717688d20b08fb2990b79f03ab7bfffeabc8cc Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Tue, 16 Jun 2026 23:36:25 +0200 Subject: [PATCH] refactor: extract matrix helpers and improve bit iteration ergonomics Refactor parallel matrix construction by extracting reusable `pairwise_matrix` and `pairwise2_matrix` helpers, and consolidate binary record deserialization into dedicated parsing functions. Add `set` and `iter` methods to `BitSliceMut` and `MemoryBitVec` for ergonomic bit manipulation and iteration. Standardize JSON field extraction via `meta::field`, expose `MemoryBitIter`, and improve test reliability by automatically cleaning up temporary directories. --- src/obicompactvec/src/bitmatrix.rs | 76 ++++++------- src/obicompactvec/src/bitvec.rs | 12 +-- src/obicompactvec/src/builder.rs | 6 +- src/obicompactvec/src/format.rs | 18 ++++ src/obicompactvec/src/intmatrix.rs | 129 +++++------------------ src/obicompactvec/src/layer_meta.rs | 7 +- src/obicompactvec/src/lib.rs | 2 +- src/obicompactvec/src/memoryvec.rs | 48 +++++++-- src/obicompactvec/src/meta.rs | 2 +- src/obicompactvec/src/reader.rs | 7 +- src/obicompactvec/src/tests/bitmatrix.rs | 2 +- src/obicompactvec/src/traits.rs | 6 ++ src/obikpartitionner/src/common.rs | 1 + src/obikpartitionner/src/select_layer.rs | 1 + src/obilayeredmap/src/layer.rs | 1 + src/obilayeredmap/src/layered_store.rs | 1 + 16 files changed, 136 insertions(+), 183 deletions(-) diff --git a/src/obicompactvec/src/bitmatrix.rs b/src/obicompactvec/src/bitmatrix.rs index 2dbc266..cd7e0e9 100644 --- a/src/obicompactvec/src/bitmatrix.rs +++ b/src/obicompactvec/src/bitmatrix.rs @@ -56,34 +56,11 @@ impl ColumnarBitMatrix { } pub(crate) fn partial_jaccard_dist_matrix(&self) -> (Array2, Array2) { - let n = self.n_cols(); - let results: Vec<(usize, usize, u64, u64)> = upper_pairs(n) - .into_par_iter() - .map(|(i, j)| { - let (inter, union) = self.col(i).partial_jaccard_dist(self.col(j)); - (i, j, inter, union) - }) - .collect(); - let mut inter_m = Array2::zeros((n, n)); - let mut union_m = Array2::zeros((n, n)); - for (i, j, inter, union) in results { - inter_m[[i, j]] = inter; inter_m[[j, i]] = inter; - union_m[[i, j]] = union; union_m[[j, i]] = union; - } - (inter_m, union_m) + pairwise2_matrix(self.n_cols(), |i, j| self.col(i).partial_jaccard_dist(self.col(j))) } pub(crate) fn partial_hamming_dist_matrix(&self) -> Array2 { - self.pairwise_u64(|i, j| self.col(i).hamming_dist(self.col(j))) - } - - fn pairwise_u64(&self, f: impl Fn(usize, usize) -> u64 + Sync) -> Array2 { - let n = self.n_cols(); - let results: Vec<(usize, usize, u64)> = upper_pairs(n) - .into_par_iter() - .map(|(i, j)| (i, j, f(i, j))) - .collect(); - fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v))) + pairwise_matrix(self.n_cols(), |i, j| self.col(i).hamming_dist(self.col(j))) } pub(crate) fn append_column(dir: &Path, value_of: impl Fn(usize) -> bool) -> io::Result<()> { @@ -228,27 +205,11 @@ impl PackedBitMatrix { } pub(crate) fn partial_jaccard_dist_matrix(&self) -> (Array2, Array2) { - let n = self.n_cols; - let results: Vec<(usize, usize, u64, u64)> = upper_pairs(n) - .into_par_iter() - .map(|(i, j)| { let (inter, union) = self.partial_jaccard_col(i, j); (i, j, inter, union) }) - .collect(); - let mut inter_m = Array2::zeros((n, n)); - let mut union_m = Array2::zeros((n, n)); - for (i, j, inter, union) in results { - inter_m[[i, j]] = inter; inter_m[[j, i]] = inter; - union_m[[i, j]] = union; union_m[[j, i]] = union; - } - (inter_m, union_m) + pairwise2_matrix(self.n_cols, |i, j| self.partial_jaccard_col(i, j)) } pub(crate) fn partial_hamming_dist_matrix(&self) -> Array2 { - let n = self.n_cols; - let results: Vec<(usize, usize, u64)> = upper_pairs(n) - .into_par_iter() - .map(|(i, j)| (i, j, self.pair_op(i, j, false))) - .collect(); - fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v))) + pairwise_matrix(self.n_cols, |i, j| self.pair_op(i, j, false)) } } @@ -488,7 +449,7 @@ impl PersistentBitMatrixBuilder { } } -// ── Helpers ─────────────────────────────────────────────────────────────────── +// ── Shared matrix helpers (also used by intmatrix.rs) ───────────────────────── fn upper_pairs(n: usize) -> Vec<(usize, usize)> { (0..n).flat_map(|i| (i + 1..n).map(move |j| (i, j))).collect() @@ -500,3 +461,30 @@ where T: Clone + Default { for (i, j, vij, vji) in vals { m[[i, j]] = vij; m[[j, i]] = vji; } m } + +/// Compute a symmetric `n×n` matrix in parallel by evaluating `f(i,j)` for +/// all upper-triangle pairs. `T: Copy` avoids the `.clone()` needed for the +/// lower-triangle mirror. +pub(crate) fn pairwise_matrix(n: usize, f: impl Fn(usize, usize) -> T + Sync) -> Array2 +where T: Copy + Default + Send { + let results: Vec<(usize, usize, T)> = upper_pairs(n) + .into_par_iter().map(|(i, j)| (i, j, f(i, j))).collect(); + fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v))) +} + +/// Same as `pairwise_matrix` but `f` returns two values that fill two +/// symmetric matrices simultaneously (e.g. intersection + union for Jaccard). +pub(crate) fn pairwise2_matrix(n: usize, f: impl Fn(usize, usize) -> (T, T) + Sync) -> (Array2, Array2) +where T: Copy + Default + Send { + let results: Vec<(usize, usize, T, T)> = upper_pairs(n) + .into_par_iter() + .map(|(i, j)| { let (a, b) = f(i, j); (i, j, a, b) }) + .collect(); + let mut m0 = Array2::from_elem((n, n), T::default()); + let mut m1 = Array2::from_elem((n, n), T::default()); + for (i, j, a, b) in results { + m0[[i, j]] = a; m0[[j, i]] = a; + m1[[i, j]] = b; m1[[j, i]] = b; + } + (m0, m1) +} diff --git a/src/obicompactvec/src/bitvec.rs b/src/obicompactvec/src/bitvec.rs index dc95512..dcb52ba 100644 --- a/src/obicompactvec/src/bitvec.rs +++ b/src/obicompactvec/src/bitvec.rs @@ -14,7 +14,7 @@ const MAGIC: [u8; 4] = *b"PBIV"; const HEADER_SIZE: usize = 16; #[inline] -fn n_words(n: usize) -> usize { +pub(crate) fn n_words(n: usize) -> usize { n.div_ceil(64) } @@ -222,16 +222,6 @@ impl PersistentBitVecBuilder { (self.mmap[HEADER_SIZE + (slot >> 3)] >> (slot & 7)) & 1 != 0 } - pub fn set(&mut self, slot: usize, value: bool) { - let byte = HEADER_SIZE + (slot >> 3); - let bit = 1u8 << (slot & 7); - if value { - self.mmap[byte] |= bit; - } else { - self.mmap[byte] &= !bit; - } - } - fn data_words(&self) -> &[u64] { let nw = n_words(self.n); let ptr = self.mmap[HEADER_SIZE..].as_ptr() as *const u64; diff --git a/src/obicompactvec/src/builder.rs b/src/obicompactvec/src/builder.rs index f2b5326..4885216 100644 --- a/src/obicompactvec/src/builder.rs +++ b/src/obicompactvec/src/builder.rs @@ -5,7 +5,7 @@ use std::path::{Path, PathBuf}; use memmap2::MmapMut; -use crate::format::{HEADER_SIZE, OVERFLOW_ENTRY_SIZE, finalize_pciv}; +use crate::format::{HEADER_SIZE, finalize_pciv, parse_overflow_entry}; use crate::reader::PersistentCompactIntVec; pub struct PersistentCompactIntVecBuilder { @@ -78,9 +78,7 @@ impl PersistentCompactIntVecBuilder { let mut overflow = HashMap::with_capacity(n_overflow); for i in 0..n_overflow { - let off = data_offset + i * OVERFLOW_ENTRY_SIZE; - let slot = u64::from_le_bytes(mmap[off..off + 8].try_into().unwrap()) as usize; - let value = u32::from_le_bytes(mmap[off + 8..off + 12].try_into().unwrap()); + let (slot, value) = parse_overflow_entry(&mmap, data_offset, i); overflow.insert(slot, value); } diff --git a/src/obicompactvec/src/format.rs b/src/obicompactvec/src/format.rs index 08f0079..265167d 100644 --- a/src/obicompactvec/src/format.rs +++ b/src/obicompactvec/src/format.rs @@ -13,6 +13,24 @@ pub const OVERFLOW_ENTRY_SIZE: usize = 12; // Index entry: slot(u64) + pos(u64) = 16 bytes. pub const INDEX_ENTRY_SIZE: usize = 16; +/// Parse a single overflow entry `(slot, value)` from a byte slice. +#[inline] +pub fn parse_overflow_entry(data: &[u8], base: usize, i: usize) -> (usize, u32) { + let off = base + i * OVERFLOW_ENTRY_SIZE; + let slot = u64::from_le_bytes(data[off..off+8].try_into().unwrap()) as usize; + let value = u32::from_le_bytes(data[off+8..off+12].try_into().unwrap()); + (slot, value) +} + +/// Parse a single sparse-index entry `(slot, pos)` from a byte slice. +#[inline] +pub fn parse_index_entry(data: &[u8], base: usize, i: usize) -> (usize, usize) { + let off = base + i * INDEX_ENTRY_SIZE; + let slot = u64::from_le_bytes(data[off..off+8].try_into().unwrap()) as usize; + let pos = u64::from_le_bytes(data[off+8..off+16].try_into().unwrap()) as usize; + (slot, pos) +} + // Sparse index target: ≤ 32 KB in L1 cache (16 B per entry → 2048 entries). pub const L1_INDEX_ENTRIES: usize = 2048; diff --git a/src/obicompactvec/src/intmatrix.rs b/src/obicompactvec/src/intmatrix.rs index 91c3b1b..9d97f8e 100644 --- a/src/obicompactvec/src/intmatrix.rs +++ b/src/obicompactvec/src/intmatrix.rs @@ -8,9 +8,10 @@ use memmap2::Mmap; use ndarray::{Array1, Array2}; use rayon::prelude::*; +use crate::bitmatrix::{pairwise_matrix, pairwise2_matrix}; use crate::builder::PersistentCompactIntVecBuilder; use crate::memoryintvec::MemoryIntVec; -use crate::format::{HEADER_SIZE, INDEX_ENTRY_SIZE, OVERFLOW_ENTRY_SIZE}; +use crate::format::{HEADER_SIZE, OVERFLOW_ENTRY_SIZE, parse_index_entry, parse_overflow_entry}; use crate::meta::MatrixMeta; use crate::reader::PersistentCompactIntVec; @@ -65,49 +66,35 @@ impl ColumnarCompactIntMatrix { } pub(crate) fn partial_bray_dist_matrix(&self) -> Array2 { - self.pairwise_u64(|i, j| self.col(i).partial_bray_dist(self.col(j))) + pairwise_matrix(self.n_cols(), |i, j| self.col(i).partial_bray_dist(self.col(j))) } pub(crate) fn partial_euclidean_dist_matrix(&self) -> Array2 { - self.pairwise(|i, j| self.col(i).partial_euclidean_dist(self.col(j))) + pairwise_matrix(self.n_cols(), |i, j| self.col(i).partial_euclidean_dist(self.col(j))) } pub(crate) fn partial_threshold_jaccard_dist_matrix( &self, threshold: u32, ) -> (Array2, Array2) { - let n = self.n_cols(); - let pairs = upper_pairs(n); - let results: Vec<(usize, usize, u64, u64)> = pairs - .into_par_iter() - .map(|(i, j)| { - let (inter, union) = - self.col(i).partial_threshold_jaccard_dist(self.col(j), threshold); - (i, j, inter, union) - }) - .collect(); - let mut inter_m = Array2::zeros((n, n)); - let mut union_m = Array2::zeros((n, n)); - for (i, j, inter, union) in results { - inter_m[[i, j]] = inter; inter_m[[j, i]] = inter; - union_m[[i, j]] = union; union_m[[j, i]] = union; - } - (inter_m, union_m) + pairwise2_matrix(self.n_cols(), |i, j| { + self.col(i).partial_threshold_jaccard_dist(self.col(j), threshold) + }) } pub(crate) fn partial_relfreq_bray_dist_matrix(&self, col_sums: &Array1) -> Array2 { - self.pairwise(|i, j| { + pairwise_matrix(self.n_cols(), |i, j| { self.col(i).partial_relfreq_bray_dist(self.col(j), col_sums[i] as f64, col_sums[j] as f64) }) } pub(crate) fn partial_relfreq_euclidean_dist_matrix(&self, col_sums: &Array1) -> Array2 { - self.pairwise(|i, j| { + pairwise_matrix(self.n_cols(), |i, j| { self.col(i).partial_relfreq_euclidean_dist(self.col(j), col_sums[i] as f64, col_sums[j] as f64) }) } pub(crate) fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1) -> Array2 { - self.pairwise(|i, j| { + pairwise_matrix(self.n_cols(), |i, j| { self.col(i).partial_hellinger_euclidean_dist(self.col(j), col_sums[i] as f64, col_sums[j] as f64) }) } @@ -121,19 +108,6 @@ impl ColumnarCompactIntMatrix { meta.save(dir) } - fn pairwise(&self, f: impl Fn(usize, usize) -> f64 + Sync) -> Array2 { - let n = self.n_cols(); - let results: Vec<(usize, usize, f64)> = upper_pairs(n) - .into_par_iter().map(|(i, j)| (i, j, f(i, j))).collect(); - fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v))) - } - - fn pairwise_u64(&self, f: impl Fn(usize, usize) -> u64 + Sync) -> Array2 { - let n = self.n_cols(); - let results: Vec<(usize, usize, u64)> = upper_pairs(n) - .into_par_iter().map(|(i, j)| (i, j, f(i, j))).collect(); - fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v))) - } } // ── PackedCompactIntMatrix ──────────────────────────────────────────────────── @@ -185,10 +159,7 @@ impl PackedCompactIntMatrix { let mut index = Vec::with_capacity(n_idx); for i in 0..n_idx { - let ioff = index_offset + i * INDEX_ENTRY_SIZE; - let slot = u64::from_le_bytes(mmap[ioff..ioff+8].try_into().unwrap()) as usize; - let pos = u64::from_le_bytes(mmap[ioff+8..ioff+16].try_into().unwrap()) as usize; - index.push((slot, pos)); + index.push(parse_index_entry(&mmap, index_offset, i)); } columns.push(ColInfo { primary_start, data_offset, n_overflow: n_ov, step, index }); } @@ -196,30 +167,25 @@ impl PackedCompactIntMatrix { Ok(Self { mmap, n_rows, n_cols, columns }) } + fn col_overflow_map(&self, ci: &ColInfo) -> HashMap { + let mut overflow = HashMap::with_capacity(ci.n_overflow); + for i in 0..ci.n_overflow { + let (slot, value) = parse_overflow_entry(&self.mmap, ci.data_offset, i); + overflow.insert(slot, value); + } + overflow + } + pub(crate) fn col_persist(&self, c: usize, path: &Path) -> io::Result { let ci = &self.columns[c]; let primary = &self.mmap[ci.primary_start..ci.primary_start + self.n_rows]; - let mut overflow = HashMap::with_capacity(ci.n_overflow); - for i in 0..ci.n_overflow { - let off = ci.data_offset + i * OVERFLOW_ENTRY_SIZE; - let slot = u64::from_le_bytes(self.mmap[off..off+8].try_into().unwrap()) as usize; - let value = u32::from_le_bytes(self.mmap[off+8..off+12].try_into().unwrap()); - overflow.insert(slot, value); - } - PersistentCompactIntVecBuilder::from_raw_primary(primary, overflow, path) + PersistentCompactIntVecBuilder::from_raw_primary(primary, self.col_overflow_map(ci), path) } pub(crate) fn col_as_memory(&self, c: usize) -> MemoryIntVec { let ci = &self.columns[c]; let primary = self.mmap[ci.primary_start..ci.primary_start + self.n_rows].to_vec(); - let mut overflow = HashMap::with_capacity(ci.n_overflow); - for i in 0..ci.n_overflow { - let off = ci.data_offset + i * OVERFLOW_ENTRY_SIZE; - let slot = u64::from_le_bytes(self.mmap[off..off+8].try_into().unwrap()) as usize; - let value = u32::from_le_bytes(self.mmap[off+8..off+12].try_into().unwrap()); - overflow.insert(slot, value); - } - MemoryIntVec::from_primary_and_overflow(primary, overflow) + MemoryIntVec::from_primary_and_overflow(primary, self.col_overflow_map(ci)) } #[inline] @@ -327,55 +293,28 @@ impl PackedCompactIntMatrix { // ── Matrix methods ──────────────────────────────────────────────────────── - fn pairwise(&self, f: impl Fn(usize, usize) -> T + Sync) -> Array2 - where T: Clone + Default + Send { - let n = self.n_cols; - let results: Vec<(usize, usize, T)> = upper_pairs(n) - .into_par_iter().map(|(i, j)| (i, j, f(i, j))).collect(); - fill_symmetric(n, results.into_iter().map(|(i, j, v)| { let w = v.clone(); (i, j, v, w) })) - } - - fn pairwise_u64(&self, f: impl Fn(usize, usize) -> u64 + Sync) -> Array2 { - let n = self.n_cols; - let results: Vec<(usize, usize, u64)> = upper_pairs(n) - .into_par_iter().map(|(i, j)| (i, j, f(i, j))).collect(); - fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v))) - } - pub(crate) fn partial_bray_dist_matrix(&self) -> Array2 { - self.pairwise_u64(|i, j| self.pair_partial_bray(i, j)) + pairwise_matrix(self.n_cols, |i, j| self.pair_partial_bray(i, j)) } - pub(crate) fn partial_euclidean_dist_matrix(&self) -> Array2 { - self.pairwise(|i, j| self.pair_partial_euclidean(i, j)) + pairwise_matrix(self.n_cols, |i, j| self.pair_partial_euclidean(i, j)) } pub(crate) fn partial_threshold_jaccard_dist_matrix(&self, t: u32) -> (Array2, Array2) { - let n = self.n_cols; - let results: Vec<(usize, usize, u64, u64)> = upper_pairs(n) - .into_par_iter() - .map(|(i, j)| { let (inter, union) = self.pair_partial_threshold_jaccard(i, j, t); (i, j, inter, union) }) - .collect(); - let mut inter_m = Array2::zeros((n, n)); - let mut union_m = Array2::zeros((n, n)); - for (i, j, inter, union) in results { - inter_m[[i, j]] = inter; inter_m[[j, i]] = inter; - union_m[[i, j]] = union; union_m[[j, i]] = union; - } - (inter_m, union_m) + pairwise2_matrix(self.n_cols, |i, j| self.pair_partial_threshold_jaccard(i, j, t)) } pub(crate) fn partial_relfreq_bray_dist_matrix(&self, col_sums: &Array1) -> Array2 { - self.pairwise(|i, j| self.pair_partial_relfreq_bray(i, j, col_sums[i] as f64, col_sums[j] as f64)) + pairwise_matrix(self.n_cols, |i, j| self.pair_partial_relfreq_bray(i, j, col_sums[i] as f64, col_sums[j] as f64)) } pub(crate) fn partial_relfreq_euclidean_dist_matrix(&self, col_sums: &Array1) -> Array2 { - self.pairwise(|i, j| self.pair_partial_relfreq_euclidean(i, j, col_sums[i] as f64, col_sums[j] as f64)) + pairwise_matrix(self.n_cols, |i, j| self.pair_partial_relfreq_euclidean(i, j, col_sums[i] as f64, col_sums[j] as f64)) } pub(crate) fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1) -> Array2 { - self.pairwise(|i, j| self.pair_partial_hellinger(i, j, col_sums[i] as f64, col_sums[j] as f64)) + pairwise_matrix(self.n_cols, |i, j| self.pair_partial_hellinger(i, j, col_sums[i] as f64, col_sums[j] as f64)) } } @@ -570,15 +509,3 @@ impl PersistentCompactIntMatrixBuilder { } } -// ── Helpers ─────────────────────────────────────────────────────────────────── - -fn upper_pairs(n: usize) -> Vec<(usize, usize)> { - (0..n).flat_map(|i| (i + 1..n).map(move |j| (i, j))).collect() -} - -fn fill_symmetric(n: usize, vals: impl Iterator) -> Array2 -where T: Clone + Default { - let mut m = Array2::from_elem((n, n), T::default()); - for (i, j, vij, vji) in vals { m[[i, j]] = vij; m[[j, i]] = vji; } - m -} diff --git a/src/obicompactvec/src/layer_meta.rs b/src/obicompactvec/src/layer_meta.rs index 65dc5bc..28fff0c 100644 --- a/src/obicompactvec/src/layer_meta.rs +++ b/src/obicompactvec/src/layer_meta.rs @@ -23,11 +23,6 @@ impl LayerMeta { } fn parse(s: &str) -> Option { - let key = "\"n\":"; - let pos = s.find(key)? + key.len(); - let rest = s[pos..].trim_start(); - let end = rest.find(|c: char| !c.is_ascii_digit()).unwrap_or(rest.len()); - let n = rest[..end].parse().ok()?; - Some(Self { n }) + Some(Self { n: crate::meta::field(s, "n")? }) } } diff --git a/src/obicompactvec/src/lib.rs b/src/obicompactvec/src/lib.rs index b3c2ff4..fb2d5e2 100644 --- a/src/obicompactvec/src/lib.rs +++ b/src/obicompactvec/src/lib.rs @@ -16,7 +16,7 @@ pub use builder::PersistentCompactIntVecBuilder; pub use intmatrix::{PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, pack_compact_int_matrix}; pub use layer_meta::LayerMeta; pub use memoryintvec::MemoryIntVec; -pub use memoryvec::MemoryBitVec; +pub use memoryvec::{MemoryBitIter, MemoryBitVec}; pub use reader::PersistentCompactIntVec; pub use traits::{BitPartials, BitSlice, BitSliceMut, BitToInt, ColumnWeights, CountPartials, IntSlice, IntSliceMut, IntToBit}; diff --git a/src/obicompactvec/src/memoryvec.rs b/src/obicompactvec/src/memoryvec.rs index 102a6d6..9195982 100644 --- a/src/obicompactvec/src/memoryvec.rs +++ b/src/obicompactvec/src/memoryvec.rs @@ -2,12 +2,9 @@ use std::io; use std::ops::{BitAnd, BitAndAssign, BitOr, BitOrAssign, BitXor, BitXorAssign, Not}; use std::path::Path; -use crate::bitvec::PersistentBitVecBuilder; +use crate::bitvec::{PersistentBitVecBuilder, n_words}; use crate::traits::{BitSlice, BitSliceMut}; -#[inline] -fn n_words(n: usize) -> usize { n.div_ceil(64) } - // ── MemoryBitVec ────────────────────────────────────────────────────────────── #[derive(Clone)] @@ -41,11 +38,6 @@ impl MemoryBitVec { (self.words[slot >> 6] >> (slot & 63)) & 1 != 0 } - pub fn set(&mut self, slot: usize, value: bool) { - let bit = 1u64 << (slot & 63); - if value { self.words[slot >> 6] |= bit; } else { self.words[slot >> 6] &= !bit; } - } - pub fn count_ones(&self) -> u64 { self.words.iter().map(|w| w.count_ones() as u64).sum() } @@ -136,3 +128,41 @@ impl BitOrAssign<&B> for MemoryBitVec { impl BitXorAssign<&B> for MemoryBitVec { fn bitxor_assign(&mut self, rhs: &B) { self.xor(rhs); } } + +// ── Iterator ────────────────────────────────────────────────────────────────── + +pub struct MemoryBitIter<'a> { + words: &'a [u64], + slot: usize, + n: usize, +} + +impl Iterator for MemoryBitIter<'_> { + type Item = bool; + + fn next(&mut self) -> Option { + if self.slot >= self.n { return None; } + let v = (self.words[self.slot >> 6] >> (self.slot & 63)) & 1 != 0; + self.slot += 1; + Some(v) + } + + fn size_hint(&self) -> (usize, Option) { + let rem = self.n - self.slot; + (rem, Some(rem)) + } +} + +impl ExactSizeIterator for MemoryBitIter<'_> {} + +impl MemoryBitVec { + pub fn iter(&self) -> MemoryBitIter<'_> { + MemoryBitIter { words: &self.words, slot: 0, n: self.n } + } +} + +impl<'a> IntoIterator for &'a MemoryBitVec { + type Item = bool; + type IntoIter = MemoryBitIter<'a>; + fn into_iter(self) -> MemoryBitIter<'a> { self.iter() } +} diff --git a/src/obicompactvec/src/meta.rs b/src/obicompactvec/src/meta.rs index d8d8466..09deedc 100644 --- a/src/obicompactvec/src/meta.rs +++ b/src/obicompactvec/src/meta.rs @@ -23,7 +23,7 @@ fn parse(s: &str) -> Option { Some(MatrixMeta { n: field(s, "n")?, n_cols: field(s, "n_cols")? }) } -fn field(s: &str, name: &str) -> Option { +pub(crate) fn field(s: &str, name: &str) -> Option { let key = format!("\"{}\":", name); let pos = s.find(&key)? + key.len(); let rest = s[pos..].trim_start(); diff --git a/src/obicompactvec/src/reader.rs b/src/obicompactvec/src/reader.rs index e4c59e4..bd3d7d7 100644 --- a/src/obicompactvec/src/reader.rs +++ b/src/obicompactvec/src/reader.rs @@ -4,7 +4,7 @@ use std::path::{Path, PathBuf}; use memmap2::Mmap; -use crate::format::{HEADER_SIZE, INDEX_ENTRY_SIZE, MAGIC, OVERFLOW_ENTRY_SIZE}; +use crate::format::{HEADER_SIZE, MAGIC, OVERFLOW_ENTRY_SIZE, parse_index_entry}; pub struct PersistentCompactIntVec { mmap: Mmap, @@ -43,10 +43,7 @@ impl PersistentCompactIntVec { let mut index = Vec::with_capacity(n_index); for i in 0..n_index { - let off = index_offset + i * INDEX_ENTRY_SIZE; - let slot = u64::from_le_bytes(mmap[off..off + 8].try_into().unwrap()) as usize; - let pos = u64::from_le_bytes(mmap[off + 8..off + 16].try_into().unwrap()) as usize; - index.push((slot, pos)); + index.push(parse_index_entry(&mmap, index_offset, i)); } Ok(Self { diff --git a/src/obicompactvec/src/tests/bitmatrix.rs b/src/obicompactvec/src/tests/bitmatrix.rs index 741a07c..3304410 100644 --- a/src/obicompactvec/src/tests/bitmatrix.rs +++ b/src/obicompactvec/src/tests/bitmatrix.rs @@ -1,7 +1,7 @@ use tempfile::tempdir; use crate::{PersistentBitMatrix, PersistentBitMatrixBuilder}; -use crate::traits::BitPartials; +use crate::traits::{BitPartials, BitSliceMut}; fn make_matrix(cols: &[&[bool]]) -> (tempfile::TempDir, PersistentBitMatrix) { let n = cols.first().map_or(0, |c| c.len()); diff --git a/src/obicompactvec/src/traits.rs b/src/obicompactvec/src/traits.rs index 91ee8d8..32e40a1 100644 --- a/src/obicompactvec/src/traits.rs +++ b/src/obicompactvec/src/traits.rs @@ -20,6 +20,11 @@ pub trait BitSlice { pub trait BitSliceMut: BitSlice { fn words_mut(&mut self) -> &mut [u64]; + fn set(&mut self, slot: usize, value: bool) { + let bit = 1u64 << (slot & 63); + if value { self.words_mut()[slot >> 6] |= bit; } else { self.words_mut()[slot >> 6] &= !bit; } + } + fn copy_from(&mut self, src: &S) -> &mut Self { assert_eq!(self.len(), src.len(), "BitSlice length mismatch"); self.words_mut().copy_from_slice(src.words()); @@ -62,6 +67,7 @@ pub trait IntSlice { fn len(&self) -> usize; fn get(&self, slot: usize) -> u32; fn is_empty(&self) -> bool { self.len() == 0 } + fn iter(&self) -> impl Iterator + '_ { (0..self.len()).map(|i| self.get(i)) } fn sum(&self) -> u64 { (0..self.len()).map(|s| self.get(s) as u64).sum() } fn count_nonzero(&self) -> u64 { (0..self.len()).filter(|&s| self.get(s) > 0).count() as u64 } diff --git a/src/obikpartitionner/src/common.rs b/src/obikpartitionner/src/common.rs index 99e345e..76d3bf3 100644 --- a/src/obikpartitionner/src/common.rs +++ b/src/obikpartitionner/src/common.rs @@ -3,6 +3,7 @@ use std::io; use std::path::{Path, PathBuf}; use obicompactvec::{PersistentBitVecBuilder, PersistentCompactIntVecBuilder}; +use obicompactvec::traits::BitSliceMut; use obilayeredmap::meta::PartitionMeta; use obilayeredmap::{IndexMode, OLMError}; use obiskio::{SKError, SKResult}; diff --git a/src/obikpartitionner/src/select_layer.rs b/src/obikpartitionner/src/select_layer.rs index 36286c0..56b2ac7 100644 --- a/src/obikpartitionner/src/select_layer.rs +++ b/src/obikpartitionner/src/select_layer.rs @@ -6,6 +6,7 @@ use obicompactvec::{ PersistentBitMatrix, PersistentBitMatrixBuilder, PersistentBitVecBuilder, PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, PersistentCompactIntVecBuilder, }; +use obicompactvec::traits::BitSliceMut; use obilayeredmap::meta::PartitionMeta; use obilayeredmap::OLMError; use obiskio::{SKError, SKResult}; diff --git a/src/obilayeredmap/src/layer.rs b/src/obilayeredmap/src/layer.rs index 72b38ea..c79e781 100644 --- a/src/obilayeredmap/src/layer.rs +++ b/src/obilayeredmap/src/layer.rs @@ -6,6 +6,7 @@ use obicompactvec::{ PersistentBitMatrix, PersistentBitMatrixBuilder, PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, }; +use obicompactvec::traits::BitSliceMut; use obikseq::CanonicalKmer; use obiskio::{UnitigFileReader, UnitigFileWriter}; diff --git a/src/obilayeredmap/src/layered_store.rs b/src/obilayeredmap/src/layered_store.rs index 433183e..6ebf343 100644 --- a/src/obilayeredmap/src/layered_store.rs +++ b/src/obilayeredmap/src/layered_store.rs @@ -102,6 +102,7 @@ mod tests { PersistentBitMatrix, PersistentBitMatrixBuilder, PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, }; + use obicompactvec::traits::BitSliceMut; use tempfile::tempdir; fn make_int_matrix(cols: &[&[u32]]) -> (tempfile::TempDir, PersistentCompactIntMatrix) {