diff --git a/.DS_Store b/.DS_Store index 2b2a356..5f0cbf7 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/docmd/architecture/rebuild_filter.md b/docmd/architecture/rebuild_filter.md new file mode 100644 index 0000000..443aa75 --- /dev/null +++ b/docmd/architecture/rebuild_filter.md @@ -0,0 +1,105 @@ +# Rebuild / filter — column-first design + +## Problem with the current two-pass design + +`rebuild_partition` currently makes **two full passes** over source data: + +**Pass 1** — read unitigs → MPHF lookup (source) → read row (108 values) → apply filter → push kmer into `GraphDeBruijn`, **discard row**. + +**Pass 2** — read unitigs again → MPHF lookup again → read row again → for each passing kmer, look up slot in new MPHF → fill column builders. + +Both passes do random access into the source matrix: for each kmer, the MPHF returns a slot, then we read 108 values scattered across 108 column positions. This is cache-hostile even with a packed matrix (`.pbmx`), because the matrix is column-major: consecutive row reads jump across the file. + +## Memory budget + +The `keep` bitvector costs **1 bit per slot**. With 256 partitions and realistic kmer counts, each partition holds at most a few tens of millions of slots → a few MB per bitvector. Even in the absolute worst case (800 M slots), it stays under 100 MB. This is negligible. + +The `slot_map` option (Option B, 8–16 bytes per slot) is heavier but still bounded: at 15 M slots and 8 bytes, that is 120 MB per partition, acceptable for a single worker. + +## Key observation + +**The filter operates on column values, not on kmers.** A filter like `--max-outgroup-count 0` only needs to know, for each slot, whether any outgroup column is non-zero. It does not need to know which kmer occupies that slot. + +This means filtering can be done as a **sequential column scan** that produces a `keep: BitVec[n_slots]` — no MPHF lookups, no kmer knowledge, perfectly cache-friendly. + +## Proposed single-scan design + +### Step 1 — column scan → `keep` bitvector + +``` +for each column c in source matrix: + read column c sequentially (one mmap range) + update keep[slot] according to filter contribution of column c +``` + +For `GroupQuorumFilter` with ingroup/outgroup: +- ingroup columns: count presence per slot → `ingroup_count[slot]` +- outgroup columns: `keep[slot] &= (value[slot] == 0)` (early-exit possible) + +Result: `keep: BitVec` of size `n_slots`, computed with purely sequential IO. + +### Step 2 — unitig scan → kept kmers + new MPHF + +``` +for each kmer in unitig files: + old_slot = old_MPHF(kmer) + if keep[old_slot]: + push kmer into new GraphDeBruijn + record (old_slot, kmer) ← or just old_slot in order +``` + +Build new MPHF from `GraphDeBruijn` via `materialize_layer`. + +### Step 3 — fill new matrix + +Two sub-options: + +**Option A — from recorded (old_slot, kmer) pairs:** + +``` +for each (old_slot, kmer) in recorded list: + new_slot = new_MPHF(kmer) + for each column c: + new_matrix[new_slot, c] = old_matrix[old_slot, c] +``` + +Memory cost: `n_kept × (8 + 8)` bytes for `(old_slot: usize, kmer: CanonicalKmer)`. +For species-specific filters, `n_kept` is small. For unfiltered rebuild, `n_kept = n_slots`. + +**Option B — column-by-column copy using old→new slot mapping:** + +Precompute `slot_map: Vec>` of size `n_slots`: +- For each kmer in unitig file: `slot_map[old_MPHF(kmer)] = Some(new_MPHF(kmer))` + +Then for each source column: +``` +read source column sequentially +for each slot where slot_map[slot] = Some(new_slot): + write value to new column at new_slot +``` + +Memory cost: `n_slots × sizeof(usize)` for the slot map (one usize per source slot). +IO pattern: sequential read of each source column → random write into new column builders. + +Option B avoids storing kmer values and works uniformly regardless of filter selectivity. + +## Comparison + +| | Current | Proposed | +|---|---|---| +| Disk reads | 2× unitigs + 2× random matrix | 1× columns (sequential) + 1× unitigs | +| MPHF lookups (source) | 2× N_kmers | 1× N_kept (step 2) or 0 (option B, col scan only) | +| Cache behavior | poor (random row access) | good (sequential column scan) | +| Extra memory | none | slot_map (option B) or (old_slot, kmer) list (option A) | + +## Files to modify + +- `src/obikpartitionner/src/rebuild_layer.rs` — `rebuild_partition` and `iter_src_layers` +- Possibly `src/obicompactvec/` — add column iterator API if not already present +- `src/obilayeredmap/` — check if per-column sequential access is exposed on `SrcLayerData` + +## Open questions + +- Does `SrcLayerData` expose per-column sequential iteration, or only `lookup(kmer, n_genomes)` random access? +- For option B: are new column builders writable in random-slot order (i.e. `set_val(slot, value)` without sequential constraint)? +- For `GroupQuorumFilter` specifically: can the filter be decomposed into independent per-column contributions, or does it need the full row? diff --git a/src/obicompactvec/src/bitmatrix.rs b/src/obicompactvec/src/bitmatrix.rs index ca1b393..2dbc266 100644 --- a/src/obicompactvec/src/bitmatrix.rs +++ b/src/obicompactvec/src/bitmatrix.rs @@ -7,6 +7,8 @@ use ndarray::{Array1, Array2}; use rayon::prelude::*; use crate::bitvec::{PersistentBitVec, PersistentBitVecBuilder}; +use crate::memoryvec::MemoryBitVec; +use crate::traits::BitSliceMut; use crate::layer_meta::LayerMeta; use crate::meta::MatrixMeta; @@ -154,6 +156,28 @@ impl PackedBitMatrix { &self.mmap[start..start + len] } + pub(crate) fn col_persist(&self, c: usize, path: &Path) -> io::Result { + PersistentBitVecBuilder::from_raw_bytes(self.col_bytes(c), self.n_rows, path) + } + + pub(crate) fn col_as_memory(&self, c: usize) -> MemoryBitVec { + let bytes = self.col_bytes(c); + let n = self.n_rows; + let n_words = n.div_ceil(64); + let mut words = vec![0u64; n_words]; + let full = bytes.len() / 8; + for (i, chunk) in bytes[..full * 8].chunks_exact(8).enumerate() { + words[i] = u64::from_le_bytes(chunk.try_into().unwrap()); + } + let rem = bytes.len() % 8; + if rem > 0 { + let mut last = [0u8; 8]; + last[..rem].copy_from_slice(&bytes[full * 8..]); + words[full] = u64::from_le_bytes(last); + } + MemoryBitVec::from_words(words, n) + } + fn count_ones_col(&self, c: usize) -> u64 { let bytes = self.col_bytes(c); let full = self.n_rows / 8; @@ -343,6 +367,26 @@ impl PersistentBitMatrix { } } + pub fn col_persist(&self, c: usize, path: &Path) -> io::Result { + match self { + Self::Columnar(m) => PersistentBitVecBuilder::build_from(m.col(c), path), + Self::Packed(m) => m.col_persist(c, path), + Self::Implicit { n_rows, .. } => { + let mut b = PersistentBitVecBuilder::new(*n_rows, path)?; + b.not(); + Ok(b) + } + } + } + + pub fn col_as_memory(&self, c: usize) -> MemoryBitVec { + match self { + Self::Columnar(m) => MemoryBitVec::from(m.col(c)), + Self::Packed(m) => m.col_as_memory(c), + Self::Implicit { n_rows, .. } => MemoryBitVec::ones(*n_rows), + } + } + pub fn row(&self, slot: usize) -> Box<[bool]> { match self { Self::Columnar(m) => m.row(slot), diff --git a/src/obicompactvec/src/bitvec.rs b/src/obicompactvec/src/bitvec.rs index cfc26aa..dc95512 100644 --- a/src/obicompactvec/src/bitvec.rs +++ b/src/obicompactvec/src/bitvec.rs @@ -188,6 +188,21 @@ impl PersistentBitVecBuilder { Ok(Self { mmap, n }) } + /// Create a PBIV file from raw packed bit-bytes, zero-padding to the next word boundary. + /// `bytes` is `n.div_ceil(8)` bytes; `n` is the number of bits. + pub(crate) fn from_raw_bytes(bytes: &[u8], n: usize, path: &Path) -> io::Result { + let file_size = HEADER_SIZE + n_bytes_for_words(n); + let file = OpenOptions::new() + .read(true).write(true).create(true).truncate(true) + .open(path)?; + file.set_len(file_size as u64)?; + let mut mmap = unsafe { MmapMut::map_mut(&file)? }; + mmap[0..4].copy_from_slice(&MAGIC); + mmap[8..16].copy_from_slice(&(n as u64).to_le_bytes()); + mmap[HEADER_SIZE..HEADER_SIZE + bytes.len()].copy_from_slice(bytes); + Ok(Self { mmap, n }) + } + pub fn build_from(source: &PersistentBitVec, path: &Path) -> io::Result { fs::copy(source.path(), path)?; let file = OpenOptions::new().read(true).write(true).open(path)?; @@ -217,6 +232,12 @@ impl PersistentBitVecBuilder { } } + fn data_words(&self) -> &[u64] { + let nw = n_words(self.n); + let ptr = self.mmap[HEADER_SIZE..].as_ptr() as *const u64; + unsafe { std::slice::from_raw_parts(ptr, nw) } + } + // SAFETY: same alignment argument as PersistentBitVec::data_words. fn data_words_mut(&mut self) -> &mut [u64] { let nw = n_words(self.n); @@ -224,41 +245,6 @@ impl PersistentBitVecBuilder { unsafe { std::slice::from_raw_parts_mut(ptr, nw) } } - pub fn and(&mut self, other: &PersistentBitVec) { - assert_eq!(self.n, other.n, "length mismatch"); - for (sw, &ow) in self.data_words_mut().iter_mut().zip(other.data_words()) { - *sw &= ow; - } - } - - pub fn or(&mut self, other: &PersistentBitVec) { - assert_eq!(self.n, other.n, "length mismatch"); - for (sw, &ow) in self.data_words_mut().iter_mut().zip(other.data_words()) { - *sw |= ow; - } - } - - pub fn xor(&mut self, other: &PersistentBitVec) { - assert_eq!(self.n, other.n, "length mismatch"); - for (sw, &ow) in self.data_words_mut().iter_mut().zip(other.data_words()) { - *sw ^= ow; - } - } - - pub fn not(&mut self) { - let rem = self.n % 64; - let words = self.data_words_mut(); - for w in words.iter_mut() { - *w ^= u64::MAX; - } - // Zero padding bits in the last word so count_ones / jaccard remain correct. - if rem != 0 { - if let Some(last) = words.last_mut() { - *last &= (1u64 << rem) - 1; - } - } - } - /// Convert a count vector to a bit vector: bit set iff count >= threshold. /// Fills u64 words directly from the count iterator — O(n), no bit-level set() overhead. pub fn build_from_counts( @@ -304,3 +290,21 @@ impl PersistentBitVecBuilder { self.mmap.flush() } } + +// ── BitSlice / BitSliceMut impls ────────────────────────────────────────────── + +use crate::traits::{BitSlice, BitSliceMut}; + +impl BitSlice for PersistentBitVec { + fn len(&self) -> usize { self.n } + fn words(&self) -> &[u64] { self.data_words() } +} + +impl BitSlice for PersistentBitVecBuilder { + fn len(&self) -> usize { self.n } + fn words(&self) -> &[u64] { self.data_words() } +} + +impl BitSliceMut for PersistentBitVecBuilder { + fn words_mut(&mut self) -> &mut [u64] { self.data_words_mut() } +} diff --git a/src/obicompactvec/src/builder.rs b/src/obicompactvec/src/builder.rs index 32d711f..f2b5326 100644 --- a/src/obicompactvec/src/builder.rs +++ b/src/obicompactvec/src/builder.rs @@ -34,6 +34,36 @@ impl PersistentCompactIntVecBuilder { }) } + /// Create from a [`MemoryIntVec`], copying primary bytes directly into the mmap. + /// O(n) memcpy + O(n_overflow) HashMap clone — no per-slot `set` overhead. + pub fn from_memory(src: &crate::memoryintvec::MemoryIntVec, path: &Path) -> io::Result { + let n = src.len(); + let file = OpenOptions::new() + .read(true).write(true).create(true).truncate(true) + .open(path)?; + file.set_len((HEADER_SIZE + n) as u64)?; + let mut mmap = unsafe { MmapMut::map_mut(&file)? }; + mmap[HEADER_SIZE..HEADER_SIZE + n].copy_from_slice(src.primary_bytes()); + Ok(Self { + path: path.to_path_buf(), + mmap, + n, + overflow: src.overflow_map().clone(), + }) + } + + /// Create from raw primary bytes + an already-built overflow map (no per-slot overhead). + pub(crate) fn from_raw_primary(primary: &[u8], overflow: HashMap, path: &Path) -> io::Result { + let n = primary.len(); + let file = OpenOptions::new() + .read(true).write(true).create(true).truncate(true) + .open(path)?; + file.set_len((HEADER_SIZE + n) as u64)?; + let mut mmap = unsafe { MmapMut::map_mut(&file)? }; + mmap[HEADER_SIZE..HEADER_SIZE + n].copy_from_slice(primary); + Ok(Self { path: path.to_path_buf(), mmap, n, overflow }) + } + /// Copy `source`'s file to `path`, mmap the primary section, load overflow into RAM. /// Avoids iterating all n slots: the file copy is OS-level, overflow loading is O(n_overflow). pub fn build_from(source: &PersistentCompactIntVec, path: &Path) -> io::Result { @@ -91,39 +121,6 @@ impl PersistentCompactIntVecBuilder { self.n == 0 } - pub fn min(&mut self, other: &PersistentCompactIntVec) { - assert_eq!(self.n, other.len(), "length mismatch"); - for (slot, other_val) in other.iter().enumerate() { - if other_val < self.get(slot) { - self.set(slot, other_val); - } - } - } - - pub fn max(&mut self, other: &PersistentCompactIntVec) { - assert_eq!(self.n, other.len(), "length mismatch"); - for (slot, other_val) in other.iter().enumerate() { - if other_val > self.get(slot) { - self.set(slot, other_val); - } - } - } - - pub fn add(&mut self, other: &PersistentCompactIntVec) { - assert_eq!(self.n, other.len(), "length mismatch"); - for (slot, other_val) in other.iter().enumerate() { - let cur = self.get(slot); - self.set(slot, cur.checked_add(other_val).expect("u32 overflow in add")); - } - } - - pub fn diff(&mut self, other: &PersistentCompactIntVec) { - assert_eq!(self.n, other.len(), "length mismatch"); - for (slot, other_val) in other.iter().enumerate() { - self.set(slot, self.get(slot).saturating_sub(other_val)); - } - } - /// Flush the primary mmap, then write sorted overflow data + index and fix the header. pub fn close(self) -> io::Result<()> { self.mmap.flush()?; @@ -141,3 +138,16 @@ impl PersistentCompactIntVecBuilder { finalize_pciv(&path, n, &entries) } } + +// ── IntSlice / IntSliceMut impls ────────────────────────────────────────────── + +use crate::traits::{IntSlice, IntSliceMut}; + +impl IntSlice for PersistentCompactIntVecBuilder { + fn len(&self) -> usize { self.n } + fn get(&self, slot: usize) -> u32 { self.get(slot) } +} + +impl IntSliceMut for PersistentCompactIntVecBuilder { + fn set(&mut self, slot: usize, value: u32) { self.set(slot, value); } +} diff --git a/src/obicompactvec/src/intmatrix.rs b/src/obicompactvec/src/intmatrix.rs index b563335..91c3b1b 100644 --- a/src/obicompactvec/src/intmatrix.rs +++ b/src/obicompactvec/src/intmatrix.rs @@ -1,4 +1,5 @@ use std::cmp::Ordering; +use std::collections::HashMap; use std::fs::{self, File}; use std::io::{self, BufWriter, Write as _}; use std::path::{Path, PathBuf}; @@ -8,6 +9,7 @@ use ndarray::{Array1, Array2}; use rayon::prelude::*; use crate::builder::PersistentCompactIntVecBuilder; +use crate::memoryintvec::MemoryIntVec; use crate::format::{HEADER_SIZE, INDEX_ENTRY_SIZE, OVERFLOW_ENTRY_SIZE}; use crate::meta::MatrixMeta; use crate::reader::PersistentCompactIntVec; @@ -194,6 +196,32 @@ impl PackedCompactIntMatrix { Ok(Self { mmap, n_rows, n_cols, columns }) } + pub(crate) fn col_persist(&self, c: usize, path: &Path) -> io::Result { + let ci = &self.columns[c]; + let primary = &self.mmap[ci.primary_start..ci.primary_start + self.n_rows]; + let mut overflow = HashMap::with_capacity(ci.n_overflow); + for i in 0..ci.n_overflow { + let off = ci.data_offset + i * OVERFLOW_ENTRY_SIZE; + let slot = u64::from_le_bytes(self.mmap[off..off+8].try_into().unwrap()) as usize; + let value = u32::from_le_bytes(self.mmap[off+8..off+12].try_into().unwrap()); + overflow.insert(slot, value); + } + PersistentCompactIntVecBuilder::from_raw_primary(primary, overflow, path) + } + + pub(crate) fn col_as_memory(&self, c: usize) -> MemoryIntVec { + let ci = &self.columns[c]; + let primary = self.mmap[ci.primary_start..ci.primary_start + self.n_rows].to_vec(); + let mut overflow = HashMap::with_capacity(ci.n_overflow); + for i in 0..ci.n_overflow { + let off = ci.data_offset + i * OVERFLOW_ENTRY_SIZE; + let slot = u64::from_le_bytes(self.mmap[off..off+8].try_into().unwrap()) as usize; + let value = u32::from_le_bytes(self.mmap[off+8..off+12].try_into().unwrap()); + overflow.insert(slot, value); + } + MemoryIntVec::from_primary_and_overflow(primary, overflow) + } + #[inline] pub(crate) fn get(&self, col: usize, slot: usize) -> u32 { let ci = &self.columns[col]; @@ -442,6 +470,20 @@ impl PersistentCompactIntMatrix { } } + pub fn col_persist(&self, c: usize, path: &Path) -> io::Result { + match self { + Self::Columnar(m) => PersistentCompactIntVecBuilder::build_from(m.col(c), path), + Self::Packed(m) => m.col_persist(c, path), + } + } + + pub fn col_as_memory(&self, c: usize) -> MemoryIntVec { + match self { + Self::Columnar(m) => MemoryIntVec::from(m.col(c)), + Self::Packed(m) => m.col_as_memory(c), + } + } + pub fn row(&self, slot: usize) -> Box<[u32]> { match self { Self::Columnar(m) => m.row(slot), Self::Packed(m) => m.row(slot) } } diff --git a/src/obicompactvec/src/lib.rs b/src/obicompactvec/src/lib.rs index 8a1e5bb..b3c2ff4 100644 --- a/src/obicompactvec/src/lib.rs +++ b/src/obicompactvec/src/lib.rs @@ -4,6 +4,8 @@ mod builder; mod format; mod intmatrix; mod layer_meta; +mod memoryintvec; +mod memoryvec; mod meta; mod reader; pub mod traits; @@ -13,8 +15,10 @@ pub use bitmatrix::{PersistentBitMatrix, PersistentBitMatrixBuilder, pack_bit_ma pub use builder::PersistentCompactIntVecBuilder; pub use intmatrix::{PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, pack_compact_int_matrix}; pub use layer_meta::LayerMeta; +pub use memoryintvec::MemoryIntVec; +pub use memoryvec::MemoryBitVec; pub use reader::PersistentCompactIntVec; -pub use traits::{BitPartials, ColumnWeights, CountPartials}; +pub use traits::{BitPartials, BitSlice, BitSliceMut, BitToInt, ColumnWeights, CountPartials, IntSlice, IntSliceMut, IntToBit}; #[cfg(test)] #[path = "tests/mod.rs"] diff --git a/src/obicompactvec/src/memoryintvec.rs b/src/obicompactvec/src/memoryintvec.rs new file mode 100644 index 0000000..486a0f1 --- /dev/null +++ b/src/obicompactvec/src/memoryintvec.rs @@ -0,0 +1,120 @@ +use std::collections::HashMap; +use std::io; +use std::ops::{Add, AddAssign, Sub, SubAssign}; +use std::path::Path; + +use crate::builder::PersistentCompactIntVecBuilder; +use crate::traits::{IntSlice, IntSliceMut}; + +// ── MemoryIntVec ────────────────────────────────────────────────────────────── + +#[derive(Clone)] +pub struct MemoryIntVec { + primary: Vec, + overflow: HashMap, + n: usize, +} + +impl MemoryIntVec { + pub fn new(n: usize) -> Self { + Self { primary: vec![0u8; n], overflow: HashMap::new(), n } + } + + pub fn len(&self) -> usize { self.n } + pub fn is_empty(&self) -> bool { self.n == 0 } + + /// Construct directly from a pre-built primary array (no overflow — all values < 255). + pub(crate) fn from_primary(primary: Vec) -> Self { + let n = primary.len(); + Self { primary, overflow: HashMap::new(), n } + } + + pub(crate) fn from_primary_and_overflow(primary: Vec, overflow: HashMap) -> Self { + let n = primary.len(); + Self { primary, overflow, n } + } + + pub(crate) fn primary_bytes(&self) -> &[u8] { &self.primary } + pub(crate) fn overflow_map(&self) -> &HashMap { &self.overflow } + + pub fn iter(&self) -> impl Iterator + '_ { + (0..self.n).map(move |slot| self.get(slot)) + } + + /// Write to disk and return a writable builder at `path`. + pub fn persist(&self, path: &Path) -> io::Result { + PersistentCompactIntVecBuilder::from_memory(self, path) + } +} + +// ── IntSlice / IntSliceMut ──────────────────────────────────────────────────── + +impl IntSlice for MemoryIntVec { + fn len(&self) -> usize { self.n } + + fn get(&self, slot: usize) -> u32 { + match self.primary[slot] { + 255 => *self.overflow.get(&slot).expect("sentinel without overflow entry"), + v => v as u32, + } + } +} + +impl IntSliceMut for MemoryIntVec { + fn set(&mut self, slot: usize, value: u32) { + if value < 255 { + self.primary[slot] = value as u8; + self.overflow.remove(&slot); + } else { + self.primary[slot] = 255; + self.overflow.insert(slot, value); + } + } +} + +// ── From conversions ────────────────────────────────────────────────────────── + +impl From<&S> for MemoryIntVec { + fn from(src: &S) -> Self { + let mut v = Self::new(src.len()); + for slot in 0..src.len() { + let val = src.get(slot); + if val != 0 { v.set(slot, val); } + } + v + } +} + +// ── std::ops — owned (consumes lhs) ────────────────────────────────────────── + +impl Add<&B> for MemoryIntVec { + type Output = MemoryIntVec; + fn add(mut self, rhs: &B) -> MemoryIntVec { IntSliceMut::add(&mut self, rhs); self } +} + +impl Sub<&B> for MemoryIntVec { + type Output = MemoryIntVec; + fn sub(mut self, rhs: &B) -> MemoryIntVec { self.diff(rhs); self } +} + +// ── std::ops — borrowed (clones lhs) ───────────────────────────────────────── + +impl Add<&B> for &MemoryIntVec { + type Output = MemoryIntVec; + fn add(self, rhs: &B) -> MemoryIntVec { self.clone().add(rhs) } +} + +impl Sub<&B> for &MemoryIntVec { + type Output = MemoryIntVec; + fn sub(self, rhs: &B) -> MemoryIntVec { self.clone().sub(rhs) } +} + +// ── std::ops — in-place assign ──────────────────────────────────────────────── + +impl AddAssign<&B> for MemoryIntVec { + fn add_assign(&mut self, rhs: &B) { IntSliceMut::add(self, rhs); } +} + +impl SubAssign<&B> for MemoryIntVec { + fn sub_assign(&mut self, rhs: &B) { self.diff(rhs); } +} diff --git a/src/obicompactvec/src/memoryvec.rs b/src/obicompactvec/src/memoryvec.rs new file mode 100644 index 0000000..102a6d6 --- /dev/null +++ b/src/obicompactvec/src/memoryvec.rs @@ -0,0 +1,138 @@ +use std::io; +use std::ops::{BitAnd, BitAndAssign, BitOr, BitOrAssign, BitXor, BitXorAssign, Not}; +use std::path::Path; + +use crate::bitvec::PersistentBitVecBuilder; +use crate::traits::{BitSlice, BitSliceMut}; + +#[inline] +fn n_words(n: usize) -> usize { n.div_ceil(64) } + +// ── MemoryBitVec ────────────────────────────────────────────────────────────── + +#[derive(Clone)] +pub struct MemoryBitVec { + words: Vec, + n: usize, +} + +impl MemoryBitVec { + pub fn new(n: usize) -> Self { + Self { words: vec![0u64; n_words(n)], n } + } + + pub fn ones(n: usize) -> Self { + let rem = n % 64; + let mut words = vec![u64::MAX; n_words(n)]; + if rem != 0 { + if let Some(last) = words.last_mut() { *last = (1u64 << rem) - 1; } + } + Self { words, n } + } + + pub(crate) fn from_words(words: Vec, n: usize) -> Self { + Self { words, n } + } + + pub fn len(&self) -> usize { self.n } + pub fn is_empty(&self) -> bool { self.n == 0 } + + pub fn get(&self, slot: usize) -> bool { + (self.words[slot >> 6] >> (slot & 63)) & 1 != 0 + } + + pub fn set(&mut self, slot: usize, value: bool) { + let bit = 1u64 << (slot & 63); + if value { self.words[slot >> 6] |= bit; } else { self.words[slot >> 6] &= !bit; } + } + + pub fn count_ones(&self) -> u64 { + self.words.iter().map(|w| w.count_ones() as u64).sum() + } + + pub fn count_zeros(&self) -> u64 { self.n as u64 - self.count_ones() } + + /// Write to disk and return a writable builder positioned at the same path. + pub fn persist(&self, path: &Path) -> io::Result { + let mut b = PersistentBitVecBuilder::new(self.n, path)?; + b.copy_from(self); + Ok(b) + } +} + +// ── BitSlice / BitSliceMut ──────────────────────────────────────────────────── + +impl BitSlice for MemoryBitVec { + fn len(&self) -> usize { self.n } + fn words(&self) -> &[u64] { &self.words } +} + +impl BitSliceMut for MemoryBitVec { + fn words_mut(&mut self) -> &mut [u64] { &mut self.words } +} + +// ── From conversions ────────────────────────────────────────────────────────── + +impl From<&S> for MemoryBitVec { + fn from(src: &S) -> Self { + Self { words: src.words().to_vec(), n: src.len() } + } +} + +// ── std::ops — owned (consumes lhs) ────────────────────────────────────────── + +impl BitAnd<&B> for MemoryBitVec { + type Output = MemoryBitVec; + fn bitand(mut self, rhs: &B) -> MemoryBitVec { self.and(rhs); self } +} + +impl BitOr<&B> for MemoryBitVec { + type Output = MemoryBitVec; + fn bitor(mut self, rhs: &B) -> MemoryBitVec { self.or(rhs); self } +} + +impl BitXor<&B> for MemoryBitVec { + type Output = MemoryBitVec; + fn bitxor(mut self, rhs: &B) -> MemoryBitVec { self.xor(rhs); self } +} + +impl Not for MemoryBitVec { + type Output = MemoryBitVec; + fn not(mut self) -> MemoryBitVec { BitSliceMut::not(&mut self); self } +} + +// ── std::ops — borrowed (clones lhs) ───────────────────────────────────────── + +impl BitAnd<&B> for &MemoryBitVec { + type Output = MemoryBitVec; + fn bitand(self, rhs: &B) -> MemoryBitVec { self.clone().bitand(rhs) } +} + +impl BitOr<&B> for &MemoryBitVec { + type Output = MemoryBitVec; + fn bitor(self, rhs: &B) -> MemoryBitVec { self.clone().bitor(rhs) } +} + +impl BitXor<&B> for &MemoryBitVec { + type Output = MemoryBitVec; + fn bitxor(self, rhs: &B) -> MemoryBitVec { self.clone().bitxor(rhs) } +} + +impl Not for &MemoryBitVec { + type Output = MemoryBitVec; + fn not(self) -> MemoryBitVec { !self.clone() } +} + +// ── std::ops — in-place assign ──────────────────────────────────────────────── + +impl BitAndAssign<&B> for MemoryBitVec { + fn bitand_assign(&mut self, rhs: &B) { self.and(rhs); } +} + +impl BitOrAssign<&B> for MemoryBitVec { + fn bitor_assign(&mut self, rhs: &B) { self.or(rhs); } +} + +impl BitXorAssign<&B> for MemoryBitVec { + fn bitxor_assign(&mut self, rhs: &B) { self.xor(rhs); } +} diff --git a/src/obicompactvec/src/reader.rs b/src/obicompactvec/src/reader.rs index 057ce29..e4c59e4 100644 --- a/src/obicompactvec/src/reader.rs +++ b/src/obicompactvec/src/reader.rs @@ -353,6 +353,17 @@ impl PersistentCompactIntVec { } } +// ── IntSlice impl ───────────────────────────────────────────────────────────── + +use crate::traits::IntSlice; + +impl IntSlice for PersistentCompactIntVec { + fn len(&self) -> usize { self.n } + fn get(&self, slot: usize) -> u32 { self.get(slot) } + fn sum(&self) -> u64 { self.sum() } + fn count_nonzero(&self) -> u64 { self.count_nonzero() } +} + impl<'a> IntoIterator for &'a PersistentCompactIntVec { type Item = u32; type IntoIter = Iter<'a>; diff --git a/src/obicompactvec/src/tests/bitvec.rs b/src/obicompactvec/src/tests/bitvec.rs index 6b20568..a408e7d 100644 --- a/src/obicompactvec/src/tests/bitvec.rs +++ b/src/obicompactvec/src/tests/bitvec.rs @@ -1,5 +1,6 @@ use tempfile::tempdir; +use crate::traits::BitSliceMut; use crate::{PersistentBitVec, PersistentBitVecBuilder, PersistentCompactIntVec, PersistentCompactIntVecBuilder}; fn make_bv(bits: &[bool]) -> (tempfile::TempDir, PersistentBitVec) { diff --git a/src/obicompactvec/src/tests/memoryvec.rs b/src/obicompactvec/src/tests/memoryvec.rs new file mode 100644 index 0000000..21c12c9 --- /dev/null +++ b/src/obicompactvec/src/tests/memoryvec.rs @@ -0,0 +1,359 @@ +use tempfile::tempdir; + +use crate::traits::{BitSlice, BitSliceMut, BitToInt, IntSlice, IntSliceMut, IntToBit}; +use crate::{MemoryBitVec, MemoryIntVec, PersistentBitVec, PersistentBitVecBuilder}; + +// ── MemoryBitVec ────────────────────────────────────────────────────────────── + +#[test] +fn mbv_new_all_zero() { + let v = MemoryBitVec::new(10); + assert_eq!(v.len(), 10); + assert!(!(0..10).any(|s| v.get(s))); + assert_eq!(v.count_ones(), 0); +} + +#[test] +fn mbv_ones_all_set() { + let v = MemoryBitVec::ones(10); + assert!((0..10).all(|s| v.get(s))); + assert_eq!(v.count_ones(), 10); + assert_eq!(v.count_zeros(), 0); +} + +#[test] +fn mbv_ones_no_padding_leak() { + // 5 bits: padding bits in last word must stay 0 + let v = MemoryBitVec::ones(5); + assert_eq!(v.words()[0], 0b11111); +} + +#[test] +fn mbv_set_get_roundtrip() { + let mut v = MemoryBitVec::new(64); + v.set(0, true); + v.set(63, true); + assert!(v.get(0)); + assert!(!v.get(1)); + assert!(v.get(63)); + assert_eq!(v.count_ones(), 2); +} + +#[test] +fn mbv_and() { + let mut a = MemoryBitVec::new(4); + a.set(0, true); a.set(1, true); + let mut b = MemoryBitVec::new(4); + b.set(0, true); b.set(2, true); + a.and(&b); + assert!(a.get(0)); assert!(!a.get(1)); assert!(!a.get(2)); +} + +#[test] +fn mbv_or() { + let mut a = MemoryBitVec::new(4); + a.set(0, true); a.set(1, true); + let mut b = MemoryBitVec::new(4); + b.set(0, true); b.set(2, true); + a.or(&b); + assert!(a.get(0)); assert!(a.get(1)); assert!(a.get(2)); assert!(!a.get(3)); +} + +#[test] +fn mbv_xor() { + let mut a = MemoryBitVec::new(4); + a.set(0, true); a.set(1, true); + let mut b = MemoryBitVec::new(4); + b.set(0, true); b.set(2, true); + a.xor(&b); + assert!(!a.get(0)); assert!(a.get(1)); assert!(a.get(2)); assert!(!a.get(3)); +} + +#[test] +fn mbv_not() { + let mut a = MemoryBitVec::new(4); + a.set(0, true); a.set(2, true); + a.not(); + assert!(!a.get(0)); assert!(a.get(1)); assert!(!a.get(2)); assert!(a.get(3)); +} + +#[test] +fn mbv_not_no_padding_leak() { + let mut v = MemoryBitVec::new(5); + v.not(); + assert_eq!(v.count_ones(), 5); + assert_eq!(v.words()[0], 0b11111); +} + +#[test] +fn mbv_ops_chaining() { + let mut a = MemoryBitVec::ones(8); + let b = MemoryBitVec::new(8); // all zeros + a.and(&b).or(&b).not(); + assert_eq!(a.count_ones(), 8); +} + +#[test] +fn mbv_std_ops_owned() { + let mut a = MemoryBitVec::new(4); + a.set(0, true); a.set(1, true); + let mut b = MemoryBitVec::new(4); + b.set(1, true); b.set(2, true); + let c = a & &b; + assert!(!c.get(0)); assert!(c.get(1)); assert!(!c.get(2)); +} + +#[test] +fn mbv_std_ops_assign() { + let mut a = MemoryBitVec::new(4); + a.set(0, true); a.set(1, true); + let mut b = MemoryBitVec::new(4); + b.set(1, true); b.set(2, true); + a &= &b; + assert!(!a.get(0)); assert!(a.get(1)); +} + +#[test] +fn mbv_from_persistent() { + let dir = tempdir().unwrap(); + let path = dir.path().join("v.pbiv"); + let mut builder = PersistentBitVecBuilder::new(4, &path).unwrap(); + builder.set(1, true); builder.set(3, true); + builder.close().unwrap(); + let pv = PersistentBitVec::open(&path).unwrap(); + let mv = MemoryBitVec::from(&pv); + assert!(!mv.get(0)); assert!(mv.get(1)); assert!(!mv.get(2)); assert!(mv.get(3)); +} + +#[test] +fn mbv_persist_roundtrip() { + let dir = tempdir().unwrap(); + let path = dir.path().join("out.pbiv"); + let mut v = MemoryBitVec::new(8); + v.set(2, true); v.set(5, true); + let builder = v.persist(&path).unwrap(); + builder.close().unwrap(); + let pv = PersistentBitVec::open(&path).unwrap(); + assert!(pv.get(2)); assert!(pv.get(5)); + assert_eq!(pv.count_ones(), 2); +} + +// ── MemoryIntVec ────────────────────────────────────────────────────────────── + +#[test] +fn miv_new_all_zero() { + let v = MemoryIntVec::new(10); + assert_eq!(v.len(), 10); + assert!((0..10).all(|s| v.get(s) == 0)); +} + +#[test] +fn miv_set_get_roundtrip() { + let mut v = MemoryIntVec::new(4); + v.set(0, 42); v.set(3, 200); + assert_eq!(v.get(0), 42); + assert_eq!(v.get(1), 0); + assert_eq!(v.get(3), 200); +} + +#[test] +fn miv_overflow_roundtrip() { + let mut v = MemoryIntVec::new(4); + v.set(1, 1000); + assert_eq!(v.get(1), 1000); + assert_eq!(v.get(0), 0); +} + +#[test] +fn miv_inc_dec() { + let mut v = MemoryIntVec::new(4); + v.inc(2); v.inc(2); v.inc(2); + assert_eq!(v.get(2), 3); + v.dec(2); + assert_eq!(v.get(2), 2); +} + +#[test] +fn miv_dec_saturates_at_zero() { + let mut v = MemoryIntVec::new(4); + v.dec(0); + assert_eq!(v.get(0), 0); +} + +#[test] +fn miv_add_at() { + let mut v = MemoryIntVec::new(4); + v.add_at(1, 100); v.add_at(1, 200); + assert_eq!(v.get(1), 300); +} + +#[test] +fn miv_min_max() { + let mut a = MemoryIntVec::new(4); + a.set(0, 5); a.set(1, 2); a.set(2, 8); + let mut b = MemoryIntVec::new(4); + b.set(0, 3); b.set(1, 7); b.set(2, 8); + let mut c = MemoryIntVec::from(&a); + IntSliceMut::min(&mut c, &b); + assert_eq!(c.get(0), 3); assert_eq!(c.get(1), 2); assert_eq!(c.get(2), 8); + let mut d = MemoryIntVec::from(&a); + IntSliceMut::max(&mut d, &b); + assert_eq!(d.get(0), 5); assert_eq!(d.get(1), 7); assert_eq!(d.get(2), 8); +} + +#[test] +fn miv_add_diff() { + let mut a = MemoryIntVec::new(3); + a.set(0, 10); a.set(1, 5); + let mut b = MemoryIntVec::new(3); + b.set(0, 3); b.set(1, 8); + let mut c = MemoryIntVec::from(&a); + c.add(&b); + assert_eq!(c.get(0), 13); assert_eq!(c.get(1), 13); + let mut d = MemoryIntVec::from(&a); + d.diff(&b); + assert_eq!(d.get(0), 7); assert_eq!(d.get(1), 0); // saturating sub +} + +#[test] +fn miv_std_ops() { + let mut a = MemoryIntVec::new(3); + a.set(0, 10); a.set(1, 5); + let mut b = MemoryIntVec::new(3); + b.set(0, 3); b.set(1, 8); + let c = &a + &b; + assert_eq!(c.get(0), 13); assert_eq!(c.get(1), 13); + let d = &a - &b; + assert_eq!(d.get(0), 7); assert_eq!(d.get(1), 0); +} + +#[test] +fn miv_from_persistent() { + use crate::{PersistentCompactIntVec, PersistentCompactIntVecBuilder}; + let dir = tempdir().unwrap(); + let path = dir.path().join("v.pciv"); + let mut b = PersistentCompactIntVecBuilder::new(4, &path).unwrap(); + b.set(1, 42); b.set(3, 1000); + b.close().unwrap(); + let pv = PersistentCompactIntVec::open(&path).unwrap(); + let mv = MemoryIntVec::from(&pv); + assert_eq!(mv.get(0), 0); assert_eq!(mv.get(1), 42); assert_eq!(mv.get(3), 1000); +} + +// ── Cross-type conversions ──────────────────────────────────────────────────── + +#[test] +fn to_bitvec_threshold() { + let mut v = MemoryIntVec::new(5); + v.set(0, 0); v.set(1, 1); v.set(2, 5); v.set(3, 10); v.set(4, 3); + let bv = v.to_bitvec(4); // > 4: slots 2 (5) and 3 (10) pass + assert!(!bv.get(0)); assert!(!bv.get(1)); assert!(bv.get(2)); + assert!(bv.get(3)); assert!(!bv.get(4)); +} + +#[test] +fn to_presence() { + let mut v = MemoryIntVec::new(4); + v.set(1, 1); v.set(3, 100); + let bv = v.to_presence(); + assert!(!bv.get(0)); assert!(bv.get(1)); assert!(!bv.get(2)); assert!(bv.get(3)); +} + +#[test] +fn to_intvec_roundtrip() { + let mut bv = MemoryBitVec::new(8); + bv.set(0, true); bv.set(3, true); bv.set(7, true); + let iv = bv.to_intvec(); + assert_eq!(iv.get(0), 1); assert_eq!(iv.get(1), 0); + assert_eq!(iv.get(3), 1); assert_eq!(iv.get(7), 1); +} + +#[test] +fn to_intvec_word_boundary() { + // 65 bits: spans two words + let mut bv = MemoryBitVec::new(65); + bv.set(63, true); bv.set(64, true); + let iv = bv.to_intvec(); + assert_eq!(iv.get(63), 1); assert_eq!(iv.get(64), 1); assert_eq!(iv.get(62), 0); +} + +#[test] +fn count_bits_accumulates() { + let mut count = MemoryIntVec::new(8); + let mut b1 = MemoryBitVec::new(8); + b1.set(0, true); b1.set(2, true); + let mut b2 = MemoryBitVec::new(8); + b2.set(0, true); b2.set(3, true); + let mut b3 = MemoryBitVec::new(8); + b3.set(2, true); b3.set(3, true); + count.count_bits(&b1).count_bits(&b2).count_bits(&b3); + assert_eq!(count.get(0), 2); + assert_eq!(count.get(2), 2); + assert_eq!(count.get(3), 2); + assert_eq!(count.get(1), 0); +} + +#[test] +fn count_bits_skips_zero_words() { + // Entire first word is zero — should not touch those slots + let mut count = MemoryIntVec::new(128); + let mut bv = MemoryBitVec::new(128); + bv.set(64, true); bv.set(127, true); + count.count_bits(&bv); + assert_eq!(count.get(0), 0); + assert_eq!(count.get(64), 1); + assert_eq!(count.get(127), 1); +} + +// ── Comparison operators ────────────────────────────────────────────────────── + +#[test] +fn cmp_gt() { + let mut v = MemoryIntVec::new(5); + v.set(0, 0); v.set(1, 3); v.set(2, 5); v.set(3, 3); v.set(4, 10); + let bv = v.gt(3); + assert!(!bv.get(0)); assert!(!bv.get(1)); assert!(bv.get(2)); + assert!(!bv.get(3)); assert!(bv.get(4)); +} + +#[test] +fn cmp_geq() { + let mut v = MemoryIntVec::new(4); + v.set(0, 2); v.set(1, 3); v.set(2, 4); v.set(3, 1); + let bv = v.geq(3); + assert!(!bv.get(0)); assert!(bv.get(1)); assert!(bv.get(2)); assert!(!bv.get(3)); +} + +#[test] +fn cmp_lt() { + let mut v = MemoryIntVec::new(4); + v.set(0, 2); v.set(1, 3); v.set(2, 4); v.set(3, 0); + let bv = v.lt(3); + assert!(bv.get(0)); assert!(!bv.get(1)); assert!(!bv.get(2)); assert!(bv.get(3)); +} + +#[test] +fn cmp_leq() { + let mut v = MemoryIntVec::new(4); + v.set(0, 2); v.set(1, 3); v.set(2, 4); v.set(3, 3); + let bv = v.leq(3); + assert!(bv.get(0)); assert!(bv.get(1)); assert!(!bv.get(2)); assert!(bv.get(3)); +} + +#[test] +fn filter_pattern() { + // Typical filter: ingroup >= min_count AND outgroup <= max_outgroup + let mut ingroup = MemoryIntVec::new(6); + let mut outgroup = MemoryIntVec::new(6); + // slot 2: ingroup=3, outgroup=0 → keep + // slot 4: ingroup=2, outgroup=1 → drop (outgroup > 0) + // slot 5: ingroup=1, outgroup=0 → drop (ingroup < 2) + ingroup.set(2, 3); ingroup.set(4, 2); ingroup.set(5, 1); + outgroup.set(4, 1); + let out_mask = outgroup.leq(0); + let mut in_mask = ingroup.geq(2); + let keep = in_mask.and(&out_mask); + assert!(!keep.get(0)); assert!(!keep.get(1)); + assert!(keep.get(2)); + assert!(!keep.get(4)); assert!(!keep.get(5)); +} diff --git a/src/obicompactvec/src/tests/mod.rs b/src/obicompactvec/src/tests/mod.rs index 4d2d9ad..c0be93a 100644 --- a/src/obicompactvec/src/tests/mod.rs +++ b/src/obicompactvec/src/tests/mod.rs @@ -1,9 +1,12 @@ mod bitmatrix; mod bitvec; mod intmatrix; +mod memoryvec; use tempfile::tempdir; +use crate::traits::IntSliceMut; + use crate::{PersistentCompactIntVec, PersistentCompactIntVecBuilder}; fn roundtrip(values: &[(usize, u32)], n: usize) -> Vec { diff --git a/src/obicompactvec/src/traits.rs b/src/obicompactvec/src/traits.rs index b61e69b..91ee8d8 100644 --- a/src/obicompactvec/src/traits.rs +++ b/src/obicompactvec/src/traits.rs @@ -1,6 +1,213 @@ use ndarray::{Array1, Array2}; -/// Column-level weight statistic — total count or presence count per column. +// ── BitSlice / BitSliceMut ──────────────────────────────────────────────────── + +/// Read-only view over the u64 word array of a bit vector. +/// +/// Bit `i` is in `words()[i >> 6]` at position `i & 63`. +/// Padding bits in the last word are zero. +pub trait BitSlice { + fn len(&self) -> usize; + fn words(&self) -> &[u64]; + fn is_empty(&self) -> bool { self.len() == 0 } + fn get(&self, slot: usize) -> bool { + (self.words()[slot >> 6] >> (slot & 63)) & 1 != 0 + } +} + +/// Mutable view over a bit-vector word array; default methods maintain the +/// zero-padding invariant on the last word. +pub trait BitSliceMut: BitSlice { + fn words_mut(&mut self) -> &mut [u64]; + + fn copy_from(&mut self, src: &S) -> &mut Self { + assert_eq!(self.len(), src.len(), "BitSlice length mismatch"); + self.words_mut().copy_from_slice(src.words()); + self + } + + fn and(&mut self, other: &S) -> &mut Self { + assert_eq!(self.len(), other.len(), "BitSlice length mismatch"); + for (w, &o) in self.words_mut().iter_mut().zip(other.words()) { *w &= o; } + self + } + + fn or(&mut self, other: &S) -> &mut Self { + assert_eq!(self.len(), other.len(), "BitSlice length mismatch"); + for (w, &o) in self.words_mut().iter_mut().zip(other.words()) { *w |= o; } + self + } + + fn xor(&mut self, other: &S) -> &mut Self { + assert_eq!(self.len(), other.len(), "BitSlice length mismatch"); + for (w, &o) in self.words_mut().iter_mut().zip(other.words()) { *w ^= o; } + self + } + + fn not(&mut self) -> &mut Self { + let rem = self.len() % 64; + let words = self.words_mut(); + for w in words.iter_mut() { *w ^= u64::MAX; } + if rem != 0 { + if let Some(last) = words.last_mut() { *last &= (1u64 << rem) - 1; } + } + self + } +} + +// ── IntSlice / IntSliceMut ──────────────────────────────────────────────────── + +/// Read-only access to a compact integer vector (values encoded as u32). +pub trait IntSlice { + fn len(&self) -> usize; + fn get(&self, slot: usize) -> u32; + fn is_empty(&self) -> bool { self.len() == 0 } + fn sum(&self) -> u64 { (0..self.len()).map(|s| self.get(s) as u64).sum() } + fn count_nonzero(&self) -> u64 { (0..self.len()).filter(|&s| self.get(s) > 0).count() as u64 } + + fn lt(&self, threshold: u32) -> MemoryBitVec { self.cmp_scalar(|v| v < threshold) } + fn leq(&self, threshold: u32) -> MemoryBitVec { self.cmp_scalar(|v| v <= threshold) } + fn gt(&self, threshold: u32) -> MemoryBitVec { self.cmp_scalar(|v| v > threshold) } + fn geq(&self, threshold: u32) -> MemoryBitVec { self.cmp_scalar(|v| v >= threshold) } + + fn cmp_scalar(&self, pred: impl Fn(u32) -> bool) -> MemoryBitVec { + let n = self.len(); + let mut words = vec![0u64; n.div_ceil(64)]; + for s in 0..n { + if pred(self.get(s)) { words[s >> 6] |= 1u64 << (s & 63); } + } + MemoryBitVec::from_words(words, n) + } +} + +/// Mutable access; default methods use only `get` / `set` and maintain the +/// compact encoding invariants on the implementor's side. +pub trait IntSliceMut: IntSlice { + fn set(&mut self, slot: usize, value: u32); + + fn inc(&mut self, slot: usize) -> &mut Self { + let v = self.get(slot); + self.set(slot, v.saturating_add(1)); + self + } + + fn dec(&mut self, slot: usize) -> &mut Self { + let v = self.get(slot); + self.set(slot, v.saturating_sub(1)); + self + } + + fn add_at(&mut self, slot: usize, delta: u32) -> &mut Self { + let v = self.get(slot); + self.set(slot, v.saturating_add(delta)); + self + } + + fn copy_from(&mut self, src: &S) -> &mut Self { + assert_eq!(self.len(), src.len(), "IntSlice length mismatch"); + for s in 0..src.len() { self.set(s, src.get(s)); } + self + } + + fn min(&mut self, other: &S) -> &mut Self { + assert_eq!(self.len(), other.len(), "IntSlice length mismatch"); + for s in 0..other.len() { self.set(s, self.get(s).min(other.get(s))); } + self + } + + fn max(&mut self, other: &S) -> &mut Self { + assert_eq!(self.len(), other.len(), "IntSlice length mismatch"); + for s in 0..other.len() { self.set(s, self.get(s).max(other.get(s))); } + self + } + + fn add(&mut self, other: &S) -> &mut Self { + assert_eq!(self.len(), other.len(), "IntSlice length mismatch"); + for s in 0..other.len() { self.set(s, self.get(s).saturating_add(other.get(s))); } + self + } + + fn diff(&mut self, other: &S) -> &mut Self { + assert_eq!(self.len(), other.len(), "IntSlice length mismatch"); + for s in 0..other.len() { self.set(s, self.get(s).saturating_sub(other.get(s))); } + self + } + + /// For each slot where `bits` is true, increment `self` by 1. + /// Skips zero words entirely — O(n_ones) rather than O(n). + fn count_bits(&mut self, bits: &B) -> &mut Self { + assert_eq!(self.len(), bits.len(), "IntSlice/BitSlice length mismatch"); + for (w_idx, &word) in bits.words().iter().enumerate() { + if word == 0 { continue; } + let base = w_idx * 64; + let mut w = word; + while w != 0 { + let bit = w.trailing_zeros() as usize; + let slot = base + bit; + if slot < self.len() { self.inc(slot); } + w &= w - 1; + } + } + self + } +} + +// ── IntSlice → MemoryBitVec conversions ─────────────────────────────────────── + +use crate::memoryvec::MemoryBitVec; + +pub trait IntToBit: IntSlice { + /// Bit set iff value >= threshold. Consistent with `geq` and `build_from_counts`. + fn to_bitvec(&self, threshold: u32) -> MemoryBitVec { self.geq(threshold) } + + /// Bit set iff value >= 1 (slot is present). + fn to_presence(&self) -> MemoryBitVec { self.geq(1) } +} + +impl IntToBit for T {} + +// ── BitSlice → MemoryIntVec conversion ─────────────────────────────────────── + +use crate::memoryintvec::MemoryIntVec; + +pub trait BitToInt: BitSlice { + fn to_intvec(&self) -> MemoryIntVec { + let n = self.len(); + let mut primary = vec![0u8; n]; + + // Unpack u64 words: each byte within a word yields 8 output bytes. + // Values are always 0 or 1 → no overflow entries needed. + let words = self.words(); + let full_words = n / 64; + + for (w_idx, &word) in words[..full_words].iter().enumerate() { + let base = w_idx * 64; + for byte_off in 0..8usize { + let byte = (word >> (byte_off * 8)) as u8; + let out = &mut primary[base + byte_off * 8..base + byte_off * 8 + 8]; + for bit in 0..8usize { + out[bit] = (byte >> bit) & 1; + } + } + } + + // Remaining bits in the last partial word + let rem = n % 64; + if rem > 0 { + let word = words[full_words]; + let base = full_words * 64; + for bit in 0..rem { + primary[base + bit] = ((word >> bit) & 1) as u8; + } + } + + MemoryIntVec::from_primary(primary) + } +} + +impl BitToInt for T {} + +// ── Column-level weight statistic — total count or presence count per column. /// Additive across layers and partitions; used as denominator in normalised distances. /// /// `partial_kmer_counts` returns the number of **distinct k-mers** present per diff --git a/src/obikindex/src/rebuild.rs b/src/obikindex/src/rebuild.rs index b1a8b5c..83a416d 100644 --- a/src/obikindex/src/rebuild.rs +++ b/src/obikindex/src/rebuild.rs @@ -98,7 +98,9 @@ impl KmerIndex { fs::File::create(output.join(SENTINEL_INDEXED))?; let idx = KmerIndex::open(output)?; + let t_pack = Stage::start("pack"); idx.pack_matrices()?; + rep.push(t_pack.stop()); Ok(idx) } }