refactor: introduce PackedIntCol view and use iterators
Centralizes overflow handling and improves modularity by replacing manual mmap indexing and row loops with composable iterator patterns. This change leverages Rust's iterator traits for efficient, idiomatic column traversal while encapsulating data access in a dedicated view struct.
This commit is contained in:
@@ -14,6 +14,7 @@ use crate::memoryintvec::MemoryIntVec;
|
|||||||
use crate::format::{byte_count_nonzero, byte_sum, HEADER_SIZE, OVERFLOW_ENTRY_SIZE, parse_index_entry, parse_overflow_entry};
|
use crate::format::{byte_count_nonzero, byte_sum, HEADER_SIZE, OVERFLOW_ENTRY_SIZE, parse_index_entry, parse_overflow_entry};
|
||||||
use crate::meta::MatrixMeta;
|
use crate::meta::MatrixMeta;
|
||||||
use crate::reader::PersistentCompactIntVec;
|
use crate::reader::PersistentCompactIntVec;
|
||||||
|
use crate::traits::IntSlice;
|
||||||
|
|
||||||
fn col_path(dir: &Path, col: usize) -> PathBuf {
|
fn col_path(dir: &Path, col: usize) -> PathBuf {
|
||||||
dir.join(format!("col_{col:06}.pciv"))
|
dir.join(format!("col_{col:06}.pciv"))
|
||||||
@@ -124,6 +125,107 @@ struct ColInfo {
|
|||||||
index: Vec<(usize, usize)>,
|
index: Vec<(usize, usize)>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ── PackedIntCol — lightweight column view backed by the shared mmap ──────────
|
||||||
|
|
||||||
|
pub(crate) struct PackedIntCol<'a> {
|
||||||
|
primary: &'a [u8],
|
||||||
|
overflow: &'a [u8], // raw bytes: n_overflow × OVERFLOW_ENTRY_SIZE
|
||||||
|
n_overflow: usize,
|
||||||
|
step: usize,
|
||||||
|
index: &'a [(usize, usize)],
|
||||||
|
n: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PackedIntCol<'_> {
|
||||||
|
fn overflow_get(&self, slot: usize) -> u32 {
|
||||||
|
let (pos_start, pos_end) = if self.step == 0 {
|
||||||
|
(0, self.n_overflow)
|
||||||
|
} else {
|
||||||
|
let i = self.index.partition_point(|&(s, _)| s <= slot).saturating_sub(1);
|
||||||
|
let start = self.index[i].1;
|
||||||
|
let end = if i + 1 < self.index.len() { self.index[i + 1].1 } else { self.n_overflow };
|
||||||
|
(start, end)
|
||||||
|
};
|
||||||
|
let mut lo = pos_start;
|
||||||
|
let mut hi = pos_end;
|
||||||
|
while lo < hi {
|
||||||
|
let mid = lo + (hi - lo) / 2;
|
||||||
|
let (stored, val) = parse_overflow_entry(self.overflow, 0, mid);
|
||||||
|
match stored.cmp(&slot) {
|
||||||
|
Ordering::Equal => return val,
|
||||||
|
Ordering::Less => lo = mid + 1,
|
||||||
|
Ordering::Greater => hi = mid,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
panic!("slot {slot} marked overflow but not found")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl IntSlice for PackedIntCol<'_> {
|
||||||
|
fn len(&self) -> usize { self.n }
|
||||||
|
|
||||||
|
fn get(&self, slot: usize) -> u32 {
|
||||||
|
let v = self.primary[slot];
|
||||||
|
if v < 255 { v as u32 } else { self.overflow_get(slot) }
|
||||||
|
}
|
||||||
|
|
||||||
|
fn primary_bytes(&self) -> &[u8] { self.primary }
|
||||||
|
|
||||||
|
fn overflow_entries(&self) -> impl Iterator<Item = (usize, u32)> + '_ {
|
||||||
|
(0..self.n_overflow).map(|i| parse_overflow_entry(self.overflow, 0, i))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn iter(&self) -> impl Iterator<Item = u32> + '_ {
|
||||||
|
PackedIntColIter {
|
||||||
|
primary: self.primary,
|
||||||
|
overflow: self.overflow,
|
||||||
|
slot: 0,
|
||||||
|
overflow_pos: 0,
|
||||||
|
n: self.n,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn sum(&self) -> u64 {
|
||||||
|
byte_sum(self.primary, (0..self.n_overflow).map(|i| parse_overflow_entry(self.overflow, 0, i).1))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn count_nonzero(&self) -> u64 { byte_count_nonzero(self.primary) }
|
||||||
|
}
|
||||||
|
|
||||||
|
struct PackedIntColIter<'a> {
|
||||||
|
primary: &'a [u8],
|
||||||
|
overflow: &'a [u8],
|
||||||
|
slot: usize,
|
||||||
|
overflow_pos: usize,
|
||||||
|
n: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Iterator for PackedIntColIter<'_> {
|
||||||
|
type Item = u32;
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<u32> {
|
||||||
|
if self.slot >= self.n { return None; }
|
||||||
|
let v = self.primary[self.slot];
|
||||||
|
self.slot += 1;
|
||||||
|
if v < 255 {
|
||||||
|
Some(v as u32)
|
||||||
|
} else {
|
||||||
|
let (_, val) = parse_overflow_entry(self.overflow, 0, self.overflow_pos);
|
||||||
|
self.overflow_pos += 1;
|
||||||
|
Some(val)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||||
|
let rem = self.n - self.slot;
|
||||||
|
(rem, Some(rem))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ExactSizeIterator for PackedIntColIter<'_> {}
|
||||||
|
|
||||||
|
// ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
pub struct PackedCompactIntMatrix {
|
pub struct PackedCompactIntMatrix {
|
||||||
mmap: Mmap,
|
mmap: Mmap,
|
||||||
n_rows: usize,
|
n_rows: usize,
|
||||||
@@ -167,57 +269,31 @@ impl PackedCompactIntMatrix {
|
|||||||
Ok(Self { mmap, n_rows, n_cols, columns })
|
Ok(Self { mmap, n_rows, n_cols, columns })
|
||||||
}
|
}
|
||||||
|
|
||||||
fn col_overflow_map(&self, ci: &ColInfo) -> HashMap<usize, u32> {
|
pub(crate) fn col_slice(&self, c: usize) -> PackedIntCol<'_> {
|
||||||
let mut overflow = HashMap::with_capacity(ci.n_overflow);
|
let ci = &self.columns[c];
|
||||||
for i in 0..ci.n_overflow {
|
PackedIntCol {
|
||||||
let (slot, value) = parse_overflow_entry(&self.mmap, ci.data_offset, i);
|
primary: &self.mmap[ci.primary_start..ci.primary_start + self.n_rows],
|
||||||
overflow.insert(slot, value);
|
overflow: &self.mmap[ci.data_offset..ci.data_offset + ci.n_overflow * OVERFLOW_ENTRY_SIZE],
|
||||||
|
n_overflow: ci.n_overflow,
|
||||||
|
step: ci.step,
|
||||||
|
index: &ci.index,
|
||||||
|
n: self.n_rows,
|
||||||
}
|
}
|
||||||
overflow
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn col_persist(&self, c: usize, path: &Path) -> io::Result<PersistentCompactIntVecBuilder> {
|
pub(crate) fn col_persist(&self, c: usize, path: &Path) -> io::Result<PersistentCompactIntVecBuilder> {
|
||||||
let ci = &self.columns[c];
|
let col = self.col_slice(c);
|
||||||
let primary = &self.mmap[ci.primary_start..ci.primary_start + self.n_rows];
|
let overflow: HashMap<usize, u32> = col.overflow_entries().collect();
|
||||||
PersistentCompactIntVecBuilder::from_raw_primary(primary, self.col_overflow_map(ci), path)
|
PersistentCompactIntVecBuilder::from_raw_primary(col.primary, overflow, path)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn col_as_memory(&self, c: usize) -> MemoryIntVec {
|
pub(crate) fn col_as_memory(&self, c: usize) -> MemoryIntVec {
|
||||||
let ci = &self.columns[c];
|
MemoryIntVec::from(&self.col_slice(c))
|
||||||
let primary = self.mmap[ci.primary_start..ci.primary_start + self.n_rows].to_vec();
|
|
||||||
MemoryIntVec::from_primary_and_overflow(primary, self.col_overflow_map(ci))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
pub(crate) fn get(&self, col: usize, slot: usize) -> u32 {
|
pub(crate) fn get(&self, col: usize, slot: usize) -> u32 {
|
||||||
let ci = &self.columns[col];
|
self.col_slice(col).get(slot)
|
||||||
let v = self.mmap[ci.primary_start + slot];
|
|
||||||
if v < 255 { return v as u32; }
|
|
||||||
self.overflow_get(ci, slot)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn overflow_get(&self, ci: &ColInfo, slot: usize) -> u32 {
|
|
||||||
let (pos_start, pos_end) = if ci.step == 0 {
|
|
||||||
(0, ci.n_overflow)
|
|
||||||
} else {
|
|
||||||
let i = ci.index.partition_point(|&(s, _)| s <= slot).saturating_sub(1);
|
|
||||||
let start = ci.index[i].1;
|
|
||||||
let end = if i + 1 < ci.index.len() { ci.index[i+1].1 } else { ci.n_overflow };
|
|
||||||
(start, end)
|
|
||||||
};
|
|
||||||
let mut lo = pos_start;
|
|
||||||
let mut hi = pos_end;
|
|
||||||
while lo < hi {
|
|
||||||
let mid = lo + (hi - lo) / 2;
|
|
||||||
let off = ci.data_offset + mid * OVERFLOW_ENTRY_SIZE;
|
|
||||||
let stored = u64::from_le_bytes(self.mmap[off..off+8].try_into().unwrap()) as usize;
|
|
||||||
match stored.cmp(&slot) {
|
|
||||||
Ordering::Equal => return u32::from_le_bytes(self.mmap[off+8..off+12].try_into().unwrap()),
|
|
||||||
Ordering::Less => lo = mid + 1,
|
|
||||||
Ordering::Greater => hi = mid,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
panic!("slot {slot} marked overflow but not found")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn fill_row(&self, slot: usize, buf: &mut [u32]) {
|
pub(crate) fn fill_row(&self, slot: usize, buf: &mut [u32]) {
|
||||||
@@ -230,73 +306,62 @@ impl PackedCompactIntMatrix {
|
|||||||
|
|
||||||
pub(crate) fn sum(&self) -> Array1<u64> {
|
pub(crate) fn sum(&self) -> Array1<u64> {
|
||||||
Array1::from_vec(
|
Array1::from_vec(
|
||||||
self.columns.par_iter()
|
(0..self.n_cols).into_par_iter()
|
||||||
.map(|ci| {
|
.map(|c| self.col_slice(c).sum())
|
||||||
let primary = &self.mmap[ci.primary_start..ci.primary_start + self.n_rows];
|
|
||||||
let overflow = (0..ci.n_overflow)
|
|
||||||
.map(|i| parse_overflow_entry(&self.mmap, ci.data_offset, i).1);
|
|
||||||
byte_sum(primary, overflow)
|
|
||||||
})
|
|
||||||
.collect()
|
.collect()
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn count_nonzero(&self) -> Array1<u64> {
|
pub(crate) fn count_nonzero(&self) -> Array1<u64> {
|
||||||
Array1::from_vec(
|
Array1::from_vec(
|
||||||
self.columns.par_iter()
|
(0..self.n_cols).into_par_iter()
|
||||||
.map(|ci| {
|
.map(|c| self.col_slice(c).count_nonzero())
|
||||||
let primary = &self.mmap[ci.primary_start..ci.primary_start + self.n_rows];
|
|
||||||
byte_count_nonzero(primary)
|
|
||||||
})
|
|
||||||
.collect()
|
.collect()
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Pair primitives ───────────────────────────────────────────────────────
|
// ── Pair primitives — sequential scan via col_slice().iter() ─────────────
|
||||||
|
|
||||||
fn pair_partial_bray(&self, i: usize, j: usize) -> u64 {
|
fn pair_partial_bray(&self, i: usize, j: usize) -> u64 {
|
||||||
(0..self.n_rows).map(|s| self.get(i, s).min(self.get(j, s)) as u64).sum()
|
self.col_slice(i).iter().zip(self.col_slice(j).iter())
|
||||||
|
.map(|(a, b)| a.min(b) as u64)
|
||||||
|
.sum()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn pair_partial_euclidean(&self, i: usize, j: usize) -> f64 {
|
fn pair_partial_euclidean(&self, i: usize, j: usize) -> f64 {
|
||||||
(0..self.n_rows).map(|s| {
|
self.col_slice(i).iter().zip(self.col_slice(j).iter())
|
||||||
let d = self.get(i, s) as f64 - self.get(j, s) as f64;
|
.map(|(a, b)| { let d = a as f64 - b as f64; d * d })
|
||||||
d * d
|
.sum()
|
||||||
}).sum()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn pair_partial_threshold_jaccard(&self, i: usize, j: usize, t: u32) -> (u64, u64) {
|
fn pair_partial_threshold_jaccard(&self, i: usize, j: usize, t: u32) -> (u64, u64) {
|
||||||
let (mut inter, mut union) = (0u64, 0u64);
|
self.col_slice(i).iter().zip(self.col_slice(j).iter())
|
||||||
for s in 0..self.n_rows {
|
.fold((0u64, 0u64), |(inter, uni), (a, b)| {
|
||||||
let a = self.get(i, s) >= t;
|
let ap = a >= t;
|
||||||
let b = self.get(j, s) >= t;
|
let bp = b >= t;
|
||||||
if a && b { inter += 1; }
|
(inter + (ap & bp) as u64, uni + (ap | bp) as u64)
|
||||||
if a || b { union += 1; }
|
})
|
||||||
}
|
|
||||||
(inter, union)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn pair_partial_relfreq_bray(&self, i: usize, j: usize, si: f64, sj: f64) -> f64 {
|
fn pair_partial_relfreq_bray(&self, i: usize, j: usize, si: f64, sj: f64) -> f64 {
|
||||||
if si == 0.0 || sj == 0.0 { return 0.0; }
|
if si == 0.0 || sj == 0.0 { return 0.0; }
|
||||||
(0..self.n_rows).map(|s| {
|
self.col_slice(i).iter().zip(self.col_slice(j).iter())
|
||||||
(self.get(i, s) as f64 / si).min(self.get(j, s) as f64 / sj)
|
.map(|(a, b)| (a as f64 / si).min(b as f64 / sj))
|
||||||
}).sum()
|
.sum()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn pair_partial_relfreq_euclidean(&self, i: usize, j: usize, si: f64, sj: f64) -> f64 {
|
fn pair_partial_relfreq_euclidean(&self, i: usize, j: usize, si: f64, sj: f64) -> f64 {
|
||||||
if si == 0.0 || sj == 0.0 { return 0.0; }
|
if si == 0.0 || sj == 0.0 { return 0.0; }
|
||||||
(0..self.n_rows).map(|s| {
|
self.col_slice(i).iter().zip(self.col_slice(j).iter())
|
||||||
let d = self.get(i, s) as f64 / si - self.get(j, s) as f64 / sj;
|
.map(|(a, b)| { let d = a as f64 / si - b as f64 / sj; d * d })
|
||||||
d * d
|
.sum()
|
||||||
}).sum()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn pair_partial_hellinger(&self, i: usize, j: usize, si: f64, sj: f64) -> f64 {
|
fn pair_partial_hellinger(&self, i: usize, j: usize, si: f64, sj: f64) -> f64 {
|
||||||
if si == 0.0 || sj == 0.0 { return 0.0; }
|
if si == 0.0 || sj == 0.0 { return 0.0; }
|
||||||
(0..self.n_rows).map(|s| {
|
self.col_slice(i).iter().zip(self.col_slice(j).iter())
|
||||||
let d = (self.get(i, s) as f64 / si).sqrt() - (self.get(j, s) as f64 / sj).sqrt();
|
.map(|(a, b)| { let d = (a as f64 / si).sqrt() - (b as f64 / sj).sqrt(); d * d })
|
||||||
d * d
|
.sum()
|
||||||
}).sum()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Matrix methods ────────────────────────────────────────────────────────
|
// ── Matrix methods ────────────────────────────────────────────────────────
|
||||||
@@ -324,7 +389,6 @@ impl PackedCompactIntMatrix {
|
|||||||
pub(crate) fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
pub(crate) fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
||||||
pairwise_matrix(self.n_cols, |i, j| self.pair_partial_hellinger(i, j, col_sums[i] as f64, col_sums[j] as f64))
|
pairwise_matrix(self.n_cols, |i, j| self.pair_partial_hellinger(i, j, col_sums[i] as f64, col_sums[j] as f64))
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Build `counts/matrix.pcmx` from existing `col_*.pciv` files.
|
/// Build `counts/matrix.pcmx` from existing `col_*.pciv` files.
|
||||||
@@ -516,4 +580,3 @@ impl PersistentCompactIntMatrixBuilder {
|
|||||||
MatrixMeta { n: self.n, n_cols: self.n_cols }.save(&self.dir)
|
MatrixMeta { n: self.n, n_cols: self.n_cols }.save(&self.dir)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user