Push mtzqmmrlmzzx #34
@@ -14,6 +14,7 @@ use crate::memoryintvec::MemoryIntVec;
|
|||||||
use crate::format::{byte_count_nonzero, byte_sum, HEADER_SIZE, OVERFLOW_ENTRY_SIZE, parse_index_entry, parse_overflow_entry};
|
use crate::format::{byte_count_nonzero, byte_sum, HEADER_SIZE, OVERFLOW_ENTRY_SIZE, parse_index_entry, parse_overflow_entry};
|
||||||
use crate::meta::MatrixMeta;
|
use crate::meta::MatrixMeta;
|
||||||
use crate::reader::PersistentCompactIntVec;
|
use crate::reader::PersistentCompactIntVec;
|
||||||
|
use crate::traits::IntSlice;
|
||||||
|
|
||||||
fn col_path(dir: &Path, col: usize) -> PathBuf {
|
fn col_path(dir: &Path, col: usize) -> PathBuf {
|
||||||
dir.join(format!("col_{col:06}.pciv"))
|
dir.join(format!("col_{col:06}.pciv"))
|
||||||
@@ -124,6 +125,107 @@ struct ColInfo {
|
|||||||
index: Vec<(usize, usize)>,
|
index: Vec<(usize, usize)>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ── PackedIntCol — lightweight column view backed by the shared mmap ──────────
|
||||||
|
|
||||||
|
pub(crate) struct PackedIntCol<'a> {
|
||||||
|
primary: &'a [u8],
|
||||||
|
overflow: &'a [u8], // raw bytes: n_overflow × OVERFLOW_ENTRY_SIZE
|
||||||
|
n_overflow: usize,
|
||||||
|
step: usize,
|
||||||
|
index: &'a [(usize, usize)],
|
||||||
|
n: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PackedIntCol<'_> {
|
||||||
|
fn overflow_get(&self, slot: usize) -> u32 {
|
||||||
|
let (pos_start, pos_end) = if self.step == 0 {
|
||||||
|
(0, self.n_overflow)
|
||||||
|
} else {
|
||||||
|
let i = self.index.partition_point(|&(s, _)| s <= slot).saturating_sub(1);
|
||||||
|
let start = self.index[i].1;
|
||||||
|
let end = if i + 1 < self.index.len() { self.index[i + 1].1 } else { self.n_overflow };
|
||||||
|
(start, end)
|
||||||
|
};
|
||||||
|
let mut lo = pos_start;
|
||||||
|
let mut hi = pos_end;
|
||||||
|
while lo < hi {
|
||||||
|
let mid = lo + (hi - lo) / 2;
|
||||||
|
let (stored, val) = parse_overflow_entry(self.overflow, 0, mid);
|
||||||
|
match stored.cmp(&slot) {
|
||||||
|
Ordering::Equal => return val,
|
||||||
|
Ordering::Less => lo = mid + 1,
|
||||||
|
Ordering::Greater => hi = mid,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
panic!("slot {slot} marked overflow but not found")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl IntSlice for PackedIntCol<'_> {
|
||||||
|
fn len(&self) -> usize { self.n }
|
||||||
|
|
||||||
|
fn get(&self, slot: usize) -> u32 {
|
||||||
|
let v = self.primary[slot];
|
||||||
|
if v < 255 { v as u32 } else { self.overflow_get(slot) }
|
||||||
|
}
|
||||||
|
|
||||||
|
fn primary_bytes(&self) -> &[u8] { self.primary }
|
||||||
|
|
||||||
|
fn overflow_entries(&self) -> impl Iterator<Item = (usize, u32)> + '_ {
|
||||||
|
(0..self.n_overflow).map(|i| parse_overflow_entry(self.overflow, 0, i))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn iter(&self) -> impl Iterator<Item = u32> + '_ {
|
||||||
|
PackedIntColIter {
|
||||||
|
primary: self.primary,
|
||||||
|
overflow: self.overflow,
|
||||||
|
slot: 0,
|
||||||
|
overflow_pos: 0,
|
||||||
|
n: self.n,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn sum(&self) -> u64 {
|
||||||
|
byte_sum(self.primary, (0..self.n_overflow).map(|i| parse_overflow_entry(self.overflow, 0, i).1))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn count_nonzero(&self) -> u64 { byte_count_nonzero(self.primary) }
|
||||||
|
}
|
||||||
|
|
||||||
|
struct PackedIntColIter<'a> {
|
||||||
|
primary: &'a [u8],
|
||||||
|
overflow: &'a [u8],
|
||||||
|
slot: usize,
|
||||||
|
overflow_pos: usize,
|
||||||
|
n: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Iterator for PackedIntColIter<'_> {
|
||||||
|
type Item = u32;
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<u32> {
|
||||||
|
if self.slot >= self.n { return None; }
|
||||||
|
let v = self.primary[self.slot];
|
||||||
|
self.slot += 1;
|
||||||
|
if v < 255 {
|
||||||
|
Some(v as u32)
|
||||||
|
} else {
|
||||||
|
let (_, val) = parse_overflow_entry(self.overflow, 0, self.overflow_pos);
|
||||||
|
self.overflow_pos += 1;
|
||||||
|
Some(val)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||||
|
let rem = self.n - self.slot;
|
||||||
|
(rem, Some(rem))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ExactSizeIterator for PackedIntColIter<'_> {}
|
||||||
|
|
||||||
|
// ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
pub struct PackedCompactIntMatrix {
|
pub struct PackedCompactIntMatrix {
|
||||||
mmap: Mmap,
|
mmap: Mmap,
|
||||||
n_rows: usize,
|
n_rows: usize,
|
||||||
@@ -167,57 +269,31 @@ impl PackedCompactIntMatrix {
|
|||||||
Ok(Self { mmap, n_rows, n_cols, columns })
|
Ok(Self { mmap, n_rows, n_cols, columns })
|
||||||
}
|
}
|
||||||
|
|
||||||
fn col_overflow_map(&self, ci: &ColInfo) -> HashMap<usize, u32> {
|
pub(crate) fn col_slice(&self, c: usize) -> PackedIntCol<'_> {
|
||||||
let mut overflow = HashMap::with_capacity(ci.n_overflow);
|
let ci = &self.columns[c];
|
||||||
for i in 0..ci.n_overflow {
|
PackedIntCol {
|
||||||
let (slot, value) = parse_overflow_entry(&self.mmap, ci.data_offset, i);
|
primary: &self.mmap[ci.primary_start..ci.primary_start + self.n_rows],
|
||||||
overflow.insert(slot, value);
|
overflow: &self.mmap[ci.data_offset..ci.data_offset + ci.n_overflow * OVERFLOW_ENTRY_SIZE],
|
||||||
|
n_overflow: ci.n_overflow,
|
||||||
|
step: ci.step,
|
||||||
|
index: &ci.index,
|
||||||
|
n: self.n_rows,
|
||||||
}
|
}
|
||||||
overflow
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn col_persist(&self, c: usize, path: &Path) -> io::Result<PersistentCompactIntVecBuilder> {
|
pub(crate) fn col_persist(&self, c: usize, path: &Path) -> io::Result<PersistentCompactIntVecBuilder> {
|
||||||
let ci = &self.columns[c];
|
let col = self.col_slice(c);
|
||||||
let primary = &self.mmap[ci.primary_start..ci.primary_start + self.n_rows];
|
let overflow: HashMap<usize, u32> = col.overflow_entries().collect();
|
||||||
PersistentCompactIntVecBuilder::from_raw_primary(primary, self.col_overflow_map(ci), path)
|
PersistentCompactIntVecBuilder::from_raw_primary(col.primary, overflow, path)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn col_as_memory(&self, c: usize) -> MemoryIntVec {
|
pub(crate) fn col_as_memory(&self, c: usize) -> MemoryIntVec {
|
||||||
let ci = &self.columns[c];
|
MemoryIntVec::from(&self.col_slice(c))
|
||||||
let primary = self.mmap[ci.primary_start..ci.primary_start + self.n_rows].to_vec();
|
|
||||||
MemoryIntVec::from_primary_and_overflow(primary, self.col_overflow_map(ci))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
pub(crate) fn get(&self, col: usize, slot: usize) -> u32 {
|
pub(crate) fn get(&self, col: usize, slot: usize) -> u32 {
|
||||||
let ci = &self.columns[col];
|
self.col_slice(col).get(slot)
|
||||||
let v = self.mmap[ci.primary_start + slot];
|
|
||||||
if v < 255 { return v as u32; }
|
|
||||||
self.overflow_get(ci, slot)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn overflow_get(&self, ci: &ColInfo, slot: usize) -> u32 {
|
|
||||||
let (pos_start, pos_end) = if ci.step == 0 {
|
|
||||||
(0, ci.n_overflow)
|
|
||||||
} else {
|
|
||||||
let i = ci.index.partition_point(|&(s, _)| s <= slot).saturating_sub(1);
|
|
||||||
let start = ci.index[i].1;
|
|
||||||
let end = if i + 1 < ci.index.len() { ci.index[i+1].1 } else { ci.n_overflow };
|
|
||||||
(start, end)
|
|
||||||
};
|
|
||||||
let mut lo = pos_start;
|
|
||||||
let mut hi = pos_end;
|
|
||||||
while lo < hi {
|
|
||||||
let mid = lo + (hi - lo) / 2;
|
|
||||||
let off = ci.data_offset + mid * OVERFLOW_ENTRY_SIZE;
|
|
||||||
let stored = u64::from_le_bytes(self.mmap[off..off+8].try_into().unwrap()) as usize;
|
|
||||||
match stored.cmp(&slot) {
|
|
||||||
Ordering::Equal => return u32::from_le_bytes(self.mmap[off+8..off+12].try_into().unwrap()),
|
|
||||||
Ordering::Less => lo = mid + 1,
|
|
||||||
Ordering::Greater => hi = mid,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
panic!("slot {slot} marked overflow but not found")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn fill_row(&self, slot: usize, buf: &mut [u32]) {
|
pub(crate) fn fill_row(&self, slot: usize, buf: &mut [u32]) {
|
||||||
@@ -230,73 +306,62 @@ impl PackedCompactIntMatrix {
|
|||||||
|
|
||||||
pub(crate) fn sum(&self) -> Array1<u64> {
|
pub(crate) fn sum(&self) -> Array1<u64> {
|
||||||
Array1::from_vec(
|
Array1::from_vec(
|
||||||
self.columns.par_iter()
|
(0..self.n_cols).into_par_iter()
|
||||||
.map(|ci| {
|
.map(|c| self.col_slice(c).sum())
|
||||||
let primary = &self.mmap[ci.primary_start..ci.primary_start + self.n_rows];
|
|
||||||
let overflow = (0..ci.n_overflow)
|
|
||||||
.map(|i| parse_overflow_entry(&self.mmap, ci.data_offset, i).1);
|
|
||||||
byte_sum(primary, overflow)
|
|
||||||
})
|
|
||||||
.collect()
|
.collect()
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn count_nonzero(&self) -> Array1<u64> {
|
pub(crate) fn count_nonzero(&self) -> Array1<u64> {
|
||||||
Array1::from_vec(
|
Array1::from_vec(
|
||||||
self.columns.par_iter()
|
(0..self.n_cols).into_par_iter()
|
||||||
.map(|ci| {
|
.map(|c| self.col_slice(c).count_nonzero())
|
||||||
let primary = &self.mmap[ci.primary_start..ci.primary_start + self.n_rows];
|
|
||||||
byte_count_nonzero(primary)
|
|
||||||
})
|
|
||||||
.collect()
|
.collect()
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Pair primitives ───────────────────────────────────────────────────────
|
// ── Pair primitives — sequential scan via col_slice().iter() ─────────────
|
||||||
|
|
||||||
fn pair_partial_bray(&self, i: usize, j: usize) -> u64 {
|
fn pair_partial_bray(&self, i: usize, j: usize) -> u64 {
|
||||||
(0..self.n_rows).map(|s| self.get(i, s).min(self.get(j, s)) as u64).sum()
|
self.col_slice(i).iter().zip(self.col_slice(j).iter())
|
||||||
|
.map(|(a, b)| a.min(b) as u64)
|
||||||
|
.sum()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn pair_partial_euclidean(&self, i: usize, j: usize) -> f64 {
|
fn pair_partial_euclidean(&self, i: usize, j: usize) -> f64 {
|
||||||
(0..self.n_rows).map(|s| {
|
self.col_slice(i).iter().zip(self.col_slice(j).iter())
|
||||||
let d = self.get(i, s) as f64 - self.get(j, s) as f64;
|
.map(|(a, b)| { let d = a as f64 - b as f64; d * d })
|
||||||
d * d
|
.sum()
|
||||||
}).sum()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn pair_partial_threshold_jaccard(&self, i: usize, j: usize, t: u32) -> (u64, u64) {
|
fn pair_partial_threshold_jaccard(&self, i: usize, j: usize, t: u32) -> (u64, u64) {
|
||||||
let (mut inter, mut union) = (0u64, 0u64);
|
self.col_slice(i).iter().zip(self.col_slice(j).iter())
|
||||||
for s in 0..self.n_rows {
|
.fold((0u64, 0u64), |(inter, uni), (a, b)| {
|
||||||
let a = self.get(i, s) >= t;
|
let ap = a >= t;
|
||||||
let b = self.get(j, s) >= t;
|
let bp = b >= t;
|
||||||
if a && b { inter += 1; }
|
(inter + (ap & bp) as u64, uni + (ap | bp) as u64)
|
||||||
if a || b { union += 1; }
|
})
|
||||||
}
|
|
||||||
(inter, union)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn pair_partial_relfreq_bray(&self, i: usize, j: usize, si: f64, sj: f64) -> f64 {
|
fn pair_partial_relfreq_bray(&self, i: usize, j: usize, si: f64, sj: f64) -> f64 {
|
||||||
if si == 0.0 || sj == 0.0 { return 0.0; }
|
if si == 0.0 || sj == 0.0 { return 0.0; }
|
||||||
(0..self.n_rows).map(|s| {
|
self.col_slice(i).iter().zip(self.col_slice(j).iter())
|
||||||
(self.get(i, s) as f64 / si).min(self.get(j, s) as f64 / sj)
|
.map(|(a, b)| (a as f64 / si).min(b as f64 / sj))
|
||||||
}).sum()
|
.sum()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn pair_partial_relfreq_euclidean(&self, i: usize, j: usize, si: f64, sj: f64) -> f64 {
|
fn pair_partial_relfreq_euclidean(&self, i: usize, j: usize, si: f64, sj: f64) -> f64 {
|
||||||
if si == 0.0 || sj == 0.0 { return 0.0; }
|
if si == 0.0 || sj == 0.0 { return 0.0; }
|
||||||
(0..self.n_rows).map(|s| {
|
self.col_slice(i).iter().zip(self.col_slice(j).iter())
|
||||||
let d = self.get(i, s) as f64 / si - self.get(j, s) as f64 / sj;
|
.map(|(a, b)| { let d = a as f64 / si - b as f64 / sj; d * d })
|
||||||
d * d
|
.sum()
|
||||||
}).sum()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn pair_partial_hellinger(&self, i: usize, j: usize, si: f64, sj: f64) -> f64 {
|
fn pair_partial_hellinger(&self, i: usize, j: usize, si: f64, sj: f64) -> f64 {
|
||||||
if si == 0.0 || sj == 0.0 { return 0.0; }
|
if si == 0.0 || sj == 0.0 { return 0.0; }
|
||||||
(0..self.n_rows).map(|s| {
|
self.col_slice(i).iter().zip(self.col_slice(j).iter())
|
||||||
let d = (self.get(i, s) as f64 / si).sqrt() - (self.get(j, s) as f64 / sj).sqrt();
|
.map(|(a, b)| { let d = (a as f64 / si).sqrt() - (b as f64 / sj).sqrt(); d * d })
|
||||||
d * d
|
.sum()
|
||||||
}).sum()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Matrix methods ────────────────────────────────────────────────────────
|
// ── Matrix methods ────────────────────────────────────────────────────────
|
||||||
@@ -324,7 +389,6 @@ impl PackedCompactIntMatrix {
|
|||||||
pub(crate) fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
pub(crate) fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
||||||
pairwise_matrix(self.n_cols, |i, j| self.pair_partial_hellinger(i, j, col_sums[i] as f64, col_sums[j] as f64))
|
pairwise_matrix(self.n_cols, |i, j| self.pair_partial_hellinger(i, j, col_sums[i] as f64, col_sums[j] as f64))
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Build `counts/matrix.pcmx` from existing `col_*.pciv` files.
|
/// Build `counts/matrix.pcmx` from existing `col_*.pciv` files.
|
||||||
@@ -516,4 +580,3 @@ impl PersistentCompactIntMatrixBuilder {
|
|||||||
MatrixMeta { n: self.n, n_cols: self.n_cols }.save(&self.dir)
|
MatrixMeta { n: self.n, n_cols: self.n_cols }.save(&self.dir)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user