fix: validate packed matrix columns before repacking
Add header parsing helpers to extract column counts without memory mapping. Update packing functions to verify existing files match current metadata, preventing stale or widened-column artifacts. Extract inline tests in obilayeredmap to an external module and add comprehensive aggregation tests. Bump obikmer to 1.1.35 and clean up repository configuration.
This commit is contained in:
@@ -15,6 +15,7 @@ benchmark/simulated_data
|
|||||||
benchmark/specimen_index_presence
|
benchmark/specimen_index_presence
|
||||||
benchmark/specimen_index_count
|
benchmark/specimen_index_count
|
||||||
benchmark/global_index_presence
|
benchmark/global_index_presence
|
||||||
|
benchmark/all_specific
|
||||||
benchmark/global_index_count
|
benchmark/global_index_count
|
||||||
benchmark/stats
|
benchmark/stats
|
||||||
benchmark/reference_index
|
benchmark/reference_index
|
||||||
|
|||||||
Generated
+1
-1
@@ -1704,7 +1704,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "obikmer"
|
name = "obikmer"
|
||||||
version = "1.1.34"
|
version = "1.1.35"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"clap",
|
"clap",
|
||||||
"csv",
|
"csv",
|
||||||
|
|||||||
Binary file not shown.
@@ -1,5 +1,5 @@
|
|||||||
use std::fs::{self, File};
|
use std::fs::{self, File};
|
||||||
use std::io::{self, BufWriter, Write as _};
|
use std::io::{self, BufWriter, Read as _, Write as _};
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
|
|
||||||
use memmap2::Mmap;
|
use memmap2::Mmap;
|
||||||
@@ -171,19 +171,43 @@ impl PackedBitMatrix {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Reads just the `n_cols` field from an existing packed matrix's header,
|
||||||
|
/// without mapping the file. Used by `pack_bit_matrix` to tell a genuinely
|
||||||
|
/// complete pack from a stale one that predates a later column-widening.
|
||||||
|
fn packed_bit_matrix_n_cols(path: &Path) -> io::Result<usize> {
|
||||||
|
let mut f = File::open(path)?;
|
||||||
|
let mut header = [0u8; PBMX_HEADER];
|
||||||
|
f.read_exact(&mut header)?;
|
||||||
|
Ok(u64::from_le_bytes(header[16..24].try_into().unwrap()) as usize)
|
||||||
|
}
|
||||||
|
|
||||||
/// Build `presence/matrix.pbmx` from existing `col_*.pbiv` files.
|
/// Build `presence/matrix.pbmx` from existing `col_*.pbiv` files.
|
||||||
pub fn pack_bit_matrix(dir: &Path) -> io::Result<()> {
|
pub fn pack_bit_matrix(dir: &Path) -> io::Result<()> {
|
||||||
let packed_path = dir.join("matrix.pbmx");
|
let packed_path = dir.join("matrix.pbmx");
|
||||||
if packed_path.exists() {
|
|
||||||
// Matrix complete; remove any leftover column files from a killed cleanup.
|
let meta = match MatrixMeta::load(dir) {
|
||||||
if let Ok(meta) = MatrixMeta::load(dir) {
|
Ok(meta) => meta,
|
||||||
|
Err(e) => {
|
||||||
|
// No columnar data pending: either this layer was already
|
||||||
|
// packed and cleaned up (matrix.pbmx complete, nothing left to
|
||||||
|
// do), or genuinely nothing was ever written here.
|
||||||
|
return if packed_path.exists() { Ok(()) } else { Err(e) };
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// A `matrix.pbmx` can already exist here even though columnar data is
|
||||||
|
// still pending — e.g. copied verbatim from a merge's base source
|
||||||
|
// before this layer was widened with more genome columns (see
|
||||||
|
// `obikpartitionner::merge_partition`). Only skip (re-)packing if the
|
||||||
|
// existing file already reflects the current column count; otherwise
|
||||||
|
// the columnar files are newer and must be (re-)packed, overwriting the
|
||||||
|
// stale one — never silently discarded as "leftover cleanup".
|
||||||
|
if packed_bit_matrix_n_cols(&packed_path).ok() == Some(meta.n_cols) {
|
||||||
for c in 0..meta.n_cols { let _ = fs::remove_file(col_path(dir, c)); }
|
for c in 0..meta.n_cols { let _ = fs::remove_file(col_path(dir, c)); }
|
||||||
let _ = fs::remove_file(dir.join("meta.json"));
|
let _ = fs::remove_file(dir.join("meta.json"));
|
||||||
}
|
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
|
||||||
let meta = MatrixMeta::load(dir)?;
|
|
||||||
let n_cols = meta.n_cols;
|
let n_cols = meta.n_cols;
|
||||||
|
|
||||||
// Compute offsets from file sizes — no column data loaded into RAM.
|
// Compute offsets from file sizes — no column data loaded into RAM.
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
use std::fs::{self, File};
|
use std::fs::{self, File};
|
||||||
use std::io::{self, BufWriter, Write as _};
|
use std::io::{self, BufWriter, Read as _, Write as _};
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
|
|
||||||
use memmap2::Mmap;
|
use memmap2::Mmap;
|
||||||
@@ -228,17 +228,44 @@ impl PackedCompactIntMatrix {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Reads just the `n_cols` field from an existing packed matrix's header,
|
||||||
|
/// without mapping the file. Used by `pack_compact_int_matrix` to tell a
|
||||||
|
/// genuinely complete pack from a stale one that predates a later
|
||||||
|
/// column-widening.
|
||||||
|
fn packed_int_matrix_n_cols(path: &Path) -> io::Result<usize> {
|
||||||
|
let mut f = File::open(path)?;
|
||||||
|
let mut header = [0u8; PCMX_HEADER];
|
||||||
|
f.read_exact(&mut header)?;
|
||||||
|
Ok(u64::from_le_bytes(header[16..24].try_into().unwrap()) as usize)
|
||||||
|
}
|
||||||
|
|
||||||
/// Build `counts/matrix.pcmx` from existing `col_*.pciv` files.
|
/// Build `counts/matrix.pcmx` from existing `col_*.pciv` files.
|
||||||
pub fn pack_compact_int_matrix(dir: &Path) -> io::Result<()> {
|
pub fn pack_compact_int_matrix(dir: &Path) -> io::Result<()> {
|
||||||
let packed_path = dir.join("matrix.pcmx");
|
let packed_path = dir.join("matrix.pcmx");
|
||||||
if packed_path.exists() {
|
|
||||||
if let Ok(meta) = MatrixMeta::load(dir) {
|
let meta = match MatrixMeta::load(dir) {
|
||||||
|
Ok(meta) => meta,
|
||||||
|
Err(e) => {
|
||||||
|
// No columnar data pending: either this layer was already
|
||||||
|
// packed and cleaned up (matrix.pcmx complete, nothing left to
|
||||||
|
// do), or genuinely nothing was ever written here.
|
||||||
|
return if packed_path.exists() { Ok(()) } else { Err(e) };
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// A `matrix.pcmx` can already exist here even though columnar data is
|
||||||
|
// still pending — e.g. copied verbatim from a merge's base source
|
||||||
|
// before this layer was widened with more genome columns (see
|
||||||
|
// `obikpartitionner::merge_partition`). Only skip (re-)packing if the
|
||||||
|
// existing file already reflects the current column count; otherwise
|
||||||
|
// the columnar files are newer and must be (re-)packed, overwriting the
|
||||||
|
// stale one — never silently discarded as "leftover cleanup".
|
||||||
|
if packed_int_matrix_n_cols(&packed_path).ok() == Some(meta.n_cols) {
|
||||||
for c in 0..meta.n_cols { let _ = fs::remove_file(col_path(dir, c)); }
|
for c in 0..meta.n_cols { let _ = fs::remove_file(col_path(dir, c)); }
|
||||||
let _ = fs::remove_file(dir.join("meta.json"));
|
let _ = fs::remove_file(dir.join("meta.json"));
|
||||||
}
|
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
let meta = MatrixMeta::load(dir)?;
|
|
||||||
let n_cols = meta.n_cols;
|
let n_cols = meta.n_cols;
|
||||||
let col_sizes: Vec<u64> = (0..n_cols)
|
let col_sizes: Vec<u64> = (0..n_cols)
|
||||||
.map(|c| fs::metadata(col_path(dir, c)).map(|m| m.len()))
|
.map(|c| fs::metadata(col_path(dir, c)).map(|m| m.len()))
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "obikmer"
|
name = "obikmer"
|
||||||
version = "1.1.34"
|
version = "1.1.35"
|
||||||
edition = "2024"
|
edition = "2024"
|
||||||
|
|
||||||
[[bin]]
|
[[bin]]
|
||||||
|
|||||||
@@ -96,162 +96,5 @@ impl<S: BitPartials> BitPartials for LayeredStore<S> {
|
|||||||
// ── Tests ─────────────────────────────────────────────────────────────────────
|
// ── Tests ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
#[path = "tests/layered_store.rs"]
|
||||||
use super::*;
|
mod tests;
|
||||||
use obicompactvec::{
|
|
||||||
PersistentBitMatrix, PersistentBitMatrixBuilder,
|
|
||||||
PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder,
|
|
||||||
};
|
|
||||||
use tempfile::tempdir;
|
|
||||||
|
|
||||||
fn make_int_matrix(cols: &[&[u32]]) -> (tempfile::TempDir, PersistentCompactIntMatrix) {
|
|
||||||
let n = cols.first().map_or(0, |c| c.len());
|
|
||||||
let dir = tempdir().unwrap();
|
|
||||||
let mut b = PersistentCompactIntMatrixBuilder::new(n, &dir.path().join("counts")).unwrap();
|
|
||||||
for &col in cols {
|
|
||||||
let mut cb = b.add_col().unwrap();
|
|
||||||
for (slot, &v) in col.iter().enumerate() { cb.set(slot, v); }
|
|
||||||
cb.close().unwrap();
|
|
||||||
}
|
|
||||||
b.close().unwrap();
|
|
||||||
let m = PersistentCompactIntMatrix::open(dir.path()).unwrap();
|
|
||||||
(dir, m)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn make_bit_matrix(cols: &[&[bool]]) -> (tempfile::TempDir, PersistentBitMatrix) {
|
|
||||||
let n = cols.first().map_or(0, |c| c.len());
|
|
||||||
let dir = tempdir().unwrap();
|
|
||||||
let mut b = PersistentBitMatrixBuilder::new(n, &dir.path().join("presence")).unwrap();
|
|
||||||
for &col in cols {
|
|
||||||
let mut cb = b.add_col().unwrap();
|
|
||||||
for (slot, &v) in col.iter().enumerate() { cb.set(slot, v); }
|
|
||||||
cb.close().unwrap();
|
|
||||||
}
|
|
||||||
b.close().unwrap();
|
|
||||||
let m = PersistentBitMatrix::open(dir.path()).unwrap();
|
|
||||||
(dir, m)
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── ColumnWeights ─────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn col_weights_sums_across_layers() {
|
|
||||||
// layer 0: col0=[1,2], col1=[3,4] → weights [3, 7]
|
|
||||||
// layer 1: col0=[10,0], col1=[0,10] → weights [10, 10]
|
|
||||||
// combined: [13, 17]
|
|
||||||
let (_d0, m0) = make_int_matrix(&[&[1, 2], &[3, 4]]);
|
|
||||||
let (_d1, m1) = make_int_matrix(&[&[10, 0], &[0, 10]]);
|
|
||||||
let store = LayeredStore::new(vec![m0, m1]);
|
|
||||||
let w = store.col_weights();
|
|
||||||
assert_eq!(w[0], 13);
|
|
||||||
assert_eq!(w[1], 17);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn col_weights_bit_sums_across_layers() {
|
|
||||||
// layer 0: col0=[T,F,T], col1=[F,T,T] → counts [2, 2]
|
|
||||||
// layer 1: col0=[F,F,T], col1=[T,T,F] → counts [1, 2]
|
|
||||||
// combined: [3, 4]
|
|
||||||
let (_d0, m0) = make_bit_matrix(&[&[true, false, true], &[false, true, true]]);
|
|
||||||
let (_d1, m1) = make_bit_matrix(&[&[false, false, true], &[true, true, false]]);
|
|
||||||
let store = LayeredStore::new(vec![m0, m1]);
|
|
||||||
let w = store.col_weights();
|
|
||||||
assert_eq!(w[0], 3);
|
|
||||||
assert_eq!(w[1], 4);
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── CountPartials — layered (one partition) ───────────────────────────────
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn layered_bray_matches_combined() {
|
|
||||||
// Split [1,2,3,4,5] across two layers; bray dist should equal direct computation
|
|
||||||
// on [1,2,3,4,5] for each column pair.
|
|
||||||
// col0=[1,2,3,4,5], col1=[5,4,3,2,1]
|
|
||||||
let (_d0, m0) = make_int_matrix(&[&[1, 2], &[5, 4]]); // slots 0-1
|
|
||||||
let (_d1, m1) = make_int_matrix(&[&[3, 4, 5], &[3, 2, 1]]); // slots 2-4
|
|
||||||
let store = LayeredStore::new(vec![m0, m1]);
|
|
||||||
|
|
||||||
// direct on full data
|
|
||||||
let (_df, mf) = make_int_matrix(&[&[1, 2, 3, 4, 5], &[5, 4, 3, 2, 1]]);
|
|
||||||
let expected = CountPartials::bray_dist_matrix(&mf);
|
|
||||||
let got = CountPartials::bray_dist_matrix(&store);
|
|
||||||
assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "bray [0,1]");
|
|
||||||
assert!((got[[1, 0]] - expected[[1, 0]]).abs() < 1e-12, "bray [1,0]");
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn layered_relfreq_bray_matches_combined() {
|
|
||||||
let (_d0, m0) = make_int_matrix(&[&[1, 2], &[5, 4]]);
|
|
||||||
let (_d1, m1) = make_int_matrix(&[&[3, 4, 5], &[3, 2, 1]]);
|
|
||||||
let store = LayeredStore::new(vec![m0, m1]);
|
|
||||||
|
|
||||||
let (_df, mf) = make_int_matrix(&[&[1, 2, 3, 4, 5], &[5, 4, 3, 2, 1]]);
|
|
||||||
let expected = CountPartials::relfreq_bray_dist_matrix(&mf);
|
|
||||||
let got = CountPartials::relfreq_bray_dist_matrix(&store);
|
|
||||||
assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "relfreq_bray [0,1]");
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn layered_euclidean_matches_combined() {
|
|
||||||
let (_d0, m0) = make_int_matrix(&[&[3, 0], &[0, 4]]);
|
|
||||||
let (_d1, m1) = make_int_matrix(&[&[1, 1], &[2, 2]]);
|
|
||||||
let store = LayeredStore::new(vec![m0, m1]);
|
|
||||||
|
|
||||||
let (_df, mf) = make_int_matrix(&[&[3, 0, 1, 1], &[0, 4, 2, 2]]);
|
|
||||||
let expected = CountPartials::euclidean_dist_matrix(&mf);
|
|
||||||
let got = CountPartials::euclidean_dist_matrix(&store);
|
|
||||||
assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "euclidean [0,1]");
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── CountPartials — partitioned (LayeredStore<LayeredStore<_>>) ───────────
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn partitioned_bray_matches_combined() {
|
|
||||||
// partition 0: slots [1,2,3,4,5] col0 vs col1
|
|
||||||
// partition 1: slots [10,20] col0 vs col1
|
|
||||||
let (_d0, p0) = make_int_matrix(&[&[1, 2, 3, 4, 5], &[5, 4, 3, 2, 1]]);
|
|
||||||
let (_d1, p1) = make_int_matrix(&[&[10, 20], &[20, 10]]);
|
|
||||||
|
|
||||||
let partitioned = LayeredStore::new(vec![
|
|
||||||
LayeredStore::new(vec![p0]),
|
|
||||||
LayeredStore::new(vec![p1]),
|
|
||||||
]);
|
|
||||||
|
|
||||||
let (_df, mf) = make_int_matrix(&[&[1, 2, 3, 4, 5, 10, 20], &[5, 4, 3, 2, 1, 20, 10]]);
|
|
||||||
let expected = CountPartials::bray_dist_matrix(&mf);
|
|
||||||
let got = CountPartials::bray_dist_matrix(&partitioned);
|
|
||||||
assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "partitioned bray [0,1]");
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── BitPartials ───────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn layered_jaccard_matches_combined() {
|
|
||||||
let (_d0, m0) = make_bit_matrix(&[&[true, false], &[false, true]]);
|
|
||||||
let (_d1, m1) = make_bit_matrix(&[&[true, true], &[true, false]]);
|
|
||||||
let store = LayeredStore::new(vec![m0, m1]);
|
|
||||||
|
|
||||||
let (_df, mf) = make_bit_matrix(&[
|
|
||||||
&[true, false, true, true],
|
|
||||||
&[false, true, true, false],
|
|
||||||
]);
|
|
||||||
let expected = BitPartials::jaccard_dist_matrix(&mf);
|
|
||||||
let got = BitPartials::jaccard_dist_matrix(&store);
|
|
||||||
assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "jaccard [0,1]");
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn layered_hamming_matches_combined() {
|
|
||||||
let (_d0, m0) = make_bit_matrix(&[&[true, false], &[false, true]]);
|
|
||||||
let (_d1, m1) = make_bit_matrix(&[&[true, true], &[false, false]]);
|
|
||||||
let store = LayeredStore::new(vec![m0, m1]);
|
|
||||||
|
|
||||||
let (_df, mf) = make_bit_matrix(&[
|
|
||||||
&[true, false, true, true],
|
|
||||||
&[false, true, false, false],
|
|
||||||
]);
|
|
||||||
let expected = BitPartials::hamming_dist_matrix(&mf);
|
|
||||||
let got = BitPartials::hamming_dist_matrix(&store);
|
|
||||||
assert_eq!(got[[0, 1]], expected[[0, 1]], "hamming [0,1]");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -0,0 +1,381 @@
|
|||||||
|
use super::*;
|
||||||
|
use obicompactvec::{
|
||||||
|
PersistentBitMatrix, PersistentBitMatrixBuilder,
|
||||||
|
PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder,
|
||||||
|
};
|
||||||
|
use tempfile::tempdir;
|
||||||
|
|
||||||
|
fn make_int_matrix(cols: &[&[u32]]) -> (tempfile::TempDir, PersistentCompactIntMatrix) {
|
||||||
|
let n = cols.first().map_or(0, |c| c.len());
|
||||||
|
let dir = tempdir().unwrap();
|
||||||
|
let mut b = PersistentCompactIntMatrixBuilder::new(n, &dir.path().join("counts")).unwrap();
|
||||||
|
for &col in cols {
|
||||||
|
let mut cb = b.add_col().unwrap();
|
||||||
|
for (slot, &v) in col.iter().enumerate() { cb.set(slot, v); }
|
||||||
|
cb.close().unwrap();
|
||||||
|
}
|
||||||
|
b.close().unwrap();
|
||||||
|
let m = PersistentCompactIntMatrix::open(dir.path()).unwrap();
|
||||||
|
(dir, m)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn make_bit_matrix(cols: &[&[bool]]) -> (tempfile::TempDir, PersistentBitMatrix) {
|
||||||
|
let n = cols.first().map_or(0, |c| c.len());
|
||||||
|
let dir = tempdir().unwrap();
|
||||||
|
let mut b = PersistentBitMatrixBuilder::new(n, &dir.path().join("presence")).unwrap();
|
||||||
|
for &col in cols {
|
||||||
|
let mut cb = b.add_col().unwrap();
|
||||||
|
for (slot, &v) in col.iter().enumerate() { cb.set(slot, v); }
|
||||||
|
cb.close().unwrap();
|
||||||
|
}
|
||||||
|
b.close().unwrap();
|
||||||
|
let m = PersistentBitMatrix::open(dir.path()).unwrap();
|
||||||
|
(dir, m)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── ColumnWeights ─────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn col_weights_sums_across_layers() {
|
||||||
|
// layer 0: col0=[1,2], col1=[3,4] → weights [3, 7]
|
||||||
|
// layer 1: col0=[10,0], col1=[0,10] → weights [10, 10]
|
||||||
|
// combined: [13, 17]
|
||||||
|
let (_d0, m0) = make_int_matrix(&[&[1, 2], &[3, 4]]);
|
||||||
|
let (_d1, m1) = make_int_matrix(&[&[10, 0], &[0, 10]]);
|
||||||
|
let store = LayeredStore::new(vec![m0, m1]);
|
||||||
|
let w = store.col_weights();
|
||||||
|
assert_eq!(w[0], 13);
|
||||||
|
assert_eq!(w[1], 17);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn col_weights_bit_sums_across_layers() {
|
||||||
|
// layer 0: col0=[T,F,T], col1=[F,T,T] → counts [2, 2]
|
||||||
|
// layer 1: col0=[F,F,T], col1=[T,T,F] → counts [1, 2]
|
||||||
|
// combined: [3, 4]
|
||||||
|
let (_d0, m0) = make_bit_matrix(&[&[true, false, true], &[false, true, true]]);
|
||||||
|
let (_d1, m1) = make_bit_matrix(&[&[false, false, true], &[true, true, false]]);
|
||||||
|
let store = LayeredStore::new(vec![m0, m1]);
|
||||||
|
let w = store.col_weights();
|
||||||
|
assert_eq!(w[0], 3);
|
||||||
|
assert_eq!(w[1], 4);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── CountPartials — layered (one partition) ───────────────────────────────
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn layered_bray_matches_combined() {
|
||||||
|
// Split [1,2,3,4,5] across two layers; bray dist should equal direct computation
|
||||||
|
// on [1,2,3,4,5] for each column pair.
|
||||||
|
// col0=[1,2,3,4,5], col1=[5,4,3,2,1]
|
||||||
|
let (_d0, m0) = make_int_matrix(&[&[1, 2], &[5, 4]]); // slots 0-1
|
||||||
|
let (_d1, m1) = make_int_matrix(&[&[3, 4, 5], &[3, 2, 1]]); // slots 2-4
|
||||||
|
let store = LayeredStore::new(vec![m0, m1]);
|
||||||
|
|
||||||
|
// direct on full data
|
||||||
|
let (_df, mf) = make_int_matrix(&[&[1, 2, 3, 4, 5], &[5, 4, 3, 2, 1]]);
|
||||||
|
let expected = CountPartials::bray_dist_matrix(&mf);
|
||||||
|
let got = CountPartials::bray_dist_matrix(&store);
|
||||||
|
assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "bray [0,1]");
|
||||||
|
assert!((got[[1, 0]] - expected[[1, 0]]).abs() < 1e-12, "bray [1,0]");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn layered_relfreq_bray_matches_combined() {
|
||||||
|
let (_d0, m0) = make_int_matrix(&[&[1, 2], &[5, 4]]);
|
||||||
|
let (_d1, m1) = make_int_matrix(&[&[3, 4, 5], &[3, 2, 1]]);
|
||||||
|
let store = LayeredStore::new(vec![m0, m1]);
|
||||||
|
|
||||||
|
let (_df, mf) = make_int_matrix(&[&[1, 2, 3, 4, 5], &[5, 4, 3, 2, 1]]);
|
||||||
|
let expected = CountPartials::relfreq_bray_dist_matrix(&mf);
|
||||||
|
let got = CountPartials::relfreq_bray_dist_matrix(&store);
|
||||||
|
assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "relfreq_bray [0,1]");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn layered_euclidean_matches_combined() {
|
||||||
|
let (_d0, m0) = make_int_matrix(&[&[3, 0], &[0, 4]]);
|
||||||
|
let (_d1, m1) = make_int_matrix(&[&[1, 1], &[2, 2]]);
|
||||||
|
let store = LayeredStore::new(vec![m0, m1]);
|
||||||
|
|
||||||
|
let (_df, mf) = make_int_matrix(&[&[3, 0, 1, 1], &[0, 4, 2, 2]]);
|
||||||
|
let expected = CountPartials::euclidean_dist_matrix(&mf);
|
||||||
|
let got = CountPartials::euclidean_dist_matrix(&store);
|
||||||
|
assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "euclidean [0,1]");
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── CountPartials — partitioned (LayeredStore<LayeredStore<_>>) ───────────
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn partitioned_bray_matches_combined() {
|
||||||
|
// partition 0: slots [1,2,3,4,5] col0 vs col1
|
||||||
|
// partition 1: slots [10,20] col0 vs col1
|
||||||
|
let (_d0, p0) = make_int_matrix(&[&[1, 2, 3, 4, 5], &[5, 4, 3, 2, 1]]);
|
||||||
|
let (_d1, p1) = make_int_matrix(&[&[10, 20], &[20, 10]]);
|
||||||
|
|
||||||
|
let partitioned = LayeredStore::new(vec![
|
||||||
|
LayeredStore::new(vec![p0]),
|
||||||
|
LayeredStore::new(vec![p1]),
|
||||||
|
]);
|
||||||
|
|
||||||
|
let (_df, mf) = make_int_matrix(&[&[1, 2, 3, 4, 5, 10, 20], &[5, 4, 3, 2, 1, 20, 10]]);
|
||||||
|
let expected = CountPartials::bray_dist_matrix(&mf);
|
||||||
|
let got = CountPartials::bray_dist_matrix(&partitioned);
|
||||||
|
assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "partitioned bray [0,1]");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn partitioned_threshold_jaccard_off_diagonal_is_pairwise() {
|
||||||
|
// 3 genomes, 2 partitions, 1 layer each — mirrors distance.rs's
|
||||||
|
// LayeredStore<LayeredStore<PersistentCompactIntMatrix>> shape.
|
||||||
|
// partition 0: col0=[3,0], col1=[0,3], col2=[3,3]
|
||||||
|
// partition 1: col0=[1,1], col1=[1,0], col2=[0,1]
|
||||||
|
let (_d0, p0) = make_int_matrix(&[&[3, 0], &[0, 3], &[3, 3]]);
|
||||||
|
let (_d1, p1) = make_int_matrix(&[&[1, 1], &[1, 0], &[0, 1]]);
|
||||||
|
|
||||||
|
let partitioned = LayeredStore::new(vec![
|
||||||
|
LayeredStore::new(vec![p0]),
|
||||||
|
LayeredStore::new(vec![p1]),
|
||||||
|
]);
|
||||||
|
|
||||||
|
let (_df, mf) = make_int_matrix(&[&[3, 0, 1, 1], &[0, 3, 1, 0], &[3, 3, 0, 1]]);
|
||||||
|
let threshold = 1u32;
|
||||||
|
let (inter_p, union_p) = CountPartials::partial_threshold_jaccard(&partitioned, threshold);
|
||||||
|
let (inter_f, union_f) = CountPartials::partial_threshold_jaccard(&mf, threshold);
|
||||||
|
|
||||||
|
let n = 3;
|
||||||
|
for i in 0..n {
|
||||||
|
for j in 0..n {
|
||||||
|
assert_eq!(inter_p[[i, j]], inter_f[[i, j]], "inter[{i},{j}]");
|
||||||
|
assert_eq!(union_p[[i, j]], union_f[[i, j]], "union[{i},{j}]");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn partitioned_threshold_jaccard_packed_off_diagonal_is_pairwise() {
|
||||||
|
// Same as `partitioned_threshold_jaccard_off_diagonal_is_pairwise` but
|
||||||
|
// each partition matrix is packed into a single .pcmx file first —
|
||||||
|
// the on-disk format actually used in production after `pack_matrices`.
|
||||||
|
use obicompactvec::pack_compact_int_matrix;
|
||||||
|
|
||||||
|
let (d0, _p0) = make_int_matrix(&[&[3, 0], &[0, 3], &[3, 3]]);
|
||||||
|
pack_compact_int_matrix(&d0.path().join("counts")).unwrap();
|
||||||
|
let p0 = PersistentCompactIntMatrix::open(d0.path()).unwrap();
|
||||||
|
|
||||||
|
let (d1, _p1) = make_int_matrix(&[&[1, 1], &[1, 0], &[0, 1]]);
|
||||||
|
pack_compact_int_matrix(&d1.path().join("counts")).unwrap();
|
||||||
|
let p1 = PersistentCompactIntMatrix::open(d1.path()).unwrap();
|
||||||
|
|
||||||
|
let partitioned = LayeredStore::new(vec![
|
||||||
|
LayeredStore::new(vec![p0]),
|
||||||
|
LayeredStore::new(vec![p1]),
|
||||||
|
]);
|
||||||
|
|
||||||
|
let (_df, mf) = make_int_matrix(&[&[3, 0, 1, 1], &[0, 3, 1, 0], &[3, 3, 0, 1]]);
|
||||||
|
let threshold = 1u32;
|
||||||
|
let (inter_p, union_p) = CountPartials::partial_threshold_jaccard(&partitioned, threshold);
|
||||||
|
let (inter_f, union_f) = CountPartials::partial_threshold_jaccard(&mf, threshold);
|
||||||
|
|
||||||
|
let n = 3;
|
||||||
|
for i in 0..n {
|
||||||
|
for j in 0..n {
|
||||||
|
assert_eq!(inter_p[[i, j]], inter_f[[i, j]], "inter[{i},{j}]");
|
||||||
|
assert_eq!(union_p[[i, j]], union_f[[i, j]], "union[{i},{j}]");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn partitioned_multilayer_threshold_jaccard_off_diagonal_is_pairwise() {
|
||||||
|
// 2 partitions, 2 layers each — the shape production indexes actually
|
||||||
|
// have (MPHF collision layers within a partition).
|
||||||
|
// partition 0, layer 0: col0=[3,0], col1=[0,3], col2=[3,3]
|
||||||
|
// partition 0, layer 1: col0=[2,0], col1=[0,0], col2=[2,0]
|
||||||
|
// partition 1, layer 0: col0=[1,1], col1=[1,0], col2=[0,1]
|
||||||
|
// partition 1, layer 1: col0=[0,5], col1=[5,5], col2=[0,0]
|
||||||
|
let (_d0a, p0a) = make_int_matrix(&[&[3, 0], &[0, 3], &[3, 3]]);
|
||||||
|
let (_d0b, p0b) = make_int_matrix(&[&[2, 0], &[0, 0], &[2, 0]]);
|
||||||
|
let (_d1a, p1a) = make_int_matrix(&[&[1, 1], &[1, 0], &[0, 1]]);
|
||||||
|
let (_d1b, p1b) = make_int_matrix(&[&[0, 5], &[5, 5], &[0, 0]]);
|
||||||
|
|
||||||
|
let partitioned = LayeredStore::new(vec![
|
||||||
|
LayeredStore::new(vec![p0a, p0b]),
|
||||||
|
LayeredStore::new(vec![p1a, p1b]),
|
||||||
|
]);
|
||||||
|
|
||||||
|
// Flattened equivalent: concatenate every layer's slots into one matrix.
|
||||||
|
let (_df, mf) = make_int_matrix(&[
|
||||||
|
&[3, 0, 2, 0, 1, 1, 0, 5],
|
||||||
|
&[0, 3, 0, 0, 1, 0, 5, 5],
|
||||||
|
&[3, 3, 2, 0, 0, 1, 0, 0],
|
||||||
|
]);
|
||||||
|
let threshold = 1u32;
|
||||||
|
let (inter_p, union_p) = CountPartials::partial_threshold_jaccard(&partitioned, threshold);
|
||||||
|
let (inter_f, union_f) = CountPartials::partial_threshold_jaccard(&mf, threshold);
|
||||||
|
|
||||||
|
let n = 3;
|
||||||
|
for i in 0..n {
|
||||||
|
for j in 0..n {
|
||||||
|
assert_eq!(inter_p[[i, j]], inter_f[[i, j]], "inter[{i},{j}]");
|
||||||
|
assert_eq!(union_p[[i, j]], union_f[[i, j]], "union[{i},{j}]");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── BitPartials ───────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn layered_jaccard_matches_combined() {
|
||||||
|
let (_d0, m0) = make_bit_matrix(&[&[true, false], &[false, true]]);
|
||||||
|
let (_d1, m1) = make_bit_matrix(&[&[true, true], &[true, false]]);
|
||||||
|
let store = LayeredStore::new(vec![m0, m1]);
|
||||||
|
|
||||||
|
let (_df, mf) = make_bit_matrix(&[
|
||||||
|
&[true, false, true, true],
|
||||||
|
&[false, true, true, false],
|
||||||
|
]);
|
||||||
|
let expected = BitPartials::jaccard_dist_matrix(&mf);
|
||||||
|
let got = BitPartials::jaccard_dist_matrix(&store);
|
||||||
|
assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "jaccard [0,1]");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn layered_hamming_matches_combined() {
|
||||||
|
let (_d0, m0) = make_bit_matrix(&[&[true, false], &[false, true]]);
|
||||||
|
let (_d1, m1) = make_bit_matrix(&[&[true, true], &[false, false]]);
|
||||||
|
let store = LayeredStore::new(vec![m0, m1]);
|
||||||
|
|
||||||
|
let (_df, mf) = make_bit_matrix(&[
|
||||||
|
&[true, false, true, true],
|
||||||
|
&[false, true, false, false],
|
||||||
|
]);
|
||||||
|
let expected = BitPartials::hamming_dist_matrix(&mf);
|
||||||
|
let got = BitPartials::hamming_dist_matrix(&store);
|
||||||
|
assert_eq!(got[[0, 1]], expected[[0, 1]], "hamming [0,1]");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn partitioned_bit_jaccard_off_diagonal_is_pairwise() {
|
||||||
|
// Same shape as the count-based `partitioned_multilayer_threshold_jaccard_*`
|
||||||
|
// tests, but for the presence/bit path (`with_counts = false` — what
|
||||||
|
// `all_specifics` actually uses in production).
|
||||||
|
// 4 genomes, 3 partitions, 2 layers in the last one.
|
||||||
|
let (_d0, p0) = make_bit_matrix(&[
|
||||||
|
&[true, false, true],
|
||||||
|
&[false, true, true],
|
||||||
|
&[true, true, false],
|
||||||
|
&[false, false, true],
|
||||||
|
]);
|
||||||
|
let (_d1, p1) = make_bit_matrix(&[
|
||||||
|
&[true, true],
|
||||||
|
&[false, true],
|
||||||
|
&[true, false],
|
||||||
|
&[true, true],
|
||||||
|
]);
|
||||||
|
let (_d2a, p2a) = make_bit_matrix(&[
|
||||||
|
&[false, true],
|
||||||
|
&[true, true],
|
||||||
|
&[false, false],
|
||||||
|
&[true, false],
|
||||||
|
]);
|
||||||
|
let (_d2b, p2b) = make_bit_matrix(&[
|
||||||
|
&[true],
|
||||||
|
&[false],
|
||||||
|
&[true],
|
||||||
|
&[true],
|
||||||
|
]);
|
||||||
|
|
||||||
|
let partitioned = LayeredStore::new(vec![
|
||||||
|
LayeredStore::new(vec![p0]),
|
||||||
|
LayeredStore::new(vec![p1]),
|
||||||
|
LayeredStore::new(vec![p2a, p2b]),
|
||||||
|
]);
|
||||||
|
|
||||||
|
// Flattened equivalent: concatenate every partition/layer's slots.
|
||||||
|
let (_df, mf) = make_bit_matrix(&[
|
||||||
|
&[true, false, true, true, true, false, true, true],
|
||||||
|
&[false, true, true, false, true, true, true, false],
|
||||||
|
&[true, true, false, true, false, false, false, true],
|
||||||
|
&[false, false, true, true, true, true, false, true],
|
||||||
|
]);
|
||||||
|
|
||||||
|
let (inter_p, union_p) = BitPartials::partial_jaccard(&partitioned);
|
||||||
|
let (inter_f, union_f) = BitPartials::partial_jaccard(&mf);
|
||||||
|
|
||||||
|
let n = 4;
|
||||||
|
for i in 0..n {
|
||||||
|
for j in 0..n {
|
||||||
|
assert_eq!(inter_p[[i, j]], inter_f[[i, j]], "inter[{i},{j}]");
|
||||||
|
assert_eq!(union_p[[i, j]], union_f[[i, j]], "union[{i},{j}]");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn partitioned_bit_jaccard_packed_off_diagonal_is_pairwise() {
|
||||||
|
// Same as `partitioned_bit_jaccard_off_diagonal_is_pairwise` but every
|
||||||
|
// partition's presence matrix is packed into a single .pbmx file —
|
||||||
|
// the on-disk format actually used in production after `pack_matrices`.
|
||||||
|
use obicompactvec::pack_bit_matrix;
|
||||||
|
|
||||||
|
let (d0, _p0) = make_bit_matrix(&[
|
||||||
|
&[true, false, true],
|
||||||
|
&[false, true, true],
|
||||||
|
&[true, true, false],
|
||||||
|
&[false, false, true],
|
||||||
|
]);
|
||||||
|
pack_bit_matrix(&d0.path().join("presence")).unwrap();
|
||||||
|
let p0 = PersistentBitMatrix::open(d0.path()).unwrap();
|
||||||
|
|
||||||
|
let (d1, _p1) = make_bit_matrix(&[
|
||||||
|
&[true, true],
|
||||||
|
&[false, true],
|
||||||
|
&[true, false],
|
||||||
|
&[true, true],
|
||||||
|
]);
|
||||||
|
pack_bit_matrix(&d1.path().join("presence")).unwrap();
|
||||||
|
let p1 = PersistentBitMatrix::open(d1.path()).unwrap();
|
||||||
|
|
||||||
|
let (d2a, _p2a) = make_bit_matrix(&[
|
||||||
|
&[false, true],
|
||||||
|
&[true, true],
|
||||||
|
&[false, false],
|
||||||
|
&[true, false],
|
||||||
|
]);
|
||||||
|
pack_bit_matrix(&d2a.path().join("presence")).unwrap();
|
||||||
|
let p2a = PersistentBitMatrix::open(d2a.path()).unwrap();
|
||||||
|
|
||||||
|
let (d2b, _p2b) = make_bit_matrix(&[
|
||||||
|
&[true],
|
||||||
|
&[false],
|
||||||
|
&[true],
|
||||||
|
&[true],
|
||||||
|
]);
|
||||||
|
pack_bit_matrix(&d2b.path().join("presence")).unwrap();
|
||||||
|
let p2b = PersistentBitMatrix::open(d2b.path()).unwrap();
|
||||||
|
|
||||||
|
let partitioned = LayeredStore::new(vec![
|
||||||
|
LayeredStore::new(vec![p0]),
|
||||||
|
LayeredStore::new(vec![p1]),
|
||||||
|
LayeredStore::new(vec![p2a, p2b]),
|
||||||
|
]);
|
||||||
|
|
||||||
|
let (_df, mf) = make_bit_matrix(&[
|
||||||
|
&[true, false, true, true, true, false, true, true],
|
||||||
|
&[false, true, true, false, true, true, true, false],
|
||||||
|
&[true, true, false, true, false, false, false, true],
|
||||||
|
&[false, false, true, true, true, true, false, true],
|
||||||
|
]);
|
||||||
|
|
||||||
|
let (inter_p, union_p) = BitPartials::partial_jaccard(&partitioned);
|
||||||
|
let (inter_f, union_f) = BitPartials::partial_jaccard(&mf);
|
||||||
|
|
||||||
|
let n = 4;
|
||||||
|
for i in 0..n {
|
||||||
|
for j in 0..n {
|
||||||
|
assert_eq!(inter_p[[i, j]], inter_f[[i, j]], "inter[{i},{j}]");
|
||||||
|
assert_eq!(union_p[[i, j]], union_f[[i, j]], "union[{i},{j}]");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user