Compare commits
3 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 5bdc0f826a | |||
| cd2f2f9417 | |||
| 7844239a8e |
@@ -15,6 +15,7 @@ benchmark/simulated_data
|
|||||||
benchmark/specimen_index_presence
|
benchmark/specimen_index_presence
|
||||||
benchmark/specimen_index_count
|
benchmark/specimen_index_count
|
||||||
benchmark/global_index_presence
|
benchmark/global_index_presence
|
||||||
|
benchmark/all_specific
|
||||||
benchmark/global_index_count
|
benchmark/global_index_count
|
||||||
benchmark/stats
|
benchmark/stats
|
||||||
benchmark/reference_index
|
benchmark/reference_index
|
||||||
|
|||||||
Generated
+1
-1
@@ -1704,7 +1704,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "obikmer"
|
name = "obikmer"
|
||||||
version = "1.1.34"
|
version = "1.1.35"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"clap",
|
"clap",
|
||||||
"csv",
|
"csv",
|
||||||
|
|||||||
Binary file not shown.
@@ -1,5 +1,5 @@
|
|||||||
use std::fs::{self, File};
|
use std::fs::{self, File};
|
||||||
use std::io::{self, BufWriter, Write as _};
|
use std::io::{self, BufWriter, Read as _, Write as _};
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
|
|
||||||
use memmap2::Mmap;
|
use memmap2::Mmap;
|
||||||
@@ -171,19 +171,43 @@ impl PackedBitMatrix {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Reads just the `n_cols` field from an existing packed matrix's header,
|
||||||
|
/// without mapping the file. Used by `pack_bit_matrix` to tell a genuinely
|
||||||
|
/// complete pack from a stale one that predates a later column-widening.
|
||||||
|
fn packed_bit_matrix_n_cols(path: &Path) -> io::Result<usize> {
|
||||||
|
let mut f = File::open(path)?;
|
||||||
|
let mut header = [0u8; PBMX_HEADER];
|
||||||
|
f.read_exact(&mut header)?;
|
||||||
|
Ok(u64::from_le_bytes(header[16..24].try_into().unwrap()) as usize)
|
||||||
|
}
|
||||||
|
|
||||||
/// Build `presence/matrix.pbmx` from existing `col_*.pbiv` files.
|
/// Build `presence/matrix.pbmx` from existing `col_*.pbiv` files.
|
||||||
pub fn pack_bit_matrix(dir: &Path) -> io::Result<()> {
|
pub fn pack_bit_matrix(dir: &Path) -> io::Result<()> {
|
||||||
let packed_path = dir.join("matrix.pbmx");
|
let packed_path = dir.join("matrix.pbmx");
|
||||||
if packed_path.exists() {
|
|
||||||
// Matrix complete; remove any leftover column files from a killed cleanup.
|
let meta = match MatrixMeta::load(dir) {
|
||||||
if let Ok(meta) = MatrixMeta::load(dir) {
|
Ok(meta) => meta,
|
||||||
for c in 0..meta.n_cols { let _ = fs::remove_file(col_path(dir, c)); }
|
Err(e) => {
|
||||||
let _ = fs::remove_file(dir.join("meta.json"));
|
// No columnar data pending: either this layer was already
|
||||||
|
// packed and cleaned up (matrix.pbmx complete, nothing left to
|
||||||
|
// do), or genuinely nothing was ever written here.
|
||||||
|
return if packed_path.exists() { Ok(()) } else { Err(e) };
|
||||||
}
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// A `matrix.pbmx` can already exist here even though columnar data is
|
||||||
|
// still pending — e.g. copied verbatim from a merge's base source
|
||||||
|
// before this layer was widened with more genome columns (see
|
||||||
|
// `obikpartitionner::merge_partition`). Only skip (re-)packing if the
|
||||||
|
// existing file already reflects the current column count; otherwise
|
||||||
|
// the columnar files are newer and must be (re-)packed, overwriting the
|
||||||
|
// stale one — never silently discarded as "leftover cleanup".
|
||||||
|
if packed_bit_matrix_n_cols(&packed_path).ok() == Some(meta.n_cols) {
|
||||||
|
for c in 0..meta.n_cols { let _ = fs::remove_file(col_path(dir, c)); }
|
||||||
|
let _ = fs::remove_file(dir.join("meta.json"));
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
|
||||||
let meta = MatrixMeta::load(dir)?;
|
|
||||||
let n_cols = meta.n_cols;
|
let n_cols = meta.n_cols;
|
||||||
|
|
||||||
// Compute offsets from file sizes — no column data loaded into RAM.
|
// Compute offsets from file sizes — no column data loaded into RAM.
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
use std::fs::{self, File};
|
use std::fs::{self, File};
|
||||||
use std::io::{self, BufWriter, Write as _};
|
use std::io::{self, BufWriter, Read as _, Write as _};
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
|
|
||||||
use memmap2::Mmap;
|
use memmap2::Mmap;
|
||||||
@@ -228,17 +228,44 @@ impl PackedCompactIntMatrix {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Reads just the `n_cols` field from an existing packed matrix's header,
|
||||||
|
/// without mapping the file. Used by `pack_compact_int_matrix` to tell a
|
||||||
|
/// genuinely complete pack from a stale one that predates a later
|
||||||
|
/// column-widening.
|
||||||
|
fn packed_int_matrix_n_cols(path: &Path) -> io::Result<usize> {
|
||||||
|
let mut f = File::open(path)?;
|
||||||
|
let mut header = [0u8; PCMX_HEADER];
|
||||||
|
f.read_exact(&mut header)?;
|
||||||
|
Ok(u64::from_le_bytes(header[16..24].try_into().unwrap()) as usize)
|
||||||
|
}
|
||||||
|
|
||||||
/// Build `counts/matrix.pcmx` from existing `col_*.pciv` files.
|
/// Build `counts/matrix.pcmx` from existing `col_*.pciv` files.
|
||||||
pub fn pack_compact_int_matrix(dir: &Path) -> io::Result<()> {
|
pub fn pack_compact_int_matrix(dir: &Path) -> io::Result<()> {
|
||||||
let packed_path = dir.join("matrix.pcmx");
|
let packed_path = dir.join("matrix.pcmx");
|
||||||
if packed_path.exists() {
|
|
||||||
if let Ok(meta) = MatrixMeta::load(dir) {
|
let meta = match MatrixMeta::load(dir) {
|
||||||
for c in 0..meta.n_cols { let _ = fs::remove_file(col_path(dir, c)); }
|
Ok(meta) => meta,
|
||||||
let _ = fs::remove_file(dir.join("meta.json"));
|
Err(e) => {
|
||||||
|
// No columnar data pending: either this layer was already
|
||||||
|
// packed and cleaned up (matrix.pcmx complete, nothing left to
|
||||||
|
// do), or genuinely nothing was ever written here.
|
||||||
|
return if packed_path.exists() { Ok(()) } else { Err(e) };
|
||||||
}
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// A `matrix.pcmx` can already exist here even though columnar data is
|
||||||
|
// still pending — e.g. copied verbatim from a merge's base source
|
||||||
|
// before this layer was widened with more genome columns (see
|
||||||
|
// `obikpartitionner::merge_partition`). Only skip (re-)packing if the
|
||||||
|
// existing file already reflects the current column count; otherwise
|
||||||
|
// the columnar files are newer and must be (re-)packed, overwriting the
|
||||||
|
// stale one — never silently discarded as "leftover cleanup".
|
||||||
|
if packed_int_matrix_n_cols(&packed_path).ok() == Some(meta.n_cols) {
|
||||||
|
for c in 0..meta.n_cols { let _ = fs::remove_file(col_path(dir, c)); }
|
||||||
|
let _ = fs::remove_file(dir.join("meta.json"));
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
let meta = MatrixMeta::load(dir)?;
|
|
||||||
let n_cols = meta.n_cols;
|
let n_cols = meta.n_cols;
|
||||||
let col_sizes: Vec<u64> = (0..n_cols)
|
let col_sizes: Vec<u64> = (0..n_cols)
|
||||||
.map(|c| fs::metadata(col_path(dir, c)).map(|m| m.len()))
|
.map(|c| fs::metadata(col_path(dir, c)).map(|m| m.len()))
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "obikmer"
|
name = "obikmer"
|
||||||
version = "1.1.34"
|
version = "1.1.35"
|
||||||
edition = "2024"
|
edition = "2024"
|
||||||
|
|
||||||
[[bin]]
|
[[bin]]
|
||||||
|
|||||||
@@ -96,162 +96,5 @@ impl<S: BitPartials> BitPartials for LayeredStore<S> {
|
|||||||
// ── Tests ─────────────────────────────────────────────────────────────────────
|
// ── Tests ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
#[path = "tests/layered_store.rs"]
|
||||||
use super::*;
|
mod tests;
|
||||||
use obicompactvec::{
|
|
||||||
PersistentBitMatrix, PersistentBitMatrixBuilder,
|
|
||||||
PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder,
|
|
||||||
};
|
|
||||||
use tempfile::tempdir;
|
|
||||||
|
|
||||||
fn make_int_matrix(cols: &[&[u32]]) -> (tempfile::TempDir, PersistentCompactIntMatrix) {
|
|
||||||
let n = cols.first().map_or(0, |c| c.len());
|
|
||||||
let dir = tempdir().unwrap();
|
|
||||||
let mut b = PersistentCompactIntMatrixBuilder::new(n, &dir.path().join("counts")).unwrap();
|
|
||||||
for &col in cols {
|
|
||||||
let mut cb = b.add_col().unwrap();
|
|
||||||
for (slot, &v) in col.iter().enumerate() { cb.set(slot, v); }
|
|
||||||
cb.close().unwrap();
|
|
||||||
}
|
|
||||||
b.close().unwrap();
|
|
||||||
let m = PersistentCompactIntMatrix::open(dir.path()).unwrap();
|
|
||||||
(dir, m)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn make_bit_matrix(cols: &[&[bool]]) -> (tempfile::TempDir, PersistentBitMatrix) {
|
|
||||||
let n = cols.first().map_or(0, |c| c.len());
|
|
||||||
let dir = tempdir().unwrap();
|
|
||||||
let mut b = PersistentBitMatrixBuilder::new(n, &dir.path().join("presence")).unwrap();
|
|
||||||
for &col in cols {
|
|
||||||
let mut cb = b.add_col().unwrap();
|
|
||||||
for (slot, &v) in col.iter().enumerate() { cb.set(slot, v); }
|
|
||||||
cb.close().unwrap();
|
|
||||||
}
|
|
||||||
b.close().unwrap();
|
|
||||||
let m = PersistentBitMatrix::open(dir.path()).unwrap();
|
|
||||||
(dir, m)
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── ColumnWeights ─────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn col_weights_sums_across_layers() {
|
|
||||||
// layer 0: col0=[1,2], col1=[3,4] → weights [3, 7]
|
|
||||||
// layer 1: col0=[10,0], col1=[0,10] → weights [10, 10]
|
|
||||||
// combined: [13, 17]
|
|
||||||
let (_d0, m0) = make_int_matrix(&[&[1, 2], &[3, 4]]);
|
|
||||||
let (_d1, m1) = make_int_matrix(&[&[10, 0], &[0, 10]]);
|
|
||||||
let store = LayeredStore::new(vec![m0, m1]);
|
|
||||||
let w = store.col_weights();
|
|
||||||
assert_eq!(w[0], 13);
|
|
||||||
assert_eq!(w[1], 17);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn col_weights_bit_sums_across_layers() {
|
|
||||||
// layer 0: col0=[T,F,T], col1=[F,T,T] → counts [2, 2]
|
|
||||||
// layer 1: col0=[F,F,T], col1=[T,T,F] → counts [1, 2]
|
|
||||||
// combined: [3, 4]
|
|
||||||
let (_d0, m0) = make_bit_matrix(&[&[true, false, true], &[false, true, true]]);
|
|
||||||
let (_d1, m1) = make_bit_matrix(&[&[false, false, true], &[true, true, false]]);
|
|
||||||
let store = LayeredStore::new(vec![m0, m1]);
|
|
||||||
let w = store.col_weights();
|
|
||||||
assert_eq!(w[0], 3);
|
|
||||||
assert_eq!(w[1], 4);
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── CountPartials — layered (one partition) ───────────────────────────────
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn layered_bray_matches_combined() {
|
|
||||||
// Split [1,2,3,4,5] across two layers; bray dist should equal direct computation
|
|
||||||
// on [1,2,3,4,5] for each column pair.
|
|
||||||
// col0=[1,2,3,4,5], col1=[5,4,3,2,1]
|
|
||||||
let (_d0, m0) = make_int_matrix(&[&[1, 2], &[5, 4]]); // slots 0-1
|
|
||||||
let (_d1, m1) = make_int_matrix(&[&[3, 4, 5], &[3, 2, 1]]); // slots 2-4
|
|
||||||
let store = LayeredStore::new(vec![m0, m1]);
|
|
||||||
|
|
||||||
// direct on full data
|
|
||||||
let (_df, mf) = make_int_matrix(&[&[1, 2, 3, 4, 5], &[5, 4, 3, 2, 1]]);
|
|
||||||
let expected = CountPartials::bray_dist_matrix(&mf);
|
|
||||||
let got = CountPartials::bray_dist_matrix(&store);
|
|
||||||
assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "bray [0,1]");
|
|
||||||
assert!((got[[1, 0]] - expected[[1, 0]]).abs() < 1e-12, "bray [1,0]");
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn layered_relfreq_bray_matches_combined() {
|
|
||||||
let (_d0, m0) = make_int_matrix(&[&[1, 2], &[5, 4]]);
|
|
||||||
let (_d1, m1) = make_int_matrix(&[&[3, 4, 5], &[3, 2, 1]]);
|
|
||||||
let store = LayeredStore::new(vec![m0, m1]);
|
|
||||||
|
|
||||||
let (_df, mf) = make_int_matrix(&[&[1, 2, 3, 4, 5], &[5, 4, 3, 2, 1]]);
|
|
||||||
let expected = CountPartials::relfreq_bray_dist_matrix(&mf);
|
|
||||||
let got = CountPartials::relfreq_bray_dist_matrix(&store);
|
|
||||||
assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "relfreq_bray [0,1]");
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn layered_euclidean_matches_combined() {
|
|
||||||
let (_d0, m0) = make_int_matrix(&[&[3, 0], &[0, 4]]);
|
|
||||||
let (_d1, m1) = make_int_matrix(&[&[1, 1], &[2, 2]]);
|
|
||||||
let store = LayeredStore::new(vec![m0, m1]);
|
|
||||||
|
|
||||||
let (_df, mf) = make_int_matrix(&[&[3, 0, 1, 1], &[0, 4, 2, 2]]);
|
|
||||||
let expected = CountPartials::euclidean_dist_matrix(&mf);
|
|
||||||
let got = CountPartials::euclidean_dist_matrix(&store);
|
|
||||||
assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "euclidean [0,1]");
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── CountPartials — partitioned (LayeredStore<LayeredStore<_>>) ───────────
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn partitioned_bray_matches_combined() {
|
|
||||||
// partition 0: slots [1,2,3,4,5] col0 vs col1
|
|
||||||
// partition 1: slots [10,20] col0 vs col1
|
|
||||||
let (_d0, p0) = make_int_matrix(&[&[1, 2, 3, 4, 5], &[5, 4, 3, 2, 1]]);
|
|
||||||
let (_d1, p1) = make_int_matrix(&[&[10, 20], &[20, 10]]);
|
|
||||||
|
|
||||||
let partitioned = LayeredStore::new(vec![
|
|
||||||
LayeredStore::new(vec![p0]),
|
|
||||||
LayeredStore::new(vec![p1]),
|
|
||||||
]);
|
|
||||||
|
|
||||||
let (_df, mf) = make_int_matrix(&[&[1, 2, 3, 4, 5, 10, 20], &[5, 4, 3, 2, 1, 20, 10]]);
|
|
||||||
let expected = CountPartials::bray_dist_matrix(&mf);
|
|
||||||
let got = CountPartials::bray_dist_matrix(&partitioned);
|
|
||||||
assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "partitioned bray [0,1]");
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── BitPartials ───────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn layered_jaccard_matches_combined() {
|
|
||||||
let (_d0, m0) = make_bit_matrix(&[&[true, false], &[false, true]]);
|
|
||||||
let (_d1, m1) = make_bit_matrix(&[&[true, true], &[true, false]]);
|
|
||||||
let store = LayeredStore::new(vec![m0, m1]);
|
|
||||||
|
|
||||||
let (_df, mf) = make_bit_matrix(&[
|
|
||||||
&[true, false, true, true],
|
|
||||||
&[false, true, true, false],
|
|
||||||
]);
|
|
||||||
let expected = BitPartials::jaccard_dist_matrix(&mf);
|
|
||||||
let got = BitPartials::jaccard_dist_matrix(&store);
|
|
||||||
assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "jaccard [0,1]");
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn layered_hamming_matches_combined() {
|
|
||||||
let (_d0, m0) = make_bit_matrix(&[&[true, false], &[false, true]]);
|
|
||||||
let (_d1, m1) = make_bit_matrix(&[&[true, true], &[false, false]]);
|
|
||||||
let store = LayeredStore::new(vec![m0, m1]);
|
|
||||||
|
|
||||||
let (_df, mf) = make_bit_matrix(&[
|
|
||||||
&[true, false, true, true],
|
|
||||||
&[false, true, false, false],
|
|
||||||
]);
|
|
||||||
let expected = BitPartials::hamming_dist_matrix(&mf);
|
|
||||||
let got = BitPartials::hamming_dist_matrix(&store);
|
|
||||||
assert_eq!(got[[0, 1]], expected[[0, 1]], "hamming [0,1]");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -0,0 +1,381 @@
|
|||||||
|
use super::*;
|
||||||
|
use obicompactvec::{
|
||||||
|
PersistentBitMatrix, PersistentBitMatrixBuilder,
|
||||||
|
PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder,
|
||||||
|
};
|
||||||
|
use tempfile::tempdir;
|
||||||
|
|
||||||
|
fn make_int_matrix(cols: &[&[u32]]) -> (tempfile::TempDir, PersistentCompactIntMatrix) {
|
||||||
|
let n = cols.first().map_or(0, |c| c.len());
|
||||||
|
let dir = tempdir().unwrap();
|
||||||
|
let mut b = PersistentCompactIntMatrixBuilder::new(n, &dir.path().join("counts")).unwrap();
|
||||||
|
for &col in cols {
|
||||||
|
let mut cb = b.add_col().unwrap();
|
||||||
|
for (slot, &v) in col.iter().enumerate() { cb.set(slot, v); }
|
||||||
|
cb.close().unwrap();
|
||||||
|
}
|
||||||
|
b.close().unwrap();
|
||||||
|
let m = PersistentCompactIntMatrix::open(dir.path()).unwrap();
|
||||||
|
(dir, m)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn make_bit_matrix(cols: &[&[bool]]) -> (tempfile::TempDir, PersistentBitMatrix) {
|
||||||
|
let n = cols.first().map_or(0, |c| c.len());
|
||||||
|
let dir = tempdir().unwrap();
|
||||||
|
let mut b = PersistentBitMatrixBuilder::new(n, &dir.path().join("presence")).unwrap();
|
||||||
|
for &col in cols {
|
||||||
|
let mut cb = b.add_col().unwrap();
|
||||||
|
for (slot, &v) in col.iter().enumerate() { cb.set(slot, v); }
|
||||||
|
cb.close().unwrap();
|
||||||
|
}
|
||||||
|
b.close().unwrap();
|
||||||
|
let m = PersistentBitMatrix::open(dir.path()).unwrap();
|
||||||
|
(dir, m)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── ColumnWeights ─────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn col_weights_sums_across_layers() {
|
||||||
|
// layer 0: col0=[1,2], col1=[3,4] → weights [3, 7]
|
||||||
|
// layer 1: col0=[10,0], col1=[0,10] → weights [10, 10]
|
||||||
|
// combined: [13, 17]
|
||||||
|
let (_d0, m0) = make_int_matrix(&[&[1, 2], &[3, 4]]);
|
||||||
|
let (_d1, m1) = make_int_matrix(&[&[10, 0], &[0, 10]]);
|
||||||
|
let store = LayeredStore::new(vec![m0, m1]);
|
||||||
|
let w = store.col_weights();
|
||||||
|
assert_eq!(w[0], 13);
|
||||||
|
assert_eq!(w[1], 17);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn col_weights_bit_sums_across_layers() {
|
||||||
|
// layer 0: col0=[T,F,T], col1=[F,T,T] → counts [2, 2]
|
||||||
|
// layer 1: col0=[F,F,T], col1=[T,T,F] → counts [1, 2]
|
||||||
|
// combined: [3, 4]
|
||||||
|
let (_d0, m0) = make_bit_matrix(&[&[true, false, true], &[false, true, true]]);
|
||||||
|
let (_d1, m1) = make_bit_matrix(&[&[false, false, true], &[true, true, false]]);
|
||||||
|
let store = LayeredStore::new(vec![m0, m1]);
|
||||||
|
let w = store.col_weights();
|
||||||
|
assert_eq!(w[0], 3);
|
||||||
|
assert_eq!(w[1], 4);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── CountPartials — layered (one partition) ───────────────────────────────
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn layered_bray_matches_combined() {
|
||||||
|
// Split [1,2,3,4,5] across two layers; bray dist should equal direct computation
|
||||||
|
// on [1,2,3,4,5] for each column pair.
|
||||||
|
// col0=[1,2,3,4,5], col1=[5,4,3,2,1]
|
||||||
|
let (_d0, m0) = make_int_matrix(&[&[1, 2], &[5, 4]]); // slots 0-1
|
||||||
|
let (_d1, m1) = make_int_matrix(&[&[3, 4, 5], &[3, 2, 1]]); // slots 2-4
|
||||||
|
let store = LayeredStore::new(vec![m0, m1]);
|
||||||
|
|
||||||
|
// direct on full data
|
||||||
|
let (_df, mf) = make_int_matrix(&[&[1, 2, 3, 4, 5], &[5, 4, 3, 2, 1]]);
|
||||||
|
let expected = CountPartials::bray_dist_matrix(&mf);
|
||||||
|
let got = CountPartials::bray_dist_matrix(&store);
|
||||||
|
assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "bray [0,1]");
|
||||||
|
assert!((got[[1, 0]] - expected[[1, 0]]).abs() < 1e-12, "bray [1,0]");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn layered_relfreq_bray_matches_combined() {
|
||||||
|
let (_d0, m0) = make_int_matrix(&[&[1, 2], &[5, 4]]);
|
||||||
|
let (_d1, m1) = make_int_matrix(&[&[3, 4, 5], &[3, 2, 1]]);
|
||||||
|
let store = LayeredStore::new(vec![m0, m1]);
|
||||||
|
|
||||||
|
let (_df, mf) = make_int_matrix(&[&[1, 2, 3, 4, 5], &[5, 4, 3, 2, 1]]);
|
||||||
|
let expected = CountPartials::relfreq_bray_dist_matrix(&mf);
|
||||||
|
let got = CountPartials::relfreq_bray_dist_matrix(&store);
|
||||||
|
assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "relfreq_bray [0,1]");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn layered_euclidean_matches_combined() {
|
||||||
|
let (_d0, m0) = make_int_matrix(&[&[3, 0], &[0, 4]]);
|
||||||
|
let (_d1, m1) = make_int_matrix(&[&[1, 1], &[2, 2]]);
|
||||||
|
let store = LayeredStore::new(vec![m0, m1]);
|
||||||
|
|
||||||
|
let (_df, mf) = make_int_matrix(&[&[3, 0, 1, 1], &[0, 4, 2, 2]]);
|
||||||
|
let expected = CountPartials::euclidean_dist_matrix(&mf);
|
||||||
|
let got = CountPartials::euclidean_dist_matrix(&store);
|
||||||
|
assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "euclidean [0,1]");
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── CountPartials — partitioned (LayeredStore<LayeredStore<_>>) ───────────
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn partitioned_bray_matches_combined() {
|
||||||
|
// partition 0: slots [1,2,3,4,5] col0 vs col1
|
||||||
|
// partition 1: slots [10,20] col0 vs col1
|
||||||
|
let (_d0, p0) = make_int_matrix(&[&[1, 2, 3, 4, 5], &[5, 4, 3, 2, 1]]);
|
||||||
|
let (_d1, p1) = make_int_matrix(&[&[10, 20], &[20, 10]]);
|
||||||
|
|
||||||
|
let partitioned = LayeredStore::new(vec![
|
||||||
|
LayeredStore::new(vec![p0]),
|
||||||
|
LayeredStore::new(vec![p1]),
|
||||||
|
]);
|
||||||
|
|
||||||
|
let (_df, mf) = make_int_matrix(&[&[1, 2, 3, 4, 5, 10, 20], &[5, 4, 3, 2, 1, 20, 10]]);
|
||||||
|
let expected = CountPartials::bray_dist_matrix(&mf);
|
||||||
|
let got = CountPartials::bray_dist_matrix(&partitioned);
|
||||||
|
assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "partitioned bray [0,1]");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn partitioned_threshold_jaccard_off_diagonal_is_pairwise() {
|
||||||
|
// 3 genomes, 2 partitions, 1 layer each — mirrors distance.rs's
|
||||||
|
// LayeredStore<LayeredStore<PersistentCompactIntMatrix>> shape.
|
||||||
|
// partition 0: col0=[3,0], col1=[0,3], col2=[3,3]
|
||||||
|
// partition 1: col0=[1,1], col1=[1,0], col2=[0,1]
|
||||||
|
let (_d0, p0) = make_int_matrix(&[&[3, 0], &[0, 3], &[3, 3]]);
|
||||||
|
let (_d1, p1) = make_int_matrix(&[&[1, 1], &[1, 0], &[0, 1]]);
|
||||||
|
|
||||||
|
let partitioned = LayeredStore::new(vec![
|
||||||
|
LayeredStore::new(vec![p0]),
|
||||||
|
LayeredStore::new(vec![p1]),
|
||||||
|
]);
|
||||||
|
|
||||||
|
let (_df, mf) = make_int_matrix(&[&[3, 0, 1, 1], &[0, 3, 1, 0], &[3, 3, 0, 1]]);
|
||||||
|
let threshold = 1u32;
|
||||||
|
let (inter_p, union_p) = CountPartials::partial_threshold_jaccard(&partitioned, threshold);
|
||||||
|
let (inter_f, union_f) = CountPartials::partial_threshold_jaccard(&mf, threshold);
|
||||||
|
|
||||||
|
let n = 3;
|
||||||
|
for i in 0..n {
|
||||||
|
for j in 0..n {
|
||||||
|
assert_eq!(inter_p[[i, j]], inter_f[[i, j]], "inter[{i},{j}]");
|
||||||
|
assert_eq!(union_p[[i, j]], union_f[[i, j]], "union[{i},{j}]");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn partitioned_threshold_jaccard_packed_off_diagonal_is_pairwise() {
|
||||||
|
// Same as `partitioned_threshold_jaccard_off_diagonal_is_pairwise` but
|
||||||
|
// each partition matrix is packed into a single .pcmx file first —
|
||||||
|
// the on-disk format actually used in production after `pack_matrices`.
|
||||||
|
use obicompactvec::pack_compact_int_matrix;
|
||||||
|
|
||||||
|
let (d0, _p0) = make_int_matrix(&[&[3, 0], &[0, 3], &[3, 3]]);
|
||||||
|
pack_compact_int_matrix(&d0.path().join("counts")).unwrap();
|
||||||
|
let p0 = PersistentCompactIntMatrix::open(d0.path()).unwrap();
|
||||||
|
|
||||||
|
let (d1, _p1) = make_int_matrix(&[&[1, 1], &[1, 0], &[0, 1]]);
|
||||||
|
pack_compact_int_matrix(&d1.path().join("counts")).unwrap();
|
||||||
|
let p1 = PersistentCompactIntMatrix::open(d1.path()).unwrap();
|
||||||
|
|
||||||
|
let partitioned = LayeredStore::new(vec![
|
||||||
|
LayeredStore::new(vec![p0]),
|
||||||
|
LayeredStore::new(vec![p1]),
|
||||||
|
]);
|
||||||
|
|
||||||
|
let (_df, mf) = make_int_matrix(&[&[3, 0, 1, 1], &[0, 3, 1, 0], &[3, 3, 0, 1]]);
|
||||||
|
let threshold = 1u32;
|
||||||
|
let (inter_p, union_p) = CountPartials::partial_threshold_jaccard(&partitioned, threshold);
|
||||||
|
let (inter_f, union_f) = CountPartials::partial_threshold_jaccard(&mf, threshold);
|
||||||
|
|
||||||
|
let n = 3;
|
||||||
|
for i in 0..n {
|
||||||
|
for j in 0..n {
|
||||||
|
assert_eq!(inter_p[[i, j]], inter_f[[i, j]], "inter[{i},{j}]");
|
||||||
|
assert_eq!(union_p[[i, j]], union_f[[i, j]], "union[{i},{j}]");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn partitioned_multilayer_threshold_jaccard_off_diagonal_is_pairwise() {
|
||||||
|
// 2 partitions, 2 layers each — the shape production indexes actually
|
||||||
|
// have (MPHF collision layers within a partition).
|
||||||
|
// partition 0, layer 0: col0=[3,0], col1=[0,3], col2=[3,3]
|
||||||
|
// partition 0, layer 1: col0=[2,0], col1=[0,0], col2=[2,0]
|
||||||
|
// partition 1, layer 0: col0=[1,1], col1=[1,0], col2=[0,1]
|
||||||
|
// partition 1, layer 1: col0=[0,5], col1=[5,5], col2=[0,0]
|
||||||
|
let (_d0a, p0a) = make_int_matrix(&[&[3, 0], &[0, 3], &[3, 3]]);
|
||||||
|
let (_d0b, p0b) = make_int_matrix(&[&[2, 0], &[0, 0], &[2, 0]]);
|
||||||
|
let (_d1a, p1a) = make_int_matrix(&[&[1, 1], &[1, 0], &[0, 1]]);
|
||||||
|
let (_d1b, p1b) = make_int_matrix(&[&[0, 5], &[5, 5], &[0, 0]]);
|
||||||
|
|
||||||
|
let partitioned = LayeredStore::new(vec![
|
||||||
|
LayeredStore::new(vec![p0a, p0b]),
|
||||||
|
LayeredStore::new(vec![p1a, p1b]),
|
||||||
|
]);
|
||||||
|
|
||||||
|
// Flattened equivalent: concatenate every layer's slots into one matrix.
|
||||||
|
let (_df, mf) = make_int_matrix(&[
|
||||||
|
&[3, 0, 2, 0, 1, 1, 0, 5],
|
||||||
|
&[0, 3, 0, 0, 1, 0, 5, 5],
|
||||||
|
&[3, 3, 2, 0, 0, 1, 0, 0],
|
||||||
|
]);
|
||||||
|
let threshold = 1u32;
|
||||||
|
let (inter_p, union_p) = CountPartials::partial_threshold_jaccard(&partitioned, threshold);
|
||||||
|
let (inter_f, union_f) = CountPartials::partial_threshold_jaccard(&mf, threshold);
|
||||||
|
|
||||||
|
let n = 3;
|
||||||
|
for i in 0..n {
|
||||||
|
for j in 0..n {
|
||||||
|
assert_eq!(inter_p[[i, j]], inter_f[[i, j]], "inter[{i},{j}]");
|
||||||
|
assert_eq!(union_p[[i, j]], union_f[[i, j]], "union[{i},{j}]");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── BitPartials ───────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn layered_jaccard_matches_combined() {
|
||||||
|
let (_d0, m0) = make_bit_matrix(&[&[true, false], &[false, true]]);
|
||||||
|
let (_d1, m1) = make_bit_matrix(&[&[true, true], &[true, false]]);
|
||||||
|
let store = LayeredStore::new(vec![m0, m1]);
|
||||||
|
|
||||||
|
let (_df, mf) = make_bit_matrix(&[
|
||||||
|
&[true, false, true, true],
|
||||||
|
&[false, true, true, false],
|
||||||
|
]);
|
||||||
|
let expected = BitPartials::jaccard_dist_matrix(&mf);
|
||||||
|
let got = BitPartials::jaccard_dist_matrix(&store);
|
||||||
|
assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "jaccard [0,1]");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn layered_hamming_matches_combined() {
|
||||||
|
let (_d0, m0) = make_bit_matrix(&[&[true, false], &[false, true]]);
|
||||||
|
let (_d1, m1) = make_bit_matrix(&[&[true, true], &[false, false]]);
|
||||||
|
let store = LayeredStore::new(vec![m0, m1]);
|
||||||
|
|
||||||
|
let (_df, mf) = make_bit_matrix(&[
|
||||||
|
&[true, false, true, true],
|
||||||
|
&[false, true, false, false],
|
||||||
|
]);
|
||||||
|
let expected = BitPartials::hamming_dist_matrix(&mf);
|
||||||
|
let got = BitPartials::hamming_dist_matrix(&store);
|
||||||
|
assert_eq!(got[[0, 1]], expected[[0, 1]], "hamming [0,1]");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn partitioned_bit_jaccard_off_diagonal_is_pairwise() {
|
||||||
|
// Same shape as the count-based `partitioned_multilayer_threshold_jaccard_*`
|
||||||
|
// tests, but for the presence/bit path (`with_counts = false` — what
|
||||||
|
// `all_specifics` actually uses in production).
|
||||||
|
// 4 genomes, 3 partitions, 2 layers in the last one.
|
||||||
|
let (_d0, p0) = make_bit_matrix(&[
|
||||||
|
&[true, false, true],
|
||||||
|
&[false, true, true],
|
||||||
|
&[true, true, false],
|
||||||
|
&[false, false, true],
|
||||||
|
]);
|
||||||
|
let (_d1, p1) = make_bit_matrix(&[
|
||||||
|
&[true, true],
|
||||||
|
&[false, true],
|
||||||
|
&[true, false],
|
||||||
|
&[true, true],
|
||||||
|
]);
|
||||||
|
let (_d2a, p2a) = make_bit_matrix(&[
|
||||||
|
&[false, true],
|
||||||
|
&[true, true],
|
||||||
|
&[false, false],
|
||||||
|
&[true, false],
|
||||||
|
]);
|
||||||
|
let (_d2b, p2b) = make_bit_matrix(&[
|
||||||
|
&[true],
|
||||||
|
&[false],
|
||||||
|
&[true],
|
||||||
|
&[true],
|
||||||
|
]);
|
||||||
|
|
||||||
|
let partitioned = LayeredStore::new(vec![
|
||||||
|
LayeredStore::new(vec![p0]),
|
||||||
|
LayeredStore::new(vec![p1]),
|
||||||
|
LayeredStore::new(vec![p2a, p2b]),
|
||||||
|
]);
|
||||||
|
|
||||||
|
// Flattened equivalent: concatenate every partition/layer's slots.
|
||||||
|
let (_df, mf) = make_bit_matrix(&[
|
||||||
|
&[true, false, true, true, true, false, true, true],
|
||||||
|
&[false, true, true, false, true, true, true, false],
|
||||||
|
&[true, true, false, true, false, false, false, true],
|
||||||
|
&[false, false, true, true, true, true, false, true],
|
||||||
|
]);
|
||||||
|
|
||||||
|
let (inter_p, union_p) = BitPartials::partial_jaccard(&partitioned);
|
||||||
|
let (inter_f, union_f) = BitPartials::partial_jaccard(&mf);
|
||||||
|
|
||||||
|
let n = 4;
|
||||||
|
for i in 0..n {
|
||||||
|
for j in 0..n {
|
||||||
|
assert_eq!(inter_p[[i, j]], inter_f[[i, j]], "inter[{i},{j}]");
|
||||||
|
assert_eq!(union_p[[i, j]], union_f[[i, j]], "union[{i},{j}]");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn partitioned_bit_jaccard_packed_off_diagonal_is_pairwise() {
|
||||||
|
// Same as `partitioned_bit_jaccard_off_diagonal_is_pairwise` but every
|
||||||
|
// partition's presence matrix is packed into a single .pbmx file —
|
||||||
|
// the on-disk format actually used in production after `pack_matrices`.
|
||||||
|
use obicompactvec::pack_bit_matrix;
|
||||||
|
|
||||||
|
let (d0, _p0) = make_bit_matrix(&[
|
||||||
|
&[true, false, true],
|
||||||
|
&[false, true, true],
|
||||||
|
&[true, true, false],
|
||||||
|
&[false, false, true],
|
||||||
|
]);
|
||||||
|
pack_bit_matrix(&d0.path().join("presence")).unwrap();
|
||||||
|
let p0 = PersistentBitMatrix::open(d0.path()).unwrap();
|
||||||
|
|
||||||
|
let (d1, _p1) = make_bit_matrix(&[
|
||||||
|
&[true, true],
|
||||||
|
&[false, true],
|
||||||
|
&[true, false],
|
||||||
|
&[true, true],
|
||||||
|
]);
|
||||||
|
pack_bit_matrix(&d1.path().join("presence")).unwrap();
|
||||||
|
let p1 = PersistentBitMatrix::open(d1.path()).unwrap();
|
||||||
|
|
||||||
|
let (d2a, _p2a) = make_bit_matrix(&[
|
||||||
|
&[false, true],
|
||||||
|
&[true, true],
|
||||||
|
&[false, false],
|
||||||
|
&[true, false],
|
||||||
|
]);
|
||||||
|
pack_bit_matrix(&d2a.path().join("presence")).unwrap();
|
||||||
|
let p2a = PersistentBitMatrix::open(d2a.path()).unwrap();
|
||||||
|
|
||||||
|
let (d2b, _p2b) = make_bit_matrix(&[
|
||||||
|
&[true],
|
||||||
|
&[false],
|
||||||
|
&[true],
|
||||||
|
&[true],
|
||||||
|
]);
|
||||||
|
pack_bit_matrix(&d2b.path().join("presence")).unwrap();
|
||||||
|
let p2b = PersistentBitMatrix::open(d2b.path()).unwrap();
|
||||||
|
|
||||||
|
let partitioned = LayeredStore::new(vec![
|
||||||
|
LayeredStore::new(vec![p0]),
|
||||||
|
LayeredStore::new(vec![p1]),
|
||||||
|
LayeredStore::new(vec![p2a, p2b]),
|
||||||
|
]);
|
||||||
|
|
||||||
|
let (_df, mf) = make_bit_matrix(&[
|
||||||
|
&[true, false, true, true, true, false, true, true],
|
||||||
|
&[false, true, true, false, true, true, true, false],
|
||||||
|
&[true, true, false, true, false, false, false, true],
|
||||||
|
&[false, false, true, true, true, true, false, true],
|
||||||
|
]);
|
||||||
|
|
||||||
|
let (inter_p, union_p) = BitPartials::partial_jaccard(&partitioned);
|
||||||
|
let (inter_f, union_f) = BitPartials::partial_jaccard(&mf);
|
||||||
|
|
||||||
|
let n = 4;
|
||||||
|
for i in 0..n {
|
||||||
|
for j in 0..n {
|
||||||
|
assert_eq!(inter_p[[i, j]], inter_f[[i, j]], "inter[{i},{j}]");
|
||||||
|
assert_eq!(union_p[[i, j]], union_f[[i, j]], "union[{i},{j}]");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user