diff --git a/.gitignore b/.gitignore index b44f5fc..793b0a9 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,7 @@ benchmark/simulated_data benchmark/specimen_index_presence benchmark/specimen_index_count benchmark/global_index_presence +benchmark/all_specific benchmark/global_index_count benchmark/stats benchmark/reference_index diff --git a/src/Cargo.lock b/src/Cargo.lock index 057ddea..0664684 100644 --- a/src/Cargo.lock +++ b/src/Cargo.lock @@ -1704,7 +1704,7 @@ dependencies = [ [[package]] name = "obikmer" -version = "1.1.34" +version = "1.1.35" dependencies = [ "clap", "csv", diff --git a/src/Synthese.docx b/src/Synthese.docx deleted file mode 100644 index 79b1209..0000000 Binary files a/src/Synthese.docx and /dev/null differ diff --git a/src/obicompactvec/src/bitmatrix.rs b/src/obicompactvec/src/bitmatrix.rs index 89121da..feb7267 100644 --- a/src/obicompactvec/src/bitmatrix.rs +++ b/src/obicompactvec/src/bitmatrix.rs @@ -1,5 +1,5 @@ use std::fs::{self, File}; -use std::io::{self, BufWriter, Write as _}; +use std::io::{self, BufWriter, Read as _, Write as _}; use std::path::{Path, PathBuf}; use memmap2::Mmap; @@ -171,19 +171,43 @@ impl PackedBitMatrix { } } +/// Reads just the `n_cols` field from an existing packed matrix's header, +/// without mapping the file. Used by `pack_bit_matrix` to tell a genuinely +/// complete pack from a stale one that predates a later column-widening. +fn packed_bit_matrix_n_cols(path: &Path) -> io::Result { + let mut f = File::open(path)?; + let mut header = [0u8; PBMX_HEADER]; + f.read_exact(&mut header)?; + Ok(u64::from_le_bytes(header[16..24].try_into().unwrap()) as usize) +} + /// Build `presence/matrix.pbmx` from existing `col_*.pbiv` files. pub fn pack_bit_matrix(dir: &Path) -> io::Result<()> { let packed_path = dir.join("matrix.pbmx"); - if packed_path.exists() { - // Matrix complete; remove any leftover column files from a killed cleanup. - if let Ok(meta) = MatrixMeta::load(dir) { - for c in 0..meta.n_cols { let _ = fs::remove_file(col_path(dir, c)); } - let _ = fs::remove_file(dir.join("meta.json")); + + let meta = match MatrixMeta::load(dir) { + Ok(meta) => meta, + Err(e) => { + // No columnar data pending: either this layer was already + // packed and cleaned up (matrix.pbmx complete, nothing left to + // do), or genuinely nothing was ever written here. + return if packed_path.exists() { Ok(()) } else { Err(e) }; } + }; + + // A `matrix.pbmx` can already exist here even though columnar data is + // still pending — e.g. copied verbatim from a merge's base source + // before this layer was widened with more genome columns (see + // `obikpartitionner::merge_partition`). Only skip (re-)packing if the + // existing file already reflects the current column count; otherwise + // the columnar files are newer and must be (re-)packed, overwriting the + // stale one — never silently discarded as "leftover cleanup". + if packed_bit_matrix_n_cols(&packed_path).ok() == Some(meta.n_cols) { + for c in 0..meta.n_cols { let _ = fs::remove_file(col_path(dir, c)); } + let _ = fs::remove_file(dir.join("meta.json")); return Ok(()); } - let meta = MatrixMeta::load(dir)?; let n_cols = meta.n_cols; // Compute offsets from file sizes — no column data loaded into RAM. diff --git a/src/obicompactvec/src/intmatrix.rs b/src/obicompactvec/src/intmatrix.rs index b2fa97e..6b030e0 100644 --- a/src/obicompactvec/src/intmatrix.rs +++ b/src/obicompactvec/src/intmatrix.rs @@ -1,5 +1,5 @@ use std::fs::{self, File}; -use std::io::{self, BufWriter, Write as _}; +use std::io::{self, BufWriter, Read as _, Write as _}; use std::path::{Path, PathBuf}; use memmap2::Mmap; @@ -228,17 +228,44 @@ impl PackedCompactIntMatrix { } } +/// Reads just the `n_cols` field from an existing packed matrix's header, +/// without mapping the file. Used by `pack_compact_int_matrix` to tell a +/// genuinely complete pack from a stale one that predates a later +/// column-widening. +fn packed_int_matrix_n_cols(path: &Path) -> io::Result { + let mut f = File::open(path)?; + let mut header = [0u8; PCMX_HEADER]; + f.read_exact(&mut header)?; + Ok(u64::from_le_bytes(header[16..24].try_into().unwrap()) as usize) +} + /// Build `counts/matrix.pcmx` from existing `col_*.pciv` files. pub fn pack_compact_int_matrix(dir: &Path) -> io::Result<()> { let packed_path = dir.join("matrix.pcmx"); - if packed_path.exists() { - if let Ok(meta) = MatrixMeta::load(dir) { - for c in 0..meta.n_cols { let _ = fs::remove_file(col_path(dir, c)); } - let _ = fs::remove_file(dir.join("meta.json")); + + let meta = match MatrixMeta::load(dir) { + Ok(meta) => meta, + Err(e) => { + // No columnar data pending: either this layer was already + // packed and cleaned up (matrix.pcmx complete, nothing left to + // do), or genuinely nothing was ever written here. + return if packed_path.exists() { Ok(()) } else { Err(e) }; } + }; + + // A `matrix.pcmx` can already exist here even though columnar data is + // still pending — e.g. copied verbatim from a merge's base source + // before this layer was widened with more genome columns (see + // `obikpartitionner::merge_partition`). Only skip (re-)packing if the + // existing file already reflects the current column count; otherwise + // the columnar files are newer and must be (re-)packed, overwriting the + // stale one — never silently discarded as "leftover cleanup". + if packed_int_matrix_n_cols(&packed_path).ok() == Some(meta.n_cols) { + for c in 0..meta.n_cols { let _ = fs::remove_file(col_path(dir, c)); } + let _ = fs::remove_file(dir.join("meta.json")); return Ok(()); } - let meta = MatrixMeta::load(dir)?; + let n_cols = meta.n_cols; let col_sizes: Vec = (0..n_cols) .map(|c| fs::metadata(col_path(dir, c)).map(|m| m.len())) diff --git a/src/obikmer/Cargo.toml b/src/obikmer/Cargo.toml index b164593..a38559e 100644 --- a/src/obikmer/Cargo.toml +++ b/src/obikmer/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "obikmer" -version = "1.1.34" +version = "1.1.35" edition = "2024" [[bin]] diff --git a/src/obilayeredmap/src/layered_store.rs b/src/obilayeredmap/src/layered_store.rs index 433183e..4747af8 100644 --- a/src/obilayeredmap/src/layered_store.rs +++ b/src/obilayeredmap/src/layered_store.rs @@ -96,162 +96,5 @@ impl BitPartials for LayeredStore { // ── Tests ───────────────────────────────────────────────────────────────────── #[cfg(test)] -mod tests { - use super::*; - use obicompactvec::{ - PersistentBitMatrix, PersistentBitMatrixBuilder, - PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, - }; - use tempfile::tempdir; - - fn make_int_matrix(cols: &[&[u32]]) -> (tempfile::TempDir, PersistentCompactIntMatrix) { - let n = cols.first().map_or(0, |c| c.len()); - let dir = tempdir().unwrap(); - let mut b = PersistentCompactIntMatrixBuilder::new(n, &dir.path().join("counts")).unwrap(); - for &col in cols { - let mut cb = b.add_col().unwrap(); - for (slot, &v) in col.iter().enumerate() { cb.set(slot, v); } - cb.close().unwrap(); - } - b.close().unwrap(); - let m = PersistentCompactIntMatrix::open(dir.path()).unwrap(); - (dir, m) - } - - fn make_bit_matrix(cols: &[&[bool]]) -> (tempfile::TempDir, PersistentBitMatrix) { - let n = cols.first().map_or(0, |c| c.len()); - let dir = tempdir().unwrap(); - let mut b = PersistentBitMatrixBuilder::new(n, &dir.path().join("presence")).unwrap(); - for &col in cols { - let mut cb = b.add_col().unwrap(); - for (slot, &v) in col.iter().enumerate() { cb.set(slot, v); } - cb.close().unwrap(); - } - b.close().unwrap(); - let m = PersistentBitMatrix::open(dir.path()).unwrap(); - (dir, m) - } - - // ── ColumnWeights ───────────────────────────────────────────────────────── - - #[test] - fn col_weights_sums_across_layers() { - // layer 0: col0=[1,2], col1=[3,4] → weights [3, 7] - // layer 1: col0=[10,0], col1=[0,10] → weights [10, 10] - // combined: [13, 17] - let (_d0, m0) = make_int_matrix(&[&[1, 2], &[3, 4]]); - let (_d1, m1) = make_int_matrix(&[&[10, 0], &[0, 10]]); - let store = LayeredStore::new(vec![m0, m1]); - let w = store.col_weights(); - assert_eq!(w[0], 13); - assert_eq!(w[1], 17); - } - - #[test] - fn col_weights_bit_sums_across_layers() { - // layer 0: col0=[T,F,T], col1=[F,T,T] → counts [2, 2] - // layer 1: col0=[F,F,T], col1=[T,T,F] → counts [1, 2] - // combined: [3, 4] - let (_d0, m0) = make_bit_matrix(&[&[true, false, true], &[false, true, true]]); - let (_d1, m1) = make_bit_matrix(&[&[false, false, true], &[true, true, false]]); - let store = LayeredStore::new(vec![m0, m1]); - let w = store.col_weights(); - assert_eq!(w[0], 3); - assert_eq!(w[1], 4); - } - - // ── CountPartials — layered (one partition) ─────────────────────────────── - - #[test] - fn layered_bray_matches_combined() { - // Split [1,2,3,4,5] across two layers; bray dist should equal direct computation - // on [1,2,3,4,5] for each column pair. - // col0=[1,2,3,4,5], col1=[5,4,3,2,1] - let (_d0, m0) = make_int_matrix(&[&[1, 2], &[5, 4]]); // slots 0-1 - let (_d1, m1) = make_int_matrix(&[&[3, 4, 5], &[3, 2, 1]]); // slots 2-4 - let store = LayeredStore::new(vec![m0, m1]); - - // direct on full data - let (_df, mf) = make_int_matrix(&[&[1, 2, 3, 4, 5], &[5, 4, 3, 2, 1]]); - let expected = CountPartials::bray_dist_matrix(&mf); - let got = CountPartials::bray_dist_matrix(&store); - assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "bray [0,1]"); - assert!((got[[1, 0]] - expected[[1, 0]]).abs() < 1e-12, "bray [1,0]"); - } - - #[test] - fn layered_relfreq_bray_matches_combined() { - let (_d0, m0) = make_int_matrix(&[&[1, 2], &[5, 4]]); - let (_d1, m1) = make_int_matrix(&[&[3, 4, 5], &[3, 2, 1]]); - let store = LayeredStore::new(vec![m0, m1]); - - let (_df, mf) = make_int_matrix(&[&[1, 2, 3, 4, 5], &[5, 4, 3, 2, 1]]); - let expected = CountPartials::relfreq_bray_dist_matrix(&mf); - let got = CountPartials::relfreq_bray_dist_matrix(&store); - assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "relfreq_bray [0,1]"); - } - - #[test] - fn layered_euclidean_matches_combined() { - let (_d0, m0) = make_int_matrix(&[&[3, 0], &[0, 4]]); - let (_d1, m1) = make_int_matrix(&[&[1, 1], &[2, 2]]); - let store = LayeredStore::new(vec![m0, m1]); - - let (_df, mf) = make_int_matrix(&[&[3, 0, 1, 1], &[0, 4, 2, 2]]); - let expected = CountPartials::euclidean_dist_matrix(&mf); - let got = CountPartials::euclidean_dist_matrix(&store); - assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "euclidean [0,1]"); - } - - // ── CountPartials — partitioned (LayeredStore>) ─────────── - - #[test] - fn partitioned_bray_matches_combined() { - // partition 0: slots [1,2,3,4,5] col0 vs col1 - // partition 1: slots [10,20] col0 vs col1 - let (_d0, p0) = make_int_matrix(&[&[1, 2, 3, 4, 5], &[5, 4, 3, 2, 1]]); - let (_d1, p1) = make_int_matrix(&[&[10, 20], &[20, 10]]); - - let partitioned = LayeredStore::new(vec![ - LayeredStore::new(vec![p0]), - LayeredStore::new(vec![p1]), - ]); - - let (_df, mf) = make_int_matrix(&[&[1, 2, 3, 4, 5, 10, 20], &[5, 4, 3, 2, 1, 20, 10]]); - let expected = CountPartials::bray_dist_matrix(&mf); - let got = CountPartials::bray_dist_matrix(&partitioned); - assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "partitioned bray [0,1]"); - } - - // ── BitPartials ─────────────────────────────────────────────────────────── - - #[test] - fn layered_jaccard_matches_combined() { - let (_d0, m0) = make_bit_matrix(&[&[true, false], &[false, true]]); - let (_d1, m1) = make_bit_matrix(&[&[true, true], &[true, false]]); - let store = LayeredStore::new(vec![m0, m1]); - - let (_df, mf) = make_bit_matrix(&[ - &[true, false, true, true], - &[false, true, true, false], - ]); - let expected = BitPartials::jaccard_dist_matrix(&mf); - let got = BitPartials::jaccard_dist_matrix(&store); - assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "jaccard [0,1]"); - } - - #[test] - fn layered_hamming_matches_combined() { - let (_d0, m0) = make_bit_matrix(&[&[true, false], &[false, true]]); - let (_d1, m1) = make_bit_matrix(&[&[true, true], &[false, false]]); - let store = LayeredStore::new(vec![m0, m1]); - - let (_df, mf) = make_bit_matrix(&[ - &[true, false, true, true], - &[false, true, false, false], - ]); - let expected = BitPartials::hamming_dist_matrix(&mf); - let got = BitPartials::hamming_dist_matrix(&store); - assert_eq!(got[[0, 1]], expected[[0, 1]], "hamming [0,1]"); - } -} +#[path = "tests/layered_store.rs"] +mod tests; diff --git a/src/obilayeredmap/src/tests/layered_store.rs b/src/obilayeredmap/src/tests/layered_store.rs new file mode 100644 index 0000000..1387013 --- /dev/null +++ b/src/obilayeredmap/src/tests/layered_store.rs @@ -0,0 +1,381 @@ +use super::*; +use obicompactvec::{ + PersistentBitMatrix, PersistentBitMatrixBuilder, + PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, +}; +use tempfile::tempdir; + +fn make_int_matrix(cols: &[&[u32]]) -> (tempfile::TempDir, PersistentCompactIntMatrix) { + let n = cols.first().map_or(0, |c| c.len()); + let dir = tempdir().unwrap(); + let mut b = PersistentCompactIntMatrixBuilder::new(n, &dir.path().join("counts")).unwrap(); + for &col in cols { + let mut cb = b.add_col().unwrap(); + for (slot, &v) in col.iter().enumerate() { cb.set(slot, v); } + cb.close().unwrap(); + } + b.close().unwrap(); + let m = PersistentCompactIntMatrix::open(dir.path()).unwrap(); + (dir, m) +} + +fn make_bit_matrix(cols: &[&[bool]]) -> (tempfile::TempDir, PersistentBitMatrix) { + let n = cols.first().map_or(0, |c| c.len()); + let dir = tempdir().unwrap(); + let mut b = PersistentBitMatrixBuilder::new(n, &dir.path().join("presence")).unwrap(); + for &col in cols { + let mut cb = b.add_col().unwrap(); + for (slot, &v) in col.iter().enumerate() { cb.set(slot, v); } + cb.close().unwrap(); + } + b.close().unwrap(); + let m = PersistentBitMatrix::open(dir.path()).unwrap(); + (dir, m) +} + +// ── ColumnWeights ───────────────────────────────────────────────────────── + +#[test] +fn col_weights_sums_across_layers() { + // layer 0: col0=[1,2], col1=[3,4] → weights [3, 7] + // layer 1: col0=[10,0], col1=[0,10] → weights [10, 10] + // combined: [13, 17] + let (_d0, m0) = make_int_matrix(&[&[1, 2], &[3, 4]]); + let (_d1, m1) = make_int_matrix(&[&[10, 0], &[0, 10]]); + let store = LayeredStore::new(vec![m0, m1]); + let w = store.col_weights(); + assert_eq!(w[0], 13); + assert_eq!(w[1], 17); +} + +#[test] +fn col_weights_bit_sums_across_layers() { + // layer 0: col0=[T,F,T], col1=[F,T,T] → counts [2, 2] + // layer 1: col0=[F,F,T], col1=[T,T,F] → counts [1, 2] + // combined: [3, 4] + let (_d0, m0) = make_bit_matrix(&[&[true, false, true], &[false, true, true]]); + let (_d1, m1) = make_bit_matrix(&[&[false, false, true], &[true, true, false]]); + let store = LayeredStore::new(vec![m0, m1]); + let w = store.col_weights(); + assert_eq!(w[0], 3); + assert_eq!(w[1], 4); +} + +// ── CountPartials — layered (one partition) ─────────────────────────────── + +#[test] +fn layered_bray_matches_combined() { + // Split [1,2,3,4,5] across two layers; bray dist should equal direct computation + // on [1,2,3,4,5] for each column pair. + // col0=[1,2,3,4,5], col1=[5,4,3,2,1] + let (_d0, m0) = make_int_matrix(&[&[1, 2], &[5, 4]]); // slots 0-1 + let (_d1, m1) = make_int_matrix(&[&[3, 4, 5], &[3, 2, 1]]); // slots 2-4 + let store = LayeredStore::new(vec![m0, m1]); + + // direct on full data + let (_df, mf) = make_int_matrix(&[&[1, 2, 3, 4, 5], &[5, 4, 3, 2, 1]]); + let expected = CountPartials::bray_dist_matrix(&mf); + let got = CountPartials::bray_dist_matrix(&store); + assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "bray [0,1]"); + assert!((got[[1, 0]] - expected[[1, 0]]).abs() < 1e-12, "bray [1,0]"); +} + +#[test] +fn layered_relfreq_bray_matches_combined() { + let (_d0, m0) = make_int_matrix(&[&[1, 2], &[5, 4]]); + let (_d1, m1) = make_int_matrix(&[&[3, 4, 5], &[3, 2, 1]]); + let store = LayeredStore::new(vec![m0, m1]); + + let (_df, mf) = make_int_matrix(&[&[1, 2, 3, 4, 5], &[5, 4, 3, 2, 1]]); + let expected = CountPartials::relfreq_bray_dist_matrix(&mf); + let got = CountPartials::relfreq_bray_dist_matrix(&store); + assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "relfreq_bray [0,1]"); +} + +#[test] +fn layered_euclidean_matches_combined() { + let (_d0, m0) = make_int_matrix(&[&[3, 0], &[0, 4]]); + let (_d1, m1) = make_int_matrix(&[&[1, 1], &[2, 2]]); + let store = LayeredStore::new(vec![m0, m1]); + + let (_df, mf) = make_int_matrix(&[&[3, 0, 1, 1], &[0, 4, 2, 2]]); + let expected = CountPartials::euclidean_dist_matrix(&mf); + let got = CountPartials::euclidean_dist_matrix(&store); + assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "euclidean [0,1]"); +} + +// ── CountPartials — partitioned (LayeredStore>) ─────────── + +#[test] +fn partitioned_bray_matches_combined() { + // partition 0: slots [1,2,3,4,5] col0 vs col1 + // partition 1: slots [10,20] col0 vs col1 + let (_d0, p0) = make_int_matrix(&[&[1, 2, 3, 4, 5], &[5, 4, 3, 2, 1]]); + let (_d1, p1) = make_int_matrix(&[&[10, 20], &[20, 10]]); + + let partitioned = LayeredStore::new(vec![ + LayeredStore::new(vec![p0]), + LayeredStore::new(vec![p1]), + ]); + + let (_df, mf) = make_int_matrix(&[&[1, 2, 3, 4, 5, 10, 20], &[5, 4, 3, 2, 1, 20, 10]]); + let expected = CountPartials::bray_dist_matrix(&mf); + let got = CountPartials::bray_dist_matrix(&partitioned); + assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "partitioned bray [0,1]"); +} + +#[test] +fn partitioned_threshold_jaccard_off_diagonal_is_pairwise() { + // 3 genomes, 2 partitions, 1 layer each — mirrors distance.rs's + // LayeredStore> shape. + // partition 0: col0=[3,0], col1=[0,3], col2=[3,3] + // partition 1: col0=[1,1], col1=[1,0], col2=[0,1] + let (_d0, p0) = make_int_matrix(&[&[3, 0], &[0, 3], &[3, 3]]); + let (_d1, p1) = make_int_matrix(&[&[1, 1], &[1, 0], &[0, 1]]); + + let partitioned = LayeredStore::new(vec![ + LayeredStore::new(vec![p0]), + LayeredStore::new(vec![p1]), + ]); + + let (_df, mf) = make_int_matrix(&[&[3, 0, 1, 1], &[0, 3, 1, 0], &[3, 3, 0, 1]]); + let threshold = 1u32; + let (inter_p, union_p) = CountPartials::partial_threshold_jaccard(&partitioned, threshold); + let (inter_f, union_f) = CountPartials::partial_threshold_jaccard(&mf, threshold); + + let n = 3; + for i in 0..n { + for j in 0..n { + assert_eq!(inter_p[[i, j]], inter_f[[i, j]], "inter[{i},{j}]"); + assert_eq!(union_p[[i, j]], union_f[[i, j]], "union[{i},{j}]"); + } + } +} + +#[test] +fn partitioned_threshold_jaccard_packed_off_diagonal_is_pairwise() { + // Same as `partitioned_threshold_jaccard_off_diagonal_is_pairwise` but + // each partition matrix is packed into a single .pcmx file first — + // the on-disk format actually used in production after `pack_matrices`. + use obicompactvec::pack_compact_int_matrix; + + let (d0, _p0) = make_int_matrix(&[&[3, 0], &[0, 3], &[3, 3]]); + pack_compact_int_matrix(&d0.path().join("counts")).unwrap(); + let p0 = PersistentCompactIntMatrix::open(d0.path()).unwrap(); + + let (d1, _p1) = make_int_matrix(&[&[1, 1], &[1, 0], &[0, 1]]); + pack_compact_int_matrix(&d1.path().join("counts")).unwrap(); + let p1 = PersistentCompactIntMatrix::open(d1.path()).unwrap(); + + let partitioned = LayeredStore::new(vec![ + LayeredStore::new(vec![p0]), + LayeredStore::new(vec![p1]), + ]); + + let (_df, mf) = make_int_matrix(&[&[3, 0, 1, 1], &[0, 3, 1, 0], &[3, 3, 0, 1]]); + let threshold = 1u32; + let (inter_p, union_p) = CountPartials::partial_threshold_jaccard(&partitioned, threshold); + let (inter_f, union_f) = CountPartials::partial_threshold_jaccard(&mf, threshold); + + let n = 3; + for i in 0..n { + for j in 0..n { + assert_eq!(inter_p[[i, j]], inter_f[[i, j]], "inter[{i},{j}]"); + assert_eq!(union_p[[i, j]], union_f[[i, j]], "union[{i},{j}]"); + } + } +} + +#[test] +fn partitioned_multilayer_threshold_jaccard_off_diagonal_is_pairwise() { + // 2 partitions, 2 layers each — the shape production indexes actually + // have (MPHF collision layers within a partition). + // partition 0, layer 0: col0=[3,0], col1=[0,3], col2=[3,3] + // partition 0, layer 1: col0=[2,0], col1=[0,0], col2=[2,0] + // partition 1, layer 0: col0=[1,1], col1=[1,0], col2=[0,1] + // partition 1, layer 1: col0=[0,5], col1=[5,5], col2=[0,0] + let (_d0a, p0a) = make_int_matrix(&[&[3, 0], &[0, 3], &[3, 3]]); + let (_d0b, p0b) = make_int_matrix(&[&[2, 0], &[0, 0], &[2, 0]]); + let (_d1a, p1a) = make_int_matrix(&[&[1, 1], &[1, 0], &[0, 1]]); + let (_d1b, p1b) = make_int_matrix(&[&[0, 5], &[5, 5], &[0, 0]]); + + let partitioned = LayeredStore::new(vec![ + LayeredStore::new(vec![p0a, p0b]), + LayeredStore::new(vec![p1a, p1b]), + ]); + + // Flattened equivalent: concatenate every layer's slots into one matrix. + let (_df, mf) = make_int_matrix(&[ + &[3, 0, 2, 0, 1, 1, 0, 5], + &[0, 3, 0, 0, 1, 0, 5, 5], + &[3, 3, 2, 0, 0, 1, 0, 0], + ]); + let threshold = 1u32; + let (inter_p, union_p) = CountPartials::partial_threshold_jaccard(&partitioned, threshold); + let (inter_f, union_f) = CountPartials::partial_threshold_jaccard(&mf, threshold); + + let n = 3; + for i in 0..n { + for j in 0..n { + assert_eq!(inter_p[[i, j]], inter_f[[i, j]], "inter[{i},{j}]"); + assert_eq!(union_p[[i, j]], union_f[[i, j]], "union[{i},{j}]"); + } + } +} + +// ── BitPartials ─────────────────────────────────────────────────────────── + +#[test] +fn layered_jaccard_matches_combined() { + let (_d0, m0) = make_bit_matrix(&[&[true, false], &[false, true]]); + let (_d1, m1) = make_bit_matrix(&[&[true, true], &[true, false]]); + let store = LayeredStore::new(vec![m0, m1]); + + let (_df, mf) = make_bit_matrix(&[ + &[true, false, true, true], + &[false, true, true, false], + ]); + let expected = BitPartials::jaccard_dist_matrix(&mf); + let got = BitPartials::jaccard_dist_matrix(&store); + assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "jaccard [0,1]"); +} + +#[test] +fn layered_hamming_matches_combined() { + let (_d0, m0) = make_bit_matrix(&[&[true, false], &[false, true]]); + let (_d1, m1) = make_bit_matrix(&[&[true, true], &[false, false]]); + let store = LayeredStore::new(vec![m0, m1]); + + let (_df, mf) = make_bit_matrix(&[ + &[true, false, true, true], + &[false, true, false, false], + ]); + let expected = BitPartials::hamming_dist_matrix(&mf); + let got = BitPartials::hamming_dist_matrix(&store); + assert_eq!(got[[0, 1]], expected[[0, 1]], "hamming [0,1]"); +} + +#[test] +fn partitioned_bit_jaccard_off_diagonal_is_pairwise() { + // Same shape as the count-based `partitioned_multilayer_threshold_jaccard_*` + // tests, but for the presence/bit path (`with_counts = false` — what + // `all_specifics` actually uses in production). + // 4 genomes, 3 partitions, 2 layers in the last one. + let (_d0, p0) = make_bit_matrix(&[ + &[true, false, true], + &[false, true, true], + &[true, true, false], + &[false, false, true], + ]); + let (_d1, p1) = make_bit_matrix(&[ + &[true, true], + &[false, true], + &[true, false], + &[true, true], + ]); + let (_d2a, p2a) = make_bit_matrix(&[ + &[false, true], + &[true, true], + &[false, false], + &[true, false], + ]); + let (_d2b, p2b) = make_bit_matrix(&[ + &[true], + &[false], + &[true], + &[true], + ]); + + let partitioned = LayeredStore::new(vec![ + LayeredStore::new(vec![p0]), + LayeredStore::new(vec![p1]), + LayeredStore::new(vec![p2a, p2b]), + ]); + + // Flattened equivalent: concatenate every partition/layer's slots. + let (_df, mf) = make_bit_matrix(&[ + &[true, false, true, true, true, false, true, true], + &[false, true, true, false, true, true, true, false], + &[true, true, false, true, false, false, false, true], + &[false, false, true, true, true, true, false, true], + ]); + + let (inter_p, union_p) = BitPartials::partial_jaccard(&partitioned); + let (inter_f, union_f) = BitPartials::partial_jaccard(&mf); + + let n = 4; + for i in 0..n { + for j in 0..n { + assert_eq!(inter_p[[i, j]], inter_f[[i, j]], "inter[{i},{j}]"); + assert_eq!(union_p[[i, j]], union_f[[i, j]], "union[{i},{j}]"); + } + } +} + +#[test] +fn partitioned_bit_jaccard_packed_off_diagonal_is_pairwise() { + // Same as `partitioned_bit_jaccard_off_diagonal_is_pairwise` but every + // partition's presence matrix is packed into a single .pbmx file — + // the on-disk format actually used in production after `pack_matrices`. + use obicompactvec::pack_bit_matrix; + + let (d0, _p0) = make_bit_matrix(&[ + &[true, false, true], + &[false, true, true], + &[true, true, false], + &[false, false, true], + ]); + pack_bit_matrix(&d0.path().join("presence")).unwrap(); + let p0 = PersistentBitMatrix::open(d0.path()).unwrap(); + + let (d1, _p1) = make_bit_matrix(&[ + &[true, true], + &[false, true], + &[true, false], + &[true, true], + ]); + pack_bit_matrix(&d1.path().join("presence")).unwrap(); + let p1 = PersistentBitMatrix::open(d1.path()).unwrap(); + + let (d2a, _p2a) = make_bit_matrix(&[ + &[false, true], + &[true, true], + &[false, false], + &[true, false], + ]); + pack_bit_matrix(&d2a.path().join("presence")).unwrap(); + let p2a = PersistentBitMatrix::open(d2a.path()).unwrap(); + + let (d2b, _p2b) = make_bit_matrix(&[ + &[true], + &[false], + &[true], + &[true], + ]); + pack_bit_matrix(&d2b.path().join("presence")).unwrap(); + let p2b = PersistentBitMatrix::open(d2b.path()).unwrap(); + + let partitioned = LayeredStore::new(vec![ + LayeredStore::new(vec![p0]), + LayeredStore::new(vec![p1]), + LayeredStore::new(vec![p2a, p2b]), + ]); + + let (_df, mf) = make_bit_matrix(&[ + &[true, false, true, true, true, false, true, true], + &[false, true, true, false, true, true, true, false], + &[true, true, false, true, false, false, false, true], + &[false, false, true, true, true, true, false, true], + ]); + + let (inter_p, union_p) = BitPartials::partial_jaccard(&partitioned); + let (inter_f, union_f) = BitPartials::partial_jaccard(&mf); + + let n = 4; + for i in 0..n { + for j in 0..n { + assert_eq!(inter_p[[i, j]], inter_f[[i, j]], "inter[{i},{j}]"); + assert_eq!(union_p[[i, j]], union_f[[i, j]], "union[{i},{j}]"); + } + } +}