feat: introduce packed matrix storage and layer metadata

Unifies bit and integer matrix storage into `PersistentBitMatrix` and `PersistentCompactIntMatrix` enums, supporting both columnar and memory-mapped single-file layouts. Introduces `LayerMeta` to persist layer dimensions as `layer_meta.json`, enabling correct initialization of implicit presence matrices. Adds CLI commands (`pack` and `--upgrade-index`) to convert existing columnar indices to the compact format and backfill missing metadata. Updates partitionner and layered map logic to use the new persistent builders, optimized memory allocation, and auto-detected storage backends.
This commit is contained in:
Eric Coissac
2026-06-03 11:50:39 +02:00
parent de1a41810a
commit 173ac9fb42
20 changed files with 799 additions and 271 deletions
+6 -4
View File
@@ -22,8 +22,9 @@ impl KmerPartition {
}
let matrices = (0..probe_n_layers(&index_dir))
.filter_map(|l| {
let dir = index_dir.join(format!("layer_{l}")).join("counts");
dir.exists().then(|| PersistentCompactIntMatrix::open(&dir).map_err(SKError::Io))
let layer_dir = index_dir.join(format!("layer_{l}"));
layer_dir.join("counts").exists()
.then(|| PersistentCompactIntMatrix::open(&layer_dir).map_err(SKError::Io))
})
.collect::<SKResult<Vec<_>>>()?;
Ok(LayeredStore::new(matrices))
@@ -38,8 +39,9 @@ impl KmerPartition {
}
let matrices = (0..probe_n_layers(&index_dir))
.filter_map(|l| {
let dir = index_dir.join(format!("layer_{l}")).join("presence");
dir.exists().then(|| PersistentBitMatrix::open(&dir).map_err(SKError::Io))
let layer_dir = index_dir.join(format!("layer_{l}"));
layer_dir.join("presence").exists()
.then(|| PersistentBitMatrix::open(&layer_dir).map_err(SKError::Io))
})
.collect::<SKResult<Vec<_>>>()?;
Ok(LayeredStore::new(matrices))
+4 -4
View File
@@ -51,14 +51,14 @@ impl KmerPartition {
let presence_dir = layer_dir.join("presence");
if use_counts && counts_dir.exists() {
let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?;
let mat = PersistentCompactIntMatrix::open(&layer_dir).map_err(SKError::Io)?;
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
if let Some(slot) = mphf.find(kmer) {
cb(kmer, mat.row(slot));
}
}
} else if !use_counts && presence_dir.exists() {
let mat = PersistentBitMatrix::open(&presence_dir).map_err(SKError::Io)?;
let mat = PersistentBitMatrix::open(&layer_dir).map_err(SKError::Io)?;
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
if let Some(slot) = mphf.find(kmer) {
let row: Box<[u32]> = mat.row(slot).iter().map(|&b| b as u32).collect();
@@ -108,14 +108,14 @@ impl KmerPartition {
let presence_dir = layer_dir.join("presence");
if use_counts && counts_dir.exists() {
let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?;
let mat = PersistentCompactIntMatrix::open(&layer_dir).map_err(SKError::Io)?;
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
if let Some(slot) = mphf.find(kmer) {
cb(part, layer, kmer, mat.row(slot));
}
}
} else if !use_counts && presence_dir.exists() {
let mat = PersistentBitMatrix::open(&presence_dir).map_err(SKError::Io)?;
let mat = PersistentBitMatrix::open(&layer_dir).map_err(SKError::Io)?;
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
if let Some(slot) = mphf.find(kmer) {
let row: Box<[u32]> = mat.row(slot).iter().map(|&b| b as u32).collect();
+16 -21
View File
@@ -45,37 +45,35 @@ impl ColBuilder {
// ── SrcLayerData — opened source matrix for pass-2 lookup ─────────────────────
pub(crate) enum SrcLayerData {
/// Pure set-membership layer (no data matrix): every kmer is present in all genomes.
SetMembership,
Presence(MphfOnly, PersistentBitMatrix),
Count(MphfOnly, PersistentCompactIntMatrix),
}
impl SrcLayerData {
pub(crate) fn open(layer_dir: &Path, merge_mode: MergeMode) -> SKResult<Self> {
let presence_dir = layer_dir.join("presence");
let counts_dir = layer_dir.join("counts");
let counts_dir = layer_dir.join("counts");
match merge_mode {
MergeMode::Presence => {
if presence_dir.exists() {
if counts_dir.exists() && !layer_dir.join("presence").exists() {
let mphf = MphfOnly::open(layer_dir).map_err(olm_to_sk)?;
let mat = PersistentBitMatrix::open(&presence_dir).map_err(SKError::Io)?;
Ok(SrcLayerData::Presence(mphf, mat))
} else if counts_dir.exists() {
let mphf = MphfOnly::open(layer_dir).map_err(olm_to_sk)?;
let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?;
let mat = PersistentCompactIntMatrix::open(layer_dir).map_err(SKError::Io)?;
Ok(SrcLayerData::Count(mphf, mat))
} else {
Ok(SrcLayerData::SetMembership)
// presence dir exists, or neither exists → Implicit handled by open()
let mphf = MphfOnly::open(layer_dir).map_err(olm_to_sk)?;
let mat = PersistentBitMatrix::open(layer_dir).map_err(SKError::Io)?;
Ok(SrcLayerData::Presence(mphf, mat))
}
}
MergeMode::Count => {
let mphf = MphfOnly::open(layer_dir).map_err(olm_to_sk)?;
if counts_dir.exists() {
let mphf = MphfOnly::open(layer_dir).map_err(olm_to_sk)?;
let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?;
let mat = PersistentCompactIntMatrix::open(layer_dir).map_err(SKError::Io)?;
Ok(SrcLayerData::Count(mphf, mat))
} else {
Ok(SrcLayerData::SetMembership)
// No counts → treat as implicit presence (all 1s)
let mat = PersistentBitMatrix::open(layer_dir).map_err(SKError::Io)?;
Ok(SrcLayerData::Presence(mphf, mat))
}
}
}
@@ -85,15 +83,12 @@ impl SrcLayerData {
/// The caller guarantees `kmer` is in the source MPHF domain.
#[inline]
pub(crate) fn lookup(&self, kmer: CanonicalKmer, n_genomes: usize) -> Vec<u32> {
let mut buf = vec![0u32; n_genomes];
match self {
SrcLayerData::SetMembership => vec![1u32; n_genomes],
SrcLayerData::Presence(mphf, mat) => {
mat.row(mphf.index(kmer)).iter().map(|&b| b as u32).collect()
}
SrcLayerData::Count(mphf, mat) => {
mat.row(mphf.index(kmer)).iter().copied().collect()
}
SrcLayerData::Presence(mphf, mat) => mat.fill_row(mphf.index(kmer), &mut buf),
SrcLayerData::Count(mphf, mat) => mat.fill_row(mphf.index(kmer), &mut buf),
}
buf
}
}
+14 -32
View File
@@ -20,48 +20,33 @@ fn olm_to_sk(e: OLMError) -> SKError {
// ── per-layer query handle ────────────────────────────────────────────────────
enum QueryLayer {
/// Layer<()> — MPHF-only, no data matrix; all indexed kmers map to 1 per genome.
SetOnly(MphfLayer),
Presence(MphfLayer, PersistentBitMatrix),
Count(MphfLayer, PersistentCompactIntMatrix),
}
impl QueryLayer {
fn open(layer_dir: &Path, with_counts: bool, mode: &IndexMode) -> SKResult<Self> {
let mphf = MphfLayer::open(layer_dir, mode).map_err(olm_to_sk)?;
let counts_dir = layer_dir.join("counts");
let presence_dir = layer_dir.join("presence");
let counts_dir = layer_dir.join("counts");
if with_counts && counts_dir.exists() {
let mphf = MphfLayer::open(layer_dir, mode).map_err(olm_to_sk)?;
let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?;
let mat = PersistentCompactIntMatrix::open(layer_dir).map_err(SKError::Io)?;
Ok(QueryLayer::Count(mphf, mat))
} else if presence_dir.exists() {
let mphf = MphfLayer::open(layer_dir, mode).map_err(olm_to_sk)?;
let mat = PersistentBitMatrix::open(&presence_dir).map_err(SKError::Io)?;
} else if presence_dir.exists() || !counts_dir.exists() {
// presence mode, or no matrix at all → Implicit handled inside open()
let mat = PersistentBitMatrix::open(layer_dir).map_err(SKError::Io)?;
Ok(QueryLayer::Presence(mphf, mat))
} else if counts_dir.exists() {
// presence query on a count index — return counts as-is
let mphf = MphfLayer::open(layer_dir, mode).map_err(olm_to_sk)?;
let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?;
Ok(QueryLayer::Count(mphf, mat))
} else {
let mphf = MphfLayer::open(layer_dir, mode).map_err(olm_to_sk)?;
Ok(QueryLayer::SetOnly(mphf))
// counts exist but not presence — count layer, no presence requested
let mat = PersistentCompactIntMatrix::open(layer_dir).map_err(SKError::Io)?;
Ok(QueryLayer::Count(mphf, mat))
}
}
/// Write per-genome values into `buf` if `kmer` is indexed in this layer.
/// Returns `true` on hit; `buf` is untouched on miss.
/// Write per-genome values into `buf` if `kmer` is indexed; returns true on hit.
fn find_into(&self, kmer: CanonicalKmer, n_genomes: usize, buf: &mut [u32]) -> bool {
match self {
QueryLayer::SetOnly(mphf) => {
if mphf.find(kmer).is_some() {
buf[..n_genomes].fill(1);
true
} else {
false
}
}
QueryLayer::Presence(mphf, mat) => {
if let Some(slot) = mphf.find(kmer) {
mat.fill_row(slot, &mut buf[..n_genomes]);
@@ -87,14 +72,11 @@ impl QueryLayer {
impl KmerPartition {
/// Query a single partition, calling `on_hit(sk_idx, kmer_idx, row)` for
/// every found k-mer without allocating intermediate result vectors.
///
/// `row` is a shared scratch buffer valid only for the duration of the call;
/// the callback must copy what it needs before returning.
pub fn query_partition_with<F>(
&self,
part_idx: usize,
superkmers: &[&RoutableSuperKmer],
k: usize,
_k: usize,
n_genomes: usize,
with_counts: bool,
mut on_hit: F,
@@ -133,13 +115,13 @@ impl KmerPartition {
Ok(())
}
/// Query a single partition for a slice of (already-routed) super-kmers.
/// Query a single partition for a slice of super-kmers, returning per-kmer rows.
/// Prefer [`query_partition_with`] to avoid per-kmer heap allocations.
pub fn query_partition(
&self,
part_idx: usize,
superkmers: &[&RoutableSuperKmer],
k: usize,
_k: usize,
n_genomes: usize,
with_counts: bool,
) -> SKResult<Vec<Vec<Option<Box<[u32]>>>>> {
@@ -152,7 +134,7 @@ impl KmerPartition {
if !index_dir.exists() {
return Ok(superkmers
.iter()
.map(|rsk| vec![None; rsk.seql() - k + 1])
.map(|rsk| vec![None; rsk.seql()])
.collect());
}