feat: introduce packed matrix storage and layer metadata

Unifies bit and integer matrix storage into `PersistentBitMatrix` and `PersistentCompactIntMatrix` enums, supporting both columnar and memory-mapped single-file layouts. Introduces `LayerMeta` to persist layer dimensions as `layer_meta.json`, enabling correct initialization of implicit presence matrices. Adds CLI commands (`pack` and `--upgrade-index`) to convert existing columnar indices to the compact format and backfill missing metadata. Updates partitionner and layered map logic to use the new persistent builders, optimized memory allocation, and auto-detected storage backends.
This commit is contained in:
Eric Coissac
2026-06-03 11:50:39 +02:00
parent de1a41810a
commit 173ac9fb42
20 changed files with 799 additions and 271 deletions
+82
View File
@@ -6,6 +6,7 @@ use std::sync::{Arc, Mutex};
use indicatif::{ProgressBar, ProgressStyle};
use obikpartitionner::{KmerPartition, KmerSpectrum};
use obilayeredmap;
use obisys::{Reporter, Stage};
use rayon::prelude::*;
use tracing::info;
@@ -199,6 +200,87 @@ impl KmerIndex {
.join(format!("layer_{layer}"))
.join("unitigs.bin")
}
/// Pack all partition matrices into single-file format (presence → .pbmx, counts → .pcmx).
///
/// Reduces per-query file-open overhead from O(n_genomes) to O(1) per partition.
/// Column files are kept in place; packed files take priority when opening.
pub fn pack_matrices(&self) -> OKIResult<()> {
use obicompactvec::{pack_bit_matrix, pack_compact_int_matrix};
use obilayeredmap::meta::PartitionMeta;
let n = self.n_partitions();
let errors: Vec<_> = (0..n)
.into_par_iter()
.filter_map(|i| {
let index_dir = self.partition.part_dir(i).join("index");
if !index_dir.exists() { return None; }
let meta = match PartitionMeta::load(&index_dir) {
Ok(m) => m,
Err(e) => return Some(OKIError::Io(std::io::Error::new(std::io::ErrorKind::Other, e.to_string()))),
};
for l in 0..meta.n_layers {
let layer_dir = index_dir.join(format!("layer_{l}"));
let presence_dir = layer_dir.join("presence");
let counts_dir = layer_dir.join("counts");
if presence_dir.exists() {
if let Err(e) = pack_bit_matrix(&presence_dir) {
return Some(OKIError::Io(e));
}
}
if counts_dir.exists() {
if let Err(e) = pack_compact_int_matrix(&counts_dir) {
return Some(OKIError::Io(e));
}
}
}
None
})
.collect();
if let Some(e) = errors.into_iter().next() { return Err(e); }
Ok(())
}
/// Write a `layer_meta.json` in any layer directory that is missing one.
///
/// Old indexes were built before this file was required. The number of
/// kmers is recovered from `unitigs.bin`, which is always present.
pub fn upgrade_layer_meta(&self) -> OKIResult<()> {
use obicompactvec::LayerMeta;
use obiskio::UnitigFileReader;
use obilayeredmap::meta::PartitionMeta;
let n = self.n_partitions();
let errors: Vec<_> = (0..n)
.into_par_iter()
.filter_map(|i| {
let index_dir = self.partition.part_dir(i).join("index");
if !index_dir.exists() { return None; }
let meta = match PartitionMeta::load(&index_dir) {
Ok(m) => m,
Err(e) => return Some(OKIError::Io(std::io::Error::new(std::io::ErrorKind::Other, e.to_string()))),
};
for l in 0..meta.n_layers {
let layer_dir = index_dir.join(format!("layer_{l}"));
let meta_path = layer_dir.join(LayerMeta::FILENAME);
if meta_path.exists() { continue; }
let unitigs_path = layer_dir.join("unitigs.bin");
let n_kmers = match UnitigFileReader::open_sequential(&unitigs_path) {
Ok(r) => r.n_kmers(),
Err(e) => return Some(OKIError::Partition(e)),
};
if let Err(e) = LayerMeta::save(&layer_dir, n_kmers) {
return Some(OKIError::Io(e));
}
}
None
})
.collect();
if let Some(e) = errors.into_iter().next() { return Err(e); }
Ok(())
}
}
fn label_from_path(path: &Path) -> String {
+10
View File
@@ -196,6 +196,16 @@ impl KmerIndex {
rep.push(t.stop());
}
// ── Pack matrices after merge ─────────────────────────────────────────
{
let t = Stage::start("pack");
let pb = spinner("pack — consolidating column files …");
let dst2 = KmerIndex::open(output)?;
dst2.pack_matrices()?;
pb.finish_and_clear();
rep.push(t.stop());
}
// Re-open to get the updated state.
KmerIndex::open(output)
}