feat: introduce packed matrix storage and layer metadata
Unifies bit and integer matrix storage into `PersistentBitMatrix` and `PersistentCompactIntMatrix` enums, supporting both columnar and memory-mapped single-file layouts. Introduces `LayerMeta` to persist layer dimensions as `layer_meta.json`, enabling correct initialization of implicit presence matrices. Adds CLI commands (`pack` and `--upgrade-index`) to convert existing columnar indices to the compact format and backfill missing metadata. Updates partitionner and layered map logic to use the new persistent builders, optimized memory allocation, and auto-detected storage backends.
This commit is contained in:
@@ -6,6 +6,7 @@ use std::sync::{Arc, Mutex};
|
||||
|
||||
use indicatif::{ProgressBar, ProgressStyle};
|
||||
use obikpartitionner::{KmerPartition, KmerSpectrum};
|
||||
use obilayeredmap;
|
||||
use obisys::{Reporter, Stage};
|
||||
use rayon::prelude::*;
|
||||
use tracing::info;
|
||||
@@ -199,6 +200,87 @@ impl KmerIndex {
|
||||
.join(format!("layer_{layer}"))
|
||||
.join("unitigs.bin")
|
||||
}
|
||||
|
||||
/// Pack all partition matrices into single-file format (presence → .pbmx, counts → .pcmx).
|
||||
///
|
||||
/// Reduces per-query file-open overhead from O(n_genomes) to O(1) per partition.
|
||||
/// Column files are kept in place; packed files take priority when opening.
|
||||
pub fn pack_matrices(&self) -> OKIResult<()> {
|
||||
use obicompactvec::{pack_bit_matrix, pack_compact_int_matrix};
|
||||
use obilayeredmap::meta::PartitionMeta;
|
||||
|
||||
let n = self.n_partitions();
|
||||
let errors: Vec<_> = (0..n)
|
||||
.into_par_iter()
|
||||
.filter_map(|i| {
|
||||
let index_dir = self.partition.part_dir(i).join("index");
|
||||
if !index_dir.exists() { return None; }
|
||||
let meta = match PartitionMeta::load(&index_dir) {
|
||||
Ok(m) => m,
|
||||
Err(e) => return Some(OKIError::Io(std::io::Error::new(std::io::ErrorKind::Other, e.to_string()))),
|
||||
};
|
||||
for l in 0..meta.n_layers {
|
||||
let layer_dir = index_dir.join(format!("layer_{l}"));
|
||||
let presence_dir = layer_dir.join("presence");
|
||||
let counts_dir = layer_dir.join("counts");
|
||||
if presence_dir.exists() {
|
||||
if let Err(e) = pack_bit_matrix(&presence_dir) {
|
||||
return Some(OKIError::Io(e));
|
||||
}
|
||||
}
|
||||
if counts_dir.exists() {
|
||||
if let Err(e) = pack_compact_int_matrix(&counts_dir) {
|
||||
return Some(OKIError::Io(e));
|
||||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
})
|
||||
.collect();
|
||||
|
||||
if let Some(e) = errors.into_iter().next() { return Err(e); }
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Write a `layer_meta.json` in any layer directory that is missing one.
|
||||
///
|
||||
/// Old indexes were built before this file was required. The number of
|
||||
/// kmers is recovered from `unitigs.bin`, which is always present.
|
||||
pub fn upgrade_layer_meta(&self) -> OKIResult<()> {
|
||||
use obicompactvec::LayerMeta;
|
||||
use obiskio::UnitigFileReader;
|
||||
use obilayeredmap::meta::PartitionMeta;
|
||||
|
||||
let n = self.n_partitions();
|
||||
let errors: Vec<_> = (0..n)
|
||||
.into_par_iter()
|
||||
.filter_map(|i| {
|
||||
let index_dir = self.partition.part_dir(i).join("index");
|
||||
if !index_dir.exists() { return None; }
|
||||
let meta = match PartitionMeta::load(&index_dir) {
|
||||
Ok(m) => m,
|
||||
Err(e) => return Some(OKIError::Io(std::io::Error::new(std::io::ErrorKind::Other, e.to_string()))),
|
||||
};
|
||||
for l in 0..meta.n_layers {
|
||||
let layer_dir = index_dir.join(format!("layer_{l}"));
|
||||
let meta_path = layer_dir.join(LayerMeta::FILENAME);
|
||||
if meta_path.exists() { continue; }
|
||||
let unitigs_path = layer_dir.join("unitigs.bin");
|
||||
let n_kmers = match UnitigFileReader::open_sequential(&unitigs_path) {
|
||||
Ok(r) => r.n_kmers(),
|
||||
Err(e) => return Some(OKIError::Partition(e)),
|
||||
};
|
||||
if let Err(e) = LayerMeta::save(&layer_dir, n_kmers) {
|
||||
return Some(OKIError::Io(e));
|
||||
}
|
||||
}
|
||||
None
|
||||
})
|
||||
.collect();
|
||||
|
||||
if let Some(e) = errors.into_iter().next() { return Err(e); }
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn label_from_path(path: &Path) -> String {
|
||||
|
||||
@@ -196,6 +196,16 @@ impl KmerIndex {
|
||||
rep.push(t.stop());
|
||||
}
|
||||
|
||||
// ── Pack matrices after merge ─────────────────────────────────────────
|
||||
{
|
||||
let t = Stage::start("pack");
|
||||
let pb = spinner("pack — consolidating column files …");
|
||||
let dst2 = KmerIndex::open(output)?;
|
||||
dst2.pack_matrices()?;
|
||||
pb.finish_and_clear();
|
||||
rep.push(t.stop());
|
||||
}
|
||||
|
||||
// Re-open to get the updated state.
|
||||
KmerIndex::open(output)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user