feat: introduce packed matrix storage and layer metadata
Unifies bit and integer matrix storage into `PersistentBitMatrix` and `PersistentCompactIntMatrix` enums, supporting both columnar and memory-mapped single-file layouts. Introduces `LayerMeta` to persist layer dimensions as `layer_meta.json`, enabling correct initialization of implicit presence matrices. Adds CLI commands (`pack` and `--upgrade-index`) to convert existing columnar indices to the compact format and backfill missing metadata. Updates partitionner and layered map logic to use the new persistent builders, optimized memory allocation, and auto-detected storage backends.
This commit is contained in:
@@ -22,8 +22,9 @@ impl KmerPartition {
|
||||
}
|
||||
let matrices = (0..probe_n_layers(&index_dir))
|
||||
.filter_map(|l| {
|
||||
let dir = index_dir.join(format!("layer_{l}")).join("counts");
|
||||
dir.exists().then(|| PersistentCompactIntMatrix::open(&dir).map_err(SKError::Io))
|
||||
let layer_dir = index_dir.join(format!("layer_{l}"));
|
||||
layer_dir.join("counts").exists()
|
||||
.then(|| PersistentCompactIntMatrix::open(&layer_dir).map_err(SKError::Io))
|
||||
})
|
||||
.collect::<SKResult<Vec<_>>>()?;
|
||||
Ok(LayeredStore::new(matrices))
|
||||
@@ -38,8 +39,9 @@ impl KmerPartition {
|
||||
}
|
||||
let matrices = (0..probe_n_layers(&index_dir))
|
||||
.filter_map(|l| {
|
||||
let dir = index_dir.join(format!("layer_{l}")).join("presence");
|
||||
dir.exists().then(|| PersistentBitMatrix::open(&dir).map_err(SKError::Io))
|
||||
let layer_dir = index_dir.join(format!("layer_{l}"));
|
||||
layer_dir.join("presence").exists()
|
||||
.then(|| PersistentBitMatrix::open(&layer_dir).map_err(SKError::Io))
|
||||
})
|
||||
.collect::<SKResult<Vec<_>>>()?;
|
||||
Ok(LayeredStore::new(matrices))
|
||||
|
||||
@@ -51,14 +51,14 @@ impl KmerPartition {
|
||||
let presence_dir = layer_dir.join("presence");
|
||||
|
||||
if use_counts && counts_dir.exists() {
|
||||
let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?;
|
||||
let mat = PersistentCompactIntMatrix::open(&layer_dir).map_err(SKError::Io)?;
|
||||
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
|
||||
if let Some(slot) = mphf.find(kmer) {
|
||||
cb(kmer, mat.row(slot));
|
||||
}
|
||||
}
|
||||
} else if !use_counts && presence_dir.exists() {
|
||||
let mat = PersistentBitMatrix::open(&presence_dir).map_err(SKError::Io)?;
|
||||
let mat = PersistentBitMatrix::open(&layer_dir).map_err(SKError::Io)?;
|
||||
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
|
||||
if let Some(slot) = mphf.find(kmer) {
|
||||
let row: Box<[u32]> = mat.row(slot).iter().map(|&b| b as u32).collect();
|
||||
@@ -108,14 +108,14 @@ impl KmerPartition {
|
||||
let presence_dir = layer_dir.join("presence");
|
||||
|
||||
if use_counts && counts_dir.exists() {
|
||||
let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?;
|
||||
let mat = PersistentCompactIntMatrix::open(&layer_dir).map_err(SKError::Io)?;
|
||||
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
|
||||
if let Some(slot) = mphf.find(kmer) {
|
||||
cb(part, layer, kmer, mat.row(slot));
|
||||
}
|
||||
}
|
||||
} else if !use_counts && presence_dir.exists() {
|
||||
let mat = PersistentBitMatrix::open(&presence_dir).map_err(SKError::Io)?;
|
||||
let mat = PersistentBitMatrix::open(&layer_dir).map_err(SKError::Io)?;
|
||||
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
|
||||
if let Some(slot) = mphf.find(kmer) {
|
||||
let row: Box<[u32]> = mat.row(slot).iter().map(|&b| b as u32).collect();
|
||||
|
||||
@@ -45,37 +45,35 @@ impl ColBuilder {
|
||||
// ── SrcLayerData — opened source matrix for pass-2 lookup ─────────────────────
|
||||
|
||||
pub(crate) enum SrcLayerData {
|
||||
/// Pure set-membership layer (no data matrix): every kmer is present in all genomes.
|
||||
SetMembership,
|
||||
Presence(MphfOnly, PersistentBitMatrix),
|
||||
Count(MphfOnly, PersistentCompactIntMatrix),
|
||||
}
|
||||
|
||||
impl SrcLayerData {
|
||||
pub(crate) fn open(layer_dir: &Path, merge_mode: MergeMode) -> SKResult<Self> {
|
||||
let presence_dir = layer_dir.join("presence");
|
||||
let counts_dir = layer_dir.join("counts");
|
||||
let counts_dir = layer_dir.join("counts");
|
||||
match merge_mode {
|
||||
MergeMode::Presence => {
|
||||
if presence_dir.exists() {
|
||||
if counts_dir.exists() && !layer_dir.join("presence").exists() {
|
||||
let mphf = MphfOnly::open(layer_dir).map_err(olm_to_sk)?;
|
||||
let mat = PersistentBitMatrix::open(&presence_dir).map_err(SKError::Io)?;
|
||||
Ok(SrcLayerData::Presence(mphf, mat))
|
||||
} else if counts_dir.exists() {
|
||||
let mphf = MphfOnly::open(layer_dir).map_err(olm_to_sk)?;
|
||||
let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?;
|
||||
let mat = PersistentCompactIntMatrix::open(layer_dir).map_err(SKError::Io)?;
|
||||
Ok(SrcLayerData::Count(mphf, mat))
|
||||
} else {
|
||||
Ok(SrcLayerData::SetMembership)
|
||||
// presence dir exists, or neither exists → Implicit handled by open()
|
||||
let mphf = MphfOnly::open(layer_dir).map_err(olm_to_sk)?;
|
||||
let mat = PersistentBitMatrix::open(layer_dir).map_err(SKError::Io)?;
|
||||
Ok(SrcLayerData::Presence(mphf, mat))
|
||||
}
|
||||
}
|
||||
MergeMode::Count => {
|
||||
let mphf = MphfOnly::open(layer_dir).map_err(olm_to_sk)?;
|
||||
if counts_dir.exists() {
|
||||
let mphf = MphfOnly::open(layer_dir).map_err(olm_to_sk)?;
|
||||
let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?;
|
||||
let mat = PersistentCompactIntMatrix::open(layer_dir).map_err(SKError::Io)?;
|
||||
Ok(SrcLayerData::Count(mphf, mat))
|
||||
} else {
|
||||
Ok(SrcLayerData::SetMembership)
|
||||
// No counts → treat as implicit presence (all 1s)
|
||||
let mat = PersistentBitMatrix::open(layer_dir).map_err(SKError::Io)?;
|
||||
Ok(SrcLayerData::Presence(mphf, mat))
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -85,15 +83,12 @@ impl SrcLayerData {
|
||||
/// The caller guarantees `kmer` is in the source MPHF domain.
|
||||
#[inline]
|
||||
pub(crate) fn lookup(&self, kmer: CanonicalKmer, n_genomes: usize) -> Vec<u32> {
|
||||
let mut buf = vec![0u32; n_genomes];
|
||||
match self {
|
||||
SrcLayerData::SetMembership => vec![1u32; n_genomes],
|
||||
SrcLayerData::Presence(mphf, mat) => {
|
||||
mat.row(mphf.index(kmer)).iter().map(|&b| b as u32).collect()
|
||||
}
|
||||
SrcLayerData::Count(mphf, mat) => {
|
||||
mat.row(mphf.index(kmer)).iter().copied().collect()
|
||||
}
|
||||
SrcLayerData::Presence(mphf, mat) => mat.fill_row(mphf.index(kmer), &mut buf),
|
||||
SrcLayerData::Count(mphf, mat) => mat.fill_row(mphf.index(kmer), &mut buf),
|
||||
}
|
||||
buf
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -20,48 +20,33 @@ fn olm_to_sk(e: OLMError) -> SKError {
|
||||
// ── per-layer query handle ────────────────────────────────────────────────────
|
||||
|
||||
enum QueryLayer {
|
||||
/// Layer<()> — MPHF-only, no data matrix; all indexed kmers map to 1 per genome.
|
||||
SetOnly(MphfLayer),
|
||||
Presence(MphfLayer, PersistentBitMatrix),
|
||||
Count(MphfLayer, PersistentCompactIntMatrix),
|
||||
}
|
||||
|
||||
impl QueryLayer {
|
||||
fn open(layer_dir: &Path, with_counts: bool, mode: &IndexMode) -> SKResult<Self> {
|
||||
let mphf = MphfLayer::open(layer_dir, mode).map_err(olm_to_sk)?;
|
||||
let counts_dir = layer_dir.join("counts");
|
||||
let presence_dir = layer_dir.join("presence");
|
||||
let counts_dir = layer_dir.join("counts");
|
||||
|
||||
if with_counts && counts_dir.exists() {
|
||||
let mphf = MphfLayer::open(layer_dir, mode).map_err(olm_to_sk)?;
|
||||
let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?;
|
||||
let mat = PersistentCompactIntMatrix::open(layer_dir).map_err(SKError::Io)?;
|
||||
Ok(QueryLayer::Count(mphf, mat))
|
||||
} else if presence_dir.exists() {
|
||||
let mphf = MphfLayer::open(layer_dir, mode).map_err(olm_to_sk)?;
|
||||
let mat = PersistentBitMatrix::open(&presence_dir).map_err(SKError::Io)?;
|
||||
} else if presence_dir.exists() || !counts_dir.exists() {
|
||||
// presence mode, or no matrix at all → Implicit handled inside open()
|
||||
let mat = PersistentBitMatrix::open(layer_dir).map_err(SKError::Io)?;
|
||||
Ok(QueryLayer::Presence(mphf, mat))
|
||||
} else if counts_dir.exists() {
|
||||
// presence query on a count index — return counts as-is
|
||||
let mphf = MphfLayer::open(layer_dir, mode).map_err(olm_to_sk)?;
|
||||
let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?;
|
||||
Ok(QueryLayer::Count(mphf, mat))
|
||||
} else {
|
||||
let mphf = MphfLayer::open(layer_dir, mode).map_err(olm_to_sk)?;
|
||||
Ok(QueryLayer::SetOnly(mphf))
|
||||
// counts exist but not presence — count layer, no presence requested
|
||||
let mat = PersistentCompactIntMatrix::open(layer_dir).map_err(SKError::Io)?;
|
||||
Ok(QueryLayer::Count(mphf, mat))
|
||||
}
|
||||
}
|
||||
|
||||
/// Write per-genome values into `buf` if `kmer` is indexed in this layer.
|
||||
/// Returns `true` on hit; `buf` is untouched on miss.
|
||||
/// Write per-genome values into `buf` if `kmer` is indexed; returns true on hit.
|
||||
fn find_into(&self, kmer: CanonicalKmer, n_genomes: usize, buf: &mut [u32]) -> bool {
|
||||
match self {
|
||||
QueryLayer::SetOnly(mphf) => {
|
||||
if mphf.find(kmer).is_some() {
|
||||
buf[..n_genomes].fill(1);
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
QueryLayer::Presence(mphf, mat) => {
|
||||
if let Some(slot) = mphf.find(kmer) {
|
||||
mat.fill_row(slot, &mut buf[..n_genomes]);
|
||||
@@ -87,14 +72,11 @@ impl QueryLayer {
|
||||
impl KmerPartition {
|
||||
/// Query a single partition, calling `on_hit(sk_idx, kmer_idx, row)` for
|
||||
/// every found k-mer without allocating intermediate result vectors.
|
||||
///
|
||||
/// `row` is a shared scratch buffer valid only for the duration of the call;
|
||||
/// the callback must copy what it needs before returning.
|
||||
pub fn query_partition_with<F>(
|
||||
&self,
|
||||
part_idx: usize,
|
||||
superkmers: &[&RoutableSuperKmer],
|
||||
k: usize,
|
||||
_k: usize,
|
||||
n_genomes: usize,
|
||||
with_counts: bool,
|
||||
mut on_hit: F,
|
||||
@@ -133,13 +115,13 @@ impl KmerPartition {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Query a single partition for a slice of (already-routed) super-kmers.
|
||||
/// Query a single partition for a slice of super-kmers, returning per-kmer rows.
|
||||
/// Prefer [`query_partition_with`] to avoid per-kmer heap allocations.
|
||||
pub fn query_partition(
|
||||
&self,
|
||||
part_idx: usize,
|
||||
superkmers: &[&RoutableSuperKmer],
|
||||
k: usize,
|
||||
_k: usize,
|
||||
n_genomes: usize,
|
||||
with_counts: bool,
|
||||
) -> SKResult<Vec<Vec<Option<Box<[u32]>>>>> {
|
||||
@@ -152,7 +134,7 @@ impl KmerPartition {
|
||||
if !index_dir.exists() {
|
||||
return Ok(superkmers
|
||||
.iter()
|
||||
.map(|rsk| vec![None; rsk.seql() - k + 1])
|
||||
.map(|rsk| vec![None; rsk.seql()])
|
||||
.collect());
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user