refactor: streamline merge pipeline and MPHF indexing

Replace mphf.find() with direct mphf.index() calls to eliminate absence checks and fallback vectors. Introduce a lightweight MphfOnly wrapper for faster index loading, and standardize k-mer iteration across merge and rebuild layers. Update IndexMeta configuration and n_new calculation to leverage MPHF cardinality, streamlining the overall merge pipeline.
This commit is contained in:
Eric Coissac
2026-06-01 13:56:48 +02:00
parent 1e2115a1b0
commit 0350ca855b
5 changed files with 51 additions and 31 deletions
+2 -2
View File
@@ -96,7 +96,7 @@ impl KmerIndex {
let mut meta = IndexMeta::read(output).map_err(OKIError::Io)?; let mut meta = IndexMeta::read(output).map_err(OKIError::Io)?;
meta.genomes = all_genomes; meta.genomes = all_genomes;
meta.config.with_counts = mode == MergeMode::Count; meta.config.with_counts = mode == MergeMode::Count;
meta.config.evidence = evidence; meta.config.evidence = evidence.clone();
meta.write(output)?; meta.write(output)?;
// In presence/absence mode, purge counts/ directories inherited from // In presence/absence mode, purge counts/ directories inherited from
@@ -147,7 +147,7 @@ impl KmerIndex {
.filter_map(|i| { .filter_map(|i| {
let srcs: Vec<(&obikpartitionner::KmerPartition, usize)> = let srcs: Vec<(&obikpartitionner::KmerPartition, usize)> =
remaining_sources.iter().map(|s| (&s.partition, s.meta.genomes.len())).collect(); remaining_sources.iter().map(|s| (&s.partition, s.meta.genomes.len())).collect();
let result = dst_partition.merge_partition(i, &srcs, mode, n_dst_genomes, block_bits).err(); let result = dst_partition.merge_partition(i, &srcs, mode, n_dst_genomes, block_bits, &evidence).err();
pb.inc(1); pb.inc(1);
result result
}) })
+22 -26
View File
@@ -9,7 +9,7 @@ use obicompactvec::{PersistentBitMatrix, PersistentBitMatrixBuilder,
PersistentCompactIntVecBuilder}; PersistentCompactIntVecBuilder};
use obikseq::CanonicalKmer; use obikseq::CanonicalKmer;
use obiskio::{SKError, SKResult, UnitigFileReader}; use obiskio::{SKError, SKResult, UnitigFileReader};
use obilayeredmap::{IndexMode, Layer, LayeredMap, MphfLayer, OLMError}; use obilayeredmap::{IndexMode, Layer, LayeredMap, MphfOnly, OLMError};
use obilayeredmap::meta::PartitionMeta; use obilayeredmap::meta::PartitionMeta;
use crate::partition::KmerPartition; use crate::partition::KmerPartition;
@@ -47,22 +47,22 @@ impl ColBuilder {
pub(crate) enum SrcLayerData { pub(crate) enum SrcLayerData {
/// Pure set-membership layer (no data matrix): every kmer is present in all genomes. /// Pure set-membership layer (no data matrix): every kmer is present in all genomes.
SetMembership, SetMembership,
Presence(MphfLayer, PersistentBitMatrix), Presence(MphfOnly, PersistentBitMatrix),
Count(MphfLayer, PersistentCompactIntMatrix), Count(MphfOnly, PersistentCompactIntMatrix),
} }
impl SrcLayerData { impl SrcLayerData {
pub(crate) fn open(layer_dir: &Path, merge_mode: MergeMode, index_mode: &IndexMode) -> SKResult<Self> { pub(crate) fn open(layer_dir: &Path, merge_mode: MergeMode) -> SKResult<Self> {
let presence_dir = layer_dir.join("presence"); let presence_dir = layer_dir.join("presence");
let counts_dir = layer_dir.join("counts"); let counts_dir = layer_dir.join("counts");
match merge_mode { match merge_mode {
MergeMode::Presence => { MergeMode::Presence => {
if presence_dir.exists() { if presence_dir.exists() {
let mphf = MphfLayer::open(layer_dir, index_mode).map_err(olm_to_sk)?; let mphf = MphfOnly::open(layer_dir).map_err(olm_to_sk)?;
let mat = PersistentBitMatrix::open(&presence_dir).map_err(SKError::Io)?; let mat = PersistentBitMatrix::open(&presence_dir).map_err(SKError::Io)?;
Ok(SrcLayerData::Presence(mphf, mat)) Ok(SrcLayerData::Presence(mphf, mat))
} else if counts_dir.exists() { } else if counts_dir.exists() {
let mphf = MphfLayer::open(layer_dir, index_mode).map_err(olm_to_sk)?; let mphf = MphfOnly::open(layer_dir).map_err(olm_to_sk)?;
let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?; let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?;
Ok(SrcLayerData::Count(mphf, mat)) Ok(SrcLayerData::Count(mphf, mat))
} else { } else {
@@ -71,7 +71,7 @@ impl SrcLayerData {
} }
MergeMode::Count => { MergeMode::Count => {
if counts_dir.exists() { if counts_dir.exists() {
let mphf = MphfLayer::open(layer_dir, index_mode).map_err(olm_to_sk)?; let mphf = MphfOnly::open(layer_dir).map_err(olm_to_sk)?;
let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?; let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?;
Ok(SrcLayerData::Count(mphf, mat)) Ok(SrcLayerData::Count(mphf, mat))
} else { } else {
@@ -82,22 +82,16 @@ impl SrcLayerData {
} }
/// Return one value per source genome for `kmer`. /// Return one value per source genome for `kmer`.
/// The caller guarantees `kmer` is in the source MPHF domain.
#[inline]
pub(crate) fn lookup(&self, kmer: CanonicalKmer, n_genomes: usize) -> Vec<u32> { pub(crate) fn lookup(&self, kmer: CanonicalKmer, n_genomes: usize) -> Vec<u32> {
match self { match self {
SrcLayerData::SetMembership => vec![1u32; n_genomes], SrcLayerData::SetMembership => vec![1u32; n_genomes],
SrcLayerData::Presence(mphf, mat) => { SrcLayerData::Presence(mphf, mat) => {
if let Some(slot) = mphf.find(kmer) { mat.row(mphf.index(kmer)).iter().map(|&b| b as u32).collect()
mat.row(slot).iter().map(|&b| b as u32).collect()
} else {
vec![0u32; n_genomes]
}
} }
SrcLayerData::Count(mphf, mat) => { SrcLayerData::Count(mphf, mat) => {
if let Some(slot) = mphf.find(kmer) { mat.row(mphf.index(kmer)).iter().copied().collect()
mat.row(slot).iter().copied().collect()
} else {
vec![0u32; n_genomes]
}
} }
} }
} }
@@ -161,6 +155,7 @@ impl KmerPartition {
mode: MergeMode, mode: MergeMode,
n_dst_genomes: usize, n_dst_genomes: usize,
block_bits: u8, block_bits: u8,
evidence: &IndexMode,
) -> SKResult<()> { ) -> SKResult<()> {
let dst_index_dir = self.part_dir(i).join(INDEX_SUBDIR); let dst_index_dir = self.part_dir(i).join(INDEX_SUBDIR);
if !dst_index_dir.exists() { if !dst_index_dir.exists() {
@@ -208,7 +203,7 @@ impl KmerPartition {
let new_layer_idx = n_dst_layers; let new_layer_idx = n_dst_layers;
let new_layer_dir = dst_index_dir.join(format!("layer_{new_layer_idx}")); let new_layer_dir = dst_index_dir.join(format!("layer_{new_layer_idx}"));
if any_new { let n_new = if any_new {
g.compute_degrees(); g.compute_degrees();
fs::create_dir_all(&new_layer_dir)?; fs::create_dir_all(&new_layer_dir)?;
let mut uw = Layer::<()>::unitig_writer(&new_layer_dir).map_err(olm_to_sk)?; let mut uw = Layer::<()>::unitig_writer(&new_layer_dir).map_err(olm_to_sk)?;
@@ -216,16 +211,18 @@ impl KmerPartition {
uw.write(&unitig)?; uw.write(&unitig)?;
} }
uw.close()?; uw.close()?;
Layer::<()>::build(&new_layer_dir, block_bits, &IndexMode::Exact).map_err(olm_to_sk)?; Layer::<()>::build(&new_layer_dir, block_bits, evidence).map_err(olm_to_sk)?;
} g.len()
} else {
0
};
drop(g); drop(g);
let new_mphf = if any_new { let new_mphf = if any_new {
Some(MphfLayer::open(&new_layer_dir, &IndexMode::Exact).map_err(olm_to_sk)?) Some(MphfOnly::open(&new_layer_dir).map_err(olm_to_sk)?)
} else { } else {
None None
}; };
let n_new = new_mphf.as_ref().map_or(0, |m| m.n());
// ── Prepare matrix directories for the new layer ────────────────────── // ── Prepare matrix directories for the new layer ──────────────────────
// Absent columns (dst genomes) are written via append_column (all-zero/false). // Absent columns (dst genomes) are written via append_column (all-zero/false).
@@ -303,7 +300,7 @@ impl KmerPartition {
for l in 0..src_meta.n_layers { for l in 0..src_meta.n_layers {
let src_layer_dir = src_index_dir.join(format!("layer_{l}")); let src_layer_dir = src_index_dir.join(format!("layer_{l}"));
let reader = UnitigFileReader::open_sequential(&src_layer_dir.join("unitigs.bin"))?; let reader = UnitigFileReader::open_sequential(&src_layer_dir.join("unitigs.bin"))?;
let src_data = SrcLayerData::open(&src_layer_dir, mode, &src_meta.mode)?; let src_data = SrcLayerData::open(&src_layer_dir, mode)?;
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() { for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
let values = src_data.lookup(kmer, *src_n); let values = src_data.lookup(kmer, *src_n);
@@ -312,9 +309,8 @@ impl KmerPartition {
if let Some((dst_layer, hit)) = dst_map.query(kmer) { if let Some((dst_layer, hit)) = dst_map.query(kmer) {
exist_builders[dst_layer][builder_idx].set_val(hit.slot, value); exist_builders[dst_layer][builder_idx].set_val(hit.slot, value);
} else if let Some(ref mphf) = new_mphf { } else if let Some(ref mphf) = new_mphf {
if let Some(slot) = mphf.find(kmer) { let slot = mphf.index(kmer);
new_src_builders[builder_idx].set_val(slot, value); new_src_builders[builder_idx].set_val(slot, value);
}
} }
} }
} }
+2 -2
View File
@@ -117,7 +117,7 @@ impl KmerPartition {
if !unitigs_path.exists() { continue; } if !unitigs_path.exists() { continue; }
let reader = UnitigFileReader::open_sequential(&unitigs_path)?; let reader = UnitigFileReader::open_sequential(&unitigs_path)?;
let src_data = SrcLayerData::open(&src_layer_dir, mode, &src_meta.mode)?; let src_data = SrcLayerData::open(&src_layer_dir, mode)?;
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() { for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
let row = src_data.lookup(kmer, n_genomes); let row = src_data.lookup(kmer, n_genomes);
@@ -182,7 +182,7 @@ impl KmerPartition {
if !unitigs_path.exists() { continue; } if !unitigs_path.exists() { continue; }
let reader = UnitigFileReader::open_sequential(&unitigs_path)?; let reader = UnitigFileReader::open_sequential(&unitigs_path)?;
let src_data = SrcLayerData::open(&src_layer_dir, mode, &src_meta.mode)?; let src_data = SrcLayerData::open(&src_layer_dir, mode)?;
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() { for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
let row = src_data.lookup(kmer, n_genomes); let row = src_data.lookup(kmer, n_genomes);
+1 -1
View File
@@ -12,4 +12,4 @@ pub use layer::{Hit, Layer, LayerData};
pub use layered_store::LayeredStore; pub use layered_store::LayeredStore;
pub use map::LayeredMap; pub use map::LayeredMap;
pub use meta::{IndexMode, PartitionMeta}; pub use meta::{IndexMode, PartitionMeta};
pub use mphf_layer::MphfLayer; pub use mphf_layer::{MphfLayer, MphfOnly};
+24
View File
@@ -129,7 +129,31 @@ impl MphfLayer {
} }
pub fn n(&self) -> usize { self.n } pub fn n(&self) -> usize { self.n }
}
// ── MphfOnly ──────────────────────────────────────────────────────────────────
/// Lightweight wrapper that loads only the MPHF file, without evidence or unitigs.
///
/// Use this when the caller guarantees that all queried kmers are in the MPHF
/// domain (e.g. when iterating the source's own unitigs during merge).
pub struct MphfOnly(Mphf);
impl MphfOnly {
pub fn open(dir: &Path) -> OLMResult<Self> {
let mphf: Mphf = Mphf::load_full(&dir.join(MPHF_FILE))
.map_err(|e| OLMError::InvalidLayer(e.to_string()))?;
Ok(Self(mphf))
}
/// Return the slot for `kmer`. Only valid when `kmer` is in the MPHF domain.
#[inline]
pub fn index(&self, kmer: CanonicalKmer) -> usize {
self.0.index(&kmer.raw())
}
}
impl MphfLayer {
// ── Build helpers ───────────────────────────────────────────────────────── // ── Build helpers ─────────────────────────────────────────────────────────
pub fn unitig_writer(dir: &Path) -> OLMResult<UnitigFileWriter> { pub fn unitig_writer(dir: &Path) -> OLMResult<UnitigFileWriter> {