refactor: streamline merge pipeline and MPHF indexing
Replace mphf.find() with direct mphf.index() calls to eliminate absence checks and fallback vectors. Introduce a lightweight MphfOnly wrapper for faster index loading, and standardize k-mer iteration across merge and rebuild layers. Update IndexMeta configuration and n_new calculation to leverage MPHF cardinality, streamlining the overall merge pipeline.
This commit is contained in:
@@ -96,7 +96,7 @@ impl KmerIndex {
|
|||||||
let mut meta = IndexMeta::read(output).map_err(OKIError::Io)?;
|
let mut meta = IndexMeta::read(output).map_err(OKIError::Io)?;
|
||||||
meta.genomes = all_genomes;
|
meta.genomes = all_genomes;
|
||||||
meta.config.with_counts = mode == MergeMode::Count;
|
meta.config.with_counts = mode == MergeMode::Count;
|
||||||
meta.config.evidence = evidence;
|
meta.config.evidence = evidence.clone();
|
||||||
meta.write(output)?;
|
meta.write(output)?;
|
||||||
|
|
||||||
// In presence/absence mode, purge counts/ directories inherited from
|
// In presence/absence mode, purge counts/ directories inherited from
|
||||||
@@ -147,7 +147,7 @@ impl KmerIndex {
|
|||||||
.filter_map(|i| {
|
.filter_map(|i| {
|
||||||
let srcs: Vec<(&obikpartitionner::KmerPartition, usize)> =
|
let srcs: Vec<(&obikpartitionner::KmerPartition, usize)> =
|
||||||
remaining_sources.iter().map(|s| (&s.partition, s.meta.genomes.len())).collect();
|
remaining_sources.iter().map(|s| (&s.partition, s.meta.genomes.len())).collect();
|
||||||
let result = dst_partition.merge_partition(i, &srcs, mode, n_dst_genomes, block_bits).err();
|
let result = dst_partition.merge_partition(i, &srcs, mode, n_dst_genomes, block_bits, &evidence).err();
|
||||||
pb.inc(1);
|
pb.inc(1);
|
||||||
result
|
result
|
||||||
})
|
})
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ use obicompactvec::{PersistentBitMatrix, PersistentBitMatrixBuilder,
|
|||||||
PersistentCompactIntVecBuilder};
|
PersistentCompactIntVecBuilder};
|
||||||
use obikseq::CanonicalKmer;
|
use obikseq::CanonicalKmer;
|
||||||
use obiskio::{SKError, SKResult, UnitigFileReader};
|
use obiskio::{SKError, SKResult, UnitigFileReader};
|
||||||
use obilayeredmap::{IndexMode, Layer, LayeredMap, MphfLayer, OLMError};
|
use obilayeredmap::{IndexMode, Layer, LayeredMap, MphfOnly, OLMError};
|
||||||
use obilayeredmap::meta::PartitionMeta;
|
use obilayeredmap::meta::PartitionMeta;
|
||||||
|
|
||||||
use crate::partition::KmerPartition;
|
use crate::partition::KmerPartition;
|
||||||
@@ -47,22 +47,22 @@ impl ColBuilder {
|
|||||||
pub(crate) enum SrcLayerData {
|
pub(crate) enum SrcLayerData {
|
||||||
/// Pure set-membership layer (no data matrix): every kmer is present in all genomes.
|
/// Pure set-membership layer (no data matrix): every kmer is present in all genomes.
|
||||||
SetMembership,
|
SetMembership,
|
||||||
Presence(MphfLayer, PersistentBitMatrix),
|
Presence(MphfOnly, PersistentBitMatrix),
|
||||||
Count(MphfLayer, PersistentCompactIntMatrix),
|
Count(MphfOnly, PersistentCompactIntMatrix),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl SrcLayerData {
|
impl SrcLayerData {
|
||||||
pub(crate) fn open(layer_dir: &Path, merge_mode: MergeMode, index_mode: &IndexMode) -> SKResult<Self> {
|
pub(crate) fn open(layer_dir: &Path, merge_mode: MergeMode) -> SKResult<Self> {
|
||||||
let presence_dir = layer_dir.join("presence");
|
let presence_dir = layer_dir.join("presence");
|
||||||
let counts_dir = layer_dir.join("counts");
|
let counts_dir = layer_dir.join("counts");
|
||||||
match merge_mode {
|
match merge_mode {
|
||||||
MergeMode::Presence => {
|
MergeMode::Presence => {
|
||||||
if presence_dir.exists() {
|
if presence_dir.exists() {
|
||||||
let mphf = MphfLayer::open(layer_dir, index_mode).map_err(olm_to_sk)?;
|
let mphf = MphfOnly::open(layer_dir).map_err(olm_to_sk)?;
|
||||||
let mat = PersistentBitMatrix::open(&presence_dir).map_err(SKError::Io)?;
|
let mat = PersistentBitMatrix::open(&presence_dir).map_err(SKError::Io)?;
|
||||||
Ok(SrcLayerData::Presence(mphf, mat))
|
Ok(SrcLayerData::Presence(mphf, mat))
|
||||||
} else if counts_dir.exists() {
|
} else if counts_dir.exists() {
|
||||||
let mphf = MphfLayer::open(layer_dir, index_mode).map_err(olm_to_sk)?;
|
let mphf = MphfOnly::open(layer_dir).map_err(olm_to_sk)?;
|
||||||
let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?;
|
let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?;
|
||||||
Ok(SrcLayerData::Count(mphf, mat))
|
Ok(SrcLayerData::Count(mphf, mat))
|
||||||
} else {
|
} else {
|
||||||
@@ -71,7 +71,7 @@ impl SrcLayerData {
|
|||||||
}
|
}
|
||||||
MergeMode::Count => {
|
MergeMode::Count => {
|
||||||
if counts_dir.exists() {
|
if counts_dir.exists() {
|
||||||
let mphf = MphfLayer::open(layer_dir, index_mode).map_err(olm_to_sk)?;
|
let mphf = MphfOnly::open(layer_dir).map_err(olm_to_sk)?;
|
||||||
let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?;
|
let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?;
|
||||||
Ok(SrcLayerData::Count(mphf, mat))
|
Ok(SrcLayerData::Count(mphf, mat))
|
||||||
} else {
|
} else {
|
||||||
@@ -82,22 +82,16 @@ impl SrcLayerData {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Return one value per source genome for `kmer`.
|
/// Return one value per source genome for `kmer`.
|
||||||
|
/// The caller guarantees `kmer` is in the source MPHF domain.
|
||||||
|
#[inline]
|
||||||
pub(crate) fn lookup(&self, kmer: CanonicalKmer, n_genomes: usize) -> Vec<u32> {
|
pub(crate) fn lookup(&self, kmer: CanonicalKmer, n_genomes: usize) -> Vec<u32> {
|
||||||
match self {
|
match self {
|
||||||
SrcLayerData::SetMembership => vec![1u32; n_genomes],
|
SrcLayerData::SetMembership => vec![1u32; n_genomes],
|
||||||
SrcLayerData::Presence(mphf, mat) => {
|
SrcLayerData::Presence(mphf, mat) => {
|
||||||
if let Some(slot) = mphf.find(kmer) {
|
mat.row(mphf.index(kmer)).iter().map(|&b| b as u32).collect()
|
||||||
mat.row(slot).iter().map(|&b| b as u32).collect()
|
|
||||||
} else {
|
|
||||||
vec![0u32; n_genomes]
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
SrcLayerData::Count(mphf, mat) => {
|
SrcLayerData::Count(mphf, mat) => {
|
||||||
if let Some(slot) = mphf.find(kmer) {
|
mat.row(mphf.index(kmer)).iter().copied().collect()
|
||||||
mat.row(slot).iter().copied().collect()
|
|
||||||
} else {
|
|
||||||
vec![0u32; n_genomes]
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -161,6 +155,7 @@ impl KmerPartition {
|
|||||||
mode: MergeMode,
|
mode: MergeMode,
|
||||||
n_dst_genomes: usize,
|
n_dst_genomes: usize,
|
||||||
block_bits: u8,
|
block_bits: u8,
|
||||||
|
evidence: &IndexMode,
|
||||||
) -> SKResult<()> {
|
) -> SKResult<()> {
|
||||||
let dst_index_dir = self.part_dir(i).join(INDEX_SUBDIR);
|
let dst_index_dir = self.part_dir(i).join(INDEX_SUBDIR);
|
||||||
if !dst_index_dir.exists() {
|
if !dst_index_dir.exists() {
|
||||||
@@ -208,7 +203,7 @@ impl KmerPartition {
|
|||||||
let new_layer_idx = n_dst_layers;
|
let new_layer_idx = n_dst_layers;
|
||||||
let new_layer_dir = dst_index_dir.join(format!("layer_{new_layer_idx}"));
|
let new_layer_dir = dst_index_dir.join(format!("layer_{new_layer_idx}"));
|
||||||
|
|
||||||
if any_new {
|
let n_new = if any_new {
|
||||||
g.compute_degrees();
|
g.compute_degrees();
|
||||||
fs::create_dir_all(&new_layer_dir)?;
|
fs::create_dir_all(&new_layer_dir)?;
|
||||||
let mut uw = Layer::<()>::unitig_writer(&new_layer_dir).map_err(olm_to_sk)?;
|
let mut uw = Layer::<()>::unitig_writer(&new_layer_dir).map_err(olm_to_sk)?;
|
||||||
@@ -216,16 +211,18 @@ impl KmerPartition {
|
|||||||
uw.write(&unitig)?;
|
uw.write(&unitig)?;
|
||||||
}
|
}
|
||||||
uw.close()?;
|
uw.close()?;
|
||||||
Layer::<()>::build(&new_layer_dir, block_bits, &IndexMode::Exact).map_err(olm_to_sk)?;
|
Layer::<()>::build(&new_layer_dir, block_bits, evidence).map_err(olm_to_sk)?;
|
||||||
}
|
g.len()
|
||||||
|
} else {
|
||||||
|
0
|
||||||
|
};
|
||||||
drop(g);
|
drop(g);
|
||||||
|
|
||||||
let new_mphf = if any_new {
|
let new_mphf = if any_new {
|
||||||
Some(MphfLayer::open(&new_layer_dir, &IndexMode::Exact).map_err(olm_to_sk)?)
|
Some(MphfOnly::open(&new_layer_dir).map_err(olm_to_sk)?)
|
||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
};
|
};
|
||||||
let n_new = new_mphf.as_ref().map_or(0, |m| m.n());
|
|
||||||
|
|
||||||
// ── Prepare matrix directories for the new layer ──────────────────────
|
// ── Prepare matrix directories for the new layer ──────────────────────
|
||||||
// Absent columns (dst genomes) are written via append_column (all-zero/false).
|
// Absent columns (dst genomes) are written via append_column (all-zero/false).
|
||||||
@@ -303,7 +300,7 @@ impl KmerPartition {
|
|||||||
for l in 0..src_meta.n_layers {
|
for l in 0..src_meta.n_layers {
|
||||||
let src_layer_dir = src_index_dir.join(format!("layer_{l}"));
|
let src_layer_dir = src_index_dir.join(format!("layer_{l}"));
|
||||||
let reader = UnitigFileReader::open_sequential(&src_layer_dir.join("unitigs.bin"))?;
|
let reader = UnitigFileReader::open_sequential(&src_layer_dir.join("unitigs.bin"))?;
|
||||||
let src_data = SrcLayerData::open(&src_layer_dir, mode, &src_meta.mode)?;
|
let src_data = SrcLayerData::open(&src_layer_dir, mode)?;
|
||||||
|
|
||||||
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
|
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
|
||||||
let values = src_data.lookup(kmer, *src_n);
|
let values = src_data.lookup(kmer, *src_n);
|
||||||
@@ -312,9 +309,8 @@ impl KmerPartition {
|
|||||||
if let Some((dst_layer, hit)) = dst_map.query(kmer) {
|
if let Some((dst_layer, hit)) = dst_map.query(kmer) {
|
||||||
exist_builders[dst_layer][builder_idx].set_val(hit.slot, value);
|
exist_builders[dst_layer][builder_idx].set_val(hit.slot, value);
|
||||||
} else if let Some(ref mphf) = new_mphf {
|
} else if let Some(ref mphf) = new_mphf {
|
||||||
if let Some(slot) = mphf.find(kmer) {
|
let slot = mphf.index(kmer);
|
||||||
new_src_builders[builder_idx].set_val(slot, value);
|
new_src_builders[builder_idx].set_val(slot, value);
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -117,7 +117,7 @@ impl KmerPartition {
|
|||||||
if !unitigs_path.exists() { continue; }
|
if !unitigs_path.exists() { continue; }
|
||||||
|
|
||||||
let reader = UnitigFileReader::open_sequential(&unitigs_path)?;
|
let reader = UnitigFileReader::open_sequential(&unitigs_path)?;
|
||||||
let src_data = SrcLayerData::open(&src_layer_dir, mode, &src_meta.mode)?;
|
let src_data = SrcLayerData::open(&src_layer_dir, mode)?;
|
||||||
|
|
||||||
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
|
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
|
||||||
let row = src_data.lookup(kmer, n_genomes);
|
let row = src_data.lookup(kmer, n_genomes);
|
||||||
@@ -182,7 +182,7 @@ impl KmerPartition {
|
|||||||
if !unitigs_path.exists() { continue; }
|
if !unitigs_path.exists() { continue; }
|
||||||
|
|
||||||
let reader = UnitigFileReader::open_sequential(&unitigs_path)?;
|
let reader = UnitigFileReader::open_sequential(&unitigs_path)?;
|
||||||
let src_data = SrcLayerData::open(&src_layer_dir, mode, &src_meta.mode)?;
|
let src_data = SrcLayerData::open(&src_layer_dir, mode)?;
|
||||||
|
|
||||||
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
|
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
|
||||||
let row = src_data.lookup(kmer, n_genomes);
|
let row = src_data.lookup(kmer, n_genomes);
|
||||||
|
|||||||
@@ -12,4 +12,4 @@ pub use layer::{Hit, Layer, LayerData};
|
|||||||
pub use layered_store::LayeredStore;
|
pub use layered_store::LayeredStore;
|
||||||
pub use map::LayeredMap;
|
pub use map::LayeredMap;
|
||||||
pub use meta::{IndexMode, PartitionMeta};
|
pub use meta::{IndexMode, PartitionMeta};
|
||||||
pub use mphf_layer::MphfLayer;
|
pub use mphf_layer::{MphfLayer, MphfOnly};
|
||||||
|
|||||||
@@ -129,7 +129,31 @@ impl MphfLayer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn n(&self) -> usize { self.n }
|
pub fn n(&self) -> usize { self.n }
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── MphfOnly ──────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/// Lightweight wrapper that loads only the MPHF file, without evidence or unitigs.
|
||||||
|
///
|
||||||
|
/// Use this when the caller guarantees that all queried kmers are in the MPHF
|
||||||
|
/// domain (e.g. when iterating the source's own unitigs during merge).
|
||||||
|
pub struct MphfOnly(Mphf);
|
||||||
|
|
||||||
|
impl MphfOnly {
|
||||||
|
pub fn open(dir: &Path) -> OLMResult<Self> {
|
||||||
|
let mphf: Mphf = Mphf::load_full(&dir.join(MPHF_FILE))
|
||||||
|
.map_err(|e| OLMError::InvalidLayer(e.to_string()))?;
|
||||||
|
Ok(Self(mphf))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return the slot for `kmer`. Only valid when `kmer` is in the MPHF domain.
|
||||||
|
#[inline]
|
||||||
|
pub fn index(&self, kmer: CanonicalKmer) -> usize {
|
||||||
|
self.0.index(&kmer.raw())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl MphfLayer {
|
||||||
// ── Build helpers ─────────────────────────────────────────────────────────
|
// ── Build helpers ─────────────────────────────────────────────────────────
|
||||||
|
|
||||||
pub fn unitig_writer(dir: &Path) -> OLMResult<UnitigFileWriter> {
|
pub fn unitig_writer(dir: &Path) -> OLMResult<UnitigFileWriter> {
|
||||||
|
|||||||
Reference in New Issue
Block a user