feat: centralize index configuration and add hybrid mode
Centralizes index configuration by storing a single `IndexMode` (`Exact`, `Approx`, or `Hybrid`) in `PartitionMeta`, eliminating per-layer metadata files. Introduces a `Hybrid` evidence mode and an `--approx` CLI flag to toggle between exact and probabilistic indexing. Refactors the build and query pipelines to dynamically dispatch based on the configured mode, deferring `.idx` generation to Pass 2 and only requiring it for Exact/Hybrid modes. Updates layer opening to load appropriate data structures, enforces strict parameter validation during merges, and clarifies performance trade-offs in documentation.
This commit is contained in:
@@ -1,8 +1,8 @@
|
||||
use obicompactvec::{PersistentBitMatrix, PersistentCompactIntMatrix};
|
||||
use obikseq::CanonicalKmer;
|
||||
use obiskio::{SKError, SKResult, UnitigFileReader};
|
||||
use obilayeredmap::OLMError;
|
||||
use obilayeredmap::MphfLayer;
|
||||
use obilayeredmap::{IndexMode, MphfLayer, OLMError};
|
||||
use obilayeredmap::meta::PartitionMeta;
|
||||
|
||||
use crate::partition::KmerPartition;
|
||||
|
||||
@@ -35,15 +35,16 @@ impl KmerPartition {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Discover layers by probing layer_0, layer_1, … until one is absent.
|
||||
// PartitionMeta (meta.json) is only created by the merge path, not by
|
||||
// the initial single-genome build, so we cannot rely on it here.
|
||||
let index_mode = PartitionMeta::load(&index_dir)
|
||||
.map(|m| m.mode)
|
||||
.unwrap_or(IndexMode::Exact);
|
||||
|
||||
let mut l = 0;
|
||||
loop {
|
||||
let layer_dir = index_dir.join(format!("layer_{l}"));
|
||||
if !layer_dir.exists() { break; }
|
||||
l += 1;
|
||||
let mphf = MphfLayer::open(&layer_dir).map_err(olm_to_sk)?;
|
||||
let mphf = MphfLayer::open(&layer_dir, &index_mode).map_err(olm_to_sk)?;
|
||||
let reader = UnitigFileReader::open_sequential(&layer_dir.join("unitigs.bin"))?;
|
||||
|
||||
let counts_dir = layer_dir.join("counts");
|
||||
@@ -92,11 +93,15 @@ impl KmerPartition {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let index_mode = PartitionMeta::load(&index_dir)
|
||||
.map(|m| m.mode)
|
||||
.unwrap_or(IndexMode::Exact);
|
||||
|
||||
let mut layer = 0;
|
||||
loop {
|
||||
let layer_dir = index_dir.join(format!("layer_{layer}"));
|
||||
if !layer_dir.exists() { break; }
|
||||
let mphf = MphfLayer::open(&layer_dir).map_err(olm_to_sk)?;
|
||||
let mphf = MphfLayer::open(&layer_dir, &index_mode).map_err(olm_to_sk)?;
|
||||
let reader = UnitigFileReader::open_sequential(&layer_dir.join("unitigs.bin"))?;
|
||||
|
||||
let counts_dir = layer_dir.join("counts");
|
||||
|
||||
@@ -5,7 +5,7 @@ use cacheline_ef::{CachelineEf, CachelineEfVec};
|
||||
use epserde::prelude::*;
|
||||
use obicompactvec::{PersistentCompactIntMatrix, PersistentCompactIntVec};
|
||||
use obidebruinj::GraphDeBruijn;
|
||||
use obilayeredmap::{EvidenceKind, OLMError, layer::Layer};
|
||||
use obilayeredmap::{IndexMode, OLMError, layer::Layer};
|
||||
use obilayeredmap::meta::PartitionMeta;
|
||||
use obiskio::{SKError, SKFileMeta, SKFileReader};
|
||||
use ptr_hash::{PtrHash, bucket_fn::CubicEps, hash::Xx64};
|
||||
@@ -44,7 +44,7 @@ impl KmerPartition {
|
||||
min_ab: u32,
|
||||
max_ab: Option<u32>,
|
||||
with_counts: bool,
|
||||
evidence: &EvidenceKind,
|
||||
mode: &IndexMode,
|
||||
block_bits: u8,
|
||||
) -> Result<usize, SKError> {
|
||||
let part_dir = self.part_dir(i);
|
||||
@@ -110,7 +110,7 @@ impl KmerPartition {
|
||||
uw.close()?;
|
||||
|
||||
if with_counts {
|
||||
Layer::<PersistentCompactIntMatrix>::build(&layer_dir, block_bits, evidence, |kmer| {
|
||||
Layer::<PersistentCompactIntMatrix>::build(&layer_dir, block_bits, mode, |kmer| {
|
||||
match (&mphf1_opt, &counts1_opt) {
|
||||
(Some(mphf), Some(counts)) => counts.get(mphf.index(&kmer.raw())),
|
||||
_ => 1,
|
||||
@@ -118,13 +118,11 @@ impl KmerPartition {
|
||||
})
|
||||
.map_err(olm_to_sk)?;
|
||||
} else {
|
||||
Layer::<()>::build(&layer_dir, block_bits, evidence).map_err(olm_to_sk)?;
|
||||
Layer::<()>::build(&layer_dir, block_bits, mode).map_err(olm_to_sk)?;
|
||||
}
|
||||
|
||||
// Write meta.json in the index/ directory so LayeredMap::open works
|
||||
// (e.g. for subsequent merge operations).
|
||||
let index_dir = layer_dir.parent().expect("layer_dir has a parent");
|
||||
PartitionMeta { n_layers: 1 }.save(index_dir).map_err(olm_to_sk)?;
|
||||
PartitionMeta { n_layers: 1, mode: mode.clone() }.save(index_dir).map_err(olm_to_sk)?;
|
||||
|
||||
Ok(n_kmers)
|
||||
}
|
||||
|
||||
@@ -9,7 +9,7 @@ use obicompactvec::{PersistentBitMatrix, PersistentBitMatrixBuilder,
|
||||
PersistentCompactIntVecBuilder};
|
||||
use obikseq::CanonicalKmer;
|
||||
use obiskio::{SKError, SKResult, UnitigFileReader};
|
||||
use obilayeredmap::{EvidenceKind, Layer, LayeredMap, MphfLayer, OLMError};
|
||||
use obilayeredmap::{IndexMode, Layer, LayeredMap, MphfLayer, OLMError};
|
||||
use obilayeredmap::meta::PartitionMeta;
|
||||
|
||||
use crate::partition::KmerPartition;
|
||||
@@ -52,18 +52,17 @@ pub(crate) enum SrcLayerData {
|
||||
}
|
||||
|
||||
impl SrcLayerData {
|
||||
pub(crate) fn open(layer_dir: &Path, mode: MergeMode) -> SKResult<Self> {
|
||||
pub(crate) fn open(layer_dir: &Path, merge_mode: MergeMode, index_mode: &IndexMode) -> SKResult<Self> {
|
||||
let presence_dir = layer_dir.join("presence");
|
||||
let counts_dir = layer_dir.join("counts");
|
||||
match mode {
|
||||
match merge_mode {
|
||||
MergeMode::Presence => {
|
||||
if presence_dir.exists() {
|
||||
let mphf = MphfLayer::open(layer_dir).map_err(olm_to_sk)?;
|
||||
let mphf = MphfLayer::open(layer_dir, index_mode).map_err(olm_to_sk)?;
|
||||
let mat = PersistentBitMatrix::open(&presence_dir).map_err(SKError::Io)?;
|
||||
Ok(SrcLayerData::Presence(mphf, mat))
|
||||
} else if counts_dir.exists() {
|
||||
// Source is a count index; treat count > 0 as present via ColBuilder::Bit.
|
||||
let mphf = MphfLayer::open(layer_dir).map_err(olm_to_sk)?;
|
||||
let mphf = MphfLayer::open(layer_dir, index_mode).map_err(olm_to_sk)?;
|
||||
let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?;
|
||||
Ok(SrcLayerData::Count(mphf, mat))
|
||||
} else {
|
||||
@@ -72,7 +71,7 @@ impl SrcLayerData {
|
||||
}
|
||||
MergeMode::Count => {
|
||||
if counts_dir.exists() {
|
||||
let mphf = MphfLayer::open(layer_dir).map_err(olm_to_sk)?;
|
||||
let mphf = MphfLayer::open(layer_dir, index_mode).map_err(olm_to_sk)?;
|
||||
let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?;
|
||||
Ok(SrcLayerData::Count(mphf, mat))
|
||||
} else {
|
||||
@@ -116,7 +115,7 @@ fn load_meta(dir: &Path) -> SKResult<PartitionMeta> {
|
||||
Err(e) if matches!(e, OLMError::Io(ref io_e) if io_e.kind() == std::io::ErrorKind::NotFound) => {
|
||||
let mut n = 0usize;
|
||||
while dir.join(format!("layer_{n}")).exists() { n += 1; }
|
||||
let m = PartitionMeta { n_layers: n };
|
||||
let m = PartitionMeta { n_layers: n, mode: IndexMode::default() };
|
||||
m.save(dir).map_err(olm_to_sk)?;
|
||||
Ok(m)
|
||||
}
|
||||
@@ -217,12 +216,12 @@ impl KmerPartition {
|
||||
uw.write(&unitig)?;
|
||||
}
|
||||
uw.close()?;
|
||||
Layer::<()>::build(&new_layer_dir, block_bits, &EvidenceKind::Exact).map_err(olm_to_sk)?;
|
||||
Layer::<()>::build(&new_layer_dir, block_bits, &IndexMode::Exact).map_err(olm_to_sk)?;
|
||||
}
|
||||
drop(g);
|
||||
|
||||
let new_mphf = if any_new {
|
||||
Some(MphfLayer::open(&new_layer_dir).map_err(olm_to_sk)?)
|
||||
Some(MphfLayer::open(&new_layer_dir, &IndexMode::Exact).map_err(olm_to_sk)?)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
@@ -304,7 +303,7 @@ impl KmerPartition {
|
||||
for l in 0..src_meta.n_layers {
|
||||
let src_layer_dir = src_index_dir.join(format!("layer_{l}"));
|
||||
let reader = UnitigFileReader::open_sequential(&src_layer_dir.join("unitigs.bin"))?;
|
||||
let src_data = SrcLayerData::open(&src_layer_dir, mode)?;
|
||||
let src_data = SrcLayerData::open(&src_layer_dir, mode, &src_meta.mode)?;
|
||||
|
||||
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
|
||||
let values = src_data.lookup(kmer, *src_n);
|
||||
|
||||
@@ -3,7 +3,7 @@ use std::path::Path;
|
||||
use obicompactvec::{PersistentBitMatrix, PersistentCompactIntMatrix};
|
||||
use obikseq::{CanonicalKmer, RoutableSuperKmer};
|
||||
use obiskio::{SKError, SKResult};
|
||||
use obilayeredmap::{MphfLayer, OLMError};
|
||||
use obilayeredmap::{IndexMode, MphfLayer, OLMError};
|
||||
use obilayeredmap::meta::PartitionMeta;
|
||||
|
||||
use crate::partition::KmerPartition;
|
||||
@@ -27,25 +27,25 @@ enum QueryLayer {
|
||||
}
|
||||
|
||||
impl QueryLayer {
|
||||
fn open(layer_dir: &Path, with_counts: bool) -> SKResult<Self> {
|
||||
fn open(layer_dir: &Path, with_counts: bool, mode: &IndexMode) -> SKResult<Self> {
|
||||
let presence_dir = layer_dir.join("presence");
|
||||
let counts_dir = layer_dir.join("counts");
|
||||
|
||||
if with_counts && counts_dir.exists() {
|
||||
let mphf = MphfLayer::open(layer_dir).map_err(olm_to_sk)?;
|
||||
let mphf = MphfLayer::open(layer_dir, mode).map_err(olm_to_sk)?;
|
||||
let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?;
|
||||
Ok(QueryLayer::Count(mphf, mat))
|
||||
} else if presence_dir.exists() {
|
||||
let mphf = MphfLayer::open(layer_dir).map_err(olm_to_sk)?;
|
||||
let mphf = MphfLayer::open(layer_dir, mode).map_err(olm_to_sk)?;
|
||||
let mat = PersistentBitMatrix::open(&presence_dir).map_err(SKError::Io)?;
|
||||
Ok(QueryLayer::Presence(mphf, mat))
|
||||
} else if counts_dir.exists() {
|
||||
// presence query on a count index — return counts as-is
|
||||
let mphf = MphfLayer::open(layer_dir).map_err(olm_to_sk)?;
|
||||
let mphf = MphfLayer::open(layer_dir, mode).map_err(olm_to_sk)?;
|
||||
let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?;
|
||||
Ok(QueryLayer::Count(mphf, mat))
|
||||
} else {
|
||||
let mphf = MphfLayer::open(layer_dir).map_err(olm_to_sk)?;
|
||||
let mphf = MphfLayer::open(layer_dir, mode).map_err(olm_to_sk)?;
|
||||
Ok(QueryLayer::SetOnly(mphf))
|
||||
}
|
||||
}
|
||||
@@ -102,7 +102,7 @@ impl KmerPartition {
|
||||
|
||||
let meta = PartitionMeta::load(&index_dir).map_err(olm_to_sk)?;
|
||||
let layers: Vec<QueryLayer> = (0..meta.n_layers)
|
||||
.map(|i| QueryLayer::open(&index_dir.join(format!("layer_{i}")), with_counts))
|
||||
.map(|i| QueryLayer::open(&index_dir.join(format!("layer_{i}")), with_counts, &meta.mode))
|
||||
.collect::<SKResult<_>>()?;
|
||||
|
||||
Ok(superkmers
|
||||
|
||||
@@ -8,7 +8,7 @@ use obicompactvec::{PersistentBitMatrixBuilder,
|
||||
PersistentCompactIntVecBuilder};
|
||||
use obidebruinj::GraphDeBruijn;
|
||||
use obiskio::{SKError, SKResult, UnitigFileReader};
|
||||
use obilayeredmap::{EvidenceKind, Layer, MphfLayer, OLMError};
|
||||
use obilayeredmap::{IndexMode, Layer, MphfLayer, OLMError};
|
||||
use obilayeredmap::meta::PartitionMeta;
|
||||
|
||||
use crate::filter::KmerFilter;
|
||||
@@ -67,7 +67,7 @@ fn load_meta(dir: &Path) -> SKResult<PartitionMeta> {
|
||||
Err(e) if matches!(e, OLMError::Io(ref io_e) if io_e.kind() == std::io::ErrorKind::NotFound) => {
|
||||
let mut n = 0usize;
|
||||
while dir.join(format!("layer_{n}")).exists() { n += 1; }
|
||||
let m = PartitionMeta { n_layers: n };
|
||||
let m = PartitionMeta { n_layers: n, mode: IndexMode::default() };
|
||||
m.save(dir).map_err(olm_to_sk)?;
|
||||
Ok(m)
|
||||
}
|
||||
@@ -117,7 +117,7 @@ impl KmerPartition {
|
||||
if !unitigs_path.exists() { continue; }
|
||||
|
||||
let reader = UnitigFileReader::open_sequential(&unitigs_path)?;
|
||||
let src_data = SrcLayerData::open(&src_layer_dir, mode)?;
|
||||
let src_data = SrcLayerData::open(&src_layer_dir, mode, &src_meta.mode)?;
|
||||
|
||||
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
|
||||
let row = src_data.lookup(kmer, n_genomes);
|
||||
@@ -146,8 +146,8 @@ impl KmerPartition {
|
||||
uw.close()?;
|
||||
drop(g);
|
||||
|
||||
Layer::<()>::build(&dst_layer_dir, block_bits, &EvidenceKind::Exact).map_err(olm_to_sk)?;
|
||||
let dst_mphf = MphfLayer::open(&dst_layer_dir).map_err(olm_to_sk)?;
|
||||
Layer::<()>::build(&dst_layer_dir, block_bits, &IndexMode::Exact).map_err(olm_to_sk)?;
|
||||
let dst_mphf = MphfLayer::open(&dst_layer_dir, &IndexMode::Exact).map_err(olm_to_sk)?;
|
||||
|
||||
// ── Prepare matrix builders (one column per genome) ───────────────────
|
||||
let data_dir = match mode {
|
||||
@@ -182,7 +182,7 @@ impl KmerPartition {
|
||||
if !unitigs_path.exists() { continue; }
|
||||
|
||||
let reader = UnitigFileReader::open_sequential(&unitigs_path)?;
|
||||
let src_data = SrcLayerData::open(&src_layer_dir, mode)?;
|
||||
let src_data = SrcLayerData::open(&src_layer_dir, mode, &src_meta.mode)?;
|
||||
|
||||
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
|
||||
let row = src_data.lookup(kmer, n_genomes);
|
||||
@@ -199,7 +199,7 @@ impl KmerPartition {
|
||||
for b in builders { b.close()?; }
|
||||
write_matrix_meta(&data_dir, n_new, n_genomes).map_err(SKError::Io)?;
|
||||
|
||||
PartitionMeta { n_layers: 1 }.save(&dst_index_dir).map_err(olm_to_sk)?;
|
||||
PartitionMeta { n_layers: 1, mode: IndexMode::Exact }.save(&dst_index_dir).map_err(olm_to_sk)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user