Files
obikmer/src/obikpartitionner/src/query_layer.rs
T
Eric Coissac 98c14aade9 feat: centralize index configuration and add hybrid mode
Centralizes index configuration by storing a single `IndexMode` (`Exact`, `Approx`, or `Hybrid`) in `PartitionMeta`, eliminating per-layer metadata files. Introduces a `Hybrid` evidence mode and an `--approx` CLI flag to toggle between exact and probabilistic indexing. Refactors the build and query pipelines to dynamically dispatch based on the configured mode, deferring `.idx` generation to Pass 2 and only requiring it for Exact/Hybrid modes. Updates layer opening to load appropriate data structures, enforces strict parameter validation during merges, and clarifies performance trade-offs in documentation.
2026-05-26 15:08:29 +02:00

121 lines
4.6 KiB
Rust

use std::path::Path;
use obicompactvec::{PersistentBitMatrix, PersistentCompactIntMatrix};
use obikseq::{CanonicalKmer, RoutableSuperKmer};
use obiskio::{SKError, SKResult};
use obilayeredmap::{IndexMode, MphfLayer, OLMError};
use obilayeredmap::meta::PartitionMeta;
use crate::partition::KmerPartition;
const INDEX_SUBDIR: &str = "index";
fn olm_to_sk(e: OLMError) -> SKError {
match e {
OLMError::Io(io_err) => SKError::Io(io_err),
other => SKError::InvalidData { context: "query", detail: other.to_string() },
}
}
// ── per-layer query handle ────────────────────────────────────────────────────
enum QueryLayer {
/// Layer<()> — MPHF-only, no data matrix; all indexed kmers map to 1 per genome.
SetOnly(MphfLayer),
Presence(MphfLayer, PersistentBitMatrix),
Count(MphfLayer, PersistentCompactIntMatrix),
}
impl QueryLayer {
fn open(layer_dir: &Path, with_counts: bool, mode: &IndexMode) -> SKResult<Self> {
let presence_dir = layer_dir.join("presence");
let counts_dir = layer_dir.join("counts");
if with_counts && counts_dir.exists() {
let mphf = MphfLayer::open(layer_dir, mode).map_err(olm_to_sk)?;
let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?;
Ok(QueryLayer::Count(mphf, mat))
} else if presence_dir.exists() {
let mphf = MphfLayer::open(layer_dir, mode).map_err(olm_to_sk)?;
let mat = PersistentBitMatrix::open(&presence_dir).map_err(SKError::Io)?;
Ok(QueryLayer::Presence(mphf, mat))
} else if counts_dir.exists() {
// presence query on a count index — return counts as-is
let mphf = MphfLayer::open(layer_dir, mode).map_err(olm_to_sk)?;
let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?;
Ok(QueryLayer::Count(mphf, mat))
} else {
let mphf = MphfLayer::open(layer_dir, mode).map_err(olm_to_sk)?;
Ok(QueryLayer::SetOnly(mphf))
}
}
/// Return `Some(per-genome row)` if `kmer` is indexed in this layer, else `None`.
fn find(&self, kmer: CanonicalKmer, n_genomes: usize) -> Option<Box<[u32]>> {
match self {
QueryLayer::SetOnly(mphf) => {
mphf.find(kmer)
.map(|_| vec![1u32; n_genomes].into_boxed_slice())
}
QueryLayer::Presence(mphf, mat) => {
mphf.find(kmer)
.map(|slot| mat.row(slot).iter().map(|&b| b as u32).collect())
}
QueryLayer::Count(mphf, mat) => {
mphf.find(kmer).map(|slot| mat.row(slot))
}
}
}
}
// ── KmerPartition::query_partition ───────────────────────────────────────────
impl KmerPartition {
/// Query a single partition for a slice of (already-routed) super-kmers.
///
/// Returns one entry per input super-kmer; each entry is a `Vec` with one
/// `Option<Box<[u32]>>` per k-mer inside that super-kmer:
/// - `None` — k-mer absent from the index
/// - `Some(row)` — per-genome count (count index) or 0/1 (presence index)
///
/// All `superkmers` must belong to this partition (same minimizer bucket).
pub fn query_partition(
&self,
part_idx: usize,
superkmers: &[&RoutableSuperKmer],
k: usize,
n_genomes: usize,
with_counts: bool,
) -> SKResult<Vec<Vec<Option<Box<[u32]>>>>> {
if superkmers.is_empty() {
return Ok(Vec::new());
}
let index_dir = self.part_dir(part_idx).join(INDEX_SUBDIR);
if !index_dir.exists() {
return Ok(superkmers
.iter()
.map(|rsk| vec![None; rsk.seql() - k + 1])
.collect());
}
let meta = PartitionMeta::load(&index_dir).map_err(olm_to_sk)?;
let layers: Vec<QueryLayer> = (0..meta.n_layers)
.map(|i| QueryLayer::open(&index_dir.join(format!("layer_{i}")), with_counts, &meta.mode))
.collect::<SKResult<_>>()?;
Ok(superkmers
.iter()
.map(|rsk| {
rsk.superkmer()
.iter_canonical_kmers()
.map(|kmer| {
layers.iter().find_map(|layer| layer.find(kmer, n_genomes))
})
.collect()
})
.collect())
}
}