98c14aade9
Centralizes index configuration by storing a single `IndexMode` (`Exact`, `Approx`, or `Hybrid`) in `PartitionMeta`, eliminating per-layer metadata files. Introduces a `Hybrid` evidence mode and an `--approx` CLI flag to toggle between exact and probabilistic indexing. Refactors the build and query pipelines to dynamically dispatch based on the configured mode, deferring `.idx` generation to Pass 2 and only requiring it for Exact/Hybrid modes. Updates layer opening to load appropriate data structures, enforces strict parameter validation during merges, and clarifies performance trade-offs in documentation.
121 lines
4.6 KiB
Rust
121 lines
4.6 KiB
Rust
use std::path::Path;
|
|
|
|
use obicompactvec::{PersistentBitMatrix, PersistentCompactIntMatrix};
|
|
use obikseq::{CanonicalKmer, RoutableSuperKmer};
|
|
use obiskio::{SKError, SKResult};
|
|
use obilayeredmap::{IndexMode, MphfLayer, OLMError};
|
|
use obilayeredmap::meta::PartitionMeta;
|
|
|
|
use crate::partition::KmerPartition;
|
|
|
|
const INDEX_SUBDIR: &str = "index";
|
|
|
|
fn olm_to_sk(e: OLMError) -> SKError {
|
|
match e {
|
|
OLMError::Io(io_err) => SKError::Io(io_err),
|
|
other => SKError::InvalidData { context: "query", detail: other.to_string() },
|
|
}
|
|
}
|
|
|
|
// ── per-layer query handle ────────────────────────────────────────────────────
|
|
|
|
enum QueryLayer {
|
|
/// Layer<()> — MPHF-only, no data matrix; all indexed kmers map to 1 per genome.
|
|
SetOnly(MphfLayer),
|
|
Presence(MphfLayer, PersistentBitMatrix),
|
|
Count(MphfLayer, PersistentCompactIntMatrix),
|
|
}
|
|
|
|
impl QueryLayer {
|
|
fn open(layer_dir: &Path, with_counts: bool, mode: &IndexMode) -> SKResult<Self> {
|
|
let presence_dir = layer_dir.join("presence");
|
|
let counts_dir = layer_dir.join("counts");
|
|
|
|
if with_counts && counts_dir.exists() {
|
|
let mphf = MphfLayer::open(layer_dir, mode).map_err(olm_to_sk)?;
|
|
let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?;
|
|
Ok(QueryLayer::Count(mphf, mat))
|
|
} else if presence_dir.exists() {
|
|
let mphf = MphfLayer::open(layer_dir, mode).map_err(olm_to_sk)?;
|
|
let mat = PersistentBitMatrix::open(&presence_dir).map_err(SKError::Io)?;
|
|
Ok(QueryLayer::Presence(mphf, mat))
|
|
} else if counts_dir.exists() {
|
|
// presence query on a count index — return counts as-is
|
|
let mphf = MphfLayer::open(layer_dir, mode).map_err(olm_to_sk)?;
|
|
let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?;
|
|
Ok(QueryLayer::Count(mphf, mat))
|
|
} else {
|
|
let mphf = MphfLayer::open(layer_dir, mode).map_err(olm_to_sk)?;
|
|
Ok(QueryLayer::SetOnly(mphf))
|
|
}
|
|
}
|
|
|
|
/// Return `Some(per-genome row)` if `kmer` is indexed in this layer, else `None`.
|
|
fn find(&self, kmer: CanonicalKmer, n_genomes: usize) -> Option<Box<[u32]>> {
|
|
match self {
|
|
QueryLayer::SetOnly(mphf) => {
|
|
mphf.find(kmer)
|
|
.map(|_| vec![1u32; n_genomes].into_boxed_slice())
|
|
}
|
|
QueryLayer::Presence(mphf, mat) => {
|
|
mphf.find(kmer)
|
|
.map(|slot| mat.row(slot).iter().map(|&b| b as u32).collect())
|
|
}
|
|
QueryLayer::Count(mphf, mat) => {
|
|
mphf.find(kmer).map(|slot| mat.row(slot))
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// ── KmerPartition::query_partition ───────────────────────────────────────────
|
|
|
|
impl KmerPartition {
|
|
/// Query a single partition for a slice of (already-routed) super-kmers.
|
|
///
|
|
/// Returns one entry per input super-kmer; each entry is a `Vec` with one
|
|
/// `Option<Box<[u32]>>` per k-mer inside that super-kmer:
|
|
/// - `None` — k-mer absent from the index
|
|
/// - `Some(row)` — per-genome count (count index) or 0/1 (presence index)
|
|
///
|
|
/// All `superkmers` must belong to this partition (same minimizer bucket).
|
|
pub fn query_partition(
|
|
&self,
|
|
part_idx: usize,
|
|
superkmers: &[&RoutableSuperKmer],
|
|
k: usize,
|
|
n_genomes: usize,
|
|
with_counts: bool,
|
|
) -> SKResult<Vec<Vec<Option<Box<[u32]>>>>> {
|
|
if superkmers.is_empty() {
|
|
return Ok(Vec::new());
|
|
}
|
|
|
|
let index_dir = self.part_dir(part_idx).join(INDEX_SUBDIR);
|
|
|
|
if !index_dir.exists() {
|
|
return Ok(superkmers
|
|
.iter()
|
|
.map(|rsk| vec![None; rsk.seql() - k + 1])
|
|
.collect());
|
|
}
|
|
|
|
let meta = PartitionMeta::load(&index_dir).map_err(olm_to_sk)?;
|
|
let layers: Vec<QueryLayer> = (0..meta.n_layers)
|
|
.map(|i| QueryLayer::open(&index_dir.join(format!("layer_{i}")), with_counts, &meta.mode))
|
|
.collect::<SKResult<_>>()?;
|
|
|
|
Ok(superkmers
|
|
.iter()
|
|
.map(|rsk| {
|
|
rsk.superkmer()
|
|
.iter_canonical_kmers()
|
|
.map(|kmer| {
|
|
layers.iter().find_map(|layer| layer.find(kmer, n_genomes))
|
|
})
|
|
.collect()
|
|
})
|
|
.collect())
|
|
}
|
|
}
|