feat: Implement query subcommand for sequence-to-genome mapping

This change introduces the `query` CLI command and its supporting infrastructure for sequence-to-genome mapping and k-mer matching. It adds a `QueryLayer` abstraction backed by MPHF and persistent matrices, exposes the index partition for direct querying, and implements `Hash`/`Eq` for `RoutableSuperKmer`. The command ingests sequence batches, deduplicates superkmers, routes them to index partitions for parallel exact or 1-mismatch matching, and outputs results as FASTA records annotated with JSON metadata. Includes `serde_json` dependency addition, module exports, and documentation updates.
This commit is contained in:
Eric Coissac
2026-05-21 13:23:05 +02:00
parent c8e591fc78
commit 13599dd444
13 changed files with 762 additions and 19 deletions
+1
View File
@@ -5,6 +5,7 @@ mod index_layer;
mod kmer_sort;
mod merge_layer;
mod partition;
mod query_layer;
mod rebuild_layer;
pub use filter::KmerFilter;
+120
View File
@@ -0,0 +1,120 @@
use std::path::Path;
use obicompactvec::{PersistentBitMatrix, PersistentCompactIntMatrix};
use obikseq::{CanonicalKmer, RoutableSuperKmer};
use obiskio::{SKError, SKResult};
use obilayeredmap::{MphfLayer, OLMError};
use obilayeredmap::meta::PartitionMeta;
use crate::partition::KmerPartition;
const INDEX_SUBDIR: &str = "index";
fn olm_to_sk(e: OLMError) -> SKError {
match e {
OLMError::Io(io_err) => SKError::Io(io_err),
other => SKError::InvalidData { context: "query", detail: other.to_string() },
}
}
// ── per-layer query handle ────────────────────────────────────────────────────
enum QueryLayer {
/// Layer<()> — MPHF-only, no data matrix; all indexed kmers map to 1 per genome.
SetOnly(MphfLayer),
Presence(MphfLayer, PersistentBitMatrix),
Count(MphfLayer, PersistentCompactIntMatrix),
}
impl QueryLayer {
fn open(layer_dir: &Path, with_counts: bool) -> SKResult<Self> {
let presence_dir = layer_dir.join("presence");
let counts_dir = layer_dir.join("counts");
if with_counts && counts_dir.exists() {
let mphf = MphfLayer::open(layer_dir).map_err(olm_to_sk)?;
let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?;
Ok(QueryLayer::Count(mphf, mat))
} else if presence_dir.exists() {
let mphf = MphfLayer::open(layer_dir).map_err(olm_to_sk)?;
let mat = PersistentBitMatrix::open(&presence_dir).map_err(SKError::Io)?;
Ok(QueryLayer::Presence(mphf, mat))
} else if counts_dir.exists() {
// presence query on a count index — return counts as-is
let mphf = MphfLayer::open(layer_dir).map_err(olm_to_sk)?;
let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?;
Ok(QueryLayer::Count(mphf, mat))
} else {
let mphf = MphfLayer::open(layer_dir).map_err(olm_to_sk)?;
Ok(QueryLayer::SetOnly(mphf))
}
}
/// Return `Some(per-genome row)` if `kmer` is indexed in this layer, else `None`.
fn find(&self, kmer: CanonicalKmer, n_genomes: usize) -> Option<Box<[u32]>> {
match self {
QueryLayer::SetOnly(mphf) => {
mphf.find(kmer)
.map(|_| vec![1u32; n_genomes].into_boxed_slice())
}
QueryLayer::Presence(mphf, mat) => {
mphf.find(kmer)
.map(|slot| mat.row(slot).iter().map(|&b| b as u32).collect())
}
QueryLayer::Count(mphf, mat) => {
mphf.find(kmer).map(|slot| mat.row(slot))
}
}
}
}
// ── KmerPartition::query_partition ───────────────────────────────────────────
impl KmerPartition {
/// Query a single partition for a slice of (already-routed) super-kmers.
///
/// Returns one entry per input super-kmer; each entry is a `Vec` with one
/// `Option<Box<[u32]>>` per k-mer inside that super-kmer:
/// - `None` — k-mer absent from the index
/// - `Some(row)` — per-genome count (count index) or 0/1 (presence index)
///
/// All `superkmers` must belong to this partition (same minimizer bucket).
pub fn query_partition(
&self,
part_idx: usize,
superkmers: &[&RoutableSuperKmer],
k: usize,
n_genomes: usize,
with_counts: bool,
) -> SKResult<Vec<Vec<Option<Box<[u32]>>>>> {
if superkmers.is_empty() {
return Ok(Vec::new());
}
let index_dir = self.part_dir(part_idx).join(INDEX_SUBDIR);
if !index_dir.exists() {
return Ok(superkmers
.iter()
.map(|rsk| vec![None; rsk.seql() - k + 1])
.collect());
}
let meta = PartitionMeta::load(&index_dir).map_err(olm_to_sk)?;
let layers: Vec<QueryLayer> = (0..meta.n_layers)
.map(|i| QueryLayer::open(&index_dir.join(format!("layer_{i}")), with_counts))
.collect::<SKResult<_>>()?;
Ok(superkmers
.iter()
.map(|rsk| {
rsk.superkmer()
.iter_canonical_kmers()
.map(|kmer| {
layers.iter().find_map(|layer| layer.find(kmer, n_genomes))
})
.collect()
})
.collect())
}
}