feat: Implement query subcommand for sequence-to-genome mapping
This change introduces the `query` CLI command and its supporting infrastructure for sequence-to-genome mapping and k-mer matching. It adds a `QueryLayer` abstraction backed by MPHF and persistent matrices, exposes the index partition for direct querying, and implements `Hash`/`Eq` for `RoutableSuperKmer`. The command ingests sequence batches, deduplicates superkmers, routes them to index partitions for parallel exact or 1-mismatch matching, and outputs results as FASTA records annotated with JSON metadata. Includes `serde_json` dependency addition, module exports, and documentation updates.
This commit is contained in:
@@ -5,6 +5,7 @@ mod index_layer;
|
||||
mod kmer_sort;
|
||||
mod merge_layer;
|
||||
mod partition;
|
||||
mod query_layer;
|
||||
mod rebuild_layer;
|
||||
|
||||
pub use filter::KmerFilter;
|
||||
|
||||
@@ -0,0 +1,120 @@
|
||||
use std::path::Path;
|
||||
|
||||
use obicompactvec::{PersistentBitMatrix, PersistentCompactIntMatrix};
|
||||
use obikseq::{CanonicalKmer, RoutableSuperKmer};
|
||||
use obiskio::{SKError, SKResult};
|
||||
use obilayeredmap::{MphfLayer, OLMError};
|
||||
use obilayeredmap::meta::PartitionMeta;
|
||||
|
||||
use crate::partition::KmerPartition;
|
||||
|
||||
const INDEX_SUBDIR: &str = "index";
|
||||
|
||||
fn olm_to_sk(e: OLMError) -> SKError {
|
||||
match e {
|
||||
OLMError::Io(io_err) => SKError::Io(io_err),
|
||||
other => SKError::InvalidData { context: "query", detail: other.to_string() },
|
||||
}
|
||||
}
|
||||
|
||||
// ── per-layer query handle ────────────────────────────────────────────────────
|
||||
|
||||
enum QueryLayer {
|
||||
/// Layer<()> — MPHF-only, no data matrix; all indexed kmers map to 1 per genome.
|
||||
SetOnly(MphfLayer),
|
||||
Presence(MphfLayer, PersistentBitMatrix),
|
||||
Count(MphfLayer, PersistentCompactIntMatrix),
|
||||
}
|
||||
|
||||
impl QueryLayer {
|
||||
fn open(layer_dir: &Path, with_counts: bool) -> SKResult<Self> {
|
||||
let presence_dir = layer_dir.join("presence");
|
||||
let counts_dir = layer_dir.join("counts");
|
||||
|
||||
if with_counts && counts_dir.exists() {
|
||||
let mphf = MphfLayer::open(layer_dir).map_err(olm_to_sk)?;
|
||||
let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?;
|
||||
Ok(QueryLayer::Count(mphf, mat))
|
||||
} else if presence_dir.exists() {
|
||||
let mphf = MphfLayer::open(layer_dir).map_err(olm_to_sk)?;
|
||||
let mat = PersistentBitMatrix::open(&presence_dir).map_err(SKError::Io)?;
|
||||
Ok(QueryLayer::Presence(mphf, mat))
|
||||
} else if counts_dir.exists() {
|
||||
// presence query on a count index — return counts as-is
|
||||
let mphf = MphfLayer::open(layer_dir).map_err(olm_to_sk)?;
|
||||
let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?;
|
||||
Ok(QueryLayer::Count(mphf, mat))
|
||||
} else {
|
||||
let mphf = MphfLayer::open(layer_dir).map_err(olm_to_sk)?;
|
||||
Ok(QueryLayer::SetOnly(mphf))
|
||||
}
|
||||
}
|
||||
|
||||
/// Return `Some(per-genome row)` if `kmer` is indexed in this layer, else `None`.
|
||||
fn find(&self, kmer: CanonicalKmer, n_genomes: usize) -> Option<Box<[u32]>> {
|
||||
match self {
|
||||
QueryLayer::SetOnly(mphf) => {
|
||||
mphf.find(kmer)
|
||||
.map(|_| vec![1u32; n_genomes].into_boxed_slice())
|
||||
}
|
||||
QueryLayer::Presence(mphf, mat) => {
|
||||
mphf.find(kmer)
|
||||
.map(|slot| mat.row(slot).iter().map(|&b| b as u32).collect())
|
||||
}
|
||||
QueryLayer::Count(mphf, mat) => {
|
||||
mphf.find(kmer).map(|slot| mat.row(slot))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ── KmerPartition::query_partition ───────────────────────────────────────────
|
||||
|
||||
impl KmerPartition {
|
||||
/// Query a single partition for a slice of (already-routed) super-kmers.
|
||||
///
|
||||
/// Returns one entry per input super-kmer; each entry is a `Vec` with one
|
||||
/// `Option<Box<[u32]>>` per k-mer inside that super-kmer:
|
||||
/// - `None` — k-mer absent from the index
|
||||
/// - `Some(row)` — per-genome count (count index) or 0/1 (presence index)
|
||||
///
|
||||
/// All `superkmers` must belong to this partition (same minimizer bucket).
|
||||
pub fn query_partition(
|
||||
&self,
|
||||
part_idx: usize,
|
||||
superkmers: &[&RoutableSuperKmer],
|
||||
k: usize,
|
||||
n_genomes: usize,
|
||||
with_counts: bool,
|
||||
) -> SKResult<Vec<Vec<Option<Box<[u32]>>>>> {
|
||||
if superkmers.is_empty() {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
|
||||
let index_dir = self.part_dir(part_idx).join(INDEX_SUBDIR);
|
||||
|
||||
if !index_dir.exists() {
|
||||
return Ok(superkmers
|
||||
.iter()
|
||||
.map(|rsk| vec![None; rsk.seql() - k + 1])
|
||||
.collect());
|
||||
}
|
||||
|
||||
let meta = PartitionMeta::load(&index_dir).map_err(olm_to_sk)?;
|
||||
let layers: Vec<QueryLayer> = (0..meta.n_layers)
|
||||
.map(|i| QueryLayer::open(&index_dir.join(format!("layer_{i}")), with_counts))
|
||||
.collect::<SKResult<_>>()?;
|
||||
|
||||
Ok(superkmers
|
||||
.iter()
|
||||
.map(|rsk| {
|
||||
rsk.superkmer()
|
||||
.iter_canonical_kmers()
|
||||
.map(|kmer| {
|
||||
layers.iter().find_map(|layer| layer.find(kmer, n_genomes))
|
||||
})
|
||||
.collect()
|
||||
})
|
||||
.collect())
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user