From 476c7a6394f53246b5293069d10100447bf29e17 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Thu, 4 Jun 2026 20:26:53 +0200 Subject: [PATCH] feat: add metadata-driven k-mer filtering for rebuild command Introduces a metadata-driven filtering system for the rebuild command, classifying genomes into ingroup and outgroup categories using exact, inequality, and hierarchical path predicates. Implements a GroupQuorumFilter to enforce configurable presence thresholds and fraction constraints per group. Refactors the command to replace global quorum filters with this unified approach, converts the presence flag to a threshold parameter, and adds corresponding documentation and MkDocs navigation. --- docmd/implementation/rebuild_filter.md | 154 ++++++++++++++++++++ mkdocs.yml | 1 + src/obikmer/src/cmd/mod.rs | 1 + src/obikmer/src/cmd/predicate.rs | 191 +++++++++++++++++++++++++ src/obikmer/src/cmd/rebuild.rs | 105 +++++++++----- src/obikpartitionner/src/filter.rs | 49 +++++++ src/obikpartitionner/src/lib.rs | 2 +- 7 files changed, 470 insertions(+), 33 deletions(-) create mode 100644 docmd/implementation/rebuild_filter.md create mode 100644 src/obikmer/src/cmd/predicate.rs diff --git a/docmd/implementation/rebuild_filter.md b/docmd/implementation/rebuild_filter.md new file mode 100644 index 0000000..1904e94 --- /dev/null +++ b/docmd/implementation/rebuild_filter.md @@ -0,0 +1,154 @@ +# Rebuild filters and ingroup/outgroup predicates + +The `rebuild` command compacts an existing index into a new single-layer index, +optionally keeping only k-mers that satisfy a set of filters. +Filters can operate on raw quorum counts over all genomes, or on pre-defined +**ingroup** and **outgroup** genome sets derived from genome metadata. + +## Predicate syntax + +Each `--ingroup` and `--outgroup` flag takes a predicate of the form: + +``` +key OP value1|value2|… +``` + +| Operator | Meaning | +|----------|---------| +| `*` or `all` | wildcard — every genome matches unconditionally | +| `key=v1\|v2` | exact match — genome's `key` equals `v1` or `v2` | +| `key!=v` | negation — genome's `key` equals none of the values | +| `key~path` | path ancestry — genome's `key` is `path` or a descendant | +| `key!~path` | not a descendant | + +Multiple values separated by `|` are always OR-ed within the predicate. + +### Path matching (`~` and `!~`) + +Metadata values can represent hierarchical taxonomic paths such as +`/Eukaryota/Viridiplantae/Streptophyta/Betulaceae/Betula/nana`. + +- **Absolute pattern** (starts with `/`): the value must start with the pattern + at a segment boundary. + `taxon~/Betulaceae/Betula` matches `/Betulaceae/Betula/nana` and + `/Betulaceae/Betula` but not `/Betulaceae/Betuloides/…`. +- **Bare segment** (no leading `/`): the value must contain the pattern as an + exact path component anywhere. + `taxon~Betula` matches any path that has `Betula` as one of its segments. + +### Missing metadata key → NA + +If a genome does not carry the queried metadata key, the predicate returns **NA**. +NA propagates through the group evaluation logic (see below), and genomes that +cannot be classified are **ignored** in all quorum counts. + +## Group semantics + +### Multiple predicates + +| Flag | Combination rule | +|------|-----------------| +| `--ingroup` (repeated) | **AND** — genome must satisfy all predicates | +| `--outgroup` (repeated) | **OR** — genome satisfies any predicate | + +### Three-value logic + +Each predicate returns `true`, `false`, or `NA` (absent key). + +- AND: `false` absorbs everything; `NA` propagates unless already `false`. +- OR: `true` absorbs everything; `NA` propagates unless already `true`. + +### Classification and priority + +For each genome: + +1. Evaluate `AND(ingroup predicates)` → `in_result` +2. Evaluate `OR(outgroup predicates)` → `out_result` +3. If `in_result = true` → **Ingroup** (ingroup wins over outgroup) +4. Else if `out_result = true` → **Outgroup** +5. Otherwise → **Uncategorized** (ignored in all quorum counts) + +### Implicit groups + +| `--ingroup` | `--outgroup` | Effective behaviour | +|-------------|--------------|---------------------| +| not set | not set | all genomes form the ingroup | +| set | not set | only `--min-count`/`--min-frac` apply to matched genomes | +| not set | set | only `--max-count`/`--max-frac` apply to matched genomes | +| set | set | both constraints apply simultaneously | + +## Quorum flags + +| Flag | Applies to | Meaning | +|------|-----------|---------| +| `--min-count N` | ingroup | k-mer present in at least N ingroup genomes | +| `--max-count N` | ingroup | k-mer present in at most N ingroup genomes | +| `--min-frac F` | ingroup | k-mer present in at least fraction F of ingroup genomes | +| `--max-frac F` | ingroup | k-mer present in at most fraction F of ingroup genomes | +| `--min-outgroup-count N` | outgroup | k-mer present in at least N outgroup genomes | +| `--max-outgroup-count N` | outgroup | k-mer present in at most N outgroup genomes | +| `--min-outgroup-frac F` | outgroup | k-mer present in at least fraction F of outgroup genomes | +| `--max-outgroup-frac F` | outgroup | k-mer present in at most fraction F of outgroup genomes | +| `--min-total-count N` | all genomes | sum of per-genome counts ≥ N | +| `--max-total-count N` | all genomes | sum of per-genome counts ≤ N | +| `--presence-threshold N` | all | per-genome count > N to be considered "present" (default 0) | + +Fractions are computed over the size of the classified group, not over total +genome count. An empty group (no genome classified as ingroup/outgroup) never +triggers a filter failure. + +## Examples + +Keep k-mers specific to *Betula nana* — present in at least 2 *B. nana* genomes +and absent from every other genome in the index: + +```sh +obikmer rebuild src --output dst \ + --ingroup "species=Betula_nana" \ + --outgroup "*" \ + --min-count 2 \ + --max-count 0 +``` + +Keep k-mers found in at least 2 *Betula nana* genomes and absent from all *Betula*: + +```sh +obikmer rebuild src --output dst \ + --ingroup "species=Betula_nana" \ + --outgroup "genus=Betula" \ + --min-count 2 \ + --max-count 0 +``` + +Use taxonomic paths — keep k-mers present in ≥ 50 % of the *Betula* clade +and in fewer than 10 % of everything outside *Betulaceae*: + +```sh +obikmer rebuild src --output dst \ + --ingroup "taxon~/Betulaceae/Betula" \ + --outgroup "taxon!~/Betulaceae" \ + --min-frac 0.5 \ + --max-frac 0.1 +``` + +Multiple outgroup predicates (OR): exclude k-mers present in *Alnus* or *Carpinus*: + +```sh +obikmer rebuild src --output dst \ + --ingroup "genus=Betula" \ + --outgroup "genus=Alnus" \ + --outgroup "genus=Carpinus" \ + --max-count 0 +``` + +## Implementation + +- **`obikpartitionner::filter::GroupQuorumFilter`** — implements `KmerFilter` + using pre-computed ingroup and outgroup index vectors. The heavy logic + (predicate parsing, three-value evaluation, genome classification) happens + once before the rebuild loop; each k-mer row evaluation is a simple index + lookup and counter. + +- **`obikmer::cmd::predicate`** — predicate parsing (`MetaPred`), path matching + (`path_matches`), three-value AND/OR evaluation, and `build_group_filter` + which returns a ready-to-use `GroupQuorumFilter`. diff --git a/mkdocs.yml b/mkdocs.yml index 6d9bc3a..bee4ebb 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -49,6 +49,7 @@ nav: - PersistentCompactIntVec: implementation/persistent_compact_int_vec.md - PersistentBitVec: implementation/persistent_bit_vec.md - Merge command: implementation/merge.md + - Rebuild filters: implementation/rebuild_filter.md - Architecture: - Sequences: architecture/sequences/invariant.md - Kmer index: architecture/index_architecture.md diff --git a/src/obikmer/src/cmd/mod.rs b/src/obikmer/src/cmd/mod.rs index 569b543..27048f8 100644 --- a/src/obikmer/src/cmd/mod.rs +++ b/src/obikmer/src/cmd/mod.rs @@ -1,5 +1,6 @@ pub mod annotate; pub mod pack; +pub(crate) mod predicate; pub mod utils; pub mod distance; pub mod dump; diff --git a/src/obikmer/src/cmd/predicate.rs b/src/obikmer/src/cmd/predicate.rs new file mode 100644 index 0000000..3faee82 --- /dev/null +++ b/src/obikmer/src/cmd/predicate.rs @@ -0,0 +1,191 @@ +use std::collections::HashMap; + +use obikindex::GenomeInfo; +use obikpartitionner::GroupQuorumFilter; + +// ── Operator ────────────────────────────────────────────────────────────────── + +enum PredOp { Wildcard, Eq, Ne, Matches, NotMatches } + +// ── MetaPred ────────────────────────────────────────────────────────────────── + +/// A single predicate on genome metadata: `key OP val1|val2|…` +/// +/// Operators: `=` (exact), `!=` (not equal), `~` (path ancestor), `!~` (not ancestor). +/// Multiple values separated by `|` are OR'd. +pub struct MetaPred { + key: String, + op: PredOp, + values: Vec, +} + +impl MetaPred { + /// Parse a predicate string of the form `key=v1|v2`, `key!=v`, `key~path`, `key!~path`. + /// The special values `*` and `all` (case-insensitive) match every genome. + pub fn parse(s: &str) -> Result { + let t = s.trim(); + if t == "*" || t.eq_ignore_ascii_case("all") { + return Ok(Self { key: String::new(), op: PredOp::Wildcard, values: vec![] }); + } + + let (op, key, rhs) = + if let Some(pos) = s.find("!=") { + (PredOp::Ne, &s[..pos], &s[pos+2..]) + } else if let Some(pos) = s.find("!~") { + (PredOp::NotMatches, &s[..pos], &s[pos+2..]) + } else if let Some(pos) = s.find('=') { + (PredOp::Eq, &s[..pos], &s[pos+1..]) + } else if let Some(pos) = s.find('~') { + (PredOp::Matches, &s[..pos], &s[pos+1..]) + } else { + return Err(format!("no operator found in predicate: {s}")); + }; + + let key = key.trim().to_string(); + if key.is_empty() { return Err(format!("empty key in predicate: {s}")); } + + let values: Vec = rhs.split('|').map(|v| v.trim().to_string()).collect(); + if values.iter().any(|v| v.is_empty()) { + return Err(format!("empty value in predicate: {s}")); + } + + Ok(Self { key, op, values }) + } + + /// Evaluate against one genome's metadata. + /// Returns `None` when the key is absent (NA propagation). + fn eval(&self, meta: &HashMap) -> Option { + if matches!(self.op, PredOp::Wildcard) { return Some(true); } + let value = meta.get(&self.key)?; + Some(match self.op { + PredOp::Wildcard => unreachable!(), + PredOp::Eq => self.values.iter().any(|v| v == value), + PredOp::Ne => self.values.iter().all(|v| v != value), + PredOp::Matches => self.values.iter().any(|v| path_matches(value, v)), + PredOp::NotMatches => self.values.iter().all(|v| !path_matches(value, v)), + }) + } +} + +// ── Path matching ───────────────────────────────────────────────────────────── + +/// True if `value` is equal to `pattern` or is a descendant of it in a `/`-separated hierarchy. +/// +/// - Absolute pattern (`/a/b`): `value` must start with `/a/b` at a segment boundary. +/// - Bare segment (`b`): `value` must contain `b` as an exact segment anywhere. +fn path_matches(value: &str, pattern: &str) -> bool { + if pattern.starts_with('/') { + value == pattern + || (value.starts_with(pattern) + && value[pattern.len()..].starts_with('/')) + } else { + value.split('/').any(|seg| seg == pattern) + } +} + +// ── Three-value group evaluation ────────────────────────────────────────────── + +/// AND of all predicates (ingroup semantics). +/// Short-circuits on `Some(false)`; propagates `None` if no predicate returns `false`. +fn eval_and(preds: &[MetaPred], meta: &HashMap) -> Option { + let mut has_na = false; + for pred in preds { + match pred.eval(meta) { + Some(false) => return Some(false), + Some(true) => {} + None => has_na = true, + } + } + if has_na { None } else { Some(true) } +} + +/// OR of all predicates (outgroup semantics). +/// Short-circuits on `Some(true)`; propagates `None` if no predicate returns `true`. +fn eval_or(preds: &[MetaPred], meta: &HashMap) -> Option { + let mut has_na = false; + for pred in preds { + match pred.eval(meta) { + Some(true) => return Some(true), + Some(false) => {} + None => has_na = true, + } + } + if has_na { None } else { Some(false) } +} + +// ── Genome classification ───────────────────────────────────────────────────── + +enum Membership { Ingroup, Outgroup, Uncategorized } + +fn classify( + genomes: &[GenomeInfo], + ingroup: &[MetaPred], + outgroup: &[MetaPred], +) -> Vec { + genomes.iter().map(|g| { + let in_r = if ingroup.is_empty() { None } else { eval_and(ingroup, &g.meta) }; + let out_r = if outgroup.is_empty() { None } else { eval_or(outgroup, &g.meta) }; + + // Ingroup wins over outgroup. + if in_r == Some(true) { return Membership::Ingroup; } + if out_r == Some(true) { return Membership::Outgroup; } + Membership::Uncategorized + }).collect() +} + +// ── Public constructor ──────────────────────────────────────────────────────── + +/// Build a `GroupQuorumFilter` from parsed predicates and genome metadata. +/// +/// - No groups defined: `ingroup_idx` = all genomes (implicit ingroup). +/// - `ingroup` predicates only: outgroup indices are empty. +/// - `outgroup` predicates only: ingroup indices are empty. +/// - Both defined: ingroup wins on overlap; uncategorized genomes are ignored. +pub struct GroupFilterParams { + pub threshold: u32, + pub min_count: Option, + pub max_count: Option, + pub min_frac: Option, + pub max_frac: Option, + pub min_outgroup_count: Option, + pub max_outgroup_count: Option, + pub min_outgroup_frac: Option, + pub max_outgroup_frac: Option, +} + +pub fn build_group_filter( + genomes: &[GenomeInfo], + ingroup_preds: &[MetaPred], + outgroup_preds: &[MetaPred], + p: GroupFilterParams, +) -> GroupQuorumFilter { + let (ingroup_idx, outgroup_idx) = if ingroup_preds.is_empty() && outgroup_preds.is_empty() { + ((0..genomes.len()).collect(), vec![]) + } else { + let members = classify(genomes, ingroup_preds, outgroup_preds); + let in_idx: Vec = members.iter().enumerate() + .filter(|(_, m)| matches!(m, Membership::Ingroup)) + .map(|(i, _)| i).collect(); + let out_idx: Vec = members.iter().enumerate() + .filter(|(_, m)| matches!(m, Membership::Outgroup)) + .map(|(i, _)| i).collect(); + (in_idx, out_idx) + }; + + let in_size = ingroup_idx.len(); + let out_size = outgroup_idx.len(); + + GroupQuorumFilter { + ingroup_idx, + outgroup_idx, + threshold: p.threshold, + min_count: p.min_count.unwrap_or(0), + max_count: p.max_count.unwrap_or(in_size), + min_frac: p.min_frac.unwrap_or(0.0), + max_frac: p.max_frac.unwrap_or(1.0), + min_outgroup_count: p.min_outgroup_count.unwrap_or(0), + max_outgroup_count: p.max_outgroup_count.unwrap_or(out_size), + min_outgroup_frac: p.min_outgroup_frac.unwrap_or(0.0), + max_outgroup_frac: p.max_outgroup_frac.unwrap_or(1.0), + } +} diff --git a/src/obikmer/src/cmd/rebuild.rs b/src/obikmer/src/cmd/rebuild.rs index 8d24d93..ca78694 100644 --- a/src/obikmer/src/cmd/rebuild.rs +++ b/src/obikmer/src/cmd/rebuild.rs @@ -2,13 +2,12 @@ use std::path::PathBuf; use clap::Args; use obikindex::{KmerIndex, MergeMode}; -use obikpartitionner::filter::{ - KmerFilter, MaxGenomeCount, MaxGenomeFraction, MaxTotalCount, - MinGenomeCount, MinGenomeFraction, MinTotalCount, -}; +use obikpartitionner::filter::{MaxTotalCount, MinTotalCount}; use obisys::Reporter; use tracing::info; +use super::predicate::{GroupFilterParams, MetaPred, build_group_filter}; + #[derive(Args)] pub struct RebuildArgs { /// Source index directory @@ -18,21 +17,47 @@ pub struct RebuildArgs { #[arg(short, long)] pub output: PathBuf, - /// Minimum fraction of genomes containing the k-mer [0.0–1.0] - #[arg(long)] - pub min_quorum_frac: Option, + /// Ingroup predicate (repeatable; AND between flags). + /// Forms: `key=val1|val2`, `key!=val`, `key~path`, `key!~path`, `*`/`all` + #[arg(long, value_name = "PRED")] + pub ingroup: Vec, - /// Maximum fraction of genomes containing the k-mer [0.0–1.0] - #[arg(long)] - pub max_quorum_frac: Option, + /// Outgroup predicate (repeatable; OR between flags). + /// Forms: `key=val1|val2`, `key!=val`, `key~path`, `key!~path`, `*`/`all` + #[arg(long, value_name = "PRED")] + pub outgroup: Vec, - /// Minimum number of genomes containing the k-mer + /// Minimum number of ingroup genomes containing the k-mer #[arg(long)] - pub min_quorum_count: Option, + pub min_count: Option, - /// Maximum number of genomes containing the k-mer + /// Maximum number of ingroup genomes containing the k-mer #[arg(long)] - pub max_quorum_count: Option, + pub max_count: Option, + + /// Minimum fraction of ingroup genomes containing the k-mer [0.0–1.0] + #[arg(long)] + pub min_frac: Option, + + /// Maximum fraction of ingroup genomes containing the k-mer [0.0–1.0] + #[arg(long)] + pub max_frac: Option, + + /// Minimum number of outgroup genomes containing the k-mer + #[arg(long)] + pub min_outgroup_count: Option, + + /// Maximum number of outgroup genomes containing the k-mer + #[arg(long)] + pub max_outgroup_count: Option, + + /// Minimum fraction of outgroup genomes containing the k-mer [0.0–1.0] + #[arg(long)] + pub min_outgroup_frac: Option, + + /// Maximum fraction of outgroup genomes containing the k-mer [0.0–1.0] + #[arg(long)] + pub max_outgroup_frac: Option, /// Minimum total count across all genomes (count index only) #[arg(long)] @@ -42,7 +67,7 @@ pub struct RebuildArgs { #[arg(long)] pub max_total_count: Option, - /// Per-genome count threshold for presence in quorum filters (default 0) + /// Per-genome count threshold to consider a genome as "containing" the k-mer (default 0) #[arg(long, default_value = "0")] pub presence_threshold: u32, @@ -61,34 +86,50 @@ pub fn run(args: RebuildArgs) { std::process::exit(1); }); - - let n_genomes = src.meta().genomes.len(); let mode = if args.presence || !src.meta().config.with_counts { MergeMode::Presence } else { MergeMode::Count }; + let ingroup_preds: Vec = args.ingroup.iter() + .map(|s| MetaPred::parse(s).unwrap_or_else(|e| { + eprintln!("error in --ingroup: {e}"); + std::process::exit(1); + })) + .collect(); + + let outgroup_preds: Vec = args.outgroup.iter() + .map(|s| MetaPred::parse(s).unwrap_or_else(|e| { + eprintln!("error in --outgroup: {e}"); + std::process::exit(1); + })) + .collect(); + info!( "rebuild: {} genome(s), mode={:?}, source={}", - n_genomes, mode, args.source.display() + src.meta().genomes.len(), mode, args.source.display() ); - let pt = args.presence_threshold; - let mut filters: Vec> = Vec::new(); + let mut filters: Vec> = Vec::new(); + + filters.push(Box::new(build_group_filter( + &src.meta().genomes, + &ingroup_preds, + &outgroup_preds, + GroupFilterParams { + threshold: args.presence_threshold, + min_count: args.min_count, + max_count: args.max_count, + min_frac: args.min_frac, + max_frac: args.max_frac, + min_outgroup_count: args.min_outgroup_count, + max_outgroup_count: args.max_outgroup_count, + min_outgroup_frac: args.min_outgroup_frac, + max_outgroup_frac: args.max_outgroup_frac, + }, + ))); - if let Some(v) = args.min_quorum_frac { - filters.push(Box::new(MinGenomeFraction { frac: v, threshold: pt })); - } - if let Some(v) = args.max_quorum_frac { - filters.push(Box::new(MaxGenomeFraction { frac: v, threshold: pt })); - } - if let Some(v) = args.min_quorum_count { - filters.push(Box::new(MinGenomeCount { count: v, threshold: pt })); - } - if let Some(v) = args.max_quorum_count { - filters.push(Box::new(MaxGenomeCount { count: v, threshold: pt })); - } if let Some(v) = args.min_total_count { filters.push(Box::new(MinTotalCount { total: v })); } diff --git a/src/obikpartitionner/src/filter.rs b/src/obikpartitionner/src/filter.rs index f35a3e8..6f3e0f0 100644 --- a/src/obikpartitionner/src/filter.rs +++ b/src/obikpartitionner/src/filter.rs @@ -85,3 +85,52 @@ impl KmerFilter for MaxTotalCount { row.iter().sum::() <= self.total } } + +// ── Group-based quorum filter ───────────────────────────────────────────────── + +/// Quorum filter operating on pre-classified genome groups. +/// +/// `ingroup_idx` / `outgroup_idx` are column indices into the per-genome row. +/// When `ingroup_idx` is empty, no ingroup quorum is checked. +/// When `outgroup_idx` is empty, no outgroup quorum is checked. +pub struct GroupQuorumFilter { + pub ingroup_idx: Vec, + pub outgroup_idx: Vec, + pub threshold: u32, + pub min_count: usize, + pub max_count: usize, + pub min_frac: f64, + pub max_frac: f64, + pub min_outgroup_count: usize, + pub max_outgroup_count: usize, + pub min_outgroup_frac: f64, + pub max_outgroup_frac: f64, +} + +impl KmerFilter for GroupQuorumFilter { + fn passes(&self, row: &[u32], _n_genomes: usize) -> bool { + if !self.ingroup_idx.is_empty() { + let n = self.ingroup_idx.iter() + .filter(|&&i| row.get(i).copied().unwrap_or(0) > self.threshold) + .count(); + let denom = self.ingroup_idx.len(); + if n < self.min_count { return false; } + if n > self.max_count { return false; } + let frac = n as f64 / denom as f64; + if frac < self.min_frac { return false; } + if frac > self.max_frac { return false; } + } + if !self.outgroup_idx.is_empty() { + let n = self.outgroup_idx.iter() + .filter(|&&i| row.get(i).copied().unwrap_or(0) > self.threshold) + .count(); + let denom = self.outgroup_idx.len(); + if n < self.min_outgroup_count { return false; } + if n > self.max_outgroup_count { return false; } + let frac = n as f64 / denom as f64; + if frac < self.min_outgroup_frac { return false; } + if frac > self.max_outgroup_frac { return false; } + } + true + } +} diff --git a/src/obikpartitionner/src/lib.rs b/src/obikpartitionner/src/lib.rs index 49a81fc..a6a010b 100644 --- a/src/obikpartitionner/src/lib.rs +++ b/src/obikpartitionner/src/lib.rs @@ -8,6 +8,6 @@ mod partition; mod query_layer; mod rebuild_layer; -pub use filter::KmerFilter; +pub use filter::{GroupQuorumFilter, KmerFilter}; pub use merge_layer::MergeMode; pub use partition::{KmerPartition, KmerSpectrum, PARTITIONS_SUBDIR};