From 476c7a6394f53246b5293069d10100447bf29e17 Mon Sep 17 00:00:00 2001
From: Eric Coissac <eric@coissac.eu>
Date: Thu, 4 Jun 2026 20:26:53 +0200
Subject: [PATCH] feat: add metadata-driven k-mer filtering for rebuild command

Introduces a metadata-driven filtering system for the rebuild command, classifying genomes into ingroup and outgroup categories using exact, inequality, and hierarchical path predicates. Implements a GroupQuorumFilter to enforce configurable presence thresholds and fraction constraints per group. Refactors the command to replace global quorum filters with this unified approach, converts the presence flag to a threshold parameter, and adds corresponding documentation and MkDocs navigation.
---
 docmd/implementation/rebuild_filter.md | 154 ++++++++++++++++++++
 mkdocs.yml                             |   1 +
 src/obikmer/src/cmd/mod.rs             |   1 +
 src/obikmer/src/cmd/predicate.rs       | 191 +++++++++++++++++++++++++
 src/obikmer/src/cmd/rebuild.rs         | 105 +++++++++-----
 src/obikpartitionner/src/filter.rs     |  49 +++++++
 src/obikpartitionner/src/lib.rs        |   2 +-
 7 files changed, 470 insertions(+), 33 deletions(-)
 create mode 100644 docmd/implementation/rebuild_filter.md
 create mode 100644 src/obikmer/src/cmd/predicate.rs

diff --git a/docmd/implementation/rebuild_filter.md b/docmd/implementation/rebuild_filter.md
new file mode 100644
index 0000000..1904e94
--- /dev/null
+++ b/docmd/implementation/rebuild_filter.md
@@ -0,0 +1,154 @@
+# Rebuild filters and ingroup/outgroup predicates
+
+The `rebuild` command compacts an existing index into a new single-layer index,
+optionally keeping only k-mers that satisfy a set of filters.
+Filters can operate on raw quorum counts over all genomes, or on pre-defined
+**ingroup** and **outgroup** genome sets derived from genome metadata.
+
+## Predicate syntax
+
+Each `--ingroup` and `--outgroup` flag takes a predicate of the form:
+
+```
+key OP value1|value2|…
+```
+
+| Operator | Meaning |
+|----------|---------|
+| `*` or `all` | wildcard — every genome matches unconditionally |
+| `key=v1\|v2` | exact match — genome's `key` equals `v1` or `v2` |
+| `key!=v` | negation — genome's `key` equals none of the values |
+| `key~path` | path ancestry — genome's `key` is `path` or a descendant |
+| `key!~path` | not a descendant |
+
+Multiple values separated by `|` are always OR-ed within the predicate.
+
+### Path matching (`~` and `!~`)
+
+Metadata values can represent hierarchical taxonomic paths such as
+`/Eukaryota/Viridiplantae/Streptophyta/Betulaceae/Betula/nana`.
+
+- **Absolute pattern** (starts with `/`): the value must start with the pattern
+  at a segment boundary.
+  `taxon~/Betulaceae/Betula` matches `/Betulaceae/Betula/nana` and
+  `/Betulaceae/Betula` but not `/Betulaceae/Betuloides/…`.
+- **Bare segment** (no leading `/`): the value must contain the pattern as an
+  exact path component anywhere.
+  `taxon~Betula` matches any path that has `Betula` as one of its segments.
+
+### Missing metadata key → NA
+
+If a genome does not carry the queried metadata key, the predicate returns **NA**.
+NA propagates through the group evaluation logic (see below), and genomes that
+cannot be classified are **ignored** in all quorum counts.
+
+## Group semantics
+
+### Multiple predicates
+
+| Flag | Combination rule |
+|------|-----------------|
+| `--ingroup` (repeated) | **AND** — genome must satisfy all predicates |
+| `--outgroup` (repeated) | **OR** — genome satisfies any predicate |
+
+### Three-value logic
+
+Each predicate returns `true`, `false`, or `NA` (absent key).
+
+- AND: `false` absorbs everything; `NA` propagates unless already `false`.
+- OR: `true` absorbs everything; `NA` propagates unless already `true`.
+
+### Classification and priority
+
+For each genome:
+
+1. Evaluate `AND(ingroup predicates)` → `in_result`
+2. Evaluate `OR(outgroup predicates)` → `out_result`
+3. If `in_result = true` → **Ingroup** (ingroup wins over outgroup)
+4. Else if `out_result = true` → **Outgroup**
+5. Otherwise → **Uncategorized** (ignored in all quorum counts)
+
+### Implicit groups
+
+| `--ingroup` | `--outgroup` | Effective behaviour |
+|-------------|--------------|---------------------|
+| not set | not set | all genomes form the ingroup |
+| set | not set | only `--min-count`/`--min-frac` apply to matched genomes |
+| not set | set | only `--max-count`/`--max-frac` apply to matched genomes |
+| set | set | both constraints apply simultaneously |
+
+## Quorum flags
+
+| Flag | Applies to | Meaning |
+|------|-----------|---------|
+| `--min-count N` | ingroup | k-mer present in at least N ingroup genomes |
+| `--max-count N` | ingroup | k-mer present in at most N ingroup genomes |
+| `--min-frac F` | ingroup | k-mer present in at least fraction F of ingroup genomes |
+| `--max-frac F` | ingroup | k-mer present in at most fraction F of ingroup genomes |
+| `--min-outgroup-count N` | outgroup | k-mer present in at least N outgroup genomes |
+| `--max-outgroup-count N` | outgroup | k-mer present in at most N outgroup genomes |
+| `--min-outgroup-frac F` | outgroup | k-mer present in at least fraction F of outgroup genomes |
+| `--max-outgroup-frac F` | outgroup | k-mer present in at most fraction F of outgroup genomes |
+| `--min-total-count N` | all genomes | sum of per-genome counts ≥ N |
+| `--max-total-count N` | all genomes | sum of per-genome counts ≤ N |
+| `--presence-threshold N` | all | per-genome count > N to be considered "present" (default 0) |
+
+Fractions are computed over the size of the classified group, not over total
+genome count. An empty group (no genome classified as ingroup/outgroup) never
+triggers a filter failure.
+
+## Examples
+
+Keep k-mers specific to *Betula nana* — present in at least 2 *B. nana* genomes
+and absent from every other genome in the index:
+
+```sh
+obikmer rebuild src --output dst \
+  --ingroup  "species=Betula_nana" \
+  --outgroup "*" \
+  --min-count 2 \
+  --max-count 0
+```
+
+Keep k-mers found in at least 2 *Betula nana* genomes and absent from all *Betula*:
+
+```sh
+obikmer rebuild src --output dst \
+  --ingroup  "species=Betula_nana" \
+  --outgroup "genus=Betula" \
+  --min-count 2 \
+  --max-count 0
+```
+
+Use taxonomic paths — keep k-mers present in ≥ 50 % of the *Betula* clade
+and in fewer than 10 % of everything outside *Betulaceae*:
+
+```sh
+obikmer rebuild src --output dst \
+  --ingroup  "taxon~/Betulaceae/Betula" \
+  --outgroup "taxon!~/Betulaceae" \
+  --min-frac 0.5 \
+  --max-frac 0.1
+```
+
+Multiple outgroup predicates (OR): exclude k-mers present in *Alnus* or *Carpinus*:
+
+```sh
+obikmer rebuild src --output dst \
+  --ingroup  "genus=Betula" \
+  --outgroup "genus=Alnus" \
+  --outgroup "genus=Carpinus" \
+  --max-count 0
+```
+
+## Implementation
+
+- **`obikpartitionner::filter::GroupQuorumFilter`** — implements `KmerFilter`
+  using pre-computed ingroup and outgroup index vectors. The heavy logic
+  (predicate parsing, three-value evaluation, genome classification) happens
+  once before the rebuild loop; each k-mer row evaluation is a simple index
+  lookup and counter.
+
+- **`obikmer::cmd::predicate`** — predicate parsing (`MetaPred`), path matching
+  (`path_matches`), three-value AND/OR evaluation, and `build_group_filter`
+  which returns a ready-to-use `GroupQuorumFilter`.
diff --git a/mkdocs.yml b/mkdocs.yml
index 6d9bc3a..bee4ebb 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -49,6 +49,7 @@ nav:
       - PersistentCompactIntVec: implementation/persistent_compact_int_vec.md
       - PersistentBitVec: implementation/persistent_bit_vec.md
       - Merge command: implementation/merge.md
+      - Rebuild filters: implementation/rebuild_filter.md
   - Architecture:
       - Sequences: architecture/sequences/invariant.md
       - Kmer index: architecture/index_architecture.md
diff --git a/src/obikmer/src/cmd/mod.rs b/src/obikmer/src/cmd/mod.rs
index 569b543..27048f8 100644
--- a/src/obikmer/src/cmd/mod.rs
+++ b/src/obikmer/src/cmd/mod.rs
@@ -1,5 +1,6 @@
 pub mod annotate;
 pub mod pack;
+pub(crate) mod predicate;
 pub mod utils;
 pub mod distance;
 pub mod dump;
diff --git a/src/obikmer/src/cmd/predicate.rs b/src/obikmer/src/cmd/predicate.rs
new file mode 100644
index 0000000..3faee82
--- /dev/null
+++ b/src/obikmer/src/cmd/predicate.rs
@@ -0,0 +1,191 @@
+use std::collections::HashMap;
+
+use obikindex::GenomeInfo;
+use obikpartitionner::GroupQuorumFilter;
+
+// ── Operator ──────────────────────────────────────────────────────────────────
+
+enum PredOp { Wildcard, Eq, Ne, Matches, NotMatches }
+
+// ── MetaPred ──────────────────────────────────────────────────────────────────
+
+/// A single predicate on genome metadata: `key OP val1|val2|…`
+///
+/// Operators: `=` (exact), `!=` (not equal), `~` (path ancestor), `!~` (not ancestor).
+/// Multiple values separated by `|` are OR'd.
+pub struct MetaPred {
+    key:    String,
+    op:     PredOp,
+    values: Vec<String>,
+}
+
+impl MetaPred {
+    /// Parse a predicate string of the form `key=v1|v2`, `key!=v`, `key~path`, `key!~path`.
+    /// The special values `*` and `all` (case-insensitive) match every genome.
+    pub fn parse(s: &str) -> Result<Self, String> {
+        let t = s.trim();
+        if t == "*" || t.eq_ignore_ascii_case("all") {
+            return Ok(Self { key: String::new(), op: PredOp::Wildcard, values: vec![] });
+        }
+
+        let (op, key, rhs) =
+            if let Some(pos) = s.find("!=") {
+                (PredOp::Ne,         &s[..pos], &s[pos+2..])
+            } else if let Some(pos) = s.find("!~") {
+                (PredOp::NotMatches, &s[..pos], &s[pos+2..])
+            } else if let Some(pos) = s.find('=') {
+                (PredOp::Eq,         &s[..pos], &s[pos+1..])
+            } else if let Some(pos) = s.find('~') {
+                (PredOp::Matches,    &s[..pos], &s[pos+1..])
+            } else {
+                return Err(format!("no operator found in predicate: {s}"));
+            };
+
+        let key = key.trim().to_string();
+        if key.is_empty() { return Err(format!("empty key in predicate: {s}")); }
+
+        let values: Vec<String> = rhs.split('|').map(|v| v.trim().to_string()).collect();
+        if values.iter().any(|v| v.is_empty()) {
+            return Err(format!("empty value in predicate: {s}"));
+        }
+
+        Ok(Self { key, op, values })
+    }
+
+    /// Evaluate against one genome's metadata.
+    /// Returns `None` when the key is absent (NA propagation).
+    fn eval(&self, meta: &HashMap<String, String>) -> Option<bool> {
+        if matches!(self.op, PredOp::Wildcard) { return Some(true); }
+        let value = meta.get(&self.key)?;
+        Some(match self.op {
+            PredOp::Wildcard   => unreachable!(),
+            PredOp::Eq         => self.values.iter().any(|v| v == value),
+            PredOp::Ne         => self.values.iter().all(|v| v != value),
+            PredOp::Matches    => self.values.iter().any(|v| path_matches(value, v)),
+            PredOp::NotMatches => self.values.iter().all(|v| !path_matches(value, v)),
+        })
+    }
+}
+
+// ── Path matching ─────────────────────────────────────────────────────────────
+
+/// True if `value` is equal to `pattern` or is a descendant of it in a `/`-separated hierarchy.
+///
+/// - Absolute pattern (`/a/b`): `value` must start with `/a/b` at a segment boundary.
+/// - Bare segment (`b`): `value` must contain `b` as an exact segment anywhere.
+fn path_matches(value: &str, pattern: &str) -> bool {
+    if pattern.starts_with('/') {
+        value == pattern
+            || (value.starts_with(pattern)
+                && value[pattern.len()..].starts_with('/'))
+    } else {
+        value.split('/').any(|seg| seg == pattern)
+    }
+}
+
+// ── Three-value group evaluation ──────────────────────────────────────────────
+
+/// AND of all predicates (ingroup semantics).
+/// Short-circuits on `Some(false)`; propagates `None` if no predicate returns `false`.
+fn eval_and(preds: &[MetaPred], meta: &HashMap<String, String>) -> Option<bool> {
+    let mut has_na = false;
+    for pred in preds {
+        match pred.eval(meta) {
+            Some(false) => return Some(false),
+            Some(true)  => {}
+            None        => has_na = true,
+        }
+    }
+    if has_na { None } else { Some(true) }
+}
+
+/// OR of all predicates (outgroup semantics).
+/// Short-circuits on `Some(true)`; propagates `None` if no predicate returns `true`.
+fn eval_or(preds: &[MetaPred], meta: &HashMap<String, String>) -> Option<bool> {
+    let mut has_na = false;
+    for pred in preds {
+        match pred.eval(meta) {
+            Some(true)  => return Some(true),
+            Some(false) => {}
+            None        => has_na = true,
+        }
+    }
+    if has_na { None } else { Some(false) }
+}
+
+// ── Genome classification ─────────────────────────────────────────────────────
+
+enum Membership { Ingroup, Outgroup, Uncategorized }
+
+fn classify(
+    genomes: &[GenomeInfo],
+    ingroup:  &[MetaPred],
+    outgroup: &[MetaPred],
+) -> Vec<Membership> {
+    genomes.iter().map(|g| {
+        let in_r  = if ingroup.is_empty()  { None } else { eval_and(ingroup,  &g.meta) };
+        let out_r = if outgroup.is_empty() { None } else { eval_or(outgroup,  &g.meta) };
+
+        // Ingroup wins over outgroup.
+        if in_r == Some(true)  { return Membership::Ingroup;  }
+        if out_r == Some(true) { return Membership::Outgroup; }
+        Membership::Uncategorized
+    }).collect()
+}
+
+// ── Public constructor ────────────────────────────────────────────────────────
+
+/// Build a `GroupQuorumFilter` from parsed predicates and genome metadata.
+///
+/// - No groups defined: `ingroup_idx` = all genomes (implicit ingroup).
+/// - `ingroup` predicates only: outgroup indices are empty.
+/// - `outgroup` predicates only: ingroup indices are empty.
+/// - Both defined: ingroup wins on overlap; uncategorized genomes are ignored.
+pub struct GroupFilterParams {
+    pub threshold:          u32,
+    pub min_count:          Option<usize>,
+    pub max_count:          Option<usize>,
+    pub min_frac:           Option<f64>,
+    pub max_frac:           Option<f64>,
+    pub min_outgroup_count: Option<usize>,
+    pub max_outgroup_count: Option<usize>,
+    pub min_outgroup_frac:  Option<f64>,
+    pub max_outgroup_frac:  Option<f64>,
+}
+
+pub fn build_group_filter(
+    genomes:        &[GenomeInfo],
+    ingroup_preds:  &[MetaPred],
+    outgroup_preds: &[MetaPred],
+    p:              GroupFilterParams,
+) -> GroupQuorumFilter {
+    let (ingroup_idx, outgroup_idx) = if ingroup_preds.is_empty() && outgroup_preds.is_empty() {
+        ((0..genomes.len()).collect(), vec![])
+    } else {
+        let members = classify(genomes, ingroup_preds, outgroup_preds);
+        let in_idx: Vec<usize>  = members.iter().enumerate()
+            .filter(|(_, m)| matches!(m, Membership::Ingroup))
+            .map(|(i, _)| i).collect();
+        let out_idx: Vec<usize> = members.iter().enumerate()
+            .filter(|(_, m)| matches!(m, Membership::Outgroup))
+            .map(|(i, _)| i).collect();
+        (in_idx, out_idx)
+    };
+
+    let in_size  = ingroup_idx.len();
+    let out_size = outgroup_idx.len();
+
+    GroupQuorumFilter {
+        ingroup_idx,
+        outgroup_idx,
+        threshold:          p.threshold,
+        min_count:          p.min_count.unwrap_or(0),
+        max_count:          p.max_count.unwrap_or(in_size),
+        min_frac:           p.min_frac.unwrap_or(0.0),
+        max_frac:           p.max_frac.unwrap_or(1.0),
+        min_outgroup_count: p.min_outgroup_count.unwrap_or(0),
+        max_outgroup_count: p.max_outgroup_count.unwrap_or(out_size),
+        min_outgroup_frac:  p.min_outgroup_frac.unwrap_or(0.0),
+        max_outgroup_frac:  p.max_outgroup_frac.unwrap_or(1.0),
+    }
+}
diff --git a/src/obikmer/src/cmd/rebuild.rs b/src/obikmer/src/cmd/rebuild.rs
index 8d24d93..ca78694 100644
--- a/src/obikmer/src/cmd/rebuild.rs
+++ b/src/obikmer/src/cmd/rebuild.rs
@@ -2,13 +2,12 @@ use std::path::PathBuf;
 
 use clap::Args;
 use obikindex::{KmerIndex, MergeMode};
-use obikpartitionner::filter::{
-    KmerFilter, MaxGenomeCount, MaxGenomeFraction, MaxTotalCount,
-    MinGenomeCount, MinGenomeFraction, MinTotalCount,
-};
+use obikpartitionner::filter::{MaxTotalCount, MinTotalCount};
 use obisys::Reporter;
 use tracing::info;
 
+use super::predicate::{GroupFilterParams, MetaPred, build_group_filter};
+
 #[derive(Args)]
 pub struct RebuildArgs {
     /// Source index directory
@@ -18,21 +17,47 @@ pub struct RebuildArgs {
     #[arg(short, long)]
     pub output: PathBuf,
 
-    /// Minimum fraction of genomes containing the k-mer [0.0–1.0]
-    #[arg(long)]
-    pub min_quorum_frac: Option<f64>,
+    /// Ingroup predicate (repeatable; AND between flags).
+    /// Forms: `key=val1|val2`, `key!=val`, `key~path`, `key!~path`, `*`/`all`
+    #[arg(long, value_name = "PRED")]
+    pub ingroup: Vec<String>,
 
-    /// Maximum fraction of genomes containing the k-mer [0.0–1.0]
-    #[arg(long)]
-    pub max_quorum_frac: Option<f64>,
+    /// Outgroup predicate (repeatable; OR between flags).
+    /// Forms: `key=val1|val2`, `key!=val`, `key~path`, `key!~path`, `*`/`all`
+    #[arg(long, value_name = "PRED")]
+    pub outgroup: Vec<String>,
 
-    /// Minimum number of genomes containing the k-mer
+    /// Minimum number of ingroup genomes containing the k-mer
     #[arg(long)]
-    pub min_quorum_count: Option<usize>,
+    pub min_count: Option<usize>,
 
-    /// Maximum number of genomes containing the k-mer
+    /// Maximum number of ingroup genomes containing the k-mer
     #[arg(long)]
-    pub max_quorum_count: Option<usize>,
+    pub max_count: Option<usize>,
+
+    /// Minimum fraction of ingroup genomes containing the k-mer [0.0–1.0]
+    #[arg(long)]
+    pub min_frac: Option<f64>,
+
+    /// Maximum fraction of ingroup genomes containing the k-mer [0.0–1.0]
+    #[arg(long)]
+    pub max_frac: Option<f64>,
+
+    /// Minimum number of outgroup genomes containing the k-mer
+    #[arg(long)]
+    pub min_outgroup_count: Option<usize>,
+
+    /// Maximum number of outgroup genomes containing the k-mer
+    #[arg(long)]
+    pub max_outgroup_count: Option<usize>,
+
+    /// Minimum fraction of outgroup genomes containing the k-mer [0.0–1.0]
+    #[arg(long)]
+    pub min_outgroup_frac: Option<f64>,
+
+    /// Maximum fraction of outgroup genomes containing the k-mer [0.0–1.0]
+    #[arg(long)]
+    pub max_outgroup_frac: Option<f64>,
 
     /// Minimum total count across all genomes (count index only)
     #[arg(long)]
@@ -42,7 +67,7 @@ pub struct RebuildArgs {
     #[arg(long)]
     pub max_total_count: Option<u32>,
 
-    /// Per-genome count threshold for presence in quorum filters (default 0)
+    /// Per-genome count threshold to consider a genome as "containing" the k-mer (default 0)
     #[arg(long, default_value = "0")]
     pub presence_threshold: u32,
 
@@ -61,34 +86,50 @@ pub fn run(args: RebuildArgs) {
         std::process::exit(1);
     });
 
-
-    let n_genomes = src.meta().genomes.len();
     let mode = if args.presence || !src.meta().config.with_counts {
         MergeMode::Presence
     } else {
         MergeMode::Count
     };
 
+    let ingroup_preds: Vec<MetaPred> = args.ingroup.iter()
+        .map(|s| MetaPred::parse(s).unwrap_or_else(|e| {
+            eprintln!("error in --ingroup: {e}");
+            std::process::exit(1);
+        }))
+        .collect();
+
+    let outgroup_preds: Vec<MetaPred> = args.outgroup.iter()
+        .map(|s| MetaPred::parse(s).unwrap_or_else(|e| {
+            eprintln!("error in --outgroup: {e}");
+            std::process::exit(1);
+        }))
+        .collect();
+
     info!(
         "rebuild: {} genome(s), mode={:?}, source={}",
-        n_genomes, mode, args.source.display()
+        src.meta().genomes.len(), mode, args.source.display()
     );
 
-    let pt = args.presence_threshold;
-    let mut filters: Vec<Box<dyn KmerFilter>> = Vec::new();
+    let mut filters: Vec<Box<dyn obikpartitionner::KmerFilter>> = Vec::new();
+
+    filters.push(Box::new(build_group_filter(
+        &src.meta().genomes,
+        &ingroup_preds,
+        &outgroup_preds,
+        GroupFilterParams {
+            threshold:          args.presence_threshold,
+            min_count:          args.min_count,
+            max_count:          args.max_count,
+            min_frac:           args.min_frac,
+            max_frac:           args.max_frac,
+            min_outgroup_count: args.min_outgroup_count,
+            max_outgroup_count: args.max_outgroup_count,
+            min_outgroup_frac:  args.min_outgroup_frac,
+            max_outgroup_frac:  args.max_outgroup_frac,
+        },
+    )));
 
-    if let Some(v) = args.min_quorum_frac {
-        filters.push(Box::new(MinGenomeFraction { frac: v, threshold: pt }));
-    }
-    if let Some(v) = args.max_quorum_frac {
-        filters.push(Box::new(MaxGenomeFraction { frac: v, threshold: pt }));
-    }
-    if let Some(v) = args.min_quorum_count {
-        filters.push(Box::new(MinGenomeCount { count: v, threshold: pt }));
-    }
-    if let Some(v) = args.max_quorum_count {
-        filters.push(Box::new(MaxGenomeCount { count: v, threshold: pt }));
-    }
     if let Some(v) = args.min_total_count {
         filters.push(Box::new(MinTotalCount { total: v }));
     }
diff --git a/src/obikpartitionner/src/filter.rs b/src/obikpartitionner/src/filter.rs
index f35a3e8..6f3e0f0 100644
--- a/src/obikpartitionner/src/filter.rs
+++ b/src/obikpartitionner/src/filter.rs
@@ -85,3 +85,52 @@ impl KmerFilter for MaxTotalCount {
         row.iter().sum::<u32>() <= self.total
     }
 }
+
+// ── Group-based quorum filter ─────────────────────────────────────────────────
+
+/// Quorum filter operating on pre-classified genome groups.
+///
+/// `ingroup_idx` / `outgroup_idx` are column indices into the per-genome row.
+/// When `ingroup_idx` is empty, no ingroup quorum is checked.
+/// When `outgroup_idx` is empty, no outgroup quorum is checked.
+pub struct GroupQuorumFilter {
+    pub ingroup_idx:        Vec<usize>,
+    pub outgroup_idx:       Vec<usize>,
+    pub threshold:          u32,
+    pub min_count:          usize,
+    pub max_count:          usize,
+    pub min_frac:           f64,
+    pub max_frac:           f64,
+    pub min_outgroup_count: usize,
+    pub max_outgroup_count: usize,
+    pub min_outgroup_frac:  f64,
+    pub max_outgroup_frac:  f64,
+}
+
+impl KmerFilter for GroupQuorumFilter {
+    fn passes(&self, row: &[u32], _n_genomes: usize) -> bool {
+        if !self.ingroup_idx.is_empty() {
+            let n = self.ingroup_idx.iter()
+                .filter(|&&i| row.get(i).copied().unwrap_or(0) > self.threshold)
+                .count();
+            let denom = self.ingroup_idx.len();
+            if n < self.min_count { return false; }
+            if n > self.max_count { return false; }
+            let frac = n as f64 / denom as f64;
+            if frac < self.min_frac { return false; }
+            if frac > self.max_frac { return false; }
+        }
+        if !self.outgroup_idx.is_empty() {
+            let n = self.outgroup_idx.iter()
+                .filter(|&&i| row.get(i).copied().unwrap_or(0) > self.threshold)
+                .count();
+            let denom = self.outgroup_idx.len();
+            if n < self.min_outgroup_count { return false; }
+            if n > self.max_outgroup_count { return false; }
+            let frac = n as f64 / denom as f64;
+            if frac < self.min_outgroup_frac { return false; }
+            if frac > self.max_outgroup_frac { return false; }
+        }
+        true
+    }
+}
diff --git a/src/obikpartitionner/src/lib.rs b/src/obikpartitionner/src/lib.rs
index 49a81fc..a6a010b 100644
--- a/src/obikpartitionner/src/lib.rs
+++ b/src/obikpartitionner/src/lib.rs
@@ -8,6 +8,6 @@ mod partition;
 mod query_layer;
 mod rebuild_layer;
 
-pub use filter::KmerFilter;
+pub use filter::{GroupQuorumFilter, KmerFilter};
 pub use merge_layer::MergeMode;
 pub use partition::{KmerPartition, KmerSpectrum, PARTITIONS_SUBDIR};