feat: add metadata-driven k-mer filtering for rebuild command

Introduces a metadata-driven filtering system for the rebuild command, classifying genomes into ingroup and outgroup categories using exact, inequality, and hierarchical path predicates. Implements a GroupQuorumFilter to enforce configurable presence thresholds and fraction constraints per group. Refactors the command to replace global quorum filters with this unified approach, converts the presence flag to a threshold parameter, and adds corresponding documentation and MkDocs navigation.
This commit is contained in:
Eric Coissac
2026-06-04 20:26:53 +02:00
parent edc18b4908
commit 476c7a6394
7 changed files with 470 additions and 33 deletions
+49
View File
@@ -85,3 +85,52 @@ impl KmerFilter for MaxTotalCount {
row.iter().sum::<u32>() <= self.total
}
}
// ── Group-based quorum filter ─────────────────────────────────────────────────
/// Quorum filter operating on pre-classified genome groups.
///
/// `ingroup_idx` / `outgroup_idx` are column indices into the per-genome row.
/// When `ingroup_idx` is empty, no ingroup quorum is checked.
/// When `outgroup_idx` is empty, no outgroup quorum is checked.
pub struct GroupQuorumFilter {
pub ingroup_idx: Vec<usize>,
pub outgroup_idx: Vec<usize>,
pub threshold: u32,
pub min_count: usize,
pub max_count: usize,
pub min_frac: f64,
pub max_frac: f64,
pub min_outgroup_count: usize,
pub max_outgroup_count: usize,
pub min_outgroup_frac: f64,
pub max_outgroup_frac: f64,
}
impl KmerFilter for GroupQuorumFilter {
fn passes(&self, row: &[u32], _n_genomes: usize) -> bool {
if !self.ingroup_idx.is_empty() {
let n = self.ingroup_idx.iter()
.filter(|&&i| row.get(i).copied().unwrap_or(0) > self.threshold)
.count();
let denom = self.ingroup_idx.len();
if n < self.min_count { return false; }
if n > self.max_count { return false; }
let frac = n as f64 / denom as f64;
if frac < self.min_frac { return false; }
if frac > self.max_frac { return false; }
}
if !self.outgroup_idx.is_empty() {
let n = self.outgroup_idx.iter()
.filter(|&&i| row.get(i).copied().unwrap_or(0) > self.threshold)
.count();
let denom = self.outgroup_idx.len();
if n < self.min_outgroup_count { return false; }
if n > self.max_outgroup_count { return false; }
let frac = n as f64 / denom as f64;
if frac < self.min_outgroup_frac { return false; }
if frac > self.max_outgroup_frac { return false; }
}
true
}
}
+1 -1
View File
@@ -8,6 +8,6 @@ mod partition;
mod query_layer;
mod rebuild_layer;
pub use filter::KmerFilter;
pub use filter::{GroupQuorumFilter, KmerFilter};
pub use merge_layer::MergeMode;
pub use partition::{KmerPartition, KmerSpectrum, PARTITIONS_SUBDIR};