feat: add metadata-driven k-mer filtering for rebuild command
Introduces a metadata-driven filtering system for the rebuild command, classifying genomes into ingroup and outgroup categories using exact, inequality, and hierarchical path predicates. Implements a GroupQuorumFilter to enforce configurable presence thresholds and fraction constraints per group. Refactors the command to replace global quorum filters with this unified approach, converts the presence flag to a threshold parameter, and adds corresponding documentation and MkDocs navigation.
This commit is contained in:
@@ -85,3 +85,52 @@ impl KmerFilter for MaxTotalCount {
|
||||
row.iter().sum::<u32>() <= self.total
|
||||
}
|
||||
}
|
||||
|
||||
// ── Group-based quorum filter ─────────────────────────────────────────────────
|
||||
|
||||
/// Quorum filter operating on pre-classified genome groups.
|
||||
///
|
||||
/// `ingroup_idx` / `outgroup_idx` are column indices into the per-genome row.
|
||||
/// When `ingroup_idx` is empty, no ingroup quorum is checked.
|
||||
/// When `outgroup_idx` is empty, no outgroup quorum is checked.
|
||||
pub struct GroupQuorumFilter {
|
||||
pub ingroup_idx: Vec<usize>,
|
||||
pub outgroup_idx: Vec<usize>,
|
||||
pub threshold: u32,
|
||||
pub min_count: usize,
|
||||
pub max_count: usize,
|
||||
pub min_frac: f64,
|
||||
pub max_frac: f64,
|
||||
pub min_outgroup_count: usize,
|
||||
pub max_outgroup_count: usize,
|
||||
pub min_outgroup_frac: f64,
|
||||
pub max_outgroup_frac: f64,
|
||||
}
|
||||
|
||||
impl KmerFilter for GroupQuorumFilter {
|
||||
fn passes(&self, row: &[u32], _n_genomes: usize) -> bool {
|
||||
if !self.ingroup_idx.is_empty() {
|
||||
let n = self.ingroup_idx.iter()
|
||||
.filter(|&&i| row.get(i).copied().unwrap_or(0) > self.threshold)
|
||||
.count();
|
||||
let denom = self.ingroup_idx.len();
|
||||
if n < self.min_count { return false; }
|
||||
if n > self.max_count { return false; }
|
||||
let frac = n as f64 / denom as f64;
|
||||
if frac < self.min_frac { return false; }
|
||||
if frac > self.max_frac { return false; }
|
||||
}
|
||||
if !self.outgroup_idx.is_empty() {
|
||||
let n = self.outgroup_idx.iter()
|
||||
.filter(|&&i| row.get(i).copied().unwrap_or(0) > self.threshold)
|
||||
.count();
|
||||
let denom = self.outgroup_idx.len();
|
||||
if n < self.min_outgroup_count { return false; }
|
||||
if n > self.max_outgroup_count { return false; }
|
||||
let frac = n as f64 / denom as f64;
|
||||
if frac < self.min_outgroup_frac { return false; }
|
||||
if frac > self.max_outgroup_frac { return false; }
|
||||
}
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,6 +8,6 @@ mod partition;
|
||||
mod query_layer;
|
||||
mod rebuild_layer;
|
||||
|
||||
pub use filter::KmerFilter;
|
||||
pub use filter::{GroupQuorumFilter, KmerFilter};
|
||||
pub use merge_layer::MergeMode;
|
||||
pub use partition::{KmerPartition, KmerSpectrum, PARTITIONS_SUBDIR};
|
||||
|
||||
Reference in New Issue
Block a user