refactor: centralize k-mer filtering logic and add validation

Refactor shared `FilterArgs` and `build_group_filter` to return a `Result` with explicit validation for fraction bounds, min/max ordering, and count constraints. Update conditional defaults for `--min-frac` and `--max-outgroup-count` to depend on explicit quorum flags, preventing silent configuration conflicts. Update documentation and MkDocs navigation to reflect the new centralized k-mer filtering system across `rebuild`, `dump`, and `unitig` commands.
This commit is contained in:
Eric Coissac
2026-06-09 09:57:38 +02:00
parent 2465cfbc4b
commit ce45e2fbe1
4 changed files with 98 additions and 34 deletions
+52 -16
View File
@@ -207,7 +207,7 @@ impl FilterArgs {
std::process::exit(1);
}))
.collect();
vec![Box::new(build_group_filter(
let filter = build_group_filter(
genomes,
&ingroup_preds,
&outgroup_preds,
@@ -222,7 +222,11 @@ impl FilterArgs {
min_outgroup_frac: self.min_outgroup_frac,
max_outgroup_frac: self.max_outgroup_frac,
},
))]
).unwrap_or_else(|e| {
eprintln!("error in filter parameters: {e}");
std::process::exit(1);
});
vec![Box::new(filter)]
}
}
@@ -243,7 +247,7 @@ pub fn build_group_filter(
ingroup_preds: &[MetaPred],
outgroup_preds: &[MetaPred],
p: GroupFilterParams,
) -> GroupQuorumFilter {
) -> Result<GroupQuorumFilter, String> {
let (ingroup_idx, outgroup_idx) = if ingroup_preds.is_empty() && outgroup_preds.is_empty() {
((0..genomes.len()).collect(), vec![])
} else {
@@ -260,20 +264,52 @@ pub fn build_group_filter(
let in_size = ingroup_idx.len();
let out_size = outgroup_idx.len();
let default_min_frac = if !ingroup_preds.is_empty() { 1.0 } else { 0.0 };
let default_max_outgroup_count = if !outgroup_preds.is_empty() { 0 } else { out_size };
let ingroup_quorum_explicit = p.min_count.is_some() || p.max_count.is_some()
|| p.min_frac.is_some() || p.max_frac.is_some();
let outgroup_quorum_explicit = p.min_outgroup_count.is_some() || p.max_outgroup_count.is_some()
|| p.min_outgroup_frac.is_some() || p.max_outgroup_frac.is_some();
GroupQuorumFilter {
let default_min_frac = if !ingroup_preds.is_empty() && !ingroup_quorum_explicit { 1.0 } else { 0.0 };
let default_max_outgroup_count = if !outgroup_preds.is_empty() && !outgroup_quorum_explicit { 0 } else { out_size };
let min_count = p.min_count.unwrap_or(0);
let max_count = p.max_count.unwrap_or(in_size);
let min_frac = p.min_frac.unwrap_or(default_min_frac);
let max_frac = p.max_frac.unwrap_or(1.0);
let min_outgroup_count = p.min_outgroup_count.unwrap_or(0);
let max_outgroup_count = p.max_outgroup_count.unwrap_or(default_max_outgroup_count);
let min_outgroup_frac = p.min_outgroup_frac.unwrap_or(0.0);
let max_outgroup_frac = p.max_outgroup_frac.unwrap_or(1.0);
for (v, lo, hi) in [
("--min-frac/--max-frac", min_frac, max_frac),
("--min-outgroup-frac/--max-outgroup-frac", min_outgroup_frac, max_outgroup_frac),
] {
if !(0.0..=1.0).contains(&lo) || !(0.0..=1.0).contains(&hi) {
return Err(format!("{v}: fraction values must be in [0.0, 1.0]"));
}
if lo > hi {
return Err(format!("{v}: min ({lo}) is greater than max ({hi})"));
}
}
if min_count > max_count {
return Err(format!("--min-count/--max-count: min ({min_count}) is greater than max ({max_count})"));
}
if min_outgroup_count > max_outgroup_count {
return Err(format!("--min-outgroup-count/--max-outgroup-count: min ({min_outgroup_count}) is greater than max ({max_outgroup_count})"));
}
Ok(GroupQuorumFilter {
ingroup_idx,
outgroup_idx,
threshold: p.threshold,
min_count: p.min_count.unwrap_or(0),
max_count: p.max_count.unwrap_or(in_size),
min_frac: p.min_frac.unwrap_or(default_min_frac),
max_frac: p.max_frac.unwrap_or(1.0),
min_outgroup_count: p.min_outgroup_count.unwrap_or(0),
max_outgroup_count: p.max_outgroup_count.unwrap_or(default_max_outgroup_count),
min_outgroup_frac: p.min_outgroup_frac.unwrap_or(0.0),
max_outgroup_frac: p.max_outgroup_frac.unwrap_or(1.0),
}
threshold: p.threshold,
min_count,
max_count,
min_frac,
max_frac,
min_outgroup_count,
max_outgroup_count,
min_outgroup_frac,
max_outgroup_frac,
})
}