feat: add configurable presence threshold to kmer distance

Introduce a `--presence-threshold` CLI argument (default: 1) and update `KmerIndex::distance` to accept a `presence_threshold` parameter. This replaces hardcoded zero thresholds, enabling configurable filtering of low-abundance kmers during Jaccard distance calculations.
This commit is contained in:
Eric Coissac
2026-06-08 20:13:07 +02:00
parent 1ec65922df
commit eb7805c747
2 changed files with 8 additions and 5 deletions
+3 -4
View File
@@ -52,7 +52,7 @@ impl DistanceMetric {
// ── KmerIndex::distance ─────────────────────────────────────────────────────── // ── KmerIndex::distance ───────────────────────────────────────────────────────
impl KmerIndex { impl KmerIndex {
pub fn distance(&self, metric: DistanceMetric, shared_kmers: bool) -> OKIResult<DistanceOutput> { pub fn distance(&self, metric: DistanceMetric, shared_kmers: bool, presence_threshold: u32) -> OKIResult<DistanceOutput> {
let n_genomes = self.meta.genomes.len(); let n_genomes = self.meta.genomes.len();
if n_genomes < 2 { if n_genomes < 2 {
return Err(OKIError::InvalidInput( return Err(OKIError::InvalidInput(
@@ -83,8 +83,7 @@ impl KmerIndex {
DistanceMetric::RelfreqEuclidean => CountPartials::relfreq_euclidean_dist_matrix(&global), DistanceMetric::RelfreqEuclidean => CountPartials::relfreq_euclidean_dist_matrix(&global),
DistanceMetric::Hellinger => CountPartials::hellinger_dist_matrix(&global), DistanceMetric::Hellinger => CountPartials::hellinger_dist_matrix(&global),
DistanceMetric::HellingerEuclidean => CountPartials::hellinger_euclidean_dist_matrix(&global), DistanceMetric::HellingerEuclidean => CountPartials::hellinger_euclidean_dist_matrix(&global),
// Jaccard on count data: threshold at 0 (present if count > 0) DistanceMetric::Jaccard => CountPartials::threshold_jaccard_dist_matrix(&global, presence_threshold),
DistanceMetric::Jaccard => CountPartials::threshold_jaccard_dist_matrix(&global, 0),
DistanceMetric::Hamming => { DistanceMetric::Hamming => {
return Err(OKIError::InvalidInput( return Err(OKIError::InvalidInput(
"Hamming is only available for presence/absence indexes".into(), "Hamming is only available for presence/absence indexes".into(),
@@ -93,7 +92,7 @@ impl KmerIndex {
}; };
let shared = if shared_kmers { let shared = if shared_kmers {
let (inter, _) = CountPartials::partial_threshold_jaccard(&global, 0); let (inter, _) = CountPartials::partial_threshold_jaccard(&global, presence_threshold);
Some(inter) Some(inter)
} else { } else {
None None
+5 -1
View File
@@ -46,6 +46,10 @@ pub struct DistanceArgs {
#[arg(long, value_enum, default_value = "jaccard")] #[arg(long, value_enum, default_value = "jaccard")]
pub metric: MetricArg, pub metric: MetricArg,
/// Minimum count to consider a kmer present when computing Jaccard on count indexes
#[arg(long, default_value = "1")]
pub presence_threshold: u32,
/// Also output the shared-kmer count matrix (CSV) /// Also output the shared-kmer count matrix (CSV)
#[arg(long)] #[arg(long)]
pub shared_kmers: bool, pub shared_kmers: bool,
@@ -81,7 +85,7 @@ pub fn run(args: DistanceArgs) {
let need_shared = args.shared_kmers || args.nj || args.upgma; let need_shared = args.shared_kmers || args.nj || args.upgma;
let result = idx let result = idx
.distance(args.metric.into(), need_shared) .distance(args.metric.into(), need_shared, args.presence_threshold)
.unwrap_or_else(|e| { .unwrap_or_else(|e| {
eprintln!("error computing distances: {e}"); eprintln!("error computing distances: {e}");
std::process::exit(1); std::process::exit(1);