From eb7805c7470e39812af2d00049d26dd850ea2c99 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Mon, 8 Jun 2026 20:13:07 +0200 Subject: [PATCH] feat: add configurable presence threshold to kmer distance Introduce a `--presence-threshold` CLI argument (default: 1) and update `KmerIndex::distance` to accept a `presence_threshold` parameter. This replaces hardcoded zero thresholds, enabling configurable filtering of low-abundance kmers during Jaccard distance calculations. --- src/obikindex/src/distance.rs | 7 +++---- src/obikmer/src/cmd/distance.rs | 6 +++++- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/obikindex/src/distance.rs b/src/obikindex/src/distance.rs index a1c9cb0..867cff0 100644 --- a/src/obikindex/src/distance.rs +++ b/src/obikindex/src/distance.rs @@ -52,7 +52,7 @@ impl DistanceMetric { // ── KmerIndex::distance ─────────────────────────────────────────────────────── impl KmerIndex { - pub fn distance(&self, metric: DistanceMetric, shared_kmers: bool) -> OKIResult { + pub fn distance(&self, metric: DistanceMetric, shared_kmers: bool, presence_threshold: u32) -> OKIResult { let n_genomes = self.meta.genomes.len(); if n_genomes < 2 { return Err(OKIError::InvalidInput( @@ -83,8 +83,7 @@ impl KmerIndex { DistanceMetric::RelfreqEuclidean => CountPartials::relfreq_euclidean_dist_matrix(&global), DistanceMetric::Hellinger => CountPartials::hellinger_dist_matrix(&global), DistanceMetric::HellingerEuclidean => CountPartials::hellinger_euclidean_dist_matrix(&global), - // Jaccard on count data: threshold at 0 (present if count > 0) - DistanceMetric::Jaccard => CountPartials::threshold_jaccard_dist_matrix(&global, 0), + DistanceMetric::Jaccard => CountPartials::threshold_jaccard_dist_matrix(&global, presence_threshold), DistanceMetric::Hamming => { return Err(OKIError::InvalidInput( "Hamming is only available for presence/absence indexes".into(), @@ -93,7 +92,7 @@ impl KmerIndex { }; let shared = if shared_kmers { - let (inter, _) = CountPartials::partial_threshold_jaccard(&global, 0); + let (inter, _) = CountPartials::partial_threshold_jaccard(&global, presence_threshold); Some(inter) } else { None diff --git a/src/obikmer/src/cmd/distance.rs b/src/obikmer/src/cmd/distance.rs index d6d4599..f66d78f 100644 --- a/src/obikmer/src/cmd/distance.rs +++ b/src/obikmer/src/cmd/distance.rs @@ -46,6 +46,10 @@ pub struct DistanceArgs { #[arg(long, value_enum, default_value = "jaccard")] pub metric: MetricArg, + /// Minimum count to consider a kmer present when computing Jaccard on count indexes + #[arg(long, default_value = "1")] + pub presence_threshold: u32, + /// Also output the shared-kmer count matrix (CSV) #[arg(long)] pub shared_kmers: bool, @@ -81,7 +85,7 @@ pub fn run(args: DistanceArgs) { let need_shared = args.shared_kmers || args.nj || args.upgma; let result = idx - .distance(args.metric.into(), need_shared) + .distance(args.metric.into(), need_shared, args.presence_threshold) .unwrap_or_else(|e| { eprintln!("error computing distances: {e}"); std::process::exit(1);