feat: add configurable presence threshold to kmer distance
Introduce a `--presence-threshold` CLI argument (default: 1) and update `KmerIndex::distance` to accept a `presence_threshold` parameter. This replaces hardcoded zero thresholds, enabling configurable filtering of low-abundance kmers during Jaccard distance calculations.
This commit is contained in:
@@ -52,7 +52,7 @@ impl DistanceMetric {
|
|||||||
// ── KmerIndex::distance ───────────────────────────────────────────────────────
|
// ── KmerIndex::distance ───────────────────────────────────────────────────────
|
||||||
|
|
||||||
impl KmerIndex {
|
impl KmerIndex {
|
||||||
pub fn distance(&self, metric: DistanceMetric, shared_kmers: bool) -> OKIResult<DistanceOutput> {
|
pub fn distance(&self, metric: DistanceMetric, shared_kmers: bool, presence_threshold: u32) -> OKIResult<DistanceOutput> {
|
||||||
let n_genomes = self.meta.genomes.len();
|
let n_genomes = self.meta.genomes.len();
|
||||||
if n_genomes < 2 {
|
if n_genomes < 2 {
|
||||||
return Err(OKIError::InvalidInput(
|
return Err(OKIError::InvalidInput(
|
||||||
@@ -83,8 +83,7 @@ impl KmerIndex {
|
|||||||
DistanceMetric::RelfreqEuclidean => CountPartials::relfreq_euclidean_dist_matrix(&global),
|
DistanceMetric::RelfreqEuclidean => CountPartials::relfreq_euclidean_dist_matrix(&global),
|
||||||
DistanceMetric::Hellinger => CountPartials::hellinger_dist_matrix(&global),
|
DistanceMetric::Hellinger => CountPartials::hellinger_dist_matrix(&global),
|
||||||
DistanceMetric::HellingerEuclidean => CountPartials::hellinger_euclidean_dist_matrix(&global),
|
DistanceMetric::HellingerEuclidean => CountPartials::hellinger_euclidean_dist_matrix(&global),
|
||||||
// Jaccard on count data: threshold at 0 (present if count > 0)
|
DistanceMetric::Jaccard => CountPartials::threshold_jaccard_dist_matrix(&global, presence_threshold),
|
||||||
DistanceMetric::Jaccard => CountPartials::threshold_jaccard_dist_matrix(&global, 0),
|
|
||||||
DistanceMetric::Hamming => {
|
DistanceMetric::Hamming => {
|
||||||
return Err(OKIError::InvalidInput(
|
return Err(OKIError::InvalidInput(
|
||||||
"Hamming is only available for presence/absence indexes".into(),
|
"Hamming is only available for presence/absence indexes".into(),
|
||||||
@@ -93,7 +92,7 @@ impl KmerIndex {
|
|||||||
};
|
};
|
||||||
|
|
||||||
let shared = if shared_kmers {
|
let shared = if shared_kmers {
|
||||||
let (inter, _) = CountPartials::partial_threshold_jaccard(&global, 0);
|
let (inter, _) = CountPartials::partial_threshold_jaccard(&global, presence_threshold);
|
||||||
Some(inter)
|
Some(inter)
|
||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
|
|||||||
@@ -46,6 +46,10 @@ pub struct DistanceArgs {
|
|||||||
#[arg(long, value_enum, default_value = "jaccard")]
|
#[arg(long, value_enum, default_value = "jaccard")]
|
||||||
pub metric: MetricArg,
|
pub metric: MetricArg,
|
||||||
|
|
||||||
|
/// Minimum count to consider a kmer present when computing Jaccard on count indexes
|
||||||
|
#[arg(long, default_value = "1")]
|
||||||
|
pub presence_threshold: u32,
|
||||||
|
|
||||||
/// Also output the shared-kmer count matrix (CSV)
|
/// Also output the shared-kmer count matrix (CSV)
|
||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
pub shared_kmers: bool,
|
pub shared_kmers: bool,
|
||||||
@@ -81,7 +85,7 @@ pub fn run(args: DistanceArgs) {
|
|||||||
|
|
||||||
let need_shared = args.shared_kmers || args.nj || args.upgma;
|
let need_shared = args.shared_kmers || args.nj || args.upgma;
|
||||||
let result = idx
|
let result = idx
|
||||||
.distance(args.metric.into(), need_shared)
|
.distance(args.metric.into(), need_shared, args.presence_threshold)
|
||||||
.unwrap_or_else(|e| {
|
.unwrap_or_else(|e| {
|
||||||
eprintln!("error computing distances: {e}");
|
eprintln!("error computing distances: {e}");
|
||||||
std::process::exit(1);
|
std::process::exit(1);
|
||||||
|
|||||||
Reference in New Issue
Block a user