feat: add pairwise distance computation and phylogenetic trees

This commit introduces a new `distance` CLI subcommand that computes pairwise genomic distance matrices using configurable metrics (Jaccard, Hamming, Bray-Curtis, Euclidean, and Hellinger). It optionally generates phylogenetic trees (NJ or UPGMA) in Newick format and outputs results as CSV. The implementation adds a robust distance computation backend that dynamically routes to optimized backends based on index configuration, supports parallel iteration, and gracefully handles missing data. Additionally, it adds a `dump` task for exporting k-mer to genome mappings as CSV, introduces an `InvalidInput` error variant, updates dependencies to support numerical operations and tree construction, and performs minor module reorganizations.
This commit is contained in:
Eric Coissac
2026-05-21 11:47:35 +02:00
parent 9e1d6f2f25
commit 3fa1dbf8cc
13 changed files with 512 additions and 7 deletions
+3
View File
@@ -7,6 +7,9 @@ edition = "2024"
obikpartitionner = { path = "../obikpartitionner" }
obiskio = { path = "../obiskio" }
obisys = { path = "../obisys" }
obicompactvec = { path = "../obicompactvec" }
obilayeredmap = { path = "../obilayeredmap" }
ndarray = "0.16"
rayon = "1"
serde = { version = "1", features = ["derive"] }
serde_json = "1"
+132
View File
@@ -0,0 +1,132 @@
use ndarray::Array2;
use obicompactvec::traits::{BitPartials, CountPartials};
use obilayeredmap::LayeredStore;
use rayon::prelude::*;
use crate::error::{OKIError, OKIResult};
use crate::index::KmerIndex;
// ── Public API ────────────────────────────────────────────────────────────────
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DistanceMetric {
/// Jaccard distance on presence/absence data.
Jaccard,
/// Hamming distance (number of differing kmer positions) on presence/absence data.
Hamming,
/// Bray-Curtis dissimilarity on raw counts.
BrayCurtis,
/// Bray-Curtis dissimilarity normalised by per-genome total counts.
RelfreqBrayCurtis,
/// Euclidean distance on raw counts.
Euclidean,
/// Euclidean distance on relative frequencies.
RelfreqEuclidean,
/// Hellinger distance on counts.
Hellinger,
/// Euclidean distance in the Hellinger (√relative-frequency) space (unnormalised variant).
HellingerEuclidean,
}
pub struct DistanceOutput {
/// n×n pairwise distance matrix (genomes in index order).
pub matrix: Array2<f64>,
/// n×n shared-kmer count matrix (intersection), if requested.
pub shared_kmers: Option<Array2<u64>>,
}
impl DistanceMetric {
pub fn requires_counts(self) -> bool {
matches!(
self,
DistanceMetric::BrayCurtis
| DistanceMetric::RelfreqBrayCurtis
| DistanceMetric::Euclidean
| DistanceMetric::RelfreqEuclidean
| DistanceMetric::Hellinger
| DistanceMetric::HellingerEuclidean
)
}
}
// ── KmerIndex::distance ───────────────────────────────────────────────────────
impl KmerIndex {
pub fn distance(&self, metric: DistanceMetric, shared_kmers: bool) -> OKIResult<DistanceOutput> {
let n_genomes = self.meta.genomes.len();
if n_genomes < 2 {
return Err(OKIError::InvalidInput(
"distance requires at least 2 genomes in the index".into(),
));
}
let use_counts = self.meta.config.with_counts;
if metric.requires_counts() && !use_counts {
return Err(OKIError::InvalidInput(format!(
"{metric:?} requires a count index (with_counts = true)"
)));
}
let n_parts = self.n_partitions();
if use_counts {
let stores: Vec<_> = (0..n_parts)
.into_par_iter()
.map(|i| self.partition.count_store(i).map_err(OKIError::Partition))
.collect::<OKIResult<_>>()?;
let global = LayeredStore::new(stores);
let matrix = match metric {
DistanceMetric::BrayCurtis => CountPartials::bray_dist_matrix(&global),
DistanceMetric::RelfreqBrayCurtis => CountPartials::relfreq_bray_dist_matrix(&global),
DistanceMetric::Euclidean => CountPartials::euclidean_dist_matrix(&global),
DistanceMetric::RelfreqEuclidean => CountPartials::relfreq_euclidean_dist_matrix(&global),
DistanceMetric::Hellinger => CountPartials::hellinger_dist_matrix(&global),
DistanceMetric::HellingerEuclidean => CountPartials::hellinger_euclidean_dist_matrix(&global),
// Jaccard on count data: threshold at 0 (present if count > 0)
DistanceMetric::Jaccard => CountPartials::threshold_jaccard_dist_matrix(&global, 0),
DistanceMetric::Hamming => {
return Err(OKIError::InvalidInput(
"Hamming is only available for presence/absence indexes".into(),
));
}
};
let shared = if shared_kmers {
let (inter, _) = CountPartials::partial_threshold_jaccard(&global, 0);
Some(inter)
} else {
None
};
Ok(DistanceOutput { matrix, shared_kmers: shared })
} else {
let stores: Vec<_> = (0..n_parts)
.into_par_iter()
.map(|i| self.partition.presence_store(i).map_err(OKIError::Partition))
.collect::<OKIResult<_>>()?;
let global = LayeredStore::new(stores);
let matrix = match metric {
DistanceMetric::Jaccard => BitPartials::jaccard_dist_matrix(&global),
DistanceMetric::Hamming => {
BitPartials::hamming_dist_matrix(&global).mapv(|v| v as f64)
}
other => {
return Err(OKIError::InvalidInput(format!(
"{other:?} requires a count index; use --metric jaccard or --metric hamming"
)));
}
};
let shared = if shared_kmers {
let (inter, _) = BitPartials::partial_jaccard(&global);
Some(inter)
} else {
None
};
Ok(DistanceOutput { matrix, shared_kmers: shared })
}
}
}
+3
View File
@@ -16,6 +16,8 @@ pub enum OKIError {
MismatchedMode,
/// Two or more sources share the same genome label.
DuplicateGenomeLabel(String),
/// Operation not valid for this index configuration.
InvalidInput(String),
}
pub type OKIResult<T> = Result<T, OKIError>;
@@ -30,6 +32,7 @@ impl fmt::Display for OKIError {
OKIError::IncompatibleConfig => write!(f, "incompatible index configurations"),
OKIError::MismatchedMode => write!(f, "count mode requires all sources to have with_counts=true"),
OKIError::DuplicateGenomeLabel(l) => write!(f, "duplicate genome label across sources: {l}"),
OKIError::InvalidInput(m) => write!(f, "invalid input: {m}"),
}
}
}
+2
View File
@@ -1,11 +1,13 @@
pub mod error;
pub mod meta;
pub mod state;
mod distance;
mod dump;
mod index;
mod merge;
pub use error::{OKIError, OKIResult};
pub use distance::{DistanceMetric, DistanceOutput};
pub use index::KmerIndex;
pub use merge::MergeMode;
pub use meta::{IndexConfig, IndexMeta, META_FILENAME};