feat: add pairwise distance computation and phylogenetic trees
This commit introduces a new `distance` CLI subcommand that computes pairwise genomic distance matrices using configurable metrics (Jaccard, Hamming, Bray-Curtis, Euclidean, and Hellinger). It optionally generates phylogenetic trees (NJ or UPGMA) in Newick format and outputs results as CSV. The implementation adds a robust distance computation backend that dynamically routes to optimized backends based on index configuration, supports parallel iteration, and gracefully handles missing data. Additionally, it adds a `dump` task for exporting k-mer to genome mappings as CSV, introduces an `InvalidInput` error variant, updates dependencies to support numerical operations and tree construction, and performs minor module reorganizations.
This commit is contained in:
@@ -7,6 +7,9 @@ edition = "2024"
|
||||
obikpartitionner = { path = "../obikpartitionner" }
|
||||
obiskio = { path = "../obiskio" }
|
||||
obisys = { path = "../obisys" }
|
||||
obicompactvec = { path = "../obicompactvec" }
|
||||
obilayeredmap = { path = "../obilayeredmap" }
|
||||
ndarray = "0.16"
|
||||
rayon = "1"
|
||||
serde = { version = "1", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
|
||||
@@ -0,0 +1,132 @@
|
||||
use ndarray::Array2;
|
||||
use obicompactvec::traits::{BitPartials, CountPartials};
|
||||
use obilayeredmap::LayeredStore;
|
||||
use rayon::prelude::*;
|
||||
|
||||
use crate::error::{OKIError, OKIResult};
|
||||
use crate::index::KmerIndex;
|
||||
|
||||
// ── Public API ────────────────────────────────────────────────────────────────
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum DistanceMetric {
|
||||
/// Jaccard distance on presence/absence data.
|
||||
Jaccard,
|
||||
/// Hamming distance (number of differing kmer positions) on presence/absence data.
|
||||
Hamming,
|
||||
/// Bray-Curtis dissimilarity on raw counts.
|
||||
BrayCurtis,
|
||||
/// Bray-Curtis dissimilarity normalised by per-genome total counts.
|
||||
RelfreqBrayCurtis,
|
||||
/// Euclidean distance on raw counts.
|
||||
Euclidean,
|
||||
/// Euclidean distance on relative frequencies.
|
||||
RelfreqEuclidean,
|
||||
/// Hellinger distance on counts.
|
||||
Hellinger,
|
||||
/// Euclidean distance in the Hellinger (√relative-frequency) space (unnormalised variant).
|
||||
HellingerEuclidean,
|
||||
}
|
||||
|
||||
pub struct DistanceOutput {
|
||||
/// n×n pairwise distance matrix (genomes in index order).
|
||||
pub matrix: Array2<f64>,
|
||||
/// n×n shared-kmer count matrix (intersection), if requested.
|
||||
pub shared_kmers: Option<Array2<u64>>,
|
||||
}
|
||||
|
||||
impl DistanceMetric {
|
||||
pub fn requires_counts(self) -> bool {
|
||||
matches!(
|
||||
self,
|
||||
DistanceMetric::BrayCurtis
|
||||
| DistanceMetric::RelfreqBrayCurtis
|
||||
| DistanceMetric::Euclidean
|
||||
| DistanceMetric::RelfreqEuclidean
|
||||
| DistanceMetric::Hellinger
|
||||
| DistanceMetric::HellingerEuclidean
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// ── KmerIndex::distance ───────────────────────────────────────────────────────
|
||||
|
||||
impl KmerIndex {
|
||||
pub fn distance(&self, metric: DistanceMetric, shared_kmers: bool) -> OKIResult<DistanceOutput> {
|
||||
let n_genomes = self.meta.genomes.len();
|
||||
if n_genomes < 2 {
|
||||
return Err(OKIError::InvalidInput(
|
||||
"distance requires at least 2 genomes in the index".into(),
|
||||
));
|
||||
}
|
||||
|
||||
let use_counts = self.meta.config.with_counts;
|
||||
if metric.requires_counts() && !use_counts {
|
||||
return Err(OKIError::InvalidInput(format!(
|
||||
"{metric:?} requires a count index (with_counts = true)"
|
||||
)));
|
||||
}
|
||||
|
||||
let n_parts = self.n_partitions();
|
||||
|
||||
if use_counts {
|
||||
let stores: Vec<_> = (0..n_parts)
|
||||
.into_par_iter()
|
||||
.map(|i| self.partition.count_store(i).map_err(OKIError::Partition))
|
||||
.collect::<OKIResult<_>>()?;
|
||||
let global = LayeredStore::new(stores);
|
||||
|
||||
let matrix = match metric {
|
||||
DistanceMetric::BrayCurtis => CountPartials::bray_dist_matrix(&global),
|
||||
DistanceMetric::RelfreqBrayCurtis => CountPartials::relfreq_bray_dist_matrix(&global),
|
||||
DistanceMetric::Euclidean => CountPartials::euclidean_dist_matrix(&global),
|
||||
DistanceMetric::RelfreqEuclidean => CountPartials::relfreq_euclidean_dist_matrix(&global),
|
||||
DistanceMetric::Hellinger => CountPartials::hellinger_dist_matrix(&global),
|
||||
DistanceMetric::HellingerEuclidean => CountPartials::hellinger_euclidean_dist_matrix(&global),
|
||||
// Jaccard on count data: threshold at 0 (present if count > 0)
|
||||
DistanceMetric::Jaccard => CountPartials::threshold_jaccard_dist_matrix(&global, 0),
|
||||
DistanceMetric::Hamming => {
|
||||
return Err(OKIError::InvalidInput(
|
||||
"Hamming is only available for presence/absence indexes".into(),
|
||||
));
|
||||
}
|
||||
};
|
||||
|
||||
let shared = if shared_kmers {
|
||||
let (inter, _) = CountPartials::partial_threshold_jaccard(&global, 0);
|
||||
Some(inter)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
Ok(DistanceOutput { matrix, shared_kmers: shared })
|
||||
} else {
|
||||
let stores: Vec<_> = (0..n_parts)
|
||||
.into_par_iter()
|
||||
.map(|i| self.partition.presence_store(i).map_err(OKIError::Partition))
|
||||
.collect::<OKIResult<_>>()?;
|
||||
let global = LayeredStore::new(stores);
|
||||
|
||||
let matrix = match metric {
|
||||
DistanceMetric::Jaccard => BitPartials::jaccard_dist_matrix(&global),
|
||||
DistanceMetric::Hamming => {
|
||||
BitPartials::hamming_dist_matrix(&global).mapv(|v| v as f64)
|
||||
}
|
||||
other => {
|
||||
return Err(OKIError::InvalidInput(format!(
|
||||
"{other:?} requires a count index; use --metric jaccard or --metric hamming"
|
||||
)));
|
||||
}
|
||||
};
|
||||
|
||||
let shared = if shared_kmers {
|
||||
let (inter, _) = BitPartials::partial_jaccard(&global);
|
||||
Some(inter)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
Ok(DistanceOutput { matrix, shared_kmers: shared })
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -16,6 +16,8 @@ pub enum OKIError {
|
||||
MismatchedMode,
|
||||
/// Two or more sources share the same genome label.
|
||||
DuplicateGenomeLabel(String),
|
||||
/// Operation not valid for this index configuration.
|
||||
InvalidInput(String),
|
||||
}
|
||||
|
||||
pub type OKIResult<T> = Result<T, OKIError>;
|
||||
@@ -30,6 +32,7 @@ impl fmt::Display for OKIError {
|
||||
OKIError::IncompatibleConfig => write!(f, "incompatible index configurations"),
|
||||
OKIError::MismatchedMode => write!(f, "count mode requires all sources to have with_counts=true"),
|
||||
OKIError::DuplicateGenomeLabel(l) => write!(f, "duplicate genome label across sources: {l}"),
|
||||
OKIError::InvalidInput(m) => write!(f, "invalid input: {m}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,11 +1,13 @@
|
||||
pub mod error;
|
||||
pub mod meta;
|
||||
pub mod state;
|
||||
mod distance;
|
||||
mod dump;
|
||||
mod index;
|
||||
mod merge;
|
||||
|
||||
pub use error::{OKIError, OKIResult};
|
||||
pub use distance::{DistanceMetric, DistanceOutput};
|
||||
pub use index::KmerIndex;
|
||||
pub use merge::MergeMode;
|
||||
pub use meta::{IndexConfig, IndexMeta, META_FILENAME};
|
||||
|
||||
Reference in New Issue
Block a user