feat: add pairwise distance computation and phylogenetic trees

This commit introduces a new `distance` CLI subcommand that computes pairwise genomic distance matrices using configurable metrics (Jaccard, Hamming, Bray-Curtis, Euclidean, and Hellinger). It optionally generates phylogenetic trees (NJ or UPGMA) in Newick format and outputs results as CSV. The implementation adds a robust distance computation backend that dynamically routes to optimized backends based on index configuration, supports parallel iteration, and gracefully handles missing data. Additionally, it adds a `dump` task for exporting k-mer to genome mappings as CSV, introduces an `InvalidInput` error variant, updates dependencies to support numerical operations and tree construction, and performs minor module reorganizations.
This commit is contained in:
Eric Coissac
2026-05-21 11:47:35 +02:00
parent 9e1d6f2f25
commit 3fa1dbf8cc
13 changed files with 512 additions and 7 deletions
+47
View File
@@ -0,0 +1,47 @@
use obicompactvec::{PersistentBitMatrix, PersistentCompactIntMatrix};
use obilayeredmap::LayeredStore;
use obiskio::{SKError, SKResult};
use crate::partition::KmerPartition;
const INDEX_SUBDIR: &str = "index";
fn probe_n_layers(index_dir: &std::path::Path) -> usize {
let mut n = 0;
while index_dir.join(format!("layer_{n}")).exists() { n += 1; }
n
}
impl KmerPartition {
/// Open all count matrices for partition `part`, one per layer.
/// Layers without a `counts/` directory are skipped.
pub fn count_store(&self, part: usize) -> SKResult<LayeredStore<PersistentCompactIntMatrix>> {
let index_dir = self.part_dir(part).join(INDEX_SUBDIR);
if !index_dir.exists() {
return Ok(LayeredStore::new(vec![]));
}
let matrices = (0..probe_n_layers(&index_dir))
.filter_map(|l| {
let dir = index_dir.join(format!("layer_{l}")).join("counts");
dir.exists().then(|| PersistentCompactIntMatrix::open(&dir).map_err(SKError::Io))
})
.collect::<SKResult<Vec<_>>>()?;
Ok(LayeredStore::new(matrices))
}
/// Open all presence matrices for partition `part`, one per layer.
/// Layers without a `presence/` directory are skipped.
pub fn presence_store(&self, part: usize) -> SKResult<LayeredStore<PersistentBitMatrix>> {
let index_dir = self.part_dir(part).join(INDEX_SUBDIR);
if !index_dir.exists() {
return Ok(LayeredStore::new(vec![]));
}
let matrices = (0..probe_n_layers(&index_dir))
.filter_map(|l| {
let dir = index_dir.join(format!("layer_{l}")).join("presence");
dir.exists().then(|| PersistentBitMatrix::open(&dir).map_err(SKError::Io))
})
.collect::<SKResult<Vec<_>>>()?;
Ok(LayeredStore::new(matrices))
}
}
+1
View File
@@ -1,3 +1,4 @@
mod distance;
mod dump_layer;
mod index_layer;
mod kmer_sort;