feat: add pairwise distance computation and phylogenetic trees
This commit introduces a new `distance` CLI subcommand that computes pairwise genomic distance matrices using configurable metrics (Jaccard, Hamming, Bray-Curtis, Euclidean, and Hellinger). It optionally generates phylogenetic trees (NJ or UPGMA) in Newick format and outputs results as CSV. The implementation adds a robust distance computation backend that dynamically routes to optimized backends based on index configuration, supports parallel iteration, and gracefully handles missing data. Additionally, it adds a `dump` task for exporting k-mer to genome mappings as CSV, introduces an `InvalidInput` error variant, updates dependencies to support numerical operations and tree construction, and performs minor module reorganizations.
This commit is contained in:
@@ -0,0 +1,47 @@
|
||||
use obicompactvec::{PersistentBitMatrix, PersistentCompactIntMatrix};
|
||||
use obilayeredmap::LayeredStore;
|
||||
use obiskio::{SKError, SKResult};
|
||||
|
||||
use crate::partition::KmerPartition;
|
||||
|
||||
const INDEX_SUBDIR: &str = "index";
|
||||
|
||||
fn probe_n_layers(index_dir: &std::path::Path) -> usize {
|
||||
let mut n = 0;
|
||||
while index_dir.join(format!("layer_{n}")).exists() { n += 1; }
|
||||
n
|
||||
}
|
||||
|
||||
impl KmerPartition {
|
||||
/// Open all count matrices for partition `part`, one per layer.
|
||||
/// Layers without a `counts/` directory are skipped.
|
||||
pub fn count_store(&self, part: usize) -> SKResult<LayeredStore<PersistentCompactIntMatrix>> {
|
||||
let index_dir = self.part_dir(part).join(INDEX_SUBDIR);
|
||||
if !index_dir.exists() {
|
||||
return Ok(LayeredStore::new(vec![]));
|
||||
}
|
||||
let matrices = (0..probe_n_layers(&index_dir))
|
||||
.filter_map(|l| {
|
||||
let dir = index_dir.join(format!("layer_{l}")).join("counts");
|
||||
dir.exists().then(|| PersistentCompactIntMatrix::open(&dir).map_err(SKError::Io))
|
||||
})
|
||||
.collect::<SKResult<Vec<_>>>()?;
|
||||
Ok(LayeredStore::new(matrices))
|
||||
}
|
||||
|
||||
/// Open all presence matrices for partition `part`, one per layer.
|
||||
/// Layers without a `presence/` directory are skipped.
|
||||
pub fn presence_store(&self, part: usize) -> SKResult<LayeredStore<PersistentBitMatrix>> {
|
||||
let index_dir = self.part_dir(part).join(INDEX_SUBDIR);
|
||||
if !index_dir.exists() {
|
||||
return Ok(LayeredStore::new(vec![]));
|
||||
}
|
||||
let matrices = (0..probe_n_layers(&index_dir))
|
||||
.filter_map(|l| {
|
||||
let dir = index_dir.join(format!("layer_{l}")).join("presence");
|
||||
dir.exists().then(|| PersistentBitMatrix::open(&dir).map_err(SKError::Io))
|
||||
})
|
||||
.collect::<SKResult<Vec<_>>>()?;
|
||||
Ok(LayeredStore::new(matrices))
|
||||
}
|
||||
}
|
||||
@@ -1,3 +1,4 @@
|
||||
mod distance;
|
||||
mod dump_layer;
|
||||
mod index_layer;
|
||||
mod kmer_sort;
|
||||
|
||||
Reference in New Issue
Block a user