feat: add pairwise distance computation and phylogenetic trees
This commit introduces a new `distance` CLI subcommand that computes pairwise genomic distance matrices using configurable metrics (Jaccard, Hamming, Bray-Curtis, Euclidean, and Hellinger). It optionally generates phylogenetic trees (NJ or UPGMA) in Newick format and outputs results as CSV. The implementation adds a robust distance computation backend that dynamically routes to optimized backends based on index configuration, supports parallel iteration, and gracefully handles missing data. Additionally, it adds a `dump` task for exporting k-mer to genome mappings as CSV, introduces an `InvalidInput` error variant, updates dependencies to support numerical operations and tree construction, and performs minor module reorganizations.
This commit is contained in:
Generated
+95
-5
@@ -176,6 +176,21 @@ dependencies = [
|
||||
"thiserror 1.0.69",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bit-set"
|
||||
version = "0.5.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0700ddab506f33b20a03b13996eccd309a48e5ff77d0d95926aa0210fb4e95f1"
|
||||
dependencies = [
|
||||
"bit-vec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bit-vec"
|
||||
version = "0.6.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb"
|
||||
|
||||
[[package]]
|
||||
name = "bitflags"
|
||||
version = "1.3.2"
|
||||
@@ -625,6 +640,12 @@ dependencies = [
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dtoa"
|
||||
version = "1.0.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4c3cf4824e2d5f025c7b531afcb2325364084a16806f6d47fbc1f5fbd9960590"
|
||||
|
||||
[[package]]
|
||||
name = "either"
|
||||
version = "1.15.0"
|
||||
@@ -1102,6 +1123,15 @@ dependencies = [
|
||||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kodama"
|
||||
version = "0.2.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7a44f3a71a44fbf49ce38152db7dc9adf959d4fe5c29344cd1858bdbda8d9091"
|
||||
dependencies = [
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lazy_static"
|
||||
version = "1.5.0"
|
||||
@@ -1453,7 +1483,10 @@ name = "obikindex"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"indicatif",
|
||||
"ndarray",
|
||||
"obicompactvec",
|
||||
"obikpartitionner",
|
||||
"obilayeredmap",
|
||||
"obiskio",
|
||||
"obisys",
|
||||
"rayon",
|
||||
@@ -1468,6 +1501,7 @@ version = "0.1.0"
|
||||
dependencies = [
|
||||
"clap",
|
||||
"indicatif",
|
||||
"kodama",
|
||||
"obifastwrite",
|
||||
"obikindex",
|
||||
"obikpartitionner",
|
||||
@@ -1480,6 +1514,7 @@ dependencies = [
|
||||
"obisys",
|
||||
"pprof",
|
||||
"rayon",
|
||||
"speedytree",
|
||||
"tracing",
|
||||
"tracing-subscriber",
|
||||
]
|
||||
@@ -1886,8 +1921,8 @@ dependencies = [
|
||||
"lazy_static",
|
||||
"log",
|
||||
"mem_dbg",
|
||||
"rand",
|
||||
"rand_chacha",
|
||||
"rand 0.9.4",
|
||||
"rand_chacha 0.9.0",
|
||||
"rayon",
|
||||
"rdst",
|
||||
"rustc-hash",
|
||||
@@ -1918,14 +1953,35 @@ version = "0.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dc33ff2d4973d518d823d61aa239014831e521c75da58e3df4840d3f47749d09"
|
||||
|
||||
[[package]]
|
||||
name = "rand"
|
||||
version = "0.8.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5ca0ecfa931c29007047d1bc58e623ab12e5590e8c7cc53200d5202b69266d8a"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"rand_chacha 0.3.1",
|
||||
"rand_core 0.6.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand"
|
||||
version = "0.9.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea"
|
||||
dependencies = [
|
||||
"rand_chacha",
|
||||
"rand_core",
|
||||
"rand_chacha 0.9.0",
|
||||
"rand_core 0.9.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_chacha"
|
||||
version = "0.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
|
||||
dependencies = [
|
||||
"ppv-lite86",
|
||||
"rand_core 0.6.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1935,7 +1991,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
|
||||
dependencies = [
|
||||
"ppv-lite86",
|
||||
"rand_core",
|
||||
"rand_core 0.9.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_core"
|
||||
version = "0.6.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
|
||||
dependencies = [
|
||||
"getrandom 0.2.17",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1973,6 +2038,12 @@ dependencies = [
|
||||
"crossbeam-utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rb_tree"
|
||||
version = "0.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e6724c78e033e1c4155b4d3f76593d09ad384739c6986c1addc2bd2f55b1aefe"
|
||||
|
||||
[[package]]
|
||||
name = "rdst"
|
||||
version = "0.20.14"
|
||||
@@ -2227,6 +2298,25 @@ version = "1.15.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03"
|
||||
|
||||
[[package]]
|
||||
name = "speedytree"
|
||||
version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7f4522052445ce1b002c095d93e9dc545c326e9e2321c1315f1ab2a381c11666"
|
||||
dependencies = [
|
||||
"bit-set",
|
||||
"bit-vec",
|
||||
"bitvec",
|
||||
"clap",
|
||||
"dtoa",
|
||||
"fixedbitset",
|
||||
"parking_lot",
|
||||
"petgraph",
|
||||
"rand 0.8.6",
|
||||
"rayon",
|
||||
"rb_tree",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "stable_deref_trait"
|
||||
version = "1.2.1"
|
||||
|
||||
@@ -80,6 +80,13 @@ pub trait CountPartials: ColumnWeights {
|
||||
let sq2 = std::f64::consts::SQRT_2;
|
||||
self.partial_hellinger(&global).mapv(|v| v.sqrt() / sq2)
|
||||
}
|
||||
|
||||
/// Euclidean distance in the Hellinger (√relative-frequency) space.
|
||||
/// Equal to √2 × hellinger_dist — unnormalised variant.
|
||||
fn hellinger_euclidean_dist_matrix(&self) -> Array2<f64> {
|
||||
let global = self.col_weights();
|
||||
self.partial_hellinger(&global).mapv(|v| v.sqrt())
|
||||
}
|
||||
}
|
||||
|
||||
/// Partial distance matrices for bit-based data (`PersistentBitMatrix`).
|
||||
|
||||
@@ -7,6 +7,9 @@ edition = "2024"
|
||||
obikpartitionner = { path = "../obikpartitionner" }
|
||||
obiskio = { path = "../obiskio" }
|
||||
obisys = { path = "../obisys" }
|
||||
obicompactvec = { path = "../obicompactvec" }
|
||||
obilayeredmap = { path = "../obilayeredmap" }
|
||||
ndarray = "0.16"
|
||||
rayon = "1"
|
||||
serde = { version = "1", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
|
||||
@@ -0,0 +1,132 @@
|
||||
use ndarray::Array2;
|
||||
use obicompactvec::traits::{BitPartials, CountPartials};
|
||||
use obilayeredmap::LayeredStore;
|
||||
use rayon::prelude::*;
|
||||
|
||||
use crate::error::{OKIError, OKIResult};
|
||||
use crate::index::KmerIndex;
|
||||
|
||||
// ── Public API ────────────────────────────────────────────────────────────────
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum DistanceMetric {
|
||||
/// Jaccard distance on presence/absence data.
|
||||
Jaccard,
|
||||
/// Hamming distance (number of differing kmer positions) on presence/absence data.
|
||||
Hamming,
|
||||
/// Bray-Curtis dissimilarity on raw counts.
|
||||
BrayCurtis,
|
||||
/// Bray-Curtis dissimilarity normalised by per-genome total counts.
|
||||
RelfreqBrayCurtis,
|
||||
/// Euclidean distance on raw counts.
|
||||
Euclidean,
|
||||
/// Euclidean distance on relative frequencies.
|
||||
RelfreqEuclidean,
|
||||
/// Hellinger distance on counts.
|
||||
Hellinger,
|
||||
/// Euclidean distance in the Hellinger (√relative-frequency) space (unnormalised variant).
|
||||
HellingerEuclidean,
|
||||
}
|
||||
|
||||
pub struct DistanceOutput {
|
||||
/// n×n pairwise distance matrix (genomes in index order).
|
||||
pub matrix: Array2<f64>,
|
||||
/// n×n shared-kmer count matrix (intersection), if requested.
|
||||
pub shared_kmers: Option<Array2<u64>>,
|
||||
}
|
||||
|
||||
impl DistanceMetric {
|
||||
pub fn requires_counts(self) -> bool {
|
||||
matches!(
|
||||
self,
|
||||
DistanceMetric::BrayCurtis
|
||||
| DistanceMetric::RelfreqBrayCurtis
|
||||
| DistanceMetric::Euclidean
|
||||
| DistanceMetric::RelfreqEuclidean
|
||||
| DistanceMetric::Hellinger
|
||||
| DistanceMetric::HellingerEuclidean
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// ── KmerIndex::distance ───────────────────────────────────────────────────────
|
||||
|
||||
impl KmerIndex {
|
||||
pub fn distance(&self, metric: DistanceMetric, shared_kmers: bool) -> OKIResult<DistanceOutput> {
|
||||
let n_genomes = self.meta.genomes.len();
|
||||
if n_genomes < 2 {
|
||||
return Err(OKIError::InvalidInput(
|
||||
"distance requires at least 2 genomes in the index".into(),
|
||||
));
|
||||
}
|
||||
|
||||
let use_counts = self.meta.config.with_counts;
|
||||
if metric.requires_counts() && !use_counts {
|
||||
return Err(OKIError::InvalidInput(format!(
|
||||
"{metric:?} requires a count index (with_counts = true)"
|
||||
)));
|
||||
}
|
||||
|
||||
let n_parts = self.n_partitions();
|
||||
|
||||
if use_counts {
|
||||
let stores: Vec<_> = (0..n_parts)
|
||||
.into_par_iter()
|
||||
.map(|i| self.partition.count_store(i).map_err(OKIError::Partition))
|
||||
.collect::<OKIResult<_>>()?;
|
||||
let global = LayeredStore::new(stores);
|
||||
|
||||
let matrix = match metric {
|
||||
DistanceMetric::BrayCurtis => CountPartials::bray_dist_matrix(&global),
|
||||
DistanceMetric::RelfreqBrayCurtis => CountPartials::relfreq_bray_dist_matrix(&global),
|
||||
DistanceMetric::Euclidean => CountPartials::euclidean_dist_matrix(&global),
|
||||
DistanceMetric::RelfreqEuclidean => CountPartials::relfreq_euclidean_dist_matrix(&global),
|
||||
DistanceMetric::Hellinger => CountPartials::hellinger_dist_matrix(&global),
|
||||
DistanceMetric::HellingerEuclidean => CountPartials::hellinger_euclidean_dist_matrix(&global),
|
||||
// Jaccard on count data: threshold at 0 (present if count > 0)
|
||||
DistanceMetric::Jaccard => CountPartials::threshold_jaccard_dist_matrix(&global, 0),
|
||||
DistanceMetric::Hamming => {
|
||||
return Err(OKIError::InvalidInput(
|
||||
"Hamming is only available for presence/absence indexes".into(),
|
||||
));
|
||||
}
|
||||
};
|
||||
|
||||
let shared = if shared_kmers {
|
||||
let (inter, _) = CountPartials::partial_threshold_jaccard(&global, 0);
|
||||
Some(inter)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
Ok(DistanceOutput { matrix, shared_kmers: shared })
|
||||
} else {
|
||||
let stores: Vec<_> = (0..n_parts)
|
||||
.into_par_iter()
|
||||
.map(|i| self.partition.presence_store(i).map_err(OKIError::Partition))
|
||||
.collect::<OKIResult<_>>()?;
|
||||
let global = LayeredStore::new(stores);
|
||||
|
||||
let matrix = match metric {
|
||||
DistanceMetric::Jaccard => BitPartials::jaccard_dist_matrix(&global),
|
||||
DistanceMetric::Hamming => {
|
||||
BitPartials::hamming_dist_matrix(&global).mapv(|v| v as f64)
|
||||
}
|
||||
other => {
|
||||
return Err(OKIError::InvalidInput(format!(
|
||||
"{other:?} requires a count index; use --metric jaccard or --metric hamming"
|
||||
)));
|
||||
}
|
||||
};
|
||||
|
||||
let shared = if shared_kmers {
|
||||
let (inter, _) = BitPartials::partial_jaccard(&global);
|
||||
Some(inter)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
Ok(DistanceOutput { matrix, shared_kmers: shared })
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -16,6 +16,8 @@ pub enum OKIError {
|
||||
MismatchedMode,
|
||||
/// Two or more sources share the same genome label.
|
||||
DuplicateGenomeLabel(String),
|
||||
/// Operation not valid for this index configuration.
|
||||
InvalidInput(String),
|
||||
}
|
||||
|
||||
pub type OKIResult<T> = Result<T, OKIError>;
|
||||
@@ -30,6 +32,7 @@ impl fmt::Display for OKIError {
|
||||
OKIError::IncompatibleConfig => write!(f, "incompatible index configurations"),
|
||||
OKIError::MismatchedMode => write!(f, "count mode requires all sources to have with_counts=true"),
|
||||
OKIError::DuplicateGenomeLabel(l) => write!(f, "duplicate genome label across sources: {l}"),
|
||||
OKIError::InvalidInput(m) => write!(f, "invalid input: {m}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,11 +1,13 @@
|
||||
pub mod error;
|
||||
pub mod meta;
|
||||
pub mod state;
|
||||
mod distance;
|
||||
mod dump;
|
||||
mod index;
|
||||
mod merge;
|
||||
|
||||
pub use error::{OKIError, OKIResult};
|
||||
pub use distance::{DistanceMetric, DistanceOutput};
|
||||
pub use index::KmerIndex;
|
||||
pub use merge::MergeMode;
|
||||
pub use meta::{IndexConfig, IndexMeta, META_FILENAME};
|
||||
|
||||
@@ -19,6 +19,8 @@ obisys = { path = "../obisys" }
|
||||
obiskio = { path = "../obiskio" }
|
||||
obikindex = { path = "../obikindex" }
|
||||
clap = { version = "4", features = ["derive"] }
|
||||
kodama = "0.2"
|
||||
speedytree = "0.1"
|
||||
rayon = "1"
|
||||
indicatif = "0.17"
|
||||
tracing = "0.1.44"
|
||||
|
||||
@@ -0,0 +1,216 @@
|
||||
use std::io::{self, BufWriter, Write};
|
||||
use std::path::PathBuf;
|
||||
|
||||
use clap::Args;
|
||||
use kodama::{Method, linkage};
|
||||
use obikindex::{DistanceMetric, KmerIndex};
|
||||
use obikseq::{set_k, set_m};
|
||||
use speedytree::{DistanceMatrix, Hybrid, NeighborJoiningSolver, to_newick};
|
||||
use tracing::info;
|
||||
|
||||
#[derive(clap::ValueEnum, Clone, Copy, Debug)]
|
||||
pub enum MetricArg {
|
||||
Jaccard,
|
||||
Hamming,
|
||||
BrayCurtis,
|
||||
#[value(name = "relfreq-bray-curtis")]
|
||||
RelfreqBrayCurtis,
|
||||
Euclidean,
|
||||
#[value(name = "relfreq-euclidean")]
|
||||
RelfreqEuclidean,
|
||||
Hellinger,
|
||||
#[value(name = "hellinger-euclidean")]
|
||||
HellingerEuclidean,
|
||||
}
|
||||
|
||||
impl From<MetricArg> for DistanceMetric {
|
||||
fn from(m: MetricArg) -> Self {
|
||||
match m {
|
||||
MetricArg::Jaccard => DistanceMetric::Jaccard,
|
||||
MetricArg::Hamming => DistanceMetric::Hamming,
|
||||
MetricArg::BrayCurtis => DistanceMetric::BrayCurtis,
|
||||
MetricArg::RelfreqBrayCurtis => DistanceMetric::RelfreqBrayCurtis,
|
||||
MetricArg::Euclidean => DistanceMetric::Euclidean,
|
||||
MetricArg::RelfreqEuclidean => DistanceMetric::RelfreqEuclidean,
|
||||
MetricArg::Hellinger => DistanceMetric::Hellinger,
|
||||
MetricArg::HellingerEuclidean => DistanceMetric::HellingerEuclidean,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Args)]
|
||||
pub struct DistanceArgs {
|
||||
/// Index directory
|
||||
pub index: PathBuf,
|
||||
|
||||
/// Distance metric to compute
|
||||
#[arg(long, value_enum, default_value = "jaccard")]
|
||||
pub metric: MetricArg,
|
||||
|
||||
/// Also output the shared-kmer count matrix (CSV)
|
||||
#[arg(long)]
|
||||
pub shared_kmers: bool,
|
||||
|
||||
/// Compute and write a Neighbor-Joining tree (Newick)
|
||||
#[arg(long)]
|
||||
pub nj: bool,
|
||||
|
||||
/// Compute and write a UPGMA tree (Newick)
|
||||
#[arg(long)]
|
||||
pub upgma: bool,
|
||||
|
||||
/// Output prefix: <prefix>_dist.csv, <prefix>_shared.csv,
|
||||
/// <prefix>_nj.nwk, <prefix>_upgma.nwk.
|
||||
/// If omitted, the distance matrix is written to stdout.
|
||||
#[arg(short, long)]
|
||||
pub output: Option<PathBuf>,
|
||||
}
|
||||
|
||||
pub fn run(args: DistanceArgs) {
|
||||
let idx = KmerIndex::open(&args.index).unwrap_or_else(|e| {
|
||||
eprintln!("error opening index: {e}");
|
||||
std::process::exit(1);
|
||||
});
|
||||
|
||||
set_k(idx.kmer_size());
|
||||
set_m(idx.minimizer_size());
|
||||
|
||||
let genomes = idx.meta().genomes.clone();
|
||||
let n = genomes.len();
|
||||
info!(
|
||||
"computing {:?} distances for {} genome(s)",
|
||||
args.metric, n
|
||||
);
|
||||
|
||||
let need_shared = args.shared_kmers || args.nj || args.upgma;
|
||||
let result = idx
|
||||
.distance(args.metric.into(), need_shared)
|
||||
.unwrap_or_else(|e| {
|
||||
eprintln!("error computing distances: {e}");
|
||||
std::process::exit(1);
|
||||
});
|
||||
|
||||
// ── Distance matrix → CSV ─────────────────────────────────────────────────
|
||||
let write_dist_csv = |w: &mut dyn Write| {
|
||||
write!(w, "genome").unwrap();
|
||||
for g in &genomes { write!(w, ",{g}").unwrap(); }
|
||||
writeln!(w).unwrap();
|
||||
for (i, g) in genomes.iter().enumerate() {
|
||||
write!(w, "{g}").unwrap();
|
||||
for j in 0..n {
|
||||
write!(w, ",{:.6}", result.matrix[[i, j]]).unwrap();
|
||||
}
|
||||
writeln!(w).unwrap();
|
||||
}
|
||||
};
|
||||
|
||||
match &args.output {
|
||||
Some(prefix) => {
|
||||
let path = format!("{}_dist.csv", prefix.display());
|
||||
let mut f = BufWriter::new(std::fs::File::create(&path).unwrap_or_else(|e| {
|
||||
eprintln!("error creating {path}: {e}");
|
||||
std::process::exit(1);
|
||||
}));
|
||||
write_dist_csv(&mut f);
|
||||
info!("distance matrix → {path}");
|
||||
}
|
||||
None => {
|
||||
let stdout = io::stdout();
|
||||
let mut out = BufWriter::new(stdout.lock());
|
||||
write_dist_csv(&mut out);
|
||||
}
|
||||
}
|
||||
|
||||
// ── Shared-kmer matrix → CSV ──────────────────────────────────────────────
|
||||
if args.shared_kmers {
|
||||
if let Some(shared) = &result.shared_kmers {
|
||||
let path = args.output.as_ref()
|
||||
.map(|p| format!("{}_shared.csv", p.display()))
|
||||
.unwrap_or_else(|| "shared.csv".into());
|
||||
let mut f = BufWriter::new(std::fs::File::create(&path).unwrap_or_else(|e| {
|
||||
eprintln!("error creating {path}: {e}");
|
||||
std::process::exit(1);
|
||||
}));
|
||||
write!(f, "genome").unwrap();
|
||||
for g in &genomes { write!(f, ",{g}").unwrap(); }
|
||||
writeln!(f).unwrap();
|
||||
for (i, g) in genomes.iter().enumerate() {
|
||||
write!(f, "{g}").unwrap();
|
||||
for j in 0..n { write!(f, ",{}", shared[[i, j]]).unwrap(); }
|
||||
writeln!(f).unwrap();
|
||||
}
|
||||
info!("shared-kmer matrix → {path}");
|
||||
}
|
||||
}
|
||||
|
||||
// ── NJ tree via speedytree ────────────────────────────────────────────────
|
||||
if args.nj {
|
||||
let rows: Vec<Vec<f64>> = (0..n)
|
||||
.map(|i| (0..n).map(|j| result.matrix[[i, j]]).collect())
|
||||
.collect();
|
||||
let dm = DistanceMatrix::build(rows, genomes.clone()).unwrap_or_else(|e| {
|
||||
eprintln!("error building distance matrix for NJ: {e}");
|
||||
std::process::exit(1);
|
||||
});
|
||||
let tree = NeighborJoiningSolver::<Hybrid>::default(dm).solve().unwrap_or_else(|e| {
|
||||
eprintln!("error computing NJ tree: {e}");
|
||||
std::process::exit(1);
|
||||
});
|
||||
let newick = to_newick(&tree);
|
||||
let path = args.output.as_ref()
|
||||
.map(|p| format!("{}_nj.nwk", p.display()))
|
||||
.unwrap_or_else(|| "nj.nwk".into());
|
||||
std::fs::write(&path, &newick).unwrap_or_else(|e| {
|
||||
eprintln!("error writing {path}: {e}");
|
||||
std::process::exit(1);
|
||||
});
|
||||
info!("NJ tree → {path}");
|
||||
}
|
||||
|
||||
// ── UPGMA tree via kodama ─────────────────────────────────────────────────
|
||||
if args.upgma {
|
||||
let mut condensed: Vec<f64> = Vec::with_capacity(n * (n - 1) / 2);
|
||||
for i in 0..n {
|
||||
for j in (i + 1)..n {
|
||||
condensed.push(result.matrix[[i, j]]);
|
||||
}
|
||||
}
|
||||
let dendro = linkage(&mut condensed, n, Method::Average);
|
||||
let newick = upgma_to_newick(&dendro, &genomes);
|
||||
let path = args.output.as_ref()
|
||||
.map(|p| format!("{}_upgma.nwk", p.display()))
|
||||
.unwrap_or_else(|| "upgma.nwk".into());
|
||||
std::fs::write(&path, &newick).unwrap_or_else(|e| {
|
||||
eprintln!("error writing {path}: {e}");
|
||||
std::process::exit(1);
|
||||
});
|
||||
info!("UPGMA tree → {path}");
|
||||
}
|
||||
}
|
||||
|
||||
// ── UPGMA Newick from kodama dendrogram ───────────────────────────────────────
|
||||
|
||||
fn upgma_to_newick(dendro: &kodama::Dendrogram<f64>, names: &[String]) -> String {
|
||||
let n = names.len();
|
||||
// node_labels[i]: Newick subtree string for node i (leaves 0..n, internals n..)
|
||||
let mut labels: Vec<String> = names.to_vec();
|
||||
// height of each node: leaves = 0, internal = dissimilarity/2
|
||||
let mut heights: Vec<f64> = vec![0.0; 2 * n - 1];
|
||||
|
||||
for (k, step) in dendro.steps().iter().enumerate() {
|
||||
let new_node = n + k;
|
||||
let h = step.dissimilarity / 2.0;
|
||||
heights[new_node] = h;
|
||||
let c1 = step.cluster1;
|
||||
let c2 = step.cluster2;
|
||||
let bl1 = (h - heights[c1]).max(0.0);
|
||||
let bl2 = (h - heights[c2]).max(0.0);
|
||||
labels.push(format!(
|
||||
"({label1}:{bl1:.6},{label2}:{bl2:.6})",
|
||||
label1 = labels[c1],
|
||||
label2 = labels[c2],
|
||||
));
|
||||
}
|
||||
|
||||
format!("{};", labels.last().unwrap())
|
||||
}
|
||||
@@ -1,3 +1,4 @@
|
||||
pub mod distance;
|
||||
pub mod dump;
|
||||
pub mod index;
|
||||
pub mod merge;
|
||||
|
||||
@@ -22,6 +22,8 @@ enum Commands {
|
||||
Merge(cmd::merge::MergeArgs),
|
||||
/// Dump all indexed kmers as CSV (kmer + per-genome counts or presence)
|
||||
Dump(cmd::dump::DumpArgs),
|
||||
/// Compute pairwise distance matrix between genomes; optionally build NJ/UPGMA trees
|
||||
Distance(cmd::distance::DistanceArgs),
|
||||
/// Dump unitigs from a built index to stdout (debug)
|
||||
Unitig(cmd::unitig::UnitigArgs),
|
||||
}
|
||||
@@ -49,6 +51,7 @@ fn main() {
|
||||
Commands::Index(args) => cmd::index::run(args),
|
||||
Commands::Merge(args) => cmd::merge::run(args),
|
||||
Commands::Dump(args) => cmd::dump::run(args),
|
||||
Commands::Distance(args) => cmd::distance::run(args),
|
||||
Commands::Unitig(args) => cmd::unitig::run(args),
|
||||
}
|
||||
|
||||
|
||||
@@ -0,0 +1,47 @@
|
||||
use obicompactvec::{PersistentBitMatrix, PersistentCompactIntMatrix};
|
||||
use obilayeredmap::LayeredStore;
|
||||
use obiskio::{SKError, SKResult};
|
||||
|
||||
use crate::partition::KmerPartition;
|
||||
|
||||
const INDEX_SUBDIR: &str = "index";
|
||||
|
||||
fn probe_n_layers(index_dir: &std::path::Path) -> usize {
|
||||
let mut n = 0;
|
||||
while index_dir.join(format!("layer_{n}")).exists() { n += 1; }
|
||||
n
|
||||
}
|
||||
|
||||
impl KmerPartition {
|
||||
/// Open all count matrices for partition `part`, one per layer.
|
||||
/// Layers without a `counts/` directory are skipped.
|
||||
pub fn count_store(&self, part: usize) -> SKResult<LayeredStore<PersistentCompactIntMatrix>> {
|
||||
let index_dir = self.part_dir(part).join(INDEX_SUBDIR);
|
||||
if !index_dir.exists() {
|
||||
return Ok(LayeredStore::new(vec![]));
|
||||
}
|
||||
let matrices = (0..probe_n_layers(&index_dir))
|
||||
.filter_map(|l| {
|
||||
let dir = index_dir.join(format!("layer_{l}")).join("counts");
|
||||
dir.exists().then(|| PersistentCompactIntMatrix::open(&dir).map_err(SKError::Io))
|
||||
})
|
||||
.collect::<SKResult<Vec<_>>>()?;
|
||||
Ok(LayeredStore::new(matrices))
|
||||
}
|
||||
|
||||
/// Open all presence matrices for partition `part`, one per layer.
|
||||
/// Layers without a `presence/` directory are skipped.
|
||||
pub fn presence_store(&self, part: usize) -> SKResult<LayeredStore<PersistentBitMatrix>> {
|
||||
let index_dir = self.part_dir(part).join(INDEX_SUBDIR);
|
||||
if !index_dir.exists() {
|
||||
return Ok(LayeredStore::new(vec![]));
|
||||
}
|
||||
let matrices = (0..probe_n_layers(&index_dir))
|
||||
.filter_map(|l| {
|
||||
let dir = index_dir.join(format!("layer_{l}")).join("presence");
|
||||
dir.exists().then(|| PersistentBitMatrix::open(&dir).map_err(SKError::Io))
|
||||
})
|
||||
.collect::<SKResult<Vec<_>>>()?;
|
||||
Ok(LayeredStore::new(matrices))
|
||||
}
|
||||
}
|
||||
@@ -1,3 +1,4 @@
|
||||
mod distance;
|
||||
mod dump_layer;
|
||||
mod index_layer;
|
||||
mod kmer_sort;
|
||||
|
||||
Reference in New Issue
Block a user