feat: add pairwise distance computation and phylogenetic trees

This commit introduces a new `distance` CLI subcommand that computes pairwise genomic distance matrices using configurable metrics (Jaccard, Hamming, Bray-Curtis, Euclidean, and Hellinger). It optionally generates phylogenetic trees (NJ or UPGMA) in Newick format and outputs results as CSV. The implementation adds a robust distance computation backend that dynamically routes to optimized backends based on index configuration, supports parallel iteration, and gracefully handles missing data. Additionally, it adds a `dump` task for exporting k-mer to genome mappings as CSV, introduces an `InvalidInput` error variant, updates dependencies to support numerical operations and tree construction, and performs minor module reorganizations.
This commit is contained in:
Eric Coissac
2026-05-21 11:47:35 +02:00
parent 9e1d6f2f25
commit 3fa1dbf8cc
13 changed files with 512 additions and 7 deletions
-2
View File
@@ -23,6 +23,4 @@
- les matrices sont sauvegardées en CSV - les matrices sont sauvegardées en CSV
- les arbres NJ sont sauvegardés en Newick avec les longeurs de branche - les arbres NJ sont sauvegardés en Newick avec les longeurs de branche
- dump : une table csv de l'index avec les kmer et les genomes associés en mode count ou presence/absence avec une option pour forcer le mode presence/absence meme si l'index est en mode count. Par defaut, le mode count est utilisé pour les index en mode count et le mode presence/absence pour les index en mode presence/absence.
- status : affiche le statut de l'index - status : affiche le statut de l'index
+95 -5
View File
@@ -176,6 +176,21 @@ dependencies = [
"thiserror 1.0.69", "thiserror 1.0.69",
] ]
[[package]]
name = "bit-set"
version = "0.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0700ddab506f33b20a03b13996eccd309a48e5ff77d0d95926aa0210fb4e95f1"
dependencies = [
"bit-vec",
]
[[package]]
name = "bit-vec"
version = "0.6.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb"
[[package]] [[package]]
name = "bitflags" name = "bitflags"
version = "1.3.2" version = "1.3.2"
@@ -625,6 +640,12 @@ dependencies = [
"syn", "syn",
] ]
[[package]]
name = "dtoa"
version = "1.0.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4c3cf4824e2d5f025c7b531afcb2325364084a16806f6d47fbc1f5fbd9960590"
[[package]] [[package]]
name = "either" name = "either"
version = "1.15.0" version = "1.15.0"
@@ -1102,6 +1123,15 @@ dependencies = [
"wasm-bindgen", "wasm-bindgen",
] ]
[[package]]
name = "kodama"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a44f3a71a44fbf49ce38152db7dc9adf959d4fe5c29344cd1858bdbda8d9091"
dependencies = [
"num-traits",
]
[[package]] [[package]]
name = "lazy_static" name = "lazy_static"
version = "1.5.0" version = "1.5.0"
@@ -1453,7 +1483,10 @@ name = "obikindex"
version = "0.1.0" version = "0.1.0"
dependencies = [ dependencies = [
"indicatif", "indicatif",
"ndarray",
"obicompactvec",
"obikpartitionner", "obikpartitionner",
"obilayeredmap",
"obiskio", "obiskio",
"obisys", "obisys",
"rayon", "rayon",
@@ -1468,6 +1501,7 @@ version = "0.1.0"
dependencies = [ dependencies = [
"clap", "clap",
"indicatif", "indicatif",
"kodama",
"obifastwrite", "obifastwrite",
"obikindex", "obikindex",
"obikpartitionner", "obikpartitionner",
@@ -1480,6 +1514,7 @@ dependencies = [
"obisys", "obisys",
"pprof", "pprof",
"rayon", "rayon",
"speedytree",
"tracing", "tracing",
"tracing-subscriber", "tracing-subscriber",
] ]
@@ -1886,8 +1921,8 @@ dependencies = [
"lazy_static", "lazy_static",
"log", "log",
"mem_dbg", "mem_dbg",
"rand", "rand 0.9.4",
"rand_chacha", "rand_chacha 0.9.0",
"rayon", "rayon",
"rdst", "rdst",
"rustc-hash", "rustc-hash",
@@ -1918,14 +1953,35 @@ version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dc33ff2d4973d518d823d61aa239014831e521c75da58e3df4840d3f47749d09" checksum = "dc33ff2d4973d518d823d61aa239014831e521c75da58e3df4840d3f47749d09"
[[package]]
name = "rand"
version = "0.8.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5ca0ecfa931c29007047d1bc58e623ab12e5590e8c7cc53200d5202b69266d8a"
dependencies = [
"libc",
"rand_chacha 0.3.1",
"rand_core 0.6.4",
]
[[package]] [[package]]
name = "rand" name = "rand"
version = "0.9.4" version = "0.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea" checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea"
dependencies = [ dependencies = [
"rand_chacha", "rand_chacha 0.9.0",
"rand_core", "rand_core 0.9.5",
]
[[package]]
name = "rand_chacha"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
dependencies = [
"ppv-lite86",
"rand_core 0.6.4",
] ]
[[package]] [[package]]
@@ -1935,7 +1991,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
dependencies = [ dependencies = [
"ppv-lite86", "ppv-lite86",
"rand_core", "rand_core 0.9.5",
]
[[package]]
name = "rand_core"
version = "0.6.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
dependencies = [
"getrandom 0.2.17",
] ]
[[package]] [[package]]
@@ -1973,6 +2038,12 @@ dependencies = [
"crossbeam-utils", "crossbeam-utils",
] ]
[[package]]
name = "rb_tree"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6724c78e033e1c4155b4d3f76593d09ad384739c6986c1addc2bd2f55b1aefe"
[[package]] [[package]]
name = "rdst" name = "rdst"
version = "0.20.14" version = "0.20.14"
@@ -2227,6 +2298,25 @@ version = "1.15.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03"
[[package]]
name = "speedytree"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7f4522052445ce1b002c095d93e9dc545c326e9e2321c1315f1ab2a381c11666"
dependencies = [
"bit-set",
"bit-vec",
"bitvec",
"clap",
"dtoa",
"fixedbitset",
"parking_lot",
"petgraph",
"rand 0.8.6",
"rayon",
"rb_tree",
]
[[package]] [[package]]
name = "stable_deref_trait" name = "stable_deref_trait"
version = "1.2.1" version = "1.2.1"
+7
View File
@@ -80,6 +80,13 @@ pub trait CountPartials: ColumnWeights {
let sq2 = std::f64::consts::SQRT_2; let sq2 = std::f64::consts::SQRT_2;
self.partial_hellinger(&global).mapv(|v| v.sqrt() / sq2) self.partial_hellinger(&global).mapv(|v| v.sqrt() / sq2)
} }
/// Euclidean distance in the Hellinger (√relative-frequency) space.
/// Equal to √2 × hellinger_dist — unnormalised variant.
fn hellinger_euclidean_dist_matrix(&self) -> Array2<f64> {
let global = self.col_weights();
self.partial_hellinger(&global).mapv(|v| v.sqrt())
}
} }
/// Partial distance matrices for bit-based data (`PersistentBitMatrix`). /// Partial distance matrices for bit-based data (`PersistentBitMatrix`).
+3
View File
@@ -7,6 +7,9 @@ edition = "2024"
obikpartitionner = { path = "../obikpartitionner" } obikpartitionner = { path = "../obikpartitionner" }
obiskio = { path = "../obiskio" } obiskio = { path = "../obiskio" }
obisys = { path = "../obisys" } obisys = { path = "../obisys" }
obicompactvec = { path = "../obicompactvec" }
obilayeredmap = { path = "../obilayeredmap" }
ndarray = "0.16"
rayon = "1" rayon = "1"
serde = { version = "1", features = ["derive"] } serde = { version = "1", features = ["derive"] }
serde_json = "1" serde_json = "1"
+132
View File
@@ -0,0 +1,132 @@
use ndarray::Array2;
use obicompactvec::traits::{BitPartials, CountPartials};
use obilayeredmap::LayeredStore;
use rayon::prelude::*;
use crate::error::{OKIError, OKIResult};
use crate::index::KmerIndex;
// ── Public API ────────────────────────────────────────────────────────────────
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DistanceMetric {
/// Jaccard distance on presence/absence data.
Jaccard,
/// Hamming distance (number of differing kmer positions) on presence/absence data.
Hamming,
/// Bray-Curtis dissimilarity on raw counts.
BrayCurtis,
/// Bray-Curtis dissimilarity normalised by per-genome total counts.
RelfreqBrayCurtis,
/// Euclidean distance on raw counts.
Euclidean,
/// Euclidean distance on relative frequencies.
RelfreqEuclidean,
/// Hellinger distance on counts.
Hellinger,
/// Euclidean distance in the Hellinger (√relative-frequency) space (unnormalised variant).
HellingerEuclidean,
}
pub struct DistanceOutput {
/// n×n pairwise distance matrix (genomes in index order).
pub matrix: Array2<f64>,
/// n×n shared-kmer count matrix (intersection), if requested.
pub shared_kmers: Option<Array2<u64>>,
}
impl DistanceMetric {
pub fn requires_counts(self) -> bool {
matches!(
self,
DistanceMetric::BrayCurtis
| DistanceMetric::RelfreqBrayCurtis
| DistanceMetric::Euclidean
| DistanceMetric::RelfreqEuclidean
| DistanceMetric::Hellinger
| DistanceMetric::HellingerEuclidean
)
}
}
// ── KmerIndex::distance ───────────────────────────────────────────────────────
impl KmerIndex {
pub fn distance(&self, metric: DistanceMetric, shared_kmers: bool) -> OKIResult<DistanceOutput> {
let n_genomes = self.meta.genomes.len();
if n_genomes < 2 {
return Err(OKIError::InvalidInput(
"distance requires at least 2 genomes in the index".into(),
));
}
let use_counts = self.meta.config.with_counts;
if metric.requires_counts() && !use_counts {
return Err(OKIError::InvalidInput(format!(
"{metric:?} requires a count index (with_counts = true)"
)));
}
let n_parts = self.n_partitions();
if use_counts {
let stores: Vec<_> = (0..n_parts)
.into_par_iter()
.map(|i| self.partition.count_store(i).map_err(OKIError::Partition))
.collect::<OKIResult<_>>()?;
let global = LayeredStore::new(stores);
let matrix = match metric {
DistanceMetric::BrayCurtis => CountPartials::bray_dist_matrix(&global),
DistanceMetric::RelfreqBrayCurtis => CountPartials::relfreq_bray_dist_matrix(&global),
DistanceMetric::Euclidean => CountPartials::euclidean_dist_matrix(&global),
DistanceMetric::RelfreqEuclidean => CountPartials::relfreq_euclidean_dist_matrix(&global),
DistanceMetric::Hellinger => CountPartials::hellinger_dist_matrix(&global),
DistanceMetric::HellingerEuclidean => CountPartials::hellinger_euclidean_dist_matrix(&global),
// Jaccard on count data: threshold at 0 (present if count > 0)
DistanceMetric::Jaccard => CountPartials::threshold_jaccard_dist_matrix(&global, 0),
DistanceMetric::Hamming => {
return Err(OKIError::InvalidInput(
"Hamming is only available for presence/absence indexes".into(),
));
}
};
let shared = if shared_kmers {
let (inter, _) = CountPartials::partial_threshold_jaccard(&global, 0);
Some(inter)
} else {
None
};
Ok(DistanceOutput { matrix, shared_kmers: shared })
} else {
let stores: Vec<_> = (0..n_parts)
.into_par_iter()
.map(|i| self.partition.presence_store(i).map_err(OKIError::Partition))
.collect::<OKIResult<_>>()?;
let global = LayeredStore::new(stores);
let matrix = match metric {
DistanceMetric::Jaccard => BitPartials::jaccard_dist_matrix(&global),
DistanceMetric::Hamming => {
BitPartials::hamming_dist_matrix(&global).mapv(|v| v as f64)
}
other => {
return Err(OKIError::InvalidInput(format!(
"{other:?} requires a count index; use --metric jaccard or --metric hamming"
)));
}
};
let shared = if shared_kmers {
let (inter, _) = BitPartials::partial_jaccard(&global);
Some(inter)
} else {
None
};
Ok(DistanceOutput { matrix, shared_kmers: shared })
}
}
}
+3
View File
@@ -16,6 +16,8 @@ pub enum OKIError {
MismatchedMode, MismatchedMode,
/// Two or more sources share the same genome label. /// Two or more sources share the same genome label.
DuplicateGenomeLabel(String), DuplicateGenomeLabel(String),
/// Operation not valid for this index configuration.
InvalidInput(String),
} }
pub type OKIResult<T> = Result<T, OKIError>; pub type OKIResult<T> = Result<T, OKIError>;
@@ -30,6 +32,7 @@ impl fmt::Display for OKIError {
OKIError::IncompatibleConfig => write!(f, "incompatible index configurations"), OKIError::IncompatibleConfig => write!(f, "incompatible index configurations"),
OKIError::MismatchedMode => write!(f, "count mode requires all sources to have with_counts=true"), OKIError::MismatchedMode => write!(f, "count mode requires all sources to have with_counts=true"),
OKIError::DuplicateGenomeLabel(l) => write!(f, "duplicate genome label across sources: {l}"), OKIError::DuplicateGenomeLabel(l) => write!(f, "duplicate genome label across sources: {l}"),
OKIError::InvalidInput(m) => write!(f, "invalid input: {m}"),
} }
} }
} }
+2
View File
@@ -1,11 +1,13 @@
pub mod error; pub mod error;
pub mod meta; pub mod meta;
pub mod state; pub mod state;
mod distance;
mod dump; mod dump;
mod index; mod index;
mod merge; mod merge;
pub use error::{OKIError, OKIResult}; pub use error::{OKIError, OKIResult};
pub use distance::{DistanceMetric, DistanceOutput};
pub use index::KmerIndex; pub use index::KmerIndex;
pub use merge::MergeMode; pub use merge::MergeMode;
pub use meta::{IndexConfig, IndexMeta, META_FILENAME}; pub use meta::{IndexConfig, IndexMeta, META_FILENAME};
+2
View File
@@ -19,6 +19,8 @@ obisys = { path = "../obisys" }
obiskio = { path = "../obiskio" } obiskio = { path = "../obiskio" }
obikindex = { path = "../obikindex" } obikindex = { path = "../obikindex" }
clap = { version = "4", features = ["derive"] } clap = { version = "4", features = ["derive"] }
kodama = "0.2"
speedytree = "0.1"
rayon = "1" rayon = "1"
indicatif = "0.17" indicatif = "0.17"
tracing = "0.1.44" tracing = "0.1.44"
+216
View File
@@ -0,0 +1,216 @@
use std::io::{self, BufWriter, Write};
use std::path::PathBuf;
use clap::Args;
use kodama::{Method, linkage};
use obikindex::{DistanceMetric, KmerIndex};
use obikseq::{set_k, set_m};
use speedytree::{DistanceMatrix, Hybrid, NeighborJoiningSolver, to_newick};
use tracing::info;
#[derive(clap::ValueEnum, Clone, Copy, Debug)]
pub enum MetricArg {
Jaccard,
Hamming,
BrayCurtis,
#[value(name = "relfreq-bray-curtis")]
RelfreqBrayCurtis,
Euclidean,
#[value(name = "relfreq-euclidean")]
RelfreqEuclidean,
Hellinger,
#[value(name = "hellinger-euclidean")]
HellingerEuclidean,
}
impl From<MetricArg> for DistanceMetric {
fn from(m: MetricArg) -> Self {
match m {
MetricArg::Jaccard => DistanceMetric::Jaccard,
MetricArg::Hamming => DistanceMetric::Hamming,
MetricArg::BrayCurtis => DistanceMetric::BrayCurtis,
MetricArg::RelfreqBrayCurtis => DistanceMetric::RelfreqBrayCurtis,
MetricArg::Euclidean => DistanceMetric::Euclidean,
MetricArg::RelfreqEuclidean => DistanceMetric::RelfreqEuclidean,
MetricArg::Hellinger => DistanceMetric::Hellinger,
MetricArg::HellingerEuclidean => DistanceMetric::HellingerEuclidean,
}
}
}
#[derive(Args)]
pub struct DistanceArgs {
/// Index directory
pub index: PathBuf,
/// Distance metric to compute
#[arg(long, value_enum, default_value = "jaccard")]
pub metric: MetricArg,
/// Also output the shared-kmer count matrix (CSV)
#[arg(long)]
pub shared_kmers: bool,
/// Compute and write a Neighbor-Joining tree (Newick)
#[arg(long)]
pub nj: bool,
/// Compute and write a UPGMA tree (Newick)
#[arg(long)]
pub upgma: bool,
/// Output prefix: <prefix>_dist.csv, <prefix>_shared.csv,
/// <prefix>_nj.nwk, <prefix>_upgma.nwk.
/// If omitted, the distance matrix is written to stdout.
#[arg(short, long)]
pub output: Option<PathBuf>,
}
pub fn run(args: DistanceArgs) {
let idx = KmerIndex::open(&args.index).unwrap_or_else(|e| {
eprintln!("error opening index: {e}");
std::process::exit(1);
});
set_k(idx.kmer_size());
set_m(idx.minimizer_size());
let genomes = idx.meta().genomes.clone();
let n = genomes.len();
info!(
"computing {:?} distances for {} genome(s)",
args.metric, n
);
let need_shared = args.shared_kmers || args.nj || args.upgma;
let result = idx
.distance(args.metric.into(), need_shared)
.unwrap_or_else(|e| {
eprintln!("error computing distances: {e}");
std::process::exit(1);
});
// ── Distance matrix → CSV ─────────────────────────────────────────────────
let write_dist_csv = |w: &mut dyn Write| {
write!(w, "genome").unwrap();
for g in &genomes { write!(w, ",{g}").unwrap(); }
writeln!(w).unwrap();
for (i, g) in genomes.iter().enumerate() {
write!(w, "{g}").unwrap();
for j in 0..n {
write!(w, ",{:.6}", result.matrix[[i, j]]).unwrap();
}
writeln!(w).unwrap();
}
};
match &args.output {
Some(prefix) => {
let path = format!("{}_dist.csv", prefix.display());
let mut f = BufWriter::new(std::fs::File::create(&path).unwrap_or_else(|e| {
eprintln!("error creating {path}: {e}");
std::process::exit(1);
}));
write_dist_csv(&mut f);
info!("distance matrix → {path}");
}
None => {
let stdout = io::stdout();
let mut out = BufWriter::new(stdout.lock());
write_dist_csv(&mut out);
}
}
// ── Shared-kmer matrix → CSV ──────────────────────────────────────────────
if args.shared_kmers {
if let Some(shared) = &result.shared_kmers {
let path = args.output.as_ref()
.map(|p| format!("{}_shared.csv", p.display()))
.unwrap_or_else(|| "shared.csv".into());
let mut f = BufWriter::new(std::fs::File::create(&path).unwrap_or_else(|e| {
eprintln!("error creating {path}: {e}");
std::process::exit(1);
}));
write!(f, "genome").unwrap();
for g in &genomes { write!(f, ",{g}").unwrap(); }
writeln!(f).unwrap();
for (i, g) in genomes.iter().enumerate() {
write!(f, "{g}").unwrap();
for j in 0..n { write!(f, ",{}", shared[[i, j]]).unwrap(); }
writeln!(f).unwrap();
}
info!("shared-kmer matrix → {path}");
}
}
// ── NJ tree via speedytree ────────────────────────────────────────────────
if args.nj {
let rows: Vec<Vec<f64>> = (0..n)
.map(|i| (0..n).map(|j| result.matrix[[i, j]]).collect())
.collect();
let dm = DistanceMatrix::build(rows, genomes.clone()).unwrap_or_else(|e| {
eprintln!("error building distance matrix for NJ: {e}");
std::process::exit(1);
});
let tree = NeighborJoiningSolver::<Hybrid>::default(dm).solve().unwrap_or_else(|e| {
eprintln!("error computing NJ tree: {e}");
std::process::exit(1);
});
let newick = to_newick(&tree);
let path = args.output.as_ref()
.map(|p| format!("{}_nj.nwk", p.display()))
.unwrap_or_else(|| "nj.nwk".into());
std::fs::write(&path, &newick).unwrap_or_else(|e| {
eprintln!("error writing {path}: {e}");
std::process::exit(1);
});
info!("NJ tree → {path}");
}
// ── UPGMA tree via kodama ─────────────────────────────────────────────────
if args.upgma {
let mut condensed: Vec<f64> = Vec::with_capacity(n * (n - 1) / 2);
for i in 0..n {
for j in (i + 1)..n {
condensed.push(result.matrix[[i, j]]);
}
}
let dendro = linkage(&mut condensed, n, Method::Average);
let newick = upgma_to_newick(&dendro, &genomes);
let path = args.output.as_ref()
.map(|p| format!("{}_upgma.nwk", p.display()))
.unwrap_or_else(|| "upgma.nwk".into());
std::fs::write(&path, &newick).unwrap_or_else(|e| {
eprintln!("error writing {path}: {e}");
std::process::exit(1);
});
info!("UPGMA tree → {path}");
}
}
// ── UPGMA Newick from kodama dendrogram ───────────────────────────────────────
fn upgma_to_newick(dendro: &kodama::Dendrogram<f64>, names: &[String]) -> String {
let n = names.len();
// node_labels[i]: Newick subtree string for node i (leaves 0..n, internals n..)
let mut labels: Vec<String> = names.to_vec();
// height of each node: leaves = 0, internal = dissimilarity/2
let mut heights: Vec<f64> = vec![0.0; 2 * n - 1];
for (k, step) in dendro.steps().iter().enumerate() {
let new_node = n + k;
let h = step.dissimilarity / 2.0;
heights[new_node] = h;
let c1 = step.cluster1;
let c2 = step.cluster2;
let bl1 = (h - heights[c1]).max(0.0);
let bl2 = (h - heights[c2]).max(0.0);
labels.push(format!(
"({label1}:{bl1:.6},{label2}:{bl2:.6})",
label1 = labels[c1],
label2 = labels[c2],
));
}
format!("{};", labels.last().unwrap())
}
+1
View File
@@ -1,3 +1,4 @@
pub mod distance;
pub mod dump; pub mod dump;
pub mod index; pub mod index;
pub mod merge; pub mod merge;
+3
View File
@@ -22,6 +22,8 @@ enum Commands {
Merge(cmd::merge::MergeArgs), Merge(cmd::merge::MergeArgs),
/// Dump all indexed kmers as CSV (kmer + per-genome counts or presence) /// Dump all indexed kmers as CSV (kmer + per-genome counts or presence)
Dump(cmd::dump::DumpArgs), Dump(cmd::dump::DumpArgs),
/// Compute pairwise distance matrix between genomes; optionally build NJ/UPGMA trees
Distance(cmd::distance::DistanceArgs),
/// Dump unitigs from a built index to stdout (debug) /// Dump unitigs from a built index to stdout (debug)
Unitig(cmd::unitig::UnitigArgs), Unitig(cmd::unitig::UnitigArgs),
} }
@@ -49,6 +51,7 @@ fn main() {
Commands::Index(args) => cmd::index::run(args), Commands::Index(args) => cmd::index::run(args),
Commands::Merge(args) => cmd::merge::run(args), Commands::Merge(args) => cmd::merge::run(args),
Commands::Dump(args) => cmd::dump::run(args), Commands::Dump(args) => cmd::dump::run(args),
Commands::Distance(args) => cmd::distance::run(args),
Commands::Unitig(args) => cmd::unitig::run(args), Commands::Unitig(args) => cmd::unitig::run(args),
} }
+47
View File
@@ -0,0 +1,47 @@
use obicompactvec::{PersistentBitMatrix, PersistentCompactIntMatrix};
use obilayeredmap::LayeredStore;
use obiskio::{SKError, SKResult};
use crate::partition::KmerPartition;
const INDEX_SUBDIR: &str = "index";
fn probe_n_layers(index_dir: &std::path::Path) -> usize {
let mut n = 0;
while index_dir.join(format!("layer_{n}")).exists() { n += 1; }
n
}
impl KmerPartition {
/// Open all count matrices for partition `part`, one per layer.
/// Layers without a `counts/` directory are skipped.
pub fn count_store(&self, part: usize) -> SKResult<LayeredStore<PersistentCompactIntMatrix>> {
let index_dir = self.part_dir(part).join(INDEX_SUBDIR);
if !index_dir.exists() {
return Ok(LayeredStore::new(vec![]));
}
let matrices = (0..probe_n_layers(&index_dir))
.filter_map(|l| {
let dir = index_dir.join(format!("layer_{l}")).join("counts");
dir.exists().then(|| PersistentCompactIntMatrix::open(&dir).map_err(SKError::Io))
})
.collect::<SKResult<Vec<_>>>()?;
Ok(LayeredStore::new(matrices))
}
/// Open all presence matrices for partition `part`, one per layer.
/// Layers without a `presence/` directory are skipped.
pub fn presence_store(&self, part: usize) -> SKResult<LayeredStore<PersistentBitMatrix>> {
let index_dir = self.part_dir(part).join(INDEX_SUBDIR);
if !index_dir.exists() {
return Ok(LayeredStore::new(vec![]));
}
let matrices = (0..probe_n_layers(&index_dir))
.filter_map(|l| {
let dir = index_dir.join(format!("layer_{l}")).join("presence");
dir.exists().then(|| PersistentBitMatrix::open(&dir).map_err(SKError::Io))
})
.collect::<SKResult<Vec<_>>>()?;
Ok(LayeredStore::new(matrices))
}
}
+1
View File
@@ -1,3 +1,4 @@
mod distance;
mod dump_layer; mod dump_layer;
mod index_layer; mod index_layer;
mod kmer_sort; mod kmer_sort;