diff --git a/TODO.md b/TODO.md index 592fc74..e6ba5d0 100644 --- a/TODO.md +++ b/TODO.md @@ -23,6 +23,4 @@ - les matrices sont sauvegardées en CSV - les arbres NJ sont sauvegardés en Newick avec les longeurs de branche -- dump : une table csv de l'index avec les kmer et les genomes associés en mode count ou presence/absence avec une option pour forcer le mode presence/absence meme si l'index est en mode count. Par defaut, le mode count est utilisé pour les index en mode count et le mode presence/absence pour les index en mode presence/absence. - - status : affiche le statut de l'index diff --git a/src/Cargo.lock b/src/Cargo.lock index 4c1929b..1de3042 100644 --- a/src/Cargo.lock +++ b/src/Cargo.lock @@ -176,6 +176,21 @@ dependencies = [ "thiserror 1.0.69", ] +[[package]] +name = "bit-set" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0700ddab506f33b20a03b13996eccd309a48e5ff77d0d95926aa0210fb4e95f1" +dependencies = [ + "bit-vec", +] + +[[package]] +name = "bit-vec" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb" + [[package]] name = "bitflags" version = "1.3.2" @@ -625,6 +640,12 @@ dependencies = [ "syn", ] +[[package]] +name = "dtoa" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c3cf4824e2d5f025c7b531afcb2325364084a16806f6d47fbc1f5fbd9960590" + [[package]] name = "either" version = "1.15.0" @@ -1102,6 +1123,15 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "kodama" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a44f3a71a44fbf49ce38152db7dc9adf959d4fe5c29344cd1858bdbda8d9091" +dependencies = [ + "num-traits", +] + [[package]] name = "lazy_static" version = "1.5.0" @@ -1453,7 +1483,10 @@ name = "obikindex" version = "0.1.0" dependencies = [ "indicatif", + "ndarray", + "obicompactvec", "obikpartitionner", + "obilayeredmap", "obiskio", "obisys", "rayon", @@ -1468,6 +1501,7 @@ version = "0.1.0" dependencies = [ "clap", "indicatif", + "kodama", "obifastwrite", "obikindex", "obikpartitionner", @@ -1480,6 +1514,7 @@ dependencies = [ "obisys", "pprof", "rayon", + "speedytree", "tracing", "tracing-subscriber", ] @@ -1886,8 +1921,8 @@ dependencies = [ "lazy_static", "log", "mem_dbg", - "rand", - "rand_chacha", + "rand 0.9.4", + "rand_chacha 0.9.0", "rayon", "rdst", "rustc-hash", @@ -1918,14 +1953,35 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc33ff2d4973d518d823d61aa239014831e521c75da58e3df4840d3f47749d09" +[[package]] +name = "rand" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ca0ecfa931c29007047d1bc58e623ab12e5590e8c7cc53200d5202b69266d8a" +dependencies = [ + "libc", + "rand_chacha 0.3.1", + "rand_core 0.6.4", +] + [[package]] name = "rand" version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea" dependencies = [ - "rand_chacha", - "rand_core", + "rand_chacha 0.9.0", + "rand_core 0.9.5", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core 0.6.4", ] [[package]] @@ -1935,7 +1991,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" dependencies = [ "ppv-lite86", - "rand_core", + "rand_core 0.9.5", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom 0.2.17", ] [[package]] @@ -1973,6 +2038,12 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "rb_tree" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6724c78e033e1c4155b4d3f76593d09ad384739c6986c1addc2bd2f55b1aefe" + [[package]] name = "rdst" version = "0.20.14" @@ -2227,6 +2298,25 @@ version = "1.15.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" +[[package]] +name = "speedytree" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f4522052445ce1b002c095d93e9dc545c326e9e2321c1315f1ab2a381c11666" +dependencies = [ + "bit-set", + "bit-vec", + "bitvec", + "clap", + "dtoa", + "fixedbitset", + "parking_lot", + "petgraph", + "rand 0.8.6", + "rayon", + "rb_tree", +] + [[package]] name = "stable_deref_trait" version = "1.2.1" diff --git a/src/obicompactvec/src/traits.rs b/src/obicompactvec/src/traits.rs index bc10097..7cccf48 100644 --- a/src/obicompactvec/src/traits.rs +++ b/src/obicompactvec/src/traits.rs @@ -80,6 +80,13 @@ pub trait CountPartials: ColumnWeights { let sq2 = std::f64::consts::SQRT_2; self.partial_hellinger(&global).mapv(|v| v.sqrt() / sq2) } + + /// Euclidean distance in the Hellinger (√relative-frequency) space. + /// Equal to √2 × hellinger_dist — unnormalised variant. + fn hellinger_euclidean_dist_matrix(&self) -> Array2 { + let global = self.col_weights(); + self.partial_hellinger(&global).mapv(|v| v.sqrt()) + } } /// Partial distance matrices for bit-based data (`PersistentBitMatrix`). diff --git a/src/obikindex/Cargo.toml b/src/obikindex/Cargo.toml index bf4ebd6..538cffb 100644 --- a/src/obikindex/Cargo.toml +++ b/src/obikindex/Cargo.toml @@ -7,6 +7,9 @@ edition = "2024" obikpartitionner = { path = "../obikpartitionner" } obiskio = { path = "../obiskio" } obisys = { path = "../obisys" } +obicompactvec = { path = "../obicompactvec" } +obilayeredmap = { path = "../obilayeredmap" } +ndarray = "0.16" rayon = "1" serde = { version = "1", features = ["derive"] } serde_json = "1" diff --git a/src/obikindex/src/distance.rs b/src/obikindex/src/distance.rs new file mode 100644 index 0000000..a1c9cb0 --- /dev/null +++ b/src/obikindex/src/distance.rs @@ -0,0 +1,132 @@ +use ndarray::Array2; +use obicompactvec::traits::{BitPartials, CountPartials}; +use obilayeredmap::LayeredStore; +use rayon::prelude::*; + +use crate::error::{OKIError, OKIResult}; +use crate::index::KmerIndex; + +// ── Public API ──────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum DistanceMetric { + /// Jaccard distance on presence/absence data. + Jaccard, + /// Hamming distance (number of differing kmer positions) on presence/absence data. + Hamming, + /// Bray-Curtis dissimilarity on raw counts. + BrayCurtis, + /// Bray-Curtis dissimilarity normalised by per-genome total counts. + RelfreqBrayCurtis, + /// Euclidean distance on raw counts. + Euclidean, + /// Euclidean distance on relative frequencies. + RelfreqEuclidean, + /// Hellinger distance on counts. + Hellinger, + /// Euclidean distance in the Hellinger (√relative-frequency) space (unnormalised variant). + HellingerEuclidean, +} + +pub struct DistanceOutput { + /// n×n pairwise distance matrix (genomes in index order). + pub matrix: Array2, + /// n×n shared-kmer count matrix (intersection), if requested. + pub shared_kmers: Option>, +} + +impl DistanceMetric { + pub fn requires_counts(self) -> bool { + matches!( + self, + DistanceMetric::BrayCurtis + | DistanceMetric::RelfreqBrayCurtis + | DistanceMetric::Euclidean + | DistanceMetric::RelfreqEuclidean + | DistanceMetric::Hellinger + | DistanceMetric::HellingerEuclidean + ) + } +} + +// ── KmerIndex::distance ─────────────────────────────────────────────────────── + +impl KmerIndex { + pub fn distance(&self, metric: DistanceMetric, shared_kmers: bool) -> OKIResult { + let n_genomes = self.meta.genomes.len(); + if n_genomes < 2 { + return Err(OKIError::InvalidInput( + "distance requires at least 2 genomes in the index".into(), + )); + } + + let use_counts = self.meta.config.with_counts; + if metric.requires_counts() && !use_counts { + return Err(OKIError::InvalidInput(format!( + "{metric:?} requires a count index (with_counts = true)" + ))); + } + + let n_parts = self.n_partitions(); + + if use_counts { + let stores: Vec<_> = (0..n_parts) + .into_par_iter() + .map(|i| self.partition.count_store(i).map_err(OKIError::Partition)) + .collect::>()?; + let global = LayeredStore::new(stores); + + let matrix = match metric { + DistanceMetric::BrayCurtis => CountPartials::bray_dist_matrix(&global), + DistanceMetric::RelfreqBrayCurtis => CountPartials::relfreq_bray_dist_matrix(&global), + DistanceMetric::Euclidean => CountPartials::euclidean_dist_matrix(&global), + DistanceMetric::RelfreqEuclidean => CountPartials::relfreq_euclidean_dist_matrix(&global), + DistanceMetric::Hellinger => CountPartials::hellinger_dist_matrix(&global), + DistanceMetric::HellingerEuclidean => CountPartials::hellinger_euclidean_dist_matrix(&global), + // Jaccard on count data: threshold at 0 (present if count > 0) + DistanceMetric::Jaccard => CountPartials::threshold_jaccard_dist_matrix(&global, 0), + DistanceMetric::Hamming => { + return Err(OKIError::InvalidInput( + "Hamming is only available for presence/absence indexes".into(), + )); + } + }; + + let shared = if shared_kmers { + let (inter, _) = CountPartials::partial_threshold_jaccard(&global, 0); + Some(inter) + } else { + None + }; + + Ok(DistanceOutput { matrix, shared_kmers: shared }) + } else { + let stores: Vec<_> = (0..n_parts) + .into_par_iter() + .map(|i| self.partition.presence_store(i).map_err(OKIError::Partition)) + .collect::>()?; + let global = LayeredStore::new(stores); + + let matrix = match metric { + DistanceMetric::Jaccard => BitPartials::jaccard_dist_matrix(&global), + DistanceMetric::Hamming => { + BitPartials::hamming_dist_matrix(&global).mapv(|v| v as f64) + } + other => { + return Err(OKIError::InvalidInput(format!( + "{other:?} requires a count index; use --metric jaccard or --metric hamming" + ))); + } + }; + + let shared = if shared_kmers { + let (inter, _) = BitPartials::partial_jaccard(&global); + Some(inter) + } else { + None + }; + + Ok(DistanceOutput { matrix, shared_kmers: shared }) + } + } +} diff --git a/src/obikindex/src/error.rs b/src/obikindex/src/error.rs index 66f1abc..96834cb 100644 --- a/src/obikindex/src/error.rs +++ b/src/obikindex/src/error.rs @@ -16,6 +16,8 @@ pub enum OKIError { MismatchedMode, /// Two or more sources share the same genome label. DuplicateGenomeLabel(String), + /// Operation not valid for this index configuration. + InvalidInput(String), } pub type OKIResult = Result; @@ -30,6 +32,7 @@ impl fmt::Display for OKIError { OKIError::IncompatibleConfig => write!(f, "incompatible index configurations"), OKIError::MismatchedMode => write!(f, "count mode requires all sources to have with_counts=true"), OKIError::DuplicateGenomeLabel(l) => write!(f, "duplicate genome label across sources: {l}"), + OKIError::InvalidInput(m) => write!(f, "invalid input: {m}"), } } } diff --git a/src/obikindex/src/lib.rs b/src/obikindex/src/lib.rs index c4a66f5..d6a0908 100644 --- a/src/obikindex/src/lib.rs +++ b/src/obikindex/src/lib.rs @@ -1,11 +1,13 @@ pub mod error; pub mod meta; pub mod state; +mod distance; mod dump; mod index; mod merge; pub use error::{OKIError, OKIResult}; +pub use distance::{DistanceMetric, DistanceOutput}; pub use index::KmerIndex; pub use merge::MergeMode; pub use meta::{IndexConfig, IndexMeta, META_FILENAME}; diff --git a/src/obikmer/Cargo.toml b/src/obikmer/Cargo.toml index f61a524..fffb28b 100644 --- a/src/obikmer/Cargo.toml +++ b/src/obikmer/Cargo.toml @@ -19,6 +19,8 @@ obisys = { path = "../obisys" } obiskio = { path = "../obiskio" } obikindex = { path = "../obikindex" } clap = { version = "4", features = ["derive"] } +kodama = "0.2" +speedytree = "0.1" rayon = "1" indicatif = "0.17" tracing = "0.1.44" diff --git a/src/obikmer/src/cmd/distance.rs b/src/obikmer/src/cmd/distance.rs new file mode 100644 index 0000000..e8574ae --- /dev/null +++ b/src/obikmer/src/cmd/distance.rs @@ -0,0 +1,216 @@ +use std::io::{self, BufWriter, Write}; +use std::path::PathBuf; + +use clap::Args; +use kodama::{Method, linkage}; +use obikindex::{DistanceMetric, KmerIndex}; +use obikseq::{set_k, set_m}; +use speedytree::{DistanceMatrix, Hybrid, NeighborJoiningSolver, to_newick}; +use tracing::info; + +#[derive(clap::ValueEnum, Clone, Copy, Debug)] +pub enum MetricArg { + Jaccard, + Hamming, + BrayCurtis, + #[value(name = "relfreq-bray-curtis")] + RelfreqBrayCurtis, + Euclidean, + #[value(name = "relfreq-euclidean")] + RelfreqEuclidean, + Hellinger, + #[value(name = "hellinger-euclidean")] + HellingerEuclidean, +} + +impl From for DistanceMetric { + fn from(m: MetricArg) -> Self { + match m { + MetricArg::Jaccard => DistanceMetric::Jaccard, + MetricArg::Hamming => DistanceMetric::Hamming, + MetricArg::BrayCurtis => DistanceMetric::BrayCurtis, + MetricArg::RelfreqBrayCurtis => DistanceMetric::RelfreqBrayCurtis, + MetricArg::Euclidean => DistanceMetric::Euclidean, + MetricArg::RelfreqEuclidean => DistanceMetric::RelfreqEuclidean, + MetricArg::Hellinger => DistanceMetric::Hellinger, + MetricArg::HellingerEuclidean => DistanceMetric::HellingerEuclidean, + } + } +} + +#[derive(Args)] +pub struct DistanceArgs { + /// Index directory + pub index: PathBuf, + + /// Distance metric to compute + #[arg(long, value_enum, default_value = "jaccard")] + pub metric: MetricArg, + + /// Also output the shared-kmer count matrix (CSV) + #[arg(long)] + pub shared_kmers: bool, + + /// Compute and write a Neighbor-Joining tree (Newick) + #[arg(long)] + pub nj: bool, + + /// Compute and write a UPGMA tree (Newick) + #[arg(long)] + pub upgma: bool, + + /// Output prefix: _dist.csv, _shared.csv, + /// _nj.nwk, _upgma.nwk. + /// If omitted, the distance matrix is written to stdout. + #[arg(short, long)] + pub output: Option, +} + +pub fn run(args: DistanceArgs) { + let idx = KmerIndex::open(&args.index).unwrap_or_else(|e| { + eprintln!("error opening index: {e}"); + std::process::exit(1); + }); + + set_k(idx.kmer_size()); + set_m(idx.minimizer_size()); + + let genomes = idx.meta().genomes.clone(); + let n = genomes.len(); + info!( + "computing {:?} distances for {} genome(s)", + args.metric, n + ); + + let need_shared = args.shared_kmers || args.nj || args.upgma; + let result = idx + .distance(args.metric.into(), need_shared) + .unwrap_or_else(|e| { + eprintln!("error computing distances: {e}"); + std::process::exit(1); + }); + + // ── Distance matrix → CSV ───────────────────────────────────────────────── + let write_dist_csv = |w: &mut dyn Write| { + write!(w, "genome").unwrap(); + for g in &genomes { write!(w, ",{g}").unwrap(); } + writeln!(w).unwrap(); + for (i, g) in genomes.iter().enumerate() { + write!(w, "{g}").unwrap(); + for j in 0..n { + write!(w, ",{:.6}", result.matrix[[i, j]]).unwrap(); + } + writeln!(w).unwrap(); + } + }; + + match &args.output { + Some(prefix) => { + let path = format!("{}_dist.csv", prefix.display()); + let mut f = BufWriter::new(std::fs::File::create(&path).unwrap_or_else(|e| { + eprintln!("error creating {path}: {e}"); + std::process::exit(1); + })); + write_dist_csv(&mut f); + info!("distance matrix → {path}"); + } + None => { + let stdout = io::stdout(); + let mut out = BufWriter::new(stdout.lock()); + write_dist_csv(&mut out); + } + } + + // ── Shared-kmer matrix → CSV ────────────────────────────────────────────── + if args.shared_kmers { + if let Some(shared) = &result.shared_kmers { + let path = args.output.as_ref() + .map(|p| format!("{}_shared.csv", p.display())) + .unwrap_or_else(|| "shared.csv".into()); + let mut f = BufWriter::new(std::fs::File::create(&path).unwrap_or_else(|e| { + eprintln!("error creating {path}: {e}"); + std::process::exit(1); + })); + write!(f, "genome").unwrap(); + for g in &genomes { write!(f, ",{g}").unwrap(); } + writeln!(f).unwrap(); + for (i, g) in genomes.iter().enumerate() { + write!(f, "{g}").unwrap(); + for j in 0..n { write!(f, ",{}", shared[[i, j]]).unwrap(); } + writeln!(f).unwrap(); + } + info!("shared-kmer matrix → {path}"); + } + } + + // ── NJ tree via speedytree ──────────────────────────────────────────────── + if args.nj { + let rows: Vec> = (0..n) + .map(|i| (0..n).map(|j| result.matrix[[i, j]]).collect()) + .collect(); + let dm = DistanceMatrix::build(rows, genomes.clone()).unwrap_or_else(|e| { + eprintln!("error building distance matrix for NJ: {e}"); + std::process::exit(1); + }); + let tree = NeighborJoiningSolver::::default(dm).solve().unwrap_or_else(|e| { + eprintln!("error computing NJ tree: {e}"); + std::process::exit(1); + }); + let newick = to_newick(&tree); + let path = args.output.as_ref() + .map(|p| format!("{}_nj.nwk", p.display())) + .unwrap_or_else(|| "nj.nwk".into()); + std::fs::write(&path, &newick).unwrap_or_else(|e| { + eprintln!("error writing {path}: {e}"); + std::process::exit(1); + }); + info!("NJ tree → {path}"); + } + + // ── UPGMA tree via kodama ───────────────────────────────────────────────── + if args.upgma { + let mut condensed: Vec = Vec::with_capacity(n * (n - 1) / 2); + for i in 0..n { + for j in (i + 1)..n { + condensed.push(result.matrix[[i, j]]); + } + } + let dendro = linkage(&mut condensed, n, Method::Average); + let newick = upgma_to_newick(&dendro, &genomes); + let path = args.output.as_ref() + .map(|p| format!("{}_upgma.nwk", p.display())) + .unwrap_or_else(|| "upgma.nwk".into()); + std::fs::write(&path, &newick).unwrap_or_else(|e| { + eprintln!("error writing {path}: {e}"); + std::process::exit(1); + }); + info!("UPGMA tree → {path}"); + } +} + +// ── UPGMA Newick from kodama dendrogram ─────────────────────────────────────── + +fn upgma_to_newick(dendro: &kodama::Dendrogram, names: &[String]) -> String { + let n = names.len(); + // node_labels[i]: Newick subtree string for node i (leaves 0..n, internals n..) + let mut labels: Vec = names.to_vec(); + // height of each node: leaves = 0, internal = dissimilarity/2 + let mut heights: Vec = vec![0.0; 2 * n - 1]; + + for (k, step) in dendro.steps().iter().enumerate() { + let new_node = n + k; + let h = step.dissimilarity / 2.0; + heights[new_node] = h; + let c1 = step.cluster1; + let c2 = step.cluster2; + let bl1 = (h - heights[c1]).max(0.0); + let bl2 = (h - heights[c2]).max(0.0); + labels.push(format!( + "({label1}:{bl1:.6},{label2}:{bl2:.6})", + label1 = labels[c1], + label2 = labels[c2], + )); + } + + format!("{};", labels.last().unwrap()) +} diff --git a/src/obikmer/src/cmd/mod.rs b/src/obikmer/src/cmd/mod.rs index 85d8603..ed66c65 100644 --- a/src/obikmer/src/cmd/mod.rs +++ b/src/obikmer/src/cmd/mod.rs @@ -1,3 +1,4 @@ +pub mod distance; pub mod dump; pub mod index; pub mod merge; diff --git a/src/obikmer/src/main.rs b/src/obikmer/src/main.rs index 1a1f8bc..c88428b 100644 --- a/src/obikmer/src/main.rs +++ b/src/obikmer/src/main.rs @@ -22,6 +22,8 @@ enum Commands { Merge(cmd::merge::MergeArgs), /// Dump all indexed kmers as CSV (kmer + per-genome counts or presence) Dump(cmd::dump::DumpArgs), + /// Compute pairwise distance matrix between genomes; optionally build NJ/UPGMA trees + Distance(cmd::distance::DistanceArgs), /// Dump unitigs from a built index to stdout (debug) Unitig(cmd::unitig::UnitigArgs), } @@ -49,6 +51,7 @@ fn main() { Commands::Index(args) => cmd::index::run(args), Commands::Merge(args) => cmd::merge::run(args), Commands::Dump(args) => cmd::dump::run(args), + Commands::Distance(args) => cmd::distance::run(args), Commands::Unitig(args) => cmd::unitig::run(args), } diff --git a/src/obikpartitionner/src/distance.rs b/src/obikpartitionner/src/distance.rs new file mode 100644 index 0000000..b5e6914 --- /dev/null +++ b/src/obikpartitionner/src/distance.rs @@ -0,0 +1,47 @@ +use obicompactvec::{PersistentBitMatrix, PersistentCompactIntMatrix}; +use obilayeredmap::LayeredStore; +use obiskio::{SKError, SKResult}; + +use crate::partition::KmerPartition; + +const INDEX_SUBDIR: &str = "index"; + +fn probe_n_layers(index_dir: &std::path::Path) -> usize { + let mut n = 0; + while index_dir.join(format!("layer_{n}")).exists() { n += 1; } + n +} + +impl KmerPartition { + /// Open all count matrices for partition `part`, one per layer. + /// Layers without a `counts/` directory are skipped. + pub fn count_store(&self, part: usize) -> SKResult> { + let index_dir = self.part_dir(part).join(INDEX_SUBDIR); + if !index_dir.exists() { + return Ok(LayeredStore::new(vec![])); + } + let matrices = (0..probe_n_layers(&index_dir)) + .filter_map(|l| { + let dir = index_dir.join(format!("layer_{l}")).join("counts"); + dir.exists().then(|| PersistentCompactIntMatrix::open(&dir).map_err(SKError::Io)) + }) + .collect::>>()?; + Ok(LayeredStore::new(matrices)) + } + + /// Open all presence matrices for partition `part`, one per layer. + /// Layers without a `presence/` directory are skipped. + pub fn presence_store(&self, part: usize) -> SKResult> { + let index_dir = self.part_dir(part).join(INDEX_SUBDIR); + if !index_dir.exists() { + return Ok(LayeredStore::new(vec![])); + } + let matrices = (0..probe_n_layers(&index_dir)) + .filter_map(|l| { + let dir = index_dir.join(format!("layer_{l}")).join("presence"); + dir.exists().then(|| PersistentBitMatrix::open(&dir).map_err(SKError::Io)) + }) + .collect::>>()?; + Ok(LayeredStore::new(matrices)) + } +} diff --git a/src/obikpartitionner/src/lib.rs b/src/obikpartitionner/src/lib.rs index 3b098a2..4b03fce 100644 --- a/src/obikpartitionner/src/lib.rs +++ b/src/obikpartitionner/src/lib.rs @@ -1,3 +1,4 @@ +mod distance; mod dump_layer; mod index_layer; mod kmer_sort;