From 0f8f61d3dda0b895707c17a5a69be0161b18ecd9 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Fri, 22 May 2026 09:28:58 +0200 Subject: [PATCH] feat: introduce genome metadata tracking and CSV export This commit replaces raw string genome labels with a structured `GenomeInfo` type for better metadata tracking. It adds a `--meta` flag to the index command, and implements a new `annotate` CLI subcommand to import metadata from CSV files or export it via `--dump`. Distance and shared-count matrices are now serialized to CSV, with UPGMA clustering trees exported as Newick files. Query outputs now include per-genome k-mer match counts in JSON, while fixing syntax and variable naming issues in index merging and dump generation. --- .DS_Store | Bin 10244 -> 10244 bytes src/Cargo.lock | 28 +++++ src/obikindex/src/dump.rs | 2 +- src/obikindex/src/index.rs | 18 ++-- src/obikindex/src/lib.rs | 2 +- src/obikindex/src/merge.rs | 14 +-- src/obikindex/src/meta.rs | 25 ++++- src/obikmer/Cargo.toml | 1 + src/obikmer/src/cmd/annotate.rs | 174 ++++++++++++++++++++++++++++++++ src/obikmer/src/cmd/distance.rs | 16 +-- src/obikmer/src/cmd/index.rs | 20 +++- src/obikmer/src/cmd/mod.rs | 1 + src/obikmer/src/cmd/query.rs | 4 +- src/obikmer/src/main.rs | 3 + 14 files changed, 276 insertions(+), 32 deletions(-) create mode 100644 src/obikmer/src/cmd/annotate.rs diff --git a/.DS_Store b/.DS_Store index e0acbe7fb0f20af03096adbd531a262406771d80..2b2a356960abc32c0146d68a62b511de3edb9e1e 100644 GIT binary patch delta 51 zcmZn(XbG6$&&atkU^hP_=Vl&(zr12>47m)cK%CD|#88=1oSc)CpP$3HvG4`sW_E?Y F>;RGu5HSD% delta 28 kcmZn(XbG6$&&aVcU^hP_$7UXZzq}hu{xNQ5SNO{g0Fh7%KmY&$ diff --git a/src/Cargo.lock b/src/Cargo.lock index a07c18c..1871a63 100644 --- a/src/Cargo.lock +++ b/src/Cargo.lock @@ -601,6 +601,27 @@ dependencies = [ "typenum", ] +[[package]] +name = "csv" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52cd9d68cf7efc6ddfaaee42e7288d3a99d613d4b50f76ce9827ae0c6e14f938" +dependencies = [ + "csv-core", + "itoa", + "ryu", + "serde_core", +] + +[[package]] +name = "csv-core" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "704a3c26996a80471189265814dbc2c257598b96b8a7feae2d31ace646bb9782" +dependencies = [ + "memchr", +] + [[package]] name = "cvt" version = "0.1.2" @@ -1500,6 +1521,7 @@ name = "obikmer" version = "0.1.0" dependencies = [ "clap", + "csv", "indicatif", "kodama", "obifastwrite", @@ -2192,6 +2214,12 @@ version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" +[[package]] +name = "ryu" +version = "1.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" + [[package]] name = "same-file" version = "1.0.6" diff --git a/src/obikindex/src/dump.rs b/src/obikindex/src/dump.rs index d1fb8db..037bb58 100644 --- a/src/obikindex/src/dump.rs +++ b/src/obikindex/src/dump.rs @@ -27,7 +27,7 @@ impl KmerIndex { } write!(out, "kmer")?; for g in genomes { - write!(out, ",{g}")?; + write!(out, ",{}", g.label)?; } writeln!(out)?; diff --git a/src/obikindex/src/index.rs b/src/obikindex/src/index.rs index 07792a4..502822e 100644 --- a/src/obikindex/src/index.rs +++ b/src/obikindex/src/index.rs @@ -11,7 +11,7 @@ use rayon::prelude::*; use tracing::info; use crate::error::{OKIError, OKIResult}; -use crate::meta::{IndexConfig, IndexMeta}; +use crate::meta::{GenomeInfo, IndexConfig, IndexMeta}; use crate::state::{IndexState, SENTINEL_COUNTED, SENTINEL_INDEXED, SENTINEL_SCATTERED}; pub struct KmerIndex { @@ -23,13 +23,12 @@ pub struct KmerIndex { impl KmerIndex { /// Create a new index at `path`. /// - /// If `genome_label` is `Some`, it is stored immediately. - /// If `None`, the label will be derived from the first scatter input path - /// when `mark_scattered` is called. + /// If `genome_info` is `Some`, it is stored immediately. + /// If `None`, the genome entry will be added when `mark_scattered` is called. pub fn create>( path: P, config: IndexConfig, - genome_label: Option, + genome_info: Option, force: bool, ) -> OKIResult { let root_path = path.as_ref().to_owned(); @@ -41,8 +40,8 @@ impl KmerIndex { force, )?; let mut meta = IndexMeta::new(config); - if let Some(label) = genome_label { - meta.genomes.push(label); + if let Some(info) = genome_info { + meta.genomes.push(info); } meta.write(&root_path)?; Ok(Self { root_path, meta, partition }) @@ -71,6 +70,7 @@ impl KmerIndex { } pub fn meta(&self) -> &IndexMeta { &self.meta } + pub fn meta_mut(&mut self) -> &mut IndexMeta { &mut self.meta } pub fn kmer_size(&self) -> usize { self.meta.config.kmer_size } pub fn minimizer_size(&self) -> usize { self.meta.config.minimizer_size } pub fn n_partitions(&self) -> usize { self.partition.n_partitions() } @@ -88,7 +88,7 @@ impl KmerIndex { pub fn mark_scattered(&mut self) -> OKIResult<()> { if self.meta.genomes.is_empty() { let label = label_from_path(&self.root_path); - self.meta.genomes.push(label); + self.meta.genomes.push(GenomeInfo::new(label)); self.meta.write(&self.root_path)?; } touch(&self.root_path.join(SENTINEL_SCATTERED))?; @@ -114,7 +114,7 @@ impl KmerIndex { } fn write_spectrum(&self, sp: &KmerSpectrum) -> OKIResult<()> { - let label = self.meta.genomes.first().map(String::as_str).unwrap_or("unknown"); + let label = self.meta.genomes.first().map(|g| g.label.as_str()).unwrap_or("unknown"); let spectrums_dir = self.root_path.join("spectrums"); fs::create_dir_all(&spectrums_dir)?; let path = spectrums_dir.join(format!("{label}.json")); diff --git a/src/obikindex/src/lib.rs b/src/obikindex/src/lib.rs index 65df6c0..5c9bc85 100644 --- a/src/obikindex/src/lib.rs +++ b/src/obikindex/src/lib.rs @@ -11,5 +11,5 @@ pub use error::{OKIError, OKIResult}; pub use distance::{DistanceMetric, DistanceOutput}; pub use index::KmerIndex; pub use merge::MergeMode; -pub use meta::{IndexConfig, IndexMeta, META_FILENAME}; +pub use meta::{GenomeInfo, IndexConfig, IndexMeta, META_FILENAME}; pub use state::{IndexState, SENTINEL_COUNTED, SENTINEL_INDEXED, SENTINEL_SCATTERED}; diff --git a/src/obikindex/src/merge.rs b/src/obikindex/src/merge.rs index c31e3e5..bb51629 100644 --- a/src/obikindex/src/merge.rs +++ b/src/obikindex/src/merge.rs @@ -11,7 +11,7 @@ use tracing::info; use crate::error::{OKIError, OKIResult}; use crate::index::KmerIndex; -use crate::meta::IndexMeta; +use crate::meta::{GenomeInfo, IndexMeta}; use crate::state::IndexState; pub use obikpartitionner::MergeMode; @@ -111,7 +111,8 @@ impl KmerIndex { fs::remove_dir_all(&spectrums_dir)?; } for (src, new_labels) in sources.iter().zip(&source_labels) { - copy_spectrums(&src.root_path, output, &src.meta.genomes, new_labels)?; + let old_labels: Vec = src.meta.genomes.iter().map(|g| g.label.clone()).collect(); + copy_spectrums(&src.root_path, output, &old_labels, new_labels)?; } pb.finish_and_clear(); rep.push(t.stop()); @@ -169,14 +170,15 @@ impl KmerIndex { fn compute_labels( sources: &[&KmerIndex], rename_duplicates: bool, -) -> OKIResult<(Vec>, Vec)> { +) -> OKIResult<(Vec>, Vec)> { let mut seen: HashMap = HashMap::new(); let mut source_labels: Vec> = Vec::with_capacity(sources.len()); - let mut all_genomes: Vec = Vec::new(); + let mut all_genomes: Vec = Vec::new(); for src in sources { let mut labels = Vec::with_capacity(src.meta.genomes.len()); - for label in &src.meta.genomes { + for genome in &src.meta.genomes { + let label = &genome.label; let count = seen.entry(label.clone()).or_insert(0); let new_label = if *count == 0 { label.clone() @@ -187,7 +189,7 @@ fn compute_labels( }; *count += 1; labels.push(new_label.clone()); - all_genomes.push(new_label); + all_genomes.push(GenomeInfo { label: new_label, meta: genome.meta.clone() }); } source_labels.push(labels); } diff --git a/src/obikindex/src/meta.rs b/src/obikindex/src/meta.rs index 8327e94..6832cee 100644 --- a/src/obikindex/src/meta.rs +++ b/src/obikindex/src/meta.rs @@ -1,3 +1,4 @@ +use std::collections::HashMap; use std::fs; use std::io; use std::path::Path; @@ -7,6 +8,20 @@ use serde::{Deserialize, Serialize}; pub const META_FILENAME: &str = "index.meta"; const META_VERSION: u32 = 1; +/// Per-genome label + categorical metadata. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct GenomeInfo { + pub label: String, + #[serde(default)] + pub meta: HashMap, +} + +impl GenomeInfo { + pub fn new(label: impl Into) -> Self { + Self { label: label.into(), meta: HashMap::new() } + } +} + #[derive(Debug, Clone, Serialize, Deserialize)] pub struct IndexConfig { pub kmer_size: usize, @@ -19,9 +34,8 @@ pub struct IndexConfig { pub struct IndexMeta { pub version: u32, pub config: IndexConfig, - /// Ordered list of genome labels indexed here. - /// Element 0 is the initial genome; subsequent entries come from merges. - pub genomes: Vec, + /// Ordered list of genomes indexed here (label + optional categorical metadata). + pub genomes: Vec, } impl IndexMeta { @@ -42,4 +56,9 @@ impl IndexMeta { pub fn exists(root: &Path) -> bool { root.join(META_FILENAME).exists() } + + /// Iterate over genome labels only. + pub fn genome_labels(&self) -> impl Iterator { + self.genomes.iter().map(|g| g.label.as_str()) + } } diff --git a/src/obikmer/Cargo.toml b/src/obikmer/Cargo.toml index 053bc97..8d44035 100644 --- a/src/obikmer/Cargo.toml +++ b/src/obikmer/Cargo.toml @@ -20,6 +20,7 @@ obiskio = { path = "../obiskio" } obikindex = { path = "../obikindex" } clap = { version = "4", features = ["derive"] } serde_json = "1" +csv = "1" kodama = "0.2" speedytree = "0.1" rayon = "1" diff --git a/src/obikmer/src/cmd/annotate.rs b/src/obikmer/src/cmd/annotate.rs new file mode 100644 index 0000000..ea0b36d --- /dev/null +++ b/src/obikmer/src/cmd/annotate.rs @@ -0,0 +1,174 @@ +use std::collections::HashSet; +use std::io::{self, BufWriter, Write}; +use std::path::PathBuf; + +use clap::Args; +use obikindex::KmerIndex; +use tracing::info; + +#[derive(Args)] +pub struct AnnotateArgs { + /// Index directory to annotate (modified in-place) + pub index: PathBuf, + + /// CSV file with genome metadata (must contain an id column) + #[arg(long)] + pub csv: Option, + + /// CSV field separator + #[arg(long, default_value = ",")] + pub sep: char, + + /// Name of the column that contains genome labels + #[arg(long, default_value = "id")] + pub id_col: String, + + /// Value that means "delete / absent" (removes existing key if present) + #[arg(long, default_value = "NA")] + pub na_value: String, + + /// Do not overwrite existing metadata keys + #[arg(long)] + pub no_overwrite: bool, + + /// Dump all genome metadata as CSV (stdout) instead of reading a CSV + #[arg(long)] + pub dump: bool, +} + +pub fn run(args: AnnotateArgs) { + if args.dump { + run_dump(&args); + } else { + run_annotate(&args); + } +} + +fn run_dump(args: &AnnotateArgs) { + let idx = open_index(&args.index); + + let genomes = &idx.meta().genomes; + + // Collect all keys in stable order (sorted for determinism) + let mut key_set: HashSet = HashSet::new(); + for g in genomes { + for k in g.meta.keys() { + key_set.insert(k.clone()); + } + } + let mut keys: Vec = key_set.into_iter().collect(); + keys.sort(); + + let stdout = io::stdout(); + let mut out = BufWriter::new(stdout.lock()); + + // Header + write!(out, "id").unwrap(); + for k in &keys { + write!(out, "{}{k}", args.sep).unwrap(); + } + writeln!(out).unwrap(); + + // Rows + for g in genomes { + write!(out, "{}", g.label).unwrap(); + for k in &keys { + let v = g.meta.get(k).map(|s| s.as_str()).unwrap_or("NA"); + write!(out, "{}{v}", args.sep).unwrap(); + } + writeln!(out).unwrap(); + } +} + +fn run_annotate(args: &AnnotateArgs) { + let csv_path = match &args.csv { + Some(p) => p.clone(), + None => { + eprintln!("error: --csv is required unless --dump is used"); + std::process::exit(1); + } + }; + + let mut idx = open_index(&args.index); + + // Build a label → genome index position map + let label_to_pos: std::collections::HashMap = idx + .meta() + .genomes + .iter() + .enumerate() + .map(|(i, g)| (g.label.clone(), i)) + .collect(); + + let sep = args.sep as u8; + let mut rdr = csv::ReaderBuilder::new() + .delimiter(sep) + .from_path(&csv_path) + .unwrap_or_else(|e| { + eprintln!("error opening {}: {e}", csv_path.display()); + std::process::exit(1); + }); + + let headers = rdr.headers().unwrap_or_else(|e| { + eprintln!("error reading CSV headers: {e}"); + std::process::exit(1); + }).clone(); + + let id_col_idx = headers.iter().position(|h| h == args.id_col).unwrap_or_else(|| { + eprintln!("error: id column '{}' not found in CSV", args.id_col); + std::process::exit(1); + }); + + let meta_cols: Vec<(usize, String)> = headers + .iter() + .enumerate() + .filter(|(i, _)| *i != id_col_idx) + .map(|(i, h)| (i, h.to_string())) + .collect(); + + let mut updated = 0usize; + let mut skipped = 0usize; + + for result in rdr.records() { + let record = result.unwrap_or_else(|e| { + eprintln!("error reading CSV record: {e}"); + std::process::exit(1); + }); + + let label = record.get(id_col_idx).unwrap_or("").to_string(); + let pos = match label_to_pos.get(&label) { + Some(&p) => p, + None => { + skipped += 1; + continue; + } + }; + + let genome = &mut idx.meta_mut().genomes[pos]; + for (col_idx, key) in &meta_cols { + let val = record.get(*col_idx).unwrap_or(""); + if val == args.na_value { + genome.meta.remove(key); + } else if args.no_overwrite && genome.meta.contains_key(key) { + // skip + } else { + genome.meta.insert(key.clone(), val.to_string()); + } + } + updated += 1; + } + + idx.meta_mut().write(&args.index).unwrap_or_else(|e| { + eprintln!("error writing index metadata: {e}"); + std::process::exit(1); + }); + + info!("annotated {updated} genome(s), skipped {skipped} CSV row(s) with unknown label"); +} + +fn open_index(path: &PathBuf) -> KmerIndex { + KmerIndex::open(path).unwrap_or_else(|e| { + eprintln!("error opening index: {e}"); + std::process::exit(1); + }) +} diff --git a/src/obikmer/src/cmd/distance.rs b/src/obikmer/src/cmd/distance.rs index e8574ae..a1813f8 100644 --- a/src/obikmer/src/cmd/distance.rs +++ b/src/obikmer/src/cmd/distance.rs @@ -75,8 +75,8 @@ pub fn run(args: DistanceArgs) { set_k(idx.kmer_size()); set_m(idx.minimizer_size()); - let genomes = idx.meta().genomes.clone(); - let n = genomes.len(); + let labels: Vec = idx.meta().genomes.iter().map(|g| g.label.clone()).collect(); + let n = labels.len(); info!( "computing {:?} distances for {} genome(s)", args.metric, n @@ -93,9 +93,9 @@ pub fn run(args: DistanceArgs) { // ── Distance matrix → CSV ───────────────────────────────────────────────── let write_dist_csv = |w: &mut dyn Write| { write!(w, "genome").unwrap(); - for g in &genomes { write!(w, ",{g}").unwrap(); } + for g in &labels { write!(w, ",{g}").unwrap(); } writeln!(w).unwrap(); - for (i, g) in genomes.iter().enumerate() { + for (i, g) in labels.iter().enumerate() { write!(w, "{g}").unwrap(); for j in 0..n { write!(w, ",{:.6}", result.matrix[[i, j]]).unwrap(); @@ -132,9 +132,9 @@ pub fn run(args: DistanceArgs) { std::process::exit(1); })); write!(f, "genome").unwrap(); - for g in &genomes { write!(f, ",{g}").unwrap(); } + for g in &labels { write!(f, ",{g}").unwrap(); } writeln!(f).unwrap(); - for (i, g) in genomes.iter().enumerate() { + for (i, g) in labels.iter().enumerate() { write!(f, "{g}").unwrap(); for j in 0..n { write!(f, ",{}", shared[[i, j]]).unwrap(); } writeln!(f).unwrap(); @@ -148,7 +148,7 @@ pub fn run(args: DistanceArgs) { let rows: Vec> = (0..n) .map(|i| (0..n).map(|j| result.matrix[[i, j]]).collect()) .collect(); - let dm = DistanceMatrix::build(rows, genomes.clone()).unwrap_or_else(|e| { + let dm = DistanceMatrix::build(rows, labels.clone()).unwrap_or_else(|e| { eprintln!("error building distance matrix for NJ: {e}"); std::process::exit(1); }); @@ -176,7 +176,7 @@ pub fn run(args: DistanceArgs) { } } let dendro = linkage(&mut condensed, n, Method::Average); - let newick = upgma_to_newick(&dendro, &genomes); + let newick = upgma_to_newick(&dendro, &labels); let path = args.output.as_ref() .map(|p| format!("{}_upgma.nwk", p.display())) .unwrap_or_else(|| "upgma.nwk".into()); diff --git a/src/obikmer/src/cmd/index.rs b/src/obikmer/src/cmd/index.rs index 38bc722..9f14e43 100644 --- a/src/obikmer/src/cmd/index.rs +++ b/src/obikmer/src/cmd/index.rs @@ -1,7 +1,12 @@ use std::path::PathBuf; use clap::Args; -use obikindex::{IndexConfig, IndexState, KmerIndex}; +use obikindex::{GenomeInfo, IndexConfig, IndexState, KmerIndex}; + +fn parse_key_value(s: &str) -> Result<(String, String), String> { + let pos = s.find('=').ok_or_else(|| format!("invalid key=value: no '=' in '{s}'"))?; + Ok((s[..pos].to_string(), s[pos + 1..].to_string())) +} use obikseq::{set_k, set_m}; use obisys::Reporter; use tracing::info; @@ -23,6 +28,10 @@ pub struct IndexArgs { #[arg(long)] pub label: Option, + /// Genome categorical metadata as key=value pairs (repeatable) + #[arg(long = "meta", value_parser = parse_key_value)] + pub meta: Vec<(String, String)>, + /// Minimum kmer abundance (inclusive) #[arg(long, default_value_t = 1)] pub min_abundance: u32, @@ -73,7 +82,14 @@ pub fn run(args: IndexArgs) { n_bits, with_counts: args.with_counts, }; - KmerIndex::create(&output, config, args.label.clone(), false).unwrap_or_else(|e| { + let genome_info = args.label.as_ref().map(|label| { + let mut info = GenomeInfo::new(label.clone()); + for (k, v) in &args.meta { + info.meta.insert(k.clone(), v.clone()); + } + info + }); + KmerIndex::create(&output, config, genome_info, false).unwrap_or_else(|e| { eprintln!("error creating index: {e}"); std::process::exit(1); }) diff --git a/src/obikmer/src/cmd/mod.rs b/src/obikmer/src/cmd/mod.rs index b71a789..43fc24a 100644 --- a/src/obikmer/src/cmd/mod.rs +++ b/src/obikmer/src/cmd/mod.rs @@ -1,3 +1,4 @@ +pub mod annotate; pub mod distance; pub mod dump; pub mod index; diff --git a/src/obikmer/src/cmd/query.rs b/src/obikmer/src/cmd/query.rs index 2067067..3307d68 100644 --- a/src/obikmer/src/cmd/query.rs +++ b/src/obikmer/src/cmd/query.rs @@ -266,8 +266,8 @@ fn emit_batch( ann.insert("kmer_missing".into(), acc.kmer_missing.into()); } let mut match_map = serde_json::Map::new(); - for (g, label) in meta.genomes.iter().enumerate() { - match_map.insert(label.clone(), acc.genome_totals[g].into()); + for (g, genome) in meta.genomes.iter().enumerate() { + match_map.insert(genome.label.clone(), acc.genome_totals[g].into()); } ann.insert("kmer_strict_matches".into(), match_map.into()); diff --git a/src/obikmer/src/main.rs b/src/obikmer/src/main.rs index 1b8f833..c9f4a12 100644 --- a/src/obikmer/src/main.rs +++ b/src/obikmer/src/main.rs @@ -26,6 +26,8 @@ enum Commands { Query(cmd::query::QueryArgs), /// Dump all indexed kmers as CSV (kmer + per-genome counts or presence) Dump(cmd::dump::DumpArgs), + /// Add or update genome metadata from a CSV file; or dump metadata as CSV + Annotate(cmd::annotate::AnnotateArgs), /// Compute pairwise distance matrix between genomes; optionally build NJ/UPGMA trees Distance(cmd::distance::DistanceArgs), /// Dump unitigs from a built index to stdout (debug) @@ -57,6 +59,7 @@ fn main() { Commands::Dump(args) => cmd::dump::run(args), Commands::Rebuild(args) => cmd::rebuild::run(args), Commands::Query(args) => cmd::query::run(args), + Commands::Annotate(args) => cmd::annotate::run(args), Commands::Distance(args) => cmd::distance::run(args), Commands::Unitig(args) => cmd::unitig::run(args), }