feat: introduce genome metadata tracking and CSV export

This commit replaces raw string genome labels with a structured `GenomeInfo` type for better metadata tracking. It adds a `--meta` flag to the index command, and implements a new `annotate` CLI subcommand to import metadata from CSV files or export it via `--dump`. Distance and shared-count matrices are now serialized to CSV, with UPGMA clustering trees exported as Newick files. Query outputs now include per-genome k-mer match counts in JSON, while fixing syntax and variable naming issues in index merging and dump generation.
This commit is contained in:
Eric Coissac
2026-05-22 09:28:58 +02:00
parent 77a0186fae
commit 0f8f61d3dd
14 changed files with 276 additions and 32 deletions
+8 -6
View File
@@ -11,7 +11,7 @@ use tracing::info;
use crate::error::{OKIError, OKIResult};
use crate::index::KmerIndex;
use crate::meta::IndexMeta;
use crate::meta::{GenomeInfo, IndexMeta};
use crate::state::IndexState;
pub use obikpartitionner::MergeMode;
@@ -111,7 +111,8 @@ impl KmerIndex {
fs::remove_dir_all(&spectrums_dir)?;
}
for (src, new_labels) in sources.iter().zip(&source_labels) {
copy_spectrums(&src.root_path, output, &src.meta.genomes, new_labels)?;
let old_labels: Vec<String> = src.meta.genomes.iter().map(|g| g.label.clone()).collect();
copy_spectrums(&src.root_path, output, &old_labels, new_labels)?;
}
pb.finish_and_clear();
rep.push(t.stop());
@@ -169,14 +170,15 @@ impl KmerIndex {
fn compute_labels(
sources: &[&KmerIndex],
rename_duplicates: bool,
) -> OKIResult<(Vec<Vec<String>>, Vec<String>)> {
) -> OKIResult<(Vec<Vec<String>>, Vec<GenomeInfo>)> {
let mut seen: HashMap<String, usize> = HashMap::new();
let mut source_labels: Vec<Vec<String>> = Vec::with_capacity(sources.len());
let mut all_genomes: Vec<String> = Vec::new();
let mut all_genomes: Vec<GenomeInfo> = Vec::new();
for src in sources {
let mut labels = Vec::with_capacity(src.meta.genomes.len());
for label in &src.meta.genomes {
for genome in &src.meta.genomes {
let label = &genome.label;
let count = seen.entry(label.clone()).or_insert(0);
let new_label = if *count == 0 {
label.clone()
@@ -187,7 +189,7 @@ fn compute_labels(
};
*count += 1;
labels.push(new_label.clone());
all_genomes.push(new_label);
all_genomes.push(GenomeInfo { label: new_label, meta: genome.meta.clone() });
}
source_labels.push(labels);
}