feat: introduce genome metadata tracking and CSV export

This commit replaces raw string genome labels with a structured `GenomeInfo` type for better metadata tracking. It adds a `--meta` flag to the index command, and implements a new `annotate` CLI subcommand to import metadata from CSV files or export it via `--dump`. Distance and shared-count matrices are now serialized to CSV, with UPGMA clustering trees exported as Newick files. Query outputs now include per-genome k-mer match counts in JSON, while fixing syntax and variable naming issues in index merging and dump generation.
This commit is contained in:
Eric Coissac
2026-05-22 09:28:58 +02:00
parent 77a0186fae
commit 0f8f61d3dd
14 changed files with 276 additions and 32 deletions
+1 -1
View File
@@ -27,7 +27,7 @@ impl KmerIndex {
}
write!(out, "kmer")?;
for g in genomes {
write!(out, ",{g}")?;
write!(out, ",{}", g.label)?;
}
writeln!(out)?;
+9 -9
View File
@@ -11,7 +11,7 @@ use rayon::prelude::*;
use tracing::info;
use crate::error::{OKIError, OKIResult};
use crate::meta::{IndexConfig, IndexMeta};
use crate::meta::{GenomeInfo, IndexConfig, IndexMeta};
use crate::state::{IndexState, SENTINEL_COUNTED, SENTINEL_INDEXED, SENTINEL_SCATTERED};
pub struct KmerIndex {
@@ -23,13 +23,12 @@ pub struct KmerIndex {
impl KmerIndex {
/// Create a new index at `path`.
///
/// If `genome_label` is `Some`, it is stored immediately.
/// If `None`, the label will be derived from the first scatter input path
/// when `mark_scattered` is called.
/// If `genome_info` is `Some`, it is stored immediately.
/// If `None`, the genome entry will be added when `mark_scattered` is called.
pub fn create<P: AsRef<Path>>(
path: P,
config: IndexConfig,
genome_label: Option<String>,
genome_info: Option<GenomeInfo>,
force: bool,
) -> OKIResult<Self> {
let root_path = path.as_ref().to_owned();
@@ -41,8 +40,8 @@ impl KmerIndex {
force,
)?;
let mut meta = IndexMeta::new(config);
if let Some(label) = genome_label {
meta.genomes.push(label);
if let Some(info) = genome_info {
meta.genomes.push(info);
}
meta.write(&root_path)?;
Ok(Self { root_path, meta, partition })
@@ -71,6 +70,7 @@ impl KmerIndex {
}
pub fn meta(&self) -> &IndexMeta { &self.meta }
pub fn meta_mut(&mut self) -> &mut IndexMeta { &mut self.meta }
pub fn kmer_size(&self) -> usize { self.meta.config.kmer_size }
pub fn minimizer_size(&self) -> usize { self.meta.config.minimizer_size }
pub fn n_partitions(&self) -> usize { self.partition.n_partitions() }
@@ -88,7 +88,7 @@ impl KmerIndex {
pub fn mark_scattered(&mut self) -> OKIResult<()> {
if self.meta.genomes.is_empty() {
let label = label_from_path(&self.root_path);
self.meta.genomes.push(label);
self.meta.genomes.push(GenomeInfo::new(label));
self.meta.write(&self.root_path)?;
}
touch(&self.root_path.join(SENTINEL_SCATTERED))?;
@@ -114,7 +114,7 @@ impl KmerIndex {
}
fn write_spectrum(&self, sp: &KmerSpectrum) -> OKIResult<()> {
let label = self.meta.genomes.first().map(String::as_str).unwrap_or("unknown");
let label = self.meta.genomes.first().map(|g| g.label.as_str()).unwrap_or("unknown");
let spectrums_dir = self.root_path.join("spectrums");
fs::create_dir_all(&spectrums_dir)?;
let path = spectrums_dir.join(format!("{label}.json"));
+1 -1
View File
@@ -11,5 +11,5 @@ pub use error::{OKIError, OKIResult};
pub use distance::{DistanceMetric, DistanceOutput};
pub use index::KmerIndex;
pub use merge::MergeMode;
pub use meta::{IndexConfig, IndexMeta, META_FILENAME};
pub use meta::{GenomeInfo, IndexConfig, IndexMeta, META_FILENAME};
pub use state::{IndexState, SENTINEL_COUNTED, SENTINEL_INDEXED, SENTINEL_SCATTERED};
+8 -6
View File
@@ -11,7 +11,7 @@ use tracing::info;
use crate::error::{OKIError, OKIResult};
use crate::index::KmerIndex;
use crate::meta::IndexMeta;
use crate::meta::{GenomeInfo, IndexMeta};
use crate::state::IndexState;
pub use obikpartitionner::MergeMode;
@@ -111,7 +111,8 @@ impl KmerIndex {
fs::remove_dir_all(&spectrums_dir)?;
}
for (src, new_labels) in sources.iter().zip(&source_labels) {
copy_spectrums(&src.root_path, output, &src.meta.genomes, new_labels)?;
let old_labels: Vec<String> = src.meta.genomes.iter().map(|g| g.label.clone()).collect();
copy_spectrums(&src.root_path, output, &old_labels, new_labels)?;
}
pb.finish_and_clear();
rep.push(t.stop());
@@ -169,14 +170,15 @@ impl KmerIndex {
fn compute_labels(
sources: &[&KmerIndex],
rename_duplicates: bool,
) -> OKIResult<(Vec<Vec<String>>, Vec<String>)> {
) -> OKIResult<(Vec<Vec<String>>, Vec<GenomeInfo>)> {
let mut seen: HashMap<String, usize> = HashMap::new();
let mut source_labels: Vec<Vec<String>> = Vec::with_capacity(sources.len());
let mut all_genomes: Vec<String> = Vec::new();
let mut all_genomes: Vec<GenomeInfo> = Vec::new();
for src in sources {
let mut labels = Vec::with_capacity(src.meta.genomes.len());
for label in &src.meta.genomes {
for genome in &src.meta.genomes {
let label = &genome.label;
let count = seen.entry(label.clone()).or_insert(0);
let new_label = if *count == 0 {
label.clone()
@@ -187,7 +189,7 @@ fn compute_labels(
};
*count += 1;
labels.push(new_label.clone());
all_genomes.push(new_label);
all_genomes.push(GenomeInfo { label: new_label, meta: genome.meta.clone() });
}
source_labels.push(labels);
}
+22 -3
View File
@@ -1,3 +1,4 @@
use std::collections::HashMap;
use std::fs;
use std::io;
use std::path::Path;
@@ -7,6 +8,20 @@ use serde::{Deserialize, Serialize};
pub const META_FILENAME: &str = "index.meta";
const META_VERSION: u32 = 1;
/// Per-genome label + categorical metadata.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GenomeInfo {
pub label: String,
#[serde(default)]
pub meta: HashMap<String, String>,
}
impl GenomeInfo {
pub fn new(label: impl Into<String>) -> Self {
Self { label: label.into(), meta: HashMap::new() }
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct IndexConfig {
pub kmer_size: usize,
@@ -19,9 +34,8 @@ pub struct IndexConfig {
pub struct IndexMeta {
pub version: u32,
pub config: IndexConfig,
/// Ordered list of genome labels indexed here.
/// Element 0 is the initial genome; subsequent entries come from merges.
pub genomes: Vec<String>,
/// Ordered list of genomes indexed here (label + optional categorical metadata).
pub genomes: Vec<GenomeInfo>,
}
impl IndexMeta {
@@ -42,4 +56,9 @@ impl IndexMeta {
pub fn exists(root: &Path) -> bool {
root.join(META_FILENAME).exists()
}
/// Iterate over genome labels only.
pub fn genome_labels(&self) -> impl Iterator<Item = &str> {
self.genomes.iter().map(|g| g.label.as_str())
}
}