feat: introduce genome metadata tracking and CSV export
This commit replaces raw string genome labels with a structured `GenomeInfo` type for better metadata tracking. It adds a `--meta` flag to the index command, and implements a new `annotate` CLI subcommand to import metadata from CSV files or export it via `--dump`. Distance and shared-count matrices are now serialized to CSV, with UPGMA clustering trees exported as Newick files. Query outputs now include per-genome k-mer match counts in JSON, while fixing syntax and variable naming issues in index merging and dump generation.
This commit is contained in:
@@ -27,7 +27,7 @@ impl KmerIndex {
|
||||
}
|
||||
write!(out, "kmer")?;
|
||||
for g in genomes {
|
||||
write!(out, ",{g}")?;
|
||||
write!(out, ",{}", g.label)?;
|
||||
}
|
||||
writeln!(out)?;
|
||||
|
||||
|
||||
@@ -11,7 +11,7 @@ use rayon::prelude::*;
|
||||
use tracing::info;
|
||||
|
||||
use crate::error::{OKIError, OKIResult};
|
||||
use crate::meta::{IndexConfig, IndexMeta};
|
||||
use crate::meta::{GenomeInfo, IndexConfig, IndexMeta};
|
||||
use crate::state::{IndexState, SENTINEL_COUNTED, SENTINEL_INDEXED, SENTINEL_SCATTERED};
|
||||
|
||||
pub struct KmerIndex {
|
||||
@@ -23,13 +23,12 @@ pub struct KmerIndex {
|
||||
impl KmerIndex {
|
||||
/// Create a new index at `path`.
|
||||
///
|
||||
/// If `genome_label` is `Some`, it is stored immediately.
|
||||
/// If `None`, the label will be derived from the first scatter input path
|
||||
/// when `mark_scattered` is called.
|
||||
/// If `genome_info` is `Some`, it is stored immediately.
|
||||
/// If `None`, the genome entry will be added when `mark_scattered` is called.
|
||||
pub fn create<P: AsRef<Path>>(
|
||||
path: P,
|
||||
config: IndexConfig,
|
||||
genome_label: Option<String>,
|
||||
genome_info: Option<GenomeInfo>,
|
||||
force: bool,
|
||||
) -> OKIResult<Self> {
|
||||
let root_path = path.as_ref().to_owned();
|
||||
@@ -41,8 +40,8 @@ impl KmerIndex {
|
||||
force,
|
||||
)?;
|
||||
let mut meta = IndexMeta::new(config);
|
||||
if let Some(label) = genome_label {
|
||||
meta.genomes.push(label);
|
||||
if let Some(info) = genome_info {
|
||||
meta.genomes.push(info);
|
||||
}
|
||||
meta.write(&root_path)?;
|
||||
Ok(Self { root_path, meta, partition })
|
||||
@@ -71,6 +70,7 @@ impl KmerIndex {
|
||||
}
|
||||
|
||||
pub fn meta(&self) -> &IndexMeta { &self.meta }
|
||||
pub fn meta_mut(&mut self) -> &mut IndexMeta { &mut self.meta }
|
||||
pub fn kmer_size(&self) -> usize { self.meta.config.kmer_size }
|
||||
pub fn minimizer_size(&self) -> usize { self.meta.config.minimizer_size }
|
||||
pub fn n_partitions(&self) -> usize { self.partition.n_partitions() }
|
||||
@@ -88,7 +88,7 @@ impl KmerIndex {
|
||||
pub fn mark_scattered(&mut self) -> OKIResult<()> {
|
||||
if self.meta.genomes.is_empty() {
|
||||
let label = label_from_path(&self.root_path);
|
||||
self.meta.genomes.push(label);
|
||||
self.meta.genomes.push(GenomeInfo::new(label));
|
||||
self.meta.write(&self.root_path)?;
|
||||
}
|
||||
touch(&self.root_path.join(SENTINEL_SCATTERED))?;
|
||||
@@ -114,7 +114,7 @@ impl KmerIndex {
|
||||
}
|
||||
|
||||
fn write_spectrum(&self, sp: &KmerSpectrum) -> OKIResult<()> {
|
||||
let label = self.meta.genomes.first().map(String::as_str).unwrap_or("unknown");
|
||||
let label = self.meta.genomes.first().map(|g| g.label.as_str()).unwrap_or("unknown");
|
||||
let spectrums_dir = self.root_path.join("spectrums");
|
||||
fs::create_dir_all(&spectrums_dir)?;
|
||||
let path = spectrums_dir.join(format!("{label}.json"));
|
||||
|
||||
@@ -11,5 +11,5 @@ pub use error::{OKIError, OKIResult};
|
||||
pub use distance::{DistanceMetric, DistanceOutput};
|
||||
pub use index::KmerIndex;
|
||||
pub use merge::MergeMode;
|
||||
pub use meta::{IndexConfig, IndexMeta, META_FILENAME};
|
||||
pub use meta::{GenomeInfo, IndexConfig, IndexMeta, META_FILENAME};
|
||||
pub use state::{IndexState, SENTINEL_COUNTED, SENTINEL_INDEXED, SENTINEL_SCATTERED};
|
||||
|
||||
@@ -11,7 +11,7 @@ use tracing::info;
|
||||
|
||||
use crate::error::{OKIError, OKIResult};
|
||||
use crate::index::KmerIndex;
|
||||
use crate::meta::IndexMeta;
|
||||
use crate::meta::{GenomeInfo, IndexMeta};
|
||||
use crate::state::IndexState;
|
||||
|
||||
pub use obikpartitionner::MergeMode;
|
||||
@@ -111,7 +111,8 @@ impl KmerIndex {
|
||||
fs::remove_dir_all(&spectrums_dir)?;
|
||||
}
|
||||
for (src, new_labels) in sources.iter().zip(&source_labels) {
|
||||
copy_spectrums(&src.root_path, output, &src.meta.genomes, new_labels)?;
|
||||
let old_labels: Vec<String> = src.meta.genomes.iter().map(|g| g.label.clone()).collect();
|
||||
copy_spectrums(&src.root_path, output, &old_labels, new_labels)?;
|
||||
}
|
||||
pb.finish_and_clear();
|
||||
rep.push(t.stop());
|
||||
@@ -169,14 +170,15 @@ impl KmerIndex {
|
||||
fn compute_labels(
|
||||
sources: &[&KmerIndex],
|
||||
rename_duplicates: bool,
|
||||
) -> OKIResult<(Vec<Vec<String>>, Vec<String>)> {
|
||||
) -> OKIResult<(Vec<Vec<String>>, Vec<GenomeInfo>)> {
|
||||
let mut seen: HashMap<String, usize> = HashMap::new();
|
||||
let mut source_labels: Vec<Vec<String>> = Vec::with_capacity(sources.len());
|
||||
let mut all_genomes: Vec<String> = Vec::new();
|
||||
let mut all_genomes: Vec<GenomeInfo> = Vec::new();
|
||||
|
||||
for src in sources {
|
||||
let mut labels = Vec::with_capacity(src.meta.genomes.len());
|
||||
for label in &src.meta.genomes {
|
||||
for genome in &src.meta.genomes {
|
||||
let label = &genome.label;
|
||||
let count = seen.entry(label.clone()).or_insert(0);
|
||||
let new_label = if *count == 0 {
|
||||
label.clone()
|
||||
@@ -187,7 +189,7 @@ fn compute_labels(
|
||||
};
|
||||
*count += 1;
|
||||
labels.push(new_label.clone());
|
||||
all_genomes.push(new_label);
|
||||
all_genomes.push(GenomeInfo { label: new_label, meta: genome.meta.clone() });
|
||||
}
|
||||
source_labels.push(labels);
|
||||
}
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
use std::collections::HashMap;
|
||||
use std::fs;
|
||||
use std::io;
|
||||
use std::path::Path;
|
||||
@@ -7,6 +8,20 @@ use serde::{Deserialize, Serialize};
|
||||
pub const META_FILENAME: &str = "index.meta";
|
||||
const META_VERSION: u32 = 1;
|
||||
|
||||
/// Per-genome label + categorical metadata.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct GenomeInfo {
|
||||
pub label: String,
|
||||
#[serde(default)]
|
||||
pub meta: HashMap<String, String>,
|
||||
}
|
||||
|
||||
impl GenomeInfo {
|
||||
pub fn new(label: impl Into<String>) -> Self {
|
||||
Self { label: label.into(), meta: HashMap::new() }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct IndexConfig {
|
||||
pub kmer_size: usize,
|
||||
@@ -19,9 +34,8 @@ pub struct IndexConfig {
|
||||
pub struct IndexMeta {
|
||||
pub version: u32,
|
||||
pub config: IndexConfig,
|
||||
/// Ordered list of genome labels indexed here.
|
||||
/// Element 0 is the initial genome; subsequent entries come from merges.
|
||||
pub genomes: Vec<String>,
|
||||
/// Ordered list of genomes indexed here (label + optional categorical metadata).
|
||||
pub genomes: Vec<GenomeInfo>,
|
||||
}
|
||||
|
||||
impl IndexMeta {
|
||||
@@ -42,4 +56,9 @@ impl IndexMeta {
|
||||
pub fn exists(root: &Path) -> bool {
|
||||
root.join(META_FILENAME).exists()
|
||||
}
|
||||
|
||||
/// Iterate over genome labels only.
|
||||
pub fn genome_labels(&self) -> impl Iterator<Item = &str> {
|
||||
self.genomes.iter().map(|g| g.label.as_str())
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user