From dfa0b2bac2118022e67361e3b49e58b177d35802 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Tue, 26 May 2026 14:42:18 +0200 Subject: [PATCH] feat: add utils subcommand for renaming genome labels Introduces a `utils` CLI subcommand to enable in-place genome label renaming without full reindexing. Adds strict label validation to reject empty strings, filesystem separators, and control characters, ensuring safe CSV serialization. Updates index metadata, renames corresponding spectrum JSON files, and registers the command in the main dispatch logic. CLI reference documentation is also updated. --- docmd/index.md | 3 +- src/obikindex/src/lib.rs | 2 +- src/obikindex/src/meta.rs | 23 +++++++++ src/obikmer/src/cmd/index.rs | 6 ++- src/obikmer/src/cmd/mod.rs | 1 + src/obikmer/src/cmd/utils.rs | 91 ++++++++++++++++++++++++++++++++++++ src/obikmer/src/main.rs | 3 ++ 7 files changed, 126 insertions(+), 3 deletions(-) create mode 100644 src/obikmer/src/cmd/utils.rs diff --git a/docmd/index.md b/docmd/index.md index 9cc7ef2..bd3e968 100644 --- a/docmd/index.md +++ b/docmd/index.md @@ -17,6 +17,7 @@ | `unitig` | Dump unitigs from a built index to stdout (debug) | | `estimate` | Estimate approximate-index parameters (z, evidence bits, FP rates) before indexing | | `reindex` | Convert an index's evidence in-place: exact ↔ approx | +| `utils` | Miscellaneous index utilities: `--new-label NEW=OLD` renames a genome label in-place | ## Constraints @@ -24,7 +25,7 @@ - Maximum efficiency in computation, memory, and disk usage - k odd, k ∈ [11, 31], fixed at runtime; kmer fits in a u64 (2 bits/base) - Canonical form: `min(kmer, revcomp(kmer))` reduces strand-symmetric space by half -- Input formats: FASTA, FASTQ, gzip, streaming stdin +- Input formats: FASTA, FASTQ, gzip, streaming stdin; `index` reads from stdin automatically when no input files are provided (`-` can also be passed explicitly among other paths) ## Priority operations diff --git a/src/obikindex/src/lib.rs b/src/obikindex/src/lib.rs index 9f84178..b5e881e 100644 --- a/src/obikindex/src/lib.rs +++ b/src/obikindex/src/lib.rs @@ -12,5 +12,5 @@ pub use error::{OKIError, OKIResult}; pub use distance::{DistanceMetric, DistanceOutput}; pub use index::KmerIndex; pub use merge::MergeMode; -pub use meta::{GenomeInfo, IndexConfig, IndexMeta, META_FILENAME}; +pub use meta::{validate_label, GenomeInfo, IndexConfig, IndexMeta, META_FILENAME}; pub use state::{IndexState, SENTINEL_COUNTED, SENTINEL_INDEXED, SENTINEL_SCATTERED}; diff --git a/src/obikindex/src/meta.rs b/src/obikindex/src/meta.rs index 8518f62..b0c5aa7 100644 --- a/src/obikindex/src/meta.rs +++ b/src/obikindex/src/meta.rs @@ -70,3 +70,26 @@ impl IndexMeta { self.genomes.iter().map(|g| g.label.as_str()) } } + +/// Validate a user-supplied genome label. +/// +/// Forbidden: `/` (filesystem separator), `=` (--new-label parser separator), +/// `\0` (null byte), `\n`, `\r`, `\t` (break CSV output). +/// Empty labels are also rejected. +pub fn validate_label(label: &str) -> Result<(), String> { + if label.is_empty() { + return Err("genome label must not be empty".into()); + } + const FORBIDDEN: &[char] = &['/', '=', '\0', '\n', '\r', '\t']; + if let Some(c) = label.chars().find(|c| FORBIDDEN.contains(c)) { + let display = match c { + '\0' => "\\0 (null)".to_string(), + '\n' => "\\n (newline)".to_string(), + '\r' => "\\r (carriage return)".to_string(), + '\t' => "\\t (tab)".to_string(), + c => format!("'{c}'"), + }; + return Err(format!("genome label contains forbidden character {display}")); + } + Ok(()) +} diff --git a/src/obikmer/src/cmd/index.rs b/src/obikmer/src/cmd/index.rs index fe57749..8a7d8ea 100644 --- a/src/obikmer/src/cmd/index.rs +++ b/src/obikmer/src/cmd/index.rs @@ -1,7 +1,7 @@ use std::path::PathBuf; use clap::Args; -use obikindex::{GenomeInfo, IndexConfig, IndexState, KmerIndex}; +use obikindex::{validate_label, GenomeInfo, IndexConfig, IndexState, KmerIndex}; use obilayeredmap::IndexMode; fn parse_key_value(s: &str) -> Result<(String, String), String> { @@ -194,6 +194,10 @@ pub fn run(args: IndexArgs) { block_bits, }; let genome_info = args.label.as_ref().map(|label| { + validate_label(label).unwrap_or_else(|e| { + eprintln!("error: --label: {e}"); + std::process::exit(1); + }); let mut info = GenomeInfo::new(label.clone()); for (k, v) in &args.meta { info.meta.insert(k.clone(), v.clone()); diff --git a/src/obikmer/src/cmd/mod.rs b/src/obikmer/src/cmd/mod.rs index 39e9a89..2fc940a 100644 --- a/src/obikmer/src/cmd/mod.rs +++ b/src/obikmer/src/cmd/mod.rs @@ -1,4 +1,5 @@ pub mod annotate; +pub mod utils; pub mod distance; pub mod dump; pub mod estimate; diff --git a/src/obikmer/src/cmd/utils.rs b/src/obikmer/src/cmd/utils.rs new file mode 100644 index 0000000..0e7ec20 --- /dev/null +++ b/src/obikmer/src/cmd/utils.rs @@ -0,0 +1,91 @@ +use std::path::PathBuf; + +use clap::Args; +use obikindex::{validate_label, KmerIndex}; +use tracing::info; + +#[derive(Args)] +pub struct UtilsArgs { + /// Index directory to operate on + pub index: PathBuf, + + /// Set a new genome label: NEW_LABEL=OLD_LABEL + #[arg(long, value_name = "NEW=OLD")] + pub new_label: Option, +} + +pub fn run(args: UtilsArgs) { + let mut any = false; + + if let Some(spec) = &args.new_label { + any = true; + run_rename(&args.index, spec); + } + + if !any { + eprintln!("utils: no operation specified. Available options: --new-label NEW=OLD"); + std::process::exit(1); + } +} + +fn run_rename(index_path: &PathBuf, spec: &str) { + let (old_label, new_label) = parse_rename_spec(spec); + + let mut idx = KmerIndex::open(index_path).unwrap_or_else(|e| { + eprintln!("error opening index: {e}"); + std::process::exit(1); + }); + + let pos = idx + .meta() + .genomes + .iter() + .position(|g| g.label == old_label) + .unwrap_or_else(|| { + eprintln!("error: genome '{old_label}' not found in index"); + std::process::exit(1); + }); + + validate_label(&new_label).unwrap_or_else(|e| { + eprintln!("error: --new-label: {e}"); + std::process::exit(1); + }); + + // Check the new label is not already taken. + if idx.meta().genomes.iter().any(|g| g.label == new_label) { + eprintln!("error: label '{new_label}' already exists in index"); + std::process::exit(1); + } + + idx.meta_mut().genomes[pos].label = new_label.clone(); + idx.meta_mut().write(index_path).unwrap_or_else(|e| { + eprintln!("error writing index metadata: {e}"); + std::process::exit(1); + }); + + // Rename the spectrum file if it exists. + let spectrums_dir = index_path.join("spectrums"); + let old_spectrum = spectrums_dir.join(format!("{old_label}.json")); + let new_spectrum = spectrums_dir.join(format!("{new_label}.json")); + if old_spectrum.exists() { + std::fs::rename(&old_spectrum, &new_spectrum).unwrap_or_else(|e| { + eprintln!("warning: could not rename spectrum file: {e}"); + }); + } + + info!("renamed genome '{old_label}' → '{new_label}'"); +} + +fn parse_rename_spec(spec: &str) -> (String, String) { + let eq = spec.find('=').unwrap_or_else(|| { + eprintln!("error: --new-label expects NEW_LABEL=OLD_LABEL, got '{spec}'"); + std::process::exit(1); + }); + let new = spec[..eq].trim().to_string(); + let old = spec[eq + 1..].trim().to_string(); + if old.is_empty() || new.is_empty() { + eprintln!("error: --new-label: both old and new labels must be non-empty"); + std::process::exit(1); + } + (old, new) +} diff --git a/src/obikmer/src/main.rs b/src/obikmer/src/main.rs index e6700e1..a872e46 100644 --- a/src/obikmer/src/main.rs +++ b/src/obikmer/src/main.rs @@ -36,6 +36,8 @@ enum Commands { Estimate(cmd::estimate::EstimateArgs), /// Convert an index's evidence in-place: exact ↔ approx Reindex(cmd::reindex::ReindexArgs), + /// Miscellaneous index utilities (--rename, …) + Utils(cmd::utils::UtilsArgs), } fn main() { @@ -68,6 +70,7 @@ fn main() { Commands::Unitig(args) => cmd::unitig::run(args), Commands::Estimate(args) => cmd::estimate::run(args), Commands::Reindex(args) => cmd::reindex::run(args), + Commands::Utils(args) => cmd::utils::run(args), } #[cfg(feature = "profiling")]