diff --git a/docmd/index.md b/docmd/index.md index 9cc7ef2..bd3e968 100644 --- a/docmd/index.md +++ b/docmd/index.md @@ -17,6 +17,7 @@ | `unitig` | Dump unitigs from a built index to stdout (debug) | | `estimate` | Estimate approximate-index parameters (z, evidence bits, FP rates) before indexing | | `reindex` | Convert an index's evidence in-place: exact ↔ approx | +| `utils` | Miscellaneous index utilities: `--new-label NEW=OLD` renames a genome label in-place | ## Constraints @@ -24,7 +25,7 @@ - Maximum efficiency in computation, memory, and disk usage - k odd, k ∈ [11, 31], fixed at runtime; kmer fits in a u64 (2 bits/base) - Canonical form: `min(kmer, revcomp(kmer))` reduces strand-symmetric space by half -- Input formats: FASTA, FASTQ, gzip, streaming stdin +- Input formats: FASTA, FASTQ, gzip, streaming stdin; `index` reads from stdin automatically when no input files are provided (`-` can also be passed explicitly among other paths) ## Priority operations diff --git a/src/obikindex/src/lib.rs b/src/obikindex/src/lib.rs index 9f84178..b5e881e 100644 --- a/src/obikindex/src/lib.rs +++ b/src/obikindex/src/lib.rs @@ -12,5 +12,5 @@ pub use error::{OKIError, OKIResult}; pub use distance::{DistanceMetric, DistanceOutput}; pub use index::KmerIndex; pub use merge::MergeMode; -pub use meta::{GenomeInfo, IndexConfig, IndexMeta, META_FILENAME}; +pub use meta::{validate_label, GenomeInfo, IndexConfig, IndexMeta, META_FILENAME}; pub use state::{IndexState, SENTINEL_COUNTED, SENTINEL_INDEXED, SENTINEL_SCATTERED}; diff --git a/src/obikindex/src/meta.rs b/src/obikindex/src/meta.rs index 8518f62..b0c5aa7 100644 --- a/src/obikindex/src/meta.rs +++ b/src/obikindex/src/meta.rs @@ -70,3 +70,26 @@ impl IndexMeta { self.genomes.iter().map(|g| g.label.as_str()) } } + +/// Validate a user-supplied genome label. +/// +/// Forbidden: `/` (filesystem separator), `=` (--new-label parser separator), +/// `\0` (null byte), `\n`, `\r`, `\t` (break CSV output). +/// Empty labels are also rejected. +pub fn validate_label(label: &str) -> Result<(), String> { + if label.is_empty() { + return Err("genome label must not be empty".into()); + } + const FORBIDDEN: &[char] = &['/', '=', '\0', '\n', '\r', '\t']; + if let Some(c) = label.chars().find(|c| FORBIDDEN.contains(c)) { + let display = match c { + '\0' => "\\0 (null)".to_string(), + '\n' => "\\n (newline)".to_string(), + '\r' => "\\r (carriage return)".to_string(), + '\t' => "\\t (tab)".to_string(), + c => format!("'{c}'"), + }; + return Err(format!("genome label contains forbidden character {display}")); + } + Ok(()) +} diff --git a/src/obikmer/src/cmd/index.rs b/src/obikmer/src/cmd/index.rs index fe57749..8a7d8ea 100644 --- a/src/obikmer/src/cmd/index.rs +++ b/src/obikmer/src/cmd/index.rs @@ -1,7 +1,7 @@ use std::path::PathBuf; use clap::Args; -use obikindex::{GenomeInfo, IndexConfig, IndexState, KmerIndex}; +use obikindex::{validate_label, GenomeInfo, IndexConfig, IndexState, KmerIndex}; use obilayeredmap::IndexMode; fn parse_key_value(s: &str) -> Result<(String, String), String> { @@ -194,6 +194,10 @@ pub fn run(args: IndexArgs) { block_bits, }; let genome_info = args.label.as_ref().map(|label| { + validate_label(label).unwrap_or_else(|e| { + eprintln!("error: --label: {e}"); + std::process::exit(1); + }); let mut info = GenomeInfo::new(label.clone()); for (k, v) in &args.meta { info.meta.insert(k.clone(), v.clone()); diff --git a/src/obikmer/src/cmd/mod.rs b/src/obikmer/src/cmd/mod.rs index 39e9a89..2fc940a 100644 --- a/src/obikmer/src/cmd/mod.rs +++ b/src/obikmer/src/cmd/mod.rs @@ -1,4 +1,5 @@ pub mod annotate; +pub mod utils; pub mod distance; pub mod dump; pub mod estimate; diff --git a/src/obikmer/src/cmd/utils.rs b/src/obikmer/src/cmd/utils.rs new file mode 100644 index 0000000..0e7ec20 --- /dev/null +++ b/src/obikmer/src/cmd/utils.rs @@ -0,0 +1,91 @@ +use std::path::PathBuf; + +use clap::Args; +use obikindex::{validate_label, KmerIndex}; +use tracing::info; + +#[derive(Args)] +pub struct UtilsArgs { + /// Index directory to operate on + pub index: PathBuf, + + /// Set a new genome label: NEW_LABEL=OLD_LABEL + #[arg(long, value_name = "NEW=OLD")] + pub new_label: Option, +} + +pub fn run(args: UtilsArgs) { + let mut any = false; + + if let Some(spec) = &args.new_label { + any = true; + run_rename(&args.index, spec); + } + + if !any { + eprintln!("utils: no operation specified. Available options: --new-label NEW=OLD"); + std::process::exit(1); + } +} + +fn run_rename(index_path: &PathBuf, spec: &str) { + let (old_label, new_label) = parse_rename_spec(spec); + + let mut idx = KmerIndex::open(index_path).unwrap_or_else(|e| { + eprintln!("error opening index: {e}"); + std::process::exit(1); + }); + + let pos = idx + .meta() + .genomes + .iter() + .position(|g| g.label == old_label) + .unwrap_or_else(|| { + eprintln!("error: genome '{old_label}' not found in index"); + std::process::exit(1); + }); + + validate_label(&new_label).unwrap_or_else(|e| { + eprintln!("error: --new-label: {e}"); + std::process::exit(1); + }); + + // Check the new label is not already taken. + if idx.meta().genomes.iter().any(|g| g.label == new_label) { + eprintln!("error: label '{new_label}' already exists in index"); + std::process::exit(1); + } + + idx.meta_mut().genomes[pos].label = new_label.clone(); + idx.meta_mut().write(index_path).unwrap_or_else(|e| { + eprintln!("error writing index metadata: {e}"); + std::process::exit(1); + }); + + // Rename the spectrum file if it exists. + let spectrums_dir = index_path.join("spectrums"); + let old_spectrum = spectrums_dir.join(format!("{old_label}.json")); + let new_spectrum = spectrums_dir.join(format!("{new_label}.json")); + if old_spectrum.exists() { + std::fs::rename(&old_spectrum, &new_spectrum).unwrap_or_else(|e| { + eprintln!("warning: could not rename spectrum file: {e}"); + }); + } + + info!("renamed genome '{old_label}' → '{new_label}'"); +} + +fn parse_rename_spec(spec: &str) -> (String, String) { + let eq = spec.find('=').unwrap_or_else(|| { + eprintln!("error: --new-label expects NEW_LABEL=OLD_LABEL, got '{spec}'"); + std::process::exit(1); + }); + let new = spec[..eq].trim().to_string(); + let old = spec[eq + 1..].trim().to_string(); + if old.is_empty() || new.is_empty() { + eprintln!("error: --new-label: both old and new labels must be non-empty"); + std::process::exit(1); + } + (old, new) +} diff --git a/src/obikmer/src/main.rs b/src/obikmer/src/main.rs index e6700e1..a872e46 100644 --- a/src/obikmer/src/main.rs +++ b/src/obikmer/src/main.rs @@ -36,6 +36,8 @@ enum Commands { Estimate(cmd::estimate::EstimateArgs), /// Convert an index's evidence in-place: exact ↔ approx Reindex(cmd::reindex::ReindexArgs), + /// Miscellaneous index utilities (--rename, …) + Utils(cmd::utils::UtilsArgs), } fn main() { @@ -68,6 +70,7 @@ fn main() { Commands::Unitig(args) => cmd::unitig::run(args), Commands::Estimate(args) => cmd::estimate::run(args), Commands::Reindex(args) => cmd::reindex::run(args), + Commands::Utils(args) => cmd::utils::run(args), } #[cfg(feature = "profiling")]