feat: add utils subcommand for renaming genome labels

Introduces a `utils` CLI subcommand to enable in-place genome label renaming without full reindexing. Adds strict label validation to reject empty strings, filesystem separators, and control characters, ensuring safe CSV serialization. Updates index metadata, renames corresponding spectrum JSON files, and registers the command in the main dispatch logic. CLI reference documentation is also updated.
This commit is contained in:
Eric Coissac
2026-05-26 14:42:18 +02:00
parent 9e60a711bc
commit dfa0b2bac2
7 changed files with 126 additions and 3 deletions
+2 -1
View File
@@ -17,6 +17,7 @@
| `unitig` | Dump unitigs from a built index to stdout (debug) |
| `estimate` | Estimate approximate-index parameters (z, evidence bits, FP rates) before indexing |
| `reindex` | Convert an index's evidence in-place: exact ↔ approx |
| `utils` | Miscellaneous index utilities: `--new-label NEW=OLD` renames a genome label in-place |
## Constraints
@@ -24,7 +25,7 @@
- Maximum efficiency in computation, memory, and disk usage
- k odd, k ∈ [11, 31], fixed at runtime; kmer fits in a u64 (2 bits/base)
- Canonical form: `min(kmer, revcomp(kmer))` reduces strand-symmetric space by half
- Input formats: FASTA, FASTQ, gzip, streaming stdin
- Input formats: FASTA, FASTQ, gzip, streaming stdin; `index` reads from stdin automatically when no input files are provided (`-` can also be passed explicitly among other paths)
## Priority operations
+1 -1
View File
@@ -12,5 +12,5 @@ pub use error::{OKIError, OKIResult};
pub use distance::{DistanceMetric, DistanceOutput};
pub use index::KmerIndex;
pub use merge::MergeMode;
pub use meta::{GenomeInfo, IndexConfig, IndexMeta, META_FILENAME};
pub use meta::{validate_label, GenomeInfo, IndexConfig, IndexMeta, META_FILENAME};
pub use state::{IndexState, SENTINEL_COUNTED, SENTINEL_INDEXED, SENTINEL_SCATTERED};
+23
View File
@@ -70,3 +70,26 @@ impl IndexMeta {
self.genomes.iter().map(|g| g.label.as_str())
}
}
/// Validate a user-supplied genome label.
///
/// Forbidden: `/` (filesystem separator), `=` (--new-label parser separator),
/// `\0` (null byte), `\n`, `\r`, `\t` (break CSV output).
/// Empty labels are also rejected.
pub fn validate_label(label: &str) -> Result<(), String> {
if label.is_empty() {
return Err("genome label must not be empty".into());
}
const FORBIDDEN: &[char] = &['/', '=', '\0', '\n', '\r', '\t'];
if let Some(c) = label.chars().find(|c| FORBIDDEN.contains(c)) {
let display = match c {
'\0' => "\\0 (null)".to_string(),
'\n' => "\\n (newline)".to_string(),
'\r' => "\\r (carriage return)".to_string(),
'\t' => "\\t (tab)".to_string(),
c => format!("'{c}'"),
};
return Err(format!("genome label contains forbidden character {display}"));
}
Ok(())
}
+5 -1
View File
@@ -1,7 +1,7 @@
use std::path::PathBuf;
use clap::Args;
use obikindex::{GenomeInfo, IndexConfig, IndexState, KmerIndex};
use obikindex::{validate_label, GenomeInfo, IndexConfig, IndexState, KmerIndex};
use obilayeredmap::IndexMode;
fn parse_key_value(s: &str) -> Result<(String, String), String> {
@@ -194,6 +194,10 @@ pub fn run(args: IndexArgs) {
block_bits,
};
let genome_info = args.label.as_ref().map(|label| {
validate_label(label).unwrap_or_else(|e| {
eprintln!("error: --label: {e}");
std::process::exit(1);
});
let mut info = GenomeInfo::new(label.clone());
for (k, v) in &args.meta {
info.meta.insert(k.clone(), v.clone());
+1
View File
@@ -1,4 +1,5 @@
pub mod annotate;
pub mod utils;
pub mod distance;
pub mod dump;
pub mod estimate;
+91
View File
@@ -0,0 +1,91 @@
use std::path::PathBuf;
use clap::Args;
use obikindex::{validate_label, KmerIndex};
use tracing::info;
#[derive(Args)]
pub struct UtilsArgs {
/// Index directory to operate on
pub index: PathBuf,
/// Set a new genome label: NEW_LABEL=OLD_LABEL
#[arg(long, value_name = "NEW=OLD")]
pub new_label: Option<String>,
}
pub fn run(args: UtilsArgs) {
let mut any = false;
if let Some(spec) = &args.new_label {
any = true;
run_rename(&args.index, spec);
}
if !any {
eprintln!("utils: no operation specified. Available options: --new-label NEW=OLD");
std::process::exit(1);
}
}
fn run_rename(index_path: &PathBuf, spec: &str) {
let (old_label, new_label) = parse_rename_spec(spec);
let mut idx = KmerIndex::open(index_path).unwrap_or_else(|e| {
eprintln!("error opening index: {e}");
std::process::exit(1);
});
let pos = idx
.meta()
.genomes
.iter()
.position(|g| g.label == old_label)
.unwrap_or_else(|| {
eprintln!("error: genome '{old_label}' not found in index");
std::process::exit(1);
});
validate_label(&new_label).unwrap_or_else(|e| {
eprintln!("error: --new-label: {e}");
std::process::exit(1);
});
// Check the new label is not already taken.
if idx.meta().genomes.iter().any(|g| g.label == new_label) {
eprintln!("error: label '{new_label}' already exists in index");
std::process::exit(1);
}
idx.meta_mut().genomes[pos].label = new_label.clone();
idx.meta_mut().write(index_path).unwrap_or_else(|e| {
eprintln!("error writing index metadata: {e}");
std::process::exit(1);
});
// Rename the spectrum file if it exists.
let spectrums_dir = index_path.join("spectrums");
let old_spectrum = spectrums_dir.join(format!("{old_label}.json"));
let new_spectrum = spectrums_dir.join(format!("{new_label}.json"));
if old_spectrum.exists() {
std::fs::rename(&old_spectrum, &new_spectrum).unwrap_or_else(|e| {
eprintln!("warning: could not rename spectrum file: {e}");
});
}
info!("renamed genome '{old_label}' → '{new_label}'");
}
fn parse_rename_spec(spec: &str) -> (String, String) {
let eq = spec.find('=').unwrap_or_else(|| {
eprintln!("error: --new-label expects NEW_LABEL=OLD_LABEL, got '{spec}'");
std::process::exit(1);
});
let new = spec[..eq].trim().to_string();
let old = spec[eq + 1..].trim().to_string();
if old.is_empty() || new.is_empty() {
eprintln!("error: --new-label: both old and new labels must be non-empty");
std::process::exit(1);
}
(old, new)
}
+3
View File
@@ -36,6 +36,8 @@ enum Commands {
Estimate(cmd::estimate::EstimateArgs),
/// Convert an index's evidence in-place: exact ↔ approx
Reindex(cmd::reindex::ReindexArgs),
/// Miscellaneous index utilities (--rename, …)
Utils(cmd::utils::UtilsArgs),
}
fn main() {
@@ -68,6 +70,7 @@ fn main() {
Commands::Unitig(args) => cmd::unitig::run(args),
Commands::Estimate(args) => cmd::estimate::run(args),
Commands::Reindex(args) => cmd::reindex::run(args),
Commands::Utils(args) => cmd::utils::run(args),
}
#[cfg(feature = "profiling")]