feat: add utils subcommand for renaming genome labels
Introduces a `utils` CLI subcommand to enable in-place genome label renaming without full reindexing. Adds strict label validation to reject empty strings, filesystem separators, and control characters, ensuring safe CSV serialization. Updates index metadata, renames corresponding spectrum JSON files, and registers the command in the main dispatch logic. CLI reference documentation is also updated.
This commit is contained in:
+2
-1
@@ -17,6 +17,7 @@
|
|||||||
| `unitig` | Dump unitigs from a built index to stdout (debug) |
|
| `unitig` | Dump unitigs from a built index to stdout (debug) |
|
||||||
| `estimate` | Estimate approximate-index parameters (z, evidence bits, FP rates) before indexing |
|
| `estimate` | Estimate approximate-index parameters (z, evidence bits, FP rates) before indexing |
|
||||||
| `reindex` | Convert an index's evidence in-place: exact ↔ approx |
|
| `reindex` | Convert an index's evidence in-place: exact ↔ approx |
|
||||||
|
| `utils` | Miscellaneous index utilities: `--new-label NEW=OLD` renames a genome label in-place |
|
||||||
|
|
||||||
## Constraints
|
## Constraints
|
||||||
|
|
||||||
@@ -24,7 +25,7 @@
|
|||||||
- Maximum efficiency in computation, memory, and disk usage
|
- Maximum efficiency in computation, memory, and disk usage
|
||||||
- k odd, k ∈ [11, 31], fixed at runtime; kmer fits in a u64 (2 bits/base)
|
- k odd, k ∈ [11, 31], fixed at runtime; kmer fits in a u64 (2 bits/base)
|
||||||
- Canonical form: `min(kmer, revcomp(kmer))` reduces strand-symmetric space by half
|
- Canonical form: `min(kmer, revcomp(kmer))` reduces strand-symmetric space by half
|
||||||
- Input formats: FASTA, FASTQ, gzip, streaming stdin
|
- Input formats: FASTA, FASTQ, gzip, streaming stdin; `index` reads from stdin automatically when no input files are provided (`-` can also be passed explicitly among other paths)
|
||||||
|
|
||||||
## Priority operations
|
## Priority operations
|
||||||
|
|
||||||
|
|||||||
@@ -12,5 +12,5 @@ pub use error::{OKIError, OKIResult};
|
|||||||
pub use distance::{DistanceMetric, DistanceOutput};
|
pub use distance::{DistanceMetric, DistanceOutput};
|
||||||
pub use index::KmerIndex;
|
pub use index::KmerIndex;
|
||||||
pub use merge::MergeMode;
|
pub use merge::MergeMode;
|
||||||
pub use meta::{GenomeInfo, IndexConfig, IndexMeta, META_FILENAME};
|
pub use meta::{validate_label, GenomeInfo, IndexConfig, IndexMeta, META_FILENAME};
|
||||||
pub use state::{IndexState, SENTINEL_COUNTED, SENTINEL_INDEXED, SENTINEL_SCATTERED};
|
pub use state::{IndexState, SENTINEL_COUNTED, SENTINEL_INDEXED, SENTINEL_SCATTERED};
|
||||||
|
|||||||
@@ -70,3 +70,26 @@ impl IndexMeta {
|
|||||||
self.genomes.iter().map(|g| g.label.as_str())
|
self.genomes.iter().map(|g| g.label.as_str())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Validate a user-supplied genome label.
|
||||||
|
///
|
||||||
|
/// Forbidden: `/` (filesystem separator), `=` (--new-label parser separator),
|
||||||
|
/// `\0` (null byte), `\n`, `\r`, `\t` (break CSV output).
|
||||||
|
/// Empty labels are also rejected.
|
||||||
|
pub fn validate_label(label: &str) -> Result<(), String> {
|
||||||
|
if label.is_empty() {
|
||||||
|
return Err("genome label must not be empty".into());
|
||||||
|
}
|
||||||
|
const FORBIDDEN: &[char] = &['/', '=', '\0', '\n', '\r', '\t'];
|
||||||
|
if let Some(c) = label.chars().find(|c| FORBIDDEN.contains(c)) {
|
||||||
|
let display = match c {
|
||||||
|
'\0' => "\\0 (null)".to_string(),
|
||||||
|
'\n' => "\\n (newline)".to_string(),
|
||||||
|
'\r' => "\\r (carriage return)".to_string(),
|
||||||
|
'\t' => "\\t (tab)".to_string(),
|
||||||
|
c => format!("'{c}'"),
|
||||||
|
};
|
||||||
|
return Err(format!("genome label contains forbidden character {display}"));
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
|
|
||||||
use clap::Args;
|
use clap::Args;
|
||||||
use obikindex::{GenomeInfo, IndexConfig, IndexState, KmerIndex};
|
use obikindex::{validate_label, GenomeInfo, IndexConfig, IndexState, KmerIndex};
|
||||||
use obilayeredmap::IndexMode;
|
use obilayeredmap::IndexMode;
|
||||||
|
|
||||||
fn parse_key_value(s: &str) -> Result<(String, String), String> {
|
fn parse_key_value(s: &str) -> Result<(String, String), String> {
|
||||||
@@ -194,6 +194,10 @@ pub fn run(args: IndexArgs) {
|
|||||||
block_bits,
|
block_bits,
|
||||||
};
|
};
|
||||||
let genome_info = args.label.as_ref().map(|label| {
|
let genome_info = args.label.as_ref().map(|label| {
|
||||||
|
validate_label(label).unwrap_or_else(|e| {
|
||||||
|
eprintln!("error: --label: {e}");
|
||||||
|
std::process::exit(1);
|
||||||
|
});
|
||||||
let mut info = GenomeInfo::new(label.clone());
|
let mut info = GenomeInfo::new(label.clone());
|
||||||
for (k, v) in &args.meta {
|
for (k, v) in &args.meta {
|
||||||
info.meta.insert(k.clone(), v.clone());
|
info.meta.insert(k.clone(), v.clone());
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
pub mod annotate;
|
pub mod annotate;
|
||||||
|
pub mod utils;
|
||||||
pub mod distance;
|
pub mod distance;
|
||||||
pub mod dump;
|
pub mod dump;
|
||||||
pub mod estimate;
|
pub mod estimate;
|
||||||
|
|||||||
@@ -0,0 +1,91 @@
|
|||||||
|
use std::path::PathBuf;
|
||||||
|
|
||||||
|
use clap::Args;
|
||||||
|
use obikindex::{validate_label, KmerIndex};
|
||||||
|
use tracing::info;
|
||||||
|
|
||||||
|
#[derive(Args)]
|
||||||
|
pub struct UtilsArgs {
|
||||||
|
/// Index directory to operate on
|
||||||
|
pub index: PathBuf,
|
||||||
|
|
||||||
|
/// Set a new genome label: NEW_LABEL=OLD_LABEL
|
||||||
|
#[arg(long, value_name = "NEW=OLD")]
|
||||||
|
pub new_label: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn run(args: UtilsArgs) {
|
||||||
|
let mut any = false;
|
||||||
|
|
||||||
|
if let Some(spec) = &args.new_label {
|
||||||
|
any = true;
|
||||||
|
run_rename(&args.index, spec);
|
||||||
|
}
|
||||||
|
|
||||||
|
if !any {
|
||||||
|
eprintln!("utils: no operation specified. Available options: --new-label NEW=OLD");
|
||||||
|
std::process::exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn run_rename(index_path: &PathBuf, spec: &str) {
|
||||||
|
let (old_label, new_label) = parse_rename_spec(spec);
|
||||||
|
|
||||||
|
let mut idx = KmerIndex::open(index_path).unwrap_or_else(|e| {
|
||||||
|
eprintln!("error opening index: {e}");
|
||||||
|
std::process::exit(1);
|
||||||
|
});
|
||||||
|
|
||||||
|
let pos = idx
|
||||||
|
.meta()
|
||||||
|
.genomes
|
||||||
|
.iter()
|
||||||
|
.position(|g| g.label == old_label)
|
||||||
|
.unwrap_or_else(|| {
|
||||||
|
eprintln!("error: genome '{old_label}' not found in index");
|
||||||
|
std::process::exit(1);
|
||||||
|
});
|
||||||
|
|
||||||
|
validate_label(&new_label).unwrap_or_else(|e| {
|
||||||
|
eprintln!("error: --new-label: {e}");
|
||||||
|
std::process::exit(1);
|
||||||
|
});
|
||||||
|
|
||||||
|
// Check the new label is not already taken.
|
||||||
|
if idx.meta().genomes.iter().any(|g| g.label == new_label) {
|
||||||
|
eprintln!("error: label '{new_label}' already exists in index");
|
||||||
|
std::process::exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
idx.meta_mut().genomes[pos].label = new_label.clone();
|
||||||
|
idx.meta_mut().write(index_path).unwrap_or_else(|e| {
|
||||||
|
eprintln!("error writing index metadata: {e}");
|
||||||
|
std::process::exit(1);
|
||||||
|
});
|
||||||
|
|
||||||
|
// Rename the spectrum file if it exists.
|
||||||
|
let spectrums_dir = index_path.join("spectrums");
|
||||||
|
let old_spectrum = spectrums_dir.join(format!("{old_label}.json"));
|
||||||
|
let new_spectrum = spectrums_dir.join(format!("{new_label}.json"));
|
||||||
|
if old_spectrum.exists() {
|
||||||
|
std::fs::rename(&old_spectrum, &new_spectrum).unwrap_or_else(|e| {
|
||||||
|
eprintln!("warning: could not rename spectrum file: {e}");
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
info!("renamed genome '{old_label}' → '{new_label}'");
|
||||||
|
}
|
||||||
|
|
||||||
|
fn parse_rename_spec(spec: &str) -> (String, String) {
|
||||||
|
let eq = spec.find('=').unwrap_or_else(|| {
|
||||||
|
eprintln!("error: --new-label expects NEW_LABEL=OLD_LABEL, got '{spec}'");
|
||||||
|
std::process::exit(1);
|
||||||
|
});
|
||||||
|
let new = spec[..eq].trim().to_string();
|
||||||
|
let old = spec[eq + 1..].trim().to_string();
|
||||||
|
if old.is_empty() || new.is_empty() {
|
||||||
|
eprintln!("error: --new-label: both old and new labels must be non-empty");
|
||||||
|
std::process::exit(1);
|
||||||
|
}
|
||||||
|
(old, new)
|
||||||
|
}
|
||||||
@@ -36,6 +36,8 @@ enum Commands {
|
|||||||
Estimate(cmd::estimate::EstimateArgs),
|
Estimate(cmd::estimate::EstimateArgs),
|
||||||
/// Convert an index's evidence in-place: exact ↔ approx
|
/// Convert an index's evidence in-place: exact ↔ approx
|
||||||
Reindex(cmd::reindex::ReindexArgs),
|
Reindex(cmd::reindex::ReindexArgs),
|
||||||
|
/// Miscellaneous index utilities (--rename, …)
|
||||||
|
Utils(cmd::utils::UtilsArgs),
|
||||||
}
|
}
|
||||||
|
|
||||||
fn main() {
|
fn main() {
|
||||||
@@ -68,6 +70,7 @@ fn main() {
|
|||||||
Commands::Unitig(args) => cmd::unitig::run(args),
|
Commands::Unitig(args) => cmd::unitig::run(args),
|
||||||
Commands::Estimate(args) => cmd::estimate::run(args),
|
Commands::Estimate(args) => cmd::estimate::run(args),
|
||||||
Commands::Reindex(args) => cmd::reindex::run(args),
|
Commands::Reindex(args) => cmd::reindex::run(args),
|
||||||
|
Commands::Utils(args) => cmd::utils::run(args),
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(feature = "profiling")]
|
#[cfg(feature = "profiling")]
|
||||||
|
|||||||
Reference in New Issue
Block a user