From 9356be4ec0afd7e719ba6cf10e559b75eacad887 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Sun, 21 Jun 2026 10:37:50 +0200 Subject: [PATCH] feat: introduce obitaxonomy crate for hierarchical taxonomy parsing Adds the `obitaxonomy` crate to parse and validate hierarchical taxonomy paths using a strict `taxonomy:/name@rank/...` syntax. Replaces generic string-based path matching in predicates with structured `TaxPath` and `TaxPattern` types, enforcing explicit anchor constraints and rank-aware semantics. Updates filtering documentation to clarify optional leading slashes and segment-boundary matching rules. --- docmd/implementation/filtering.md | 18 ++-- docmd/implementation/obitaxonomy.md | 143 +++++++++++++++++++++++++ src/Cargo.lock | 1 + src/obikmer/Cargo.toml | 1 + src/obikmer/src/cmd/predicate.rs | 20 ++-- src/obitaxonomy/Cargo.toml | 6 ++ src/obitaxonomy/src/error.rs | 38 +++++++ src/obitaxonomy/src/lib.rs | 11 ++ src/obitaxonomy/src/path.rs | 82 ++++++++++++++ src/obitaxonomy/src/pattern.rs | 72 +++++++++++++ src/obitaxonomy/src/segment.rs | 49 +++++++++ src/obitaxonomy/src/segment_pattern.rs | 41 +++++++ 12 files changed, 464 insertions(+), 18 deletions(-) create mode 100644 docmd/implementation/obitaxonomy.md create mode 100644 src/obitaxonomy/Cargo.toml create mode 100644 src/obitaxonomy/src/error.rs create mode 100644 src/obitaxonomy/src/lib.rs create mode 100644 src/obitaxonomy/src/path.rs create mode 100644 src/obitaxonomy/src/pattern.rs create mode 100644 src/obitaxonomy/src/segment.rs create mode 100644 src/obitaxonomy/src/segment_pattern.rs diff --git a/docmd/implementation/filtering.md b/docmd/implementation/filtering.md index fe56bc9..ea6d4a2 100644 --- a/docmd/implementation/filtering.md +++ b/docmd/implementation/filtering.md @@ -32,14 +32,20 @@ Multiple values separated by `|` are always OR-ed within the predicate. Metadata values can represent hierarchical concept paths such as `/Eukaryota/Viridiplantae/Streptophyta/Betulaceae/Betula/nana`. -**Both the stored metadata value and the pattern must start with `/`.** -A pattern that does not start with `/` is rejected at parse time with an error. +Stored taxonomy values always start with `/` (the root of the path). +Query patterns do **not** need to start with `/` — a leading `/` is an optional +start anchor, not a requirement. -The value matches the pattern if it equals it exactly or starts with the pattern -followed by `/` (segment-boundary prefix): +| Pattern form | Semantics | +|---|---| +| `A/B` | contiguous sub-path A then B, anywhere in the value | +| `/A/B` | value starts with A then B | +| `A/B$` | value ends with A then B | +| `/A/B$` | value is exactly A then B | +| `A@x/B` | A with class `x` followed by B with any class | -- `taxon~/Betulaceae/Betula` matches `/Betulaceae/Betula/nana` and - `/Betulaceae/Betula` but not `/Betulaceae/Betuloides/…`. +- `taxon~/Betulaceae/Betula` matches any path that starts with `Betulaceae` then `Betula`. +- `taxon~Betula` matches any path containing `Betula` as a segment, anywhere. ### Missing metadata key → NA diff --git a/docmd/implementation/obitaxonomy.md b/docmd/implementation/obitaxonomy.md new file mode 100644 index 0000000..d8ccd22 --- /dev/null +++ b/docmd/implementation/obitaxonomy.md @@ -0,0 +1,143 @@ +# `obitaxonomy` — taxonomy concept paths + +`obitaxonomy` is a dependency-free crate that defines a typed representation +of hierarchical concept paths (taxonomic or otherwise) stored in genome metadata. + +--- + +## Concept path syntax + +A concept path is stored as a metadata value with the prefix `taxonomy:/`: + +``` +taxonomy:/enterobacteriaceae@family/Escherichia@genus/Escherichia coli@species +``` + +Structure: + +- The `taxonomy:/` prefix is the type discriminator. Any metadata value starting + with it is parsed as a `TaxPath`; all others remain plain strings. +- The remainder is one or more `/`-separated segments. +- Each segment is `name` or `name@rank`, where `rank` is a label for the + taxonomic level (e.g. `family`, `genus`, `species`). +- Rank annotations are **optional per segment** and can be mixed freely. +- Spaces are allowed in both names and ranks. + +### Reserved character + +`@` is reserved throughout the taxonomy system and may **not** appear in: + +| Context | Constraint | +|---------|------------| +| Segment name | forbidden | +| Rank/class label | forbidden | +| Metadata key names | forbidden (used as `key@rank` in predicate syntax) | + +`@` is freely allowed in plain-text metadata values (non-taxonomy). + +### Parse errors + +| Condition | Error | +|-----------|-------| +| Value does not start with `taxonomy:/` | `MissingPrefix` | +| No segments after the prefix | `EmptyPath` | +| Segment with empty name (consecutive `/`) | `EmptySegmentName` | +| Segment with trailing `@` and no rank (`name@`) | `EmptyRankName` | +| Segment with more than one `@` | `AmbiguousRank` | + +--- + +## Public API + +### `TaxSegment` + +A single node: a name and an optional rank. + +```rust +seg.name() // &str +seg.rank() // Option<&str> +seg.to_string() // "name" or "name@rank" +TaxSegment::parse(s) // Result +``` + +### `TaxPath` + +```rust +TaxPath::parse(s) // Result +path.segments() // &[TaxSegment] +path.depth() // usize — number of segments +path.is_ancestor_of(&other) // bool — prefix match by name, ranks ignored +path.name_at_rank("genus") // Option<&str> +path.to_string() // reconstructs "taxonomy:/…" +``` + +`is_ancestor_of` compares segment **names** only — rank annotations are +informational and do not affect the ancestry relation. + +```rust +let a: TaxPath = "taxonomy:/Enterobacteriaceae@family/Escherichia@genus".parse()?; +let b: TaxPath = "taxonomy:/Enterobacteriaceae@family/Escherichia@genus/Escherichia coli@species".parse()?; + +assert!(a.is_ancestor_of(&b)); // true +assert!(b.is_ancestor_of(&a)); // false +assert!(a.is_ancestor_of(&a)); // true (equal ⇒ ancestor) + +assert_eq!(b.name_at_rank("species"), Some("Escherichia coli")); +assert_eq!(b.name_at_rank("genus"), Some("Escherichia")); +assert_eq!(b.name_at_rank("order"), None); +``` + +--- + +## Integration with `GenomeInfo` + +At index load time, every metadata value is inspected once: + +- Starts with `taxonomy:/` → parsed into `TaxPath`, stored in `genome.taxonomy`. +- Otherwise → kept as-is in `genome.meta`. + +```rust +struct GenomeInfo { + label: String, + meta: HashMap, // plain text metadata + taxonomy: HashMap, // parsed taxonomy metadata +} +``` + +The raw string is not duplicated. `TaxPath::to_string()` reconstructs the +original value losslessly for serialisation. + +--- + +## Predicate operators (in `filter` / `select`) + +Path predicates use the `~` / `!~` operators. The **stored value** always starts +with `/` (rooted path); the **query pattern** does not need to. + +### Path pattern syntax + +| Pattern | Semantics | +|---------|-----------| +| `A/B` | contiguous sub-path A then B, anywhere in the value | +| `/A/B` | value starts with A then B (start-anchored) | +| `A/B$` | value ends with A then B (end-anchored) | +| `/A/B$` | value is exactly A then B (fully anchored) | +| `A@x/B` | A with class `x` followed by B with any class | +| `A@x/B@y` | A with class `x` followed by B with class `y` | + +A segment pattern without `@` matches the segment name regardless of its stored class. + +### Rank-aware queries + +``` +key@rank=value +``` + +| Predicate form | Semantics | +|----------------|-----------| +| `key@rank=value` | genome's `key` has `value` at rank `rank` | +| `key@rank!=value` | does not | +| `key@rank=v1\|v2` | value at `rank` is `v1` or `v2` | + +`~` combined with `@rank` on the key (e.g. `key@genus~pattern`) is not defined +and is rejected at parse time. diff --git a/src/Cargo.lock b/src/Cargo.lock index a48a7fd..bdb1caa 100644 --- a/src/Cargo.lock +++ b/src/Cargo.lock @@ -1722,6 +1722,7 @@ dependencies = [ "obiskbuilder", "obiskio", "obisys", + "obitaxonomy", "pprof", "rayon", "serde_json", diff --git a/src/obikmer/Cargo.toml b/src/obikmer/Cargo.toml index 2dcfb91..4045476 100644 --- a/src/obikmer/Cargo.toml +++ b/src/obikmer/Cargo.toml @@ -19,6 +19,7 @@ obikpartitionner = { path = "../obikpartitionner" } obisys = { path = "../obisys" } obiskio = { path = "../obiskio" } obikindex = { path = "../obikindex" } +obitaxonomy = { path = "../obitaxonomy" } obilayeredmap = { path = "../obilayeredmap" } clap = { version = "4", features = ["derive"] } serde_json = "1" diff --git a/src/obikmer/src/cmd/predicate.rs b/src/obikmer/src/cmd/predicate.rs index b1183d3..47baab9 100644 --- a/src/obikmer/src/cmd/predicate.rs +++ b/src/obikmer/src/cmd/predicate.rs @@ -3,6 +3,7 @@ use std::collections::HashMap; use clap::Args; use obikindex::GenomeInfo; use obikpartitionner::{GroupQuorumFilter, KmerFilter}; +use obitaxonomy::{TaxPath, TaxPattern}; // ── Operator ────────────────────────────────────────────────────────────────── @@ -49,12 +50,6 @@ impl MetaPred { if values.iter().any(|v| v.is_empty()) { return Err(format!("empty value in predicate: {s}")); } - if matches!(op, PredOp::Matches | PredOp::NotMatches) { - if let Some(v) = values.iter().find(|v| !v.starts_with('/')) { - return Err(format!("path predicate value must start with '/': {v:?} in predicate: {s}")); - } - } - Ok(Self { key, op, values }) } @@ -75,14 +70,15 @@ impl MetaPred { // ── Path matching ───────────────────────────────────────────────────────────── -/// True if `value` is equal to `pattern` or is a descendant of it in a `/`-separated hierarchy. +/// True if the stored taxonomy `value` matches `pattern`. /// -/// Both `value` and `pattern` must start with `/`. -/// `value` matches if it equals `pattern` exactly or starts with `pattern` followed by `/`. +/// `value` must be a valid `TaxPath` (starts with `taxonomy:/`). +/// `pattern` is a `TaxPattern` query (see `obitaxonomy::TaxPattern` for syntax). +/// Returns `false` if either fails to parse. fn path_matches(value: &str, pattern: &str) -> bool { - value == pattern - || (value.starts_with(pattern) - && value[pattern.len()..].starts_with('/')) + let Ok(path) = TaxPath::parse(value) else { return false }; + let Ok(pat) = TaxPattern::parse(pattern) else { return false }; + pat.matches(&path) } // ── Three-value group evaluation ────────────────────────────────────────────── diff --git a/src/obitaxonomy/Cargo.toml b/src/obitaxonomy/Cargo.toml new file mode 100644 index 0000000..b391f4d --- /dev/null +++ b/src/obitaxonomy/Cargo.toml @@ -0,0 +1,6 @@ +[package] +name = "obitaxonomy" +version = "0.1.0" +edition = "2024" + +[dependencies] diff --git a/src/obitaxonomy/src/error.rs b/src/obitaxonomy/src/error.rs new file mode 100644 index 0000000..5f4f24e --- /dev/null +++ b/src/obitaxonomy/src/error.rs @@ -0,0 +1,38 @@ +use std::fmt; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum TaxError { + /// Stored value does not start with the `taxonomy:/` prefix. + MissingPrefix, + /// Stored path contains no segments after the prefix. + EmptyPath, + /// Query pattern contains no segments (after stripping anchors). + EmptyPattern, + /// A segment has an empty name (e.g. consecutive `/`). + EmptySegmentName, + /// A segment has a trailing `@` with no rank name. + EmptyRankName { segment: String }, + /// A segment contains more than one `@`. + AmbiguousRank { segment: String }, +} + +impl fmt::Display for TaxError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + TaxError::MissingPrefix => + write!(f, "taxonomy path must start with \"taxonomy:/\""), + TaxError::EmptyPath => + write!(f, "taxonomy path has no segments"), + TaxError::EmptyPattern => + write!(f, "taxonomy query pattern has no segments"), + TaxError::EmptySegmentName => + write!(f, "segment has an empty name"), + TaxError::EmptyRankName { segment } => + write!(f, "segment has '@' with no rank name: {segment:?}"), + TaxError::AmbiguousRank { segment } => + write!(f, "segment contains more than one '@': {segment:?}"), + } + } +} + +impl std::error::Error for TaxError {} diff --git a/src/obitaxonomy/src/lib.rs b/src/obitaxonomy/src/lib.rs new file mode 100644 index 0000000..aea3cff --- /dev/null +++ b/src/obitaxonomy/src/lib.rs @@ -0,0 +1,11 @@ +mod error; +mod segment; +mod segment_pattern; +mod path; +mod pattern; + +pub use error::TaxError; +pub use segment::TaxSegment; +pub use segment_pattern::SegmentPattern; +pub use path::{TaxPath, PREFIX}; +pub use pattern::TaxPattern; diff --git a/src/obitaxonomy/src/path.rs b/src/obitaxonomy/src/path.rs new file mode 100644 index 0000000..096c09b --- /dev/null +++ b/src/obitaxonomy/src/path.rs @@ -0,0 +1,82 @@ +use std::fmt; +use std::str::FromStr; + +use crate::error::TaxError; +use crate::segment::TaxSegment; + +/// The prefix that marks a metadata value as a taxonomy path. +pub const PREFIX: &str = "taxonomy:/"; + +/// A rooted, `/`-separated taxonomy path with optional per-segment rank annotations. +/// +/// Stored form: `taxonomy:/seg1@rank1/seg2/seg3@rank3` +/// The leading `taxonomy:/` is the discriminator; the remainder is one or more +/// `/`-separated segments, each of the form `name` or `name@rank`. +/// +/// `@` is reserved and may not appear in segment names or rank names. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct TaxPath { + segments: Vec, +} + +impl TaxPath { + pub fn parse(s: &str) -> Result { + let tail = s.strip_prefix(PREFIX).ok_or(TaxError::MissingPrefix)?; + if tail.is_empty() { + return Err(TaxError::EmptyPath); + } + let segments = tail.split('/') + .map(TaxSegment::parse) + .collect::, _>>()?; + Ok(Self { segments }) + } + + /// True if `self` is an ancestor of — or equal to — `other`. + /// + /// Comparison is by segment name only; rank annotations are ignored. + /// `self` must be a prefix of `other` at segment granularity. + pub fn is_ancestor_of(&self, other: &TaxPath) -> bool { + self.segments.len() <= other.segments.len() + && self.segments.iter().zip(other.segments.iter()) + .all(|(a, b)| a.name() == b.name()) + } + + /// Returns the name of the first segment whose rank equals `rank`, if any. + pub fn name_at_rank(&self, rank: &str) -> Option<&str> { + self.segments.iter() + .find(|s| s.rank() == Some(rank)) + .map(|s| s.name()) + } + + /// True if any segment has the given rank. + pub fn has_rank(&self, rank: &str) -> bool { + self.segments.iter().any(|s| s.rank() == Some(rank)) + } + + /// True if the path contains a segment with both the given rank and name. + pub fn matches_rank(&self, rank: &str, name: &str) -> bool { + self.segments.iter().any(|s| s.rank() == Some(rank) && s.name() == name) + } + + pub fn segments(&self) -> &[TaxSegment] { &self.segments } + pub fn depth(&self) -> usize { self.segments.len() } + pub fn is_empty(&self) -> bool { self.segments.is_empty() } +} + +impl fmt::Display for TaxPath { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", PREFIX)?; + let mut first = true; + for seg in &self.segments { + if !first { write!(f, "/")?; } + write!(f, "{seg}")?; + first = false; + } + Ok(()) + } +} + +impl FromStr for TaxPath { + type Err = TaxError; + fn from_str(s: &str) -> Result { Self::parse(s) } +} diff --git a/src/obitaxonomy/src/pattern.rs b/src/obitaxonomy/src/pattern.rs new file mode 100644 index 0000000..c0474d8 --- /dev/null +++ b/src/obitaxonomy/src/pattern.rs @@ -0,0 +1,72 @@ +use crate::error::TaxError; +use crate::path::TaxPath; +use crate::segment::TaxSegment; +use crate::segment_pattern::SegmentPattern; + +/// A query pattern for matching against stored `TaxPath` values. +/// +/// Syntax: +/// +/// | Form | Semantics | +/// |----------|-----------| +/// | `A/B` | A then B as a contiguous sub-path, anywhere in the value | +/// | `/A/B` | value starts with A then B (start-anchored) | +/// | `A/B$` | value ends with A then B (end-anchored) | +/// | `/A/B$` | value is exactly A then B (fully anchored) | +/// | `A@x/B` | A with rank `x`, followed by B with any rank | +/// +/// A segment pattern without `@` matches any segment with that name regardless +/// of its stored rank. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct TaxPattern { + start_anchored: bool, + end_anchored: bool, + segments: Vec, +} + +impl TaxPattern { + pub fn parse(s: &str) -> Result { + let s = s.trim(); + + let start_anchored = s.starts_with('/'); + let s = if start_anchored { &s[1..] } else { s }; + + let end_anchored = s.ends_with('$'); + let s = if end_anchored { &s[..s.len() - 1] } else { s }; + + if s.is_empty() { + return Err(TaxError::EmptyPattern); + } + + let segments = s.split('/') + .map(SegmentPattern::parse) + .collect::, _>>()?; + + Ok(Self { start_anchored, end_anchored, segments }) + } + + /// True if this pattern matches `path` according to the anchor flags. + /// + /// The pattern must match a contiguous run of segments in the path. + /// Start/end anchors restrict where that run may begin or end. + pub fn matches(&self, path: &TaxPath) -> bool { + let n = self.segments.len(); + let m = path.depth(); + + if n > m { return false; } + + let segs = path.segments(); + match (self.start_anchored, self.end_anchored) { + (true, true) => n == m && self.window_matches(segs, 0), + (true, false) => self.window_matches(segs, 0), + (false, true) => self.window_matches(segs, m - n), + (false, false) => (0..=(m - n)).any(|i| self.window_matches(segs, i)), + } + } + + fn window_matches(&self, segs: &[TaxSegment], start: usize) -> bool { + self.segments.iter() + .zip(segs[start..start + self.segments.len()].iter()) + .all(|(pat, seg)| pat.matches(seg)) + } +} diff --git a/src/obitaxonomy/src/segment.rs b/src/obitaxonomy/src/segment.rs new file mode 100644 index 0000000..b06436d --- /dev/null +++ b/src/obitaxonomy/src/segment.rs @@ -0,0 +1,49 @@ +use std::fmt; + +use crate::error::TaxError; + +/// A single node in a taxonomy path: a name and an optional rank. +/// +/// Neither `name` nor `rank` may contain `@` (reserved separator). +/// Serialised form: `name` or `name@rank`. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct TaxSegment { + name: String, + rank: Option, +} + +impl TaxSegment { + pub fn parse(raw: &str) -> Result { + let parts: Vec<&str> = raw.splitn(3, '@').collect(); + + let (name_raw, rank_raw) = match parts.as_slice() { + [name] => (*name, None), + [name, rank] => (*name, Some(*rank)), + _ => return Err(TaxError::AmbiguousRank { segment: raw.to_string() }), + }; + + if name_raw.is_empty() { + return Err(TaxError::EmptySegmentName); + } + + let rank = match rank_raw { + None => None, + Some("") => return Err(TaxError::EmptyRankName { segment: raw.to_string() }), + Some(r) => Some(r.to_string()), + }; + + Ok(Self { name: name_raw.to_string(), rank }) + } + + pub fn name(&self) -> &str { &self.name } + pub fn rank(&self) -> Option<&str> { self.rank.as_deref() } +} + +impl fmt::Display for TaxSegment { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match &self.rank { + None => write!(f, "{}", self.name), + Some(r) => write!(f, "{}@{}", self.name, r), + } + } +} diff --git a/src/obitaxonomy/src/segment_pattern.rs b/src/obitaxonomy/src/segment_pattern.rs new file mode 100644 index 0000000..13895ed --- /dev/null +++ b/src/obitaxonomy/src/segment_pattern.rs @@ -0,0 +1,41 @@ +use crate::error::TaxError; +use crate::segment::TaxSegment; + +/// A single segment in a query pattern: a required name and an optional rank filter. +/// +/// If `rank` is `None`, the pattern matches any segment with the given name, +/// regardless of its stored rank. If `rank` is `Some(r)`, both name and rank +/// must match exactly. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct SegmentPattern { + name: String, + rank: Option, +} + +impl SegmentPattern { + pub fn parse(raw: &str) -> Result { + let parts: Vec<&str> = raw.splitn(3, '@').collect(); + let (name_raw, rank_raw) = match parts.as_slice() { + [name] => (*name, None), + [name, rank] => (*name, Some(*rank)), + _ => return Err(TaxError::AmbiguousRank { segment: raw.to_string() }), + }; + if name_raw.is_empty() { + return Err(TaxError::EmptySegmentName); + } + let rank = match rank_raw { + None => None, + Some("") => return Err(TaxError::EmptyRankName { segment: raw.to_string() }), + Some(r) => Some(r.to_string()), + }; + Ok(Self { name: name_raw.to_string(), rank }) + } + + /// True if this pattern matches `seg`. + /// Name must match exactly. If a rank is specified in the pattern, the + /// segment's rank must match; otherwise any rank (or no rank) is accepted. + pub fn matches(&self, seg: &TaxSegment) -> bool { + self.name == seg.name() + && self.rank.as_deref().map_or(true, |r| seg.rank() == Some(r)) + } +}