feat: introduce obitaxonomy crate for hierarchical taxonomy parsing

Adds the `obitaxonomy` crate to parse and validate hierarchical taxonomy paths using a strict `taxonomy:/name@rank/...` syntax. Replaces generic string-based path matching in predicates with structured `TaxPath` and `TaxPattern` types, enforcing explicit anchor constraints and rank-aware semantics. Updates filtering documentation to clarify optional leading slashes and segment-boundary matching rules.
This commit is contained in:
Eric Coissac
2026-06-21 10:37:50 +02:00
parent c694e1f2b0
commit 9356be4ec0
12 changed files with 464 additions and 18 deletions
+6
View File
@@ -0,0 +1,6 @@
[package]
name = "obitaxonomy"
version = "0.1.0"
edition = "2024"
[dependencies]
+38
View File
@@ -0,0 +1,38 @@
use std::fmt;
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum TaxError {
/// Stored value does not start with the `taxonomy:/` prefix.
MissingPrefix,
/// Stored path contains no segments after the prefix.
EmptyPath,
/// Query pattern contains no segments (after stripping anchors).
EmptyPattern,
/// A segment has an empty name (e.g. consecutive `/`).
EmptySegmentName,
/// A segment has a trailing `@` with no rank name.
EmptyRankName { segment: String },
/// A segment contains more than one `@`.
AmbiguousRank { segment: String },
}
impl fmt::Display for TaxError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
TaxError::MissingPrefix =>
write!(f, "taxonomy path must start with \"taxonomy:/\""),
TaxError::EmptyPath =>
write!(f, "taxonomy path has no segments"),
TaxError::EmptyPattern =>
write!(f, "taxonomy query pattern has no segments"),
TaxError::EmptySegmentName =>
write!(f, "segment has an empty name"),
TaxError::EmptyRankName { segment } =>
write!(f, "segment has '@' with no rank name: {segment:?}"),
TaxError::AmbiguousRank { segment } =>
write!(f, "segment contains more than one '@': {segment:?}"),
}
}
}
impl std::error::Error for TaxError {}
+11
View File
@@ -0,0 +1,11 @@
mod error;
mod segment;
mod segment_pattern;
mod path;
mod pattern;
pub use error::TaxError;
pub use segment::TaxSegment;
pub use segment_pattern::SegmentPattern;
pub use path::{TaxPath, PREFIX};
pub use pattern::TaxPattern;
+82
View File
@@ -0,0 +1,82 @@
use std::fmt;
use std::str::FromStr;
use crate::error::TaxError;
use crate::segment::TaxSegment;
/// The prefix that marks a metadata value as a taxonomy path.
pub const PREFIX: &str = "taxonomy:/";
/// A rooted, `/`-separated taxonomy path with optional per-segment rank annotations.
///
/// Stored form: `taxonomy:/seg1@rank1/seg2/seg3@rank3`
/// The leading `taxonomy:/` is the discriminator; the remainder is one or more
/// `/`-separated segments, each of the form `name` or `name@rank`.
///
/// `@` is reserved and may not appear in segment names or rank names.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct TaxPath {
segments: Vec<TaxSegment>,
}
impl TaxPath {
pub fn parse(s: &str) -> Result<Self, TaxError> {
let tail = s.strip_prefix(PREFIX).ok_or(TaxError::MissingPrefix)?;
if tail.is_empty() {
return Err(TaxError::EmptyPath);
}
let segments = tail.split('/')
.map(TaxSegment::parse)
.collect::<Result<Vec<_>, _>>()?;
Ok(Self { segments })
}
/// True if `self` is an ancestor of — or equal to — `other`.
///
/// Comparison is by segment name only; rank annotations are ignored.
/// `self` must be a prefix of `other` at segment granularity.
pub fn is_ancestor_of(&self, other: &TaxPath) -> bool {
self.segments.len() <= other.segments.len()
&& self.segments.iter().zip(other.segments.iter())
.all(|(a, b)| a.name() == b.name())
}
/// Returns the name of the first segment whose rank equals `rank`, if any.
pub fn name_at_rank(&self, rank: &str) -> Option<&str> {
self.segments.iter()
.find(|s| s.rank() == Some(rank))
.map(|s| s.name())
}
/// True if any segment has the given rank.
pub fn has_rank(&self, rank: &str) -> bool {
self.segments.iter().any(|s| s.rank() == Some(rank))
}
/// True if the path contains a segment with both the given rank and name.
pub fn matches_rank(&self, rank: &str, name: &str) -> bool {
self.segments.iter().any(|s| s.rank() == Some(rank) && s.name() == name)
}
pub fn segments(&self) -> &[TaxSegment] { &self.segments }
pub fn depth(&self) -> usize { self.segments.len() }
pub fn is_empty(&self) -> bool { self.segments.is_empty() }
}
impl fmt::Display for TaxPath {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}", PREFIX)?;
let mut first = true;
for seg in &self.segments {
if !first { write!(f, "/")?; }
write!(f, "{seg}")?;
first = false;
}
Ok(())
}
}
impl FromStr for TaxPath {
type Err = TaxError;
fn from_str(s: &str) -> Result<Self, Self::Err> { Self::parse(s) }
}
+72
View File
@@ -0,0 +1,72 @@
use crate::error::TaxError;
use crate::path::TaxPath;
use crate::segment::TaxSegment;
use crate::segment_pattern::SegmentPattern;
/// A query pattern for matching against stored `TaxPath` values.
///
/// Syntax:
///
/// | Form | Semantics |
/// |----------|-----------|
/// | `A/B` | A then B as a contiguous sub-path, anywhere in the value |
/// | `/A/B` | value starts with A then B (start-anchored) |
/// | `A/B$` | value ends with A then B (end-anchored) |
/// | `/A/B$` | value is exactly A then B (fully anchored) |
/// | `A@x/B` | A with rank `x`, followed by B with any rank |
///
/// A segment pattern without `@` matches any segment with that name regardless
/// of its stored rank.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct TaxPattern {
start_anchored: bool,
end_anchored: bool,
segments: Vec<SegmentPattern>,
}
impl TaxPattern {
pub fn parse(s: &str) -> Result<Self, TaxError> {
let s = s.trim();
let start_anchored = s.starts_with('/');
let s = if start_anchored { &s[1..] } else { s };
let end_anchored = s.ends_with('$');
let s = if end_anchored { &s[..s.len() - 1] } else { s };
if s.is_empty() {
return Err(TaxError::EmptyPattern);
}
let segments = s.split('/')
.map(SegmentPattern::parse)
.collect::<Result<Vec<_>, _>>()?;
Ok(Self { start_anchored, end_anchored, segments })
}
/// True if this pattern matches `path` according to the anchor flags.
///
/// The pattern must match a contiguous run of segments in the path.
/// Start/end anchors restrict where that run may begin or end.
pub fn matches(&self, path: &TaxPath) -> bool {
let n = self.segments.len();
let m = path.depth();
if n > m { return false; }
let segs = path.segments();
match (self.start_anchored, self.end_anchored) {
(true, true) => n == m && self.window_matches(segs, 0),
(true, false) => self.window_matches(segs, 0),
(false, true) => self.window_matches(segs, m - n),
(false, false) => (0..=(m - n)).any(|i| self.window_matches(segs, i)),
}
}
fn window_matches(&self, segs: &[TaxSegment], start: usize) -> bool {
self.segments.iter()
.zip(segs[start..start + self.segments.len()].iter())
.all(|(pat, seg)| pat.matches(seg))
}
}
+49
View File
@@ -0,0 +1,49 @@
use std::fmt;
use crate::error::TaxError;
/// A single node in a taxonomy path: a name and an optional rank.
///
/// Neither `name` nor `rank` may contain `@` (reserved separator).
/// Serialised form: `name` or `name@rank`.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct TaxSegment {
name: String,
rank: Option<String>,
}
impl TaxSegment {
pub fn parse(raw: &str) -> Result<Self, TaxError> {
let parts: Vec<&str> = raw.splitn(3, '@').collect();
let (name_raw, rank_raw) = match parts.as_slice() {
[name] => (*name, None),
[name, rank] => (*name, Some(*rank)),
_ => return Err(TaxError::AmbiguousRank { segment: raw.to_string() }),
};
if name_raw.is_empty() {
return Err(TaxError::EmptySegmentName);
}
let rank = match rank_raw {
None => None,
Some("") => return Err(TaxError::EmptyRankName { segment: raw.to_string() }),
Some(r) => Some(r.to_string()),
};
Ok(Self { name: name_raw.to_string(), rank })
}
pub fn name(&self) -> &str { &self.name }
pub fn rank(&self) -> Option<&str> { self.rank.as_deref() }
}
impl fmt::Display for TaxSegment {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match &self.rank {
None => write!(f, "{}", self.name),
Some(r) => write!(f, "{}@{}", self.name, r),
}
}
}
+41
View File
@@ -0,0 +1,41 @@
use crate::error::TaxError;
use crate::segment::TaxSegment;
/// A single segment in a query pattern: a required name and an optional rank filter.
///
/// If `rank` is `None`, the pattern matches any segment with the given name,
/// regardless of its stored rank. If `rank` is `Some(r)`, both name and rank
/// must match exactly.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct SegmentPattern {
name: String,
rank: Option<String>,
}
impl SegmentPattern {
pub fn parse(raw: &str) -> Result<Self, TaxError> {
let parts: Vec<&str> = raw.splitn(3, '@').collect();
let (name_raw, rank_raw) = match parts.as_slice() {
[name] => (*name, None),
[name, rank] => (*name, Some(*rank)),
_ => return Err(TaxError::AmbiguousRank { segment: raw.to_string() }),
};
if name_raw.is_empty() {
return Err(TaxError::EmptySegmentName);
}
let rank = match rank_raw {
None => None,
Some("") => return Err(TaxError::EmptyRankName { segment: raw.to_string() }),
Some(r) => Some(r.to_string()),
};
Ok(Self { name: name_raw.to_string(), rank })
}
/// True if this pattern matches `seg`.
/// Name must match exactly. If a rank is specified in the pattern, the
/// segment's rank must match; otherwise any rank (or no rank) is accepted.
pub fn matches(&self, seg: &TaxSegment) -> bool {
self.name == seg.name()
&& self.rank.as_deref().map_or(true, |r| seg.rank() == Some(r))
}
}