Files
obitools4/pkg/obitax/taxonomy.go

255 lines
7.5 KiB
Go
Raw Normal View History

2022-01-13 23:27:39 +01:00
package obitax
import (
"fmt"
"regexp"
2022-01-13 23:27:39 +01:00
2024-11-08 09:48:16 +01:00
log "github.com/sirupsen/logrus"
)
2022-01-13 23:27:39 +01:00
2024-11-08 09:48:16 +01:00
// Taxonomy represents a hierarchical classification of taxa.
// It holds information about the taxonomy's name, code, ranks, nodes, root node, aliases, and an index.
// The generic type T is used to specify the type of taxon identifiers.
//
// Fields:
// - name: The name of the taxonomy.
// - code: A unique code representing the taxonomy.
// - ranks: A pointer to an InnerString instance that holds the ranks of the taxa.
// - nodes: A pointer to a TaxonSet containing all the nodes (taxa) in the taxonomy.
// - root: A pointer to the root TaxNode of the taxonomy.
// - index: A map that indexes taxa by their string representation for quick access.
2022-01-13 23:27:39 +01:00
type Taxonomy struct {
2024-11-08 09:48:16 +01:00
name string
code string
2024-11-14 19:10:23 +01:00
ids *InnerString
2024-11-08 09:48:16 +01:00
ranks *InnerString
nameclasses *InnerString
2024-11-14 19:10:23 +01:00
names *InnerString
2024-11-08 09:48:16 +01:00
nodes *TaxonSet
root *TaxNode
matcher *regexp.Regexp
2024-11-14 19:10:23 +01:00
index map[*string]*TaxonSet
2022-01-13 23:27:39 +01:00
}
2024-11-08 09:48:16 +01:00
// NewTaxonomy creates and initializes a new Taxonomy instance with the specified name and code.
// It sets up the necessary internal structures, including ranks, nodes, aliases, and an index.
//
// Parameters:
// - name: The name of the taxonomy to be created.
// - code: A unique code representing the taxonomy.
//
// Returns:
// - A pointer to the newly created Taxonomy instance.
func NewTaxonomy(name, code, codeCharacters string) *Taxonomy {
2024-11-14 19:10:23 +01:00
set := make(map[*string]*TaxNode)
2024-11-08 09:48:16 +01:00
// codeCharacters := "[[:alnum:]]" // [[:digit:]]
matcher := regexp.MustCompile(fmt.Sprintf("^[[:blank:]]*(%s:)?(%s+)", code, codeCharacters))
taxonomy := &Taxonomy{
name: name,
code: code,
2024-11-14 19:10:23 +01:00
ids: NewInnerString(),
2024-11-08 09:48:16 +01:00
ranks: NewInnerString(),
nameclasses: NewInnerString(),
2024-11-14 19:10:23 +01:00
names: NewInnerString(),
2024-11-08 09:48:16 +01:00
nodes: &TaxonSet{set: set},
root: nil,
matcher: matcher,
2024-11-14 19:10:23 +01:00
index: make(map[*string]*TaxonSet),
2024-11-08 09:48:16 +01:00
}
2022-01-13 23:27:39 +01:00
2024-11-08 09:48:16 +01:00
taxonomy.nodes.taxonomy = taxonomy
2022-01-13 23:27:39 +01:00
2024-11-08 09:48:16 +01:00
return taxonomy
2022-01-13 23:27:39 +01:00
}
2024-11-08 09:48:16 +01:00
// Id converts a given taxid string into the corresponding taxon identifier of type T.
// It uses a regular expression to validate and extract the taxid. If the taxid is invalid,
// the method returns an error along with a zero value of type T.
//
// Parameters:
// - taxid: A string representation of the taxon identifier to be converted.
//
// Returns:
2024-11-14 19:10:23 +01:00
// - The taxon identifier as a *string corresponding to the provided taxid.
2024-11-08 09:48:16 +01:00
// - An error if the taxid is not valid or cannot be converted.
2024-11-14 19:10:23 +01:00
func (taxonomy *Taxonomy) Id(taxid string) (*string, error) {
2024-11-08 09:48:16 +01:00
matches := taxonomy.matcher.FindStringSubmatch(taxid)
if matches == nil {
2024-11-14 19:10:23 +01:00
return nil, fmt.Errorf("taxid %s is not a valid taxid", taxid)
2024-11-08 09:48:16 +01:00
}
2024-11-14 19:10:23 +01:00
return taxonomy.ids.Innerize(matches[2]), nil
2022-01-13 23:27:39 +01:00
}
2024-11-08 09:48:16 +01:00
// TaxidSting retrieves the string representation of a taxon node identified by the given ID.
// It looks up the node in the taxonomy and returns its formatted string representation
// along with the taxonomy code. If the node does not exist, it returns an error.
//
// Parameters:
// - id: The identifier of the taxon node to retrieve.
//
// Returns:
// - A string representing the taxon node in the format "taxonomyCode:id [scientificName]",
// or an error if the taxon node with the specified ID does not exist in the taxonomy.
func (taxonomy *Taxonomy) TaxidSting(id string) (string, error) {
2024-11-14 19:10:23 +01:00
pid, err := taxonomy.Id(id)
if err != nil {
return "", err
2024-11-08 09:48:16 +01:00
}
2024-11-14 19:10:23 +01:00
taxon := taxonomy.nodes.Get(pid)
if taxon == nil {
return "", fmt.Errorf("taxid %s is not part of the taxonomy", id)
}
return taxon.String(), nil
2022-01-13 23:27:39 +01:00
}
2024-11-08 09:48:16 +01:00
// Taxon retrieves the Taxon associated with the given taxid string.
// It first converts the taxid to its corresponding identifier using the Id method.
// If the taxon is not found, it logs a fatal error and terminates the program.
//
// Parameters:
// - taxid: A string representation of the taxon identifier to be retrieved.
//
// Returns:
// - A pointer to the Taxon[T] instance associated with the provided taxid.
// - If the taxid is unknown, the method will log a fatal error.
func (taxonomy *Taxonomy) Taxon(taxid string) *Taxon {
id, err := taxonomy.Id(taxid)
if err != nil {
2024-11-14 19:10:23 +01:00
log.Fatalf("Taxid %s: %v", taxid, err)
2022-01-13 23:27:39 +01:00
}
2024-11-14 19:10:23 +01:00
taxon := taxonomy.nodes.Get(id)
2022-01-13 23:27:39 +01:00
2024-11-14 19:10:23 +01:00
if taxon == nil {
log.Fatalf("Taxid %s is not part of the taxonomy %s",
taxid,
taxonomy.name)
2024-11-08 09:48:16 +01:00
}
2024-11-14 19:10:23 +01:00
return taxon
2022-01-13 23:27:39 +01:00
}
2024-11-08 09:48:16 +01:00
// TaxonSet returns the set of taxon nodes contained within the Taxonomy.
// It provides access to the underlying collection of taxon nodes for further operations.
//
// Returns:
// - A pointer to the TaxonSet[T] representing the collection of taxon nodes in the taxonomy.
2024-11-14 19:10:23 +01:00
func (taxonomy *Taxonomy) AsTaxonSet() *TaxonSet {
2024-11-08 09:48:16 +01:00
return taxonomy.nodes
}
2024-11-08 09:48:16 +01:00
// Len returns the number of taxa in the Taxonomy.
// It delegates the call to the Len method of the underlying nodes set.
//
// Returns:
// - An integer representing the total count of taxa in the taxonomy.
func (taxonomy *Taxonomy) Len() int {
return taxonomy.nodes.Len()
}
2024-11-08 09:48:16 +01:00
// AddTaxon adds a new taxon to the taxonomy with the specified parameters.
// It checks if the taxon already exists and can replace it if specified.
//
// Parameters:
// - taxid: The identifier of the taxon to be added.
// - parent: The identifier of the parent taxon.
// - rank: The rank of the taxon (e.g., species, genus).
// - isRoot: A boolean indicating if this taxon is the root of the taxonomy.
// - replace: A boolean indicating whether to replace an existing taxon with the same taxid.
//
// Returns:
// - A pointer to the newly created Taxon[T] instance.
// - An error if the taxon cannot be added (e.g., it already exists and replace is false).
func (taxonomy *Taxonomy) AddTaxon(taxid, parent string, rank string, isRoot bool, replace bool) (*Taxon, error) {
2024-11-14 19:10:23 +01:00
parentid, perr := taxonomy.Id(parent)
id, err := taxonomy.Id(taxid)
if perr != nil {
return nil, fmt.Errorf("error in parsing parent taxid %s: %v", parent, perr)
}
2024-11-14 19:10:23 +01:00
if err != nil {
return nil, fmt.Errorf("error in parsing taxid %s: %v", taxid, err)
}
if !replace && taxonomy.nodes.Contains(id) {
return nil, fmt.Errorf("trying to add taxon %s already present in the taxonomy", taxid)
}
2024-11-08 09:48:16 +01:00
2024-11-14 19:10:23 +01:00
prank := taxonomy.ranks.Innerize(rank)
n := &TaxNode{id, parentid, prank, nil, nil}
2022-01-13 23:27:39 +01:00
2024-11-08 09:48:16 +01:00
taxonomy.nodes.Insert(n)
if isRoot {
n.parent = n.id
taxonomy.root = n
2022-01-13 23:27:39 +01:00
}
2024-11-08 09:48:16 +01:00
return &Taxon{
Taxonomy: taxonomy,
Node: n,
}, nil
2022-01-13 23:27:39 +01:00
}
2024-11-08 09:48:16 +01:00
func (taxonomy *Taxonomy) AddAlias(newtaxid, oldtaxid string, replace bool) (*Taxon, error) {
newid, err := taxonomy.Id(newtaxid)
if err != nil {
return nil, err
2022-01-13 23:27:39 +01:00
}
2024-11-08 09:48:16 +01:00
oldid, err := taxonomy.Id(oldtaxid)
2022-01-13 23:27:39 +01:00
2024-11-08 09:48:16 +01:00
if err != nil {
return nil, err
2022-01-13 23:27:39 +01:00
}
2024-11-08 09:48:16 +01:00
if !replace && taxonomy.nodes.Contains(newid) {
return nil, fmt.Errorf("trying to add alias %s already present in the taxonomy", newtaxid)
2022-01-13 23:27:39 +01:00
}
2024-11-14 19:10:23 +01:00
t := taxonomy.nodes.Get(oldid)
2022-01-13 23:27:39 +01:00
2024-11-14 19:10:23 +01:00
if t == nil {
2024-11-08 09:48:16 +01:00
return nil, fmt.Errorf("trying to add alias %s to a taxon that does not exist", oldtaxid)
2022-01-13 23:27:39 +01:00
}
2024-11-14 19:10:23 +01:00
taxonomy.nodes.Alias(newid, t)
2024-11-08 09:48:16 +01:00
2024-11-14 19:10:23 +01:00
return t, nil
2022-01-13 23:27:39 +01:00
}
2024-11-08 09:48:16 +01:00
// RankList returns a slice of strings representing the ranks of the taxa
// in the taxonomy. It retrieves the ranks from the InnerString instance
// associated with the taxonomy.
//
// Returns:
// - A slice of strings containing the ranks of the taxa.
func (taxonomy *Taxonomy) RankList() []string {
return taxonomy.ranks.Slice()
2022-01-13 23:27:39 +01:00
}
2024-11-14 19:10:23 +01:00
func (taxonomy *Taxonomy) Index() *map[*string]*TaxonSet {
return &(taxonomy.index)
}
2022-01-13 23:27:39 +01:00
2024-11-14 19:10:23 +01:00
func (taxonomy *Taxonomy) Name() string {
return taxonomy.name
}
2022-01-13 23:27:39 +01:00
2024-11-14 19:10:23 +01:00
func (taxonomy *Taxonomy) Code() string {
return taxonomy.code
2022-01-13 23:27:39 +01:00
}