mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
Fisrt step in the obitax rewriting
This commit is contained in:
@ -3,57 +3,222 @@ package obitax
|
||||
import (
|
||||
"fmt"
|
||||
"regexp"
|
||||
"strconv"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
type TaxName struct {
|
||||
name *string
|
||||
nameclass *string
|
||||
}
|
||||
|
||||
// Taxonomy represents a hierarchical classification of taxa.
|
||||
// It holds information about the taxonomy's name, code, ranks, nodes, root node, aliases, and an index.
|
||||
// The generic type T is used to specify the type of taxon identifiers.
|
||||
//
|
||||
// Fields:
|
||||
// - name: The name of the taxonomy.
|
||||
// - code: A unique code representing the taxonomy.
|
||||
// - ranks: A pointer to an InnerString instance that holds the ranks of the taxa.
|
||||
// - nodes: A pointer to a TaxonSet containing all the nodes (taxa) in the taxonomy.
|
||||
// - root: A pointer to the root TaxNode of the taxonomy.
|
||||
// - index: A map that indexes taxa by their string representation for quick access.
|
||||
type Taxonomy struct {
|
||||
nodes *TaxonSet
|
||||
alias map[int]*TaxNode
|
||||
index map[string]*TaxonSet
|
||||
name string
|
||||
code string
|
||||
ranks *InnerString
|
||||
nameclasses *InnerString
|
||||
nodes *TaxonSet
|
||||
root *TaxNode
|
||||
matcher *regexp.Regexp
|
||||
index map[string]*TaxonSet
|
||||
}
|
||||
|
||||
func NewTaxonomy() *Taxonomy {
|
||||
set := make(TaxonSet)
|
||||
taxonomy := Taxonomy{
|
||||
nodes: &set,
|
||||
alias: make(TaxonSet),
|
||||
index: make(map[string]*TaxonSet)}
|
||||
return &taxonomy
|
||||
// NewTaxonomy creates and initializes a new Taxonomy instance with the specified name and code.
|
||||
// It sets up the necessary internal structures, including ranks, nodes, aliases, and an index.
|
||||
//
|
||||
// Parameters:
|
||||
// - name: The name of the taxonomy to be created.
|
||||
// - code: A unique code representing the taxonomy.
|
||||
//
|
||||
// Returns:
|
||||
// - A pointer to the newly created Taxonomy instance.
|
||||
func NewTaxonomy(name, code, codeCharacters string) *Taxonomy {
|
||||
set := make(map[string]*TaxNode)
|
||||
|
||||
// codeCharacters := "[[:alnum:]]" // [[:digit:]]
|
||||
|
||||
matcher := regexp.MustCompile(fmt.Sprintf("^[[:blank:]]*(%s:)?(%s+)", code, codeCharacters))
|
||||
|
||||
taxonomy := &Taxonomy{
|
||||
name: name,
|
||||
code: code,
|
||||
ranks: NewInnerString(),
|
||||
nameclasses: NewInnerString(),
|
||||
nodes: &TaxonSet{set: set},
|
||||
root: nil,
|
||||
matcher: matcher,
|
||||
index: make(map[string]*TaxonSet),
|
||||
}
|
||||
|
||||
taxonomy.nodes.taxonomy = taxonomy
|
||||
|
||||
return taxonomy
|
||||
}
|
||||
|
||||
// Id converts a given taxid string into the corresponding taxon identifier of type T.
|
||||
// It uses a regular expression to validate and extract the taxid. If the taxid is invalid,
|
||||
// the method returns an error along with a zero value of type T.
|
||||
//
|
||||
// Parameters:
|
||||
// - taxid: A string representation of the taxon identifier to be converted.
|
||||
//
|
||||
// Returns:
|
||||
// - The taxon identifier of type T corresponding to the provided taxid.
|
||||
// - An error if the taxid is not valid or cannot be converted.
|
||||
func (taxonomy *Taxonomy) Id(taxid string) (string, error) {
|
||||
matches := taxonomy.matcher.FindStringSubmatch(taxid)
|
||||
|
||||
if matches == nil {
|
||||
return "", fmt.Errorf("Taxid %s is not a valid taxid", taxid)
|
||||
}
|
||||
|
||||
return matches[2], nil
|
||||
}
|
||||
|
||||
// TaxidSting retrieves the string representation of a taxon node identified by the given ID.
|
||||
// It looks up the node in the taxonomy and returns its formatted string representation
|
||||
// along with the taxonomy code. If the node does not exist, it returns an error.
|
||||
//
|
||||
// Parameters:
|
||||
// - id: The identifier of the taxon node to retrieve.
|
||||
//
|
||||
// Returns:
|
||||
// - A string representing the taxon node in the format "taxonomyCode:id [scientificName]",
|
||||
// or an error if the taxon node with the specified ID does not exist in the taxonomy.
|
||||
func (taxonomy *Taxonomy) TaxidSting(id string) (string, error) {
|
||||
node := taxonomy.nodes.Get(id)
|
||||
if node == nil {
|
||||
return "", fmt.Errorf("Taxid %d is part of the taxonomy", id)
|
||||
}
|
||||
return node.String(taxonomy.code), nil
|
||||
}
|
||||
|
||||
// Taxon retrieves the Taxon associated with the given taxid string.
|
||||
// It first converts the taxid to its corresponding identifier using the Id method.
|
||||
// If the taxon is not found, it logs a fatal error and terminates the program.
|
||||
//
|
||||
// Parameters:
|
||||
// - taxid: A string representation of the taxon identifier to be retrieved.
|
||||
//
|
||||
// Returns:
|
||||
// - A pointer to the Taxon[T] instance associated with the provided taxid.
|
||||
// - If the taxid is unknown, the method will log a fatal error.
|
||||
func (taxonomy *Taxonomy) Taxon(taxid string) *Taxon {
|
||||
id, err := taxonomy.Id(taxid)
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("Taxid %s is not a valid taxid", taxid)
|
||||
}
|
||||
|
||||
node := taxonomy.nodes.Get(id)
|
||||
|
||||
if node == nil {
|
||||
log.Fatalf("Taxid %s is an unknown taxid", taxid)
|
||||
}
|
||||
|
||||
return &Taxon{
|
||||
Taxonomy: taxonomy,
|
||||
Node: node,
|
||||
}
|
||||
}
|
||||
|
||||
// TaxonSet returns the set of taxon nodes contained within the Taxonomy.
|
||||
// It provides access to the underlying collection of taxon nodes for further operations.
|
||||
//
|
||||
// Returns:
|
||||
// - A pointer to the TaxonSet[T] representing the collection of taxon nodes in the taxonomy.
|
||||
func (taxonomy *Taxonomy) TaxonSet() *TaxonSet {
|
||||
return taxonomy.nodes
|
||||
}
|
||||
|
||||
func (taxonomy *Taxonomy) Alias() *map[int]*TaxNode {
|
||||
return &(taxonomy.alias)
|
||||
}
|
||||
|
||||
func (taxonomy *Taxonomy) Index() *map[string]*TaxonSet {
|
||||
return &(taxonomy.index)
|
||||
}
|
||||
|
||||
// Len returns the number of taxa in the Taxonomy.
|
||||
// It delegates the call to the Len method of the underlying nodes set.
|
||||
//
|
||||
// Returns:
|
||||
// - An integer representing the total count of taxa in the taxonomy.
|
||||
func (taxonomy *Taxonomy) Len() int {
|
||||
return len(*taxonomy.nodes)
|
||||
return taxonomy.nodes.Len()
|
||||
}
|
||||
|
||||
func (taxonomy *Taxonomy) AddNewTaxa(taxid, parent int, rank string, replace bool, init bool) (*TaxNode, error) {
|
||||
if !replace {
|
||||
_, ok := (*taxonomy.nodes)[taxid]
|
||||
if ok {
|
||||
return nil, fmt.Errorf("trying to add taxoon %d already present in the taxonomy", taxid)
|
||||
}
|
||||
// AddTaxon adds a new taxon to the taxonomy with the specified parameters.
|
||||
// It checks if the taxon already exists and can replace it if specified.
|
||||
//
|
||||
// Parameters:
|
||||
// - taxid: The identifier of the taxon to be added.
|
||||
// - parent: The identifier of the parent taxon.
|
||||
// - rank: The rank of the taxon (e.g., species, genus).
|
||||
// - isRoot: A boolean indicating if this taxon is the root of the taxonomy.
|
||||
// - replace: A boolean indicating whether to replace an existing taxon with the same taxid.
|
||||
//
|
||||
// Returns:
|
||||
// - A pointer to the newly created Taxon[T] instance.
|
||||
// - An error if the taxon cannot be added (e.g., it already exists and replace is false).
|
||||
func (taxonomy *Taxonomy) AddTaxon(taxid, parent string, rank string, isRoot bool, replace bool) (*Taxon, error) {
|
||||
if !replace && taxonomy.nodes.Contains(taxid) {
|
||||
return nil, fmt.Errorf("trying to add taxon %d already present in the taxonomy", taxid)
|
||||
}
|
||||
|
||||
n := NewTaxNode(taxid, parent, rank)
|
||||
(*taxonomy.nodes)[taxid] = n
|
||||
rank = taxonomy.ranks.Innerize(rank)
|
||||
|
||||
return n, nil
|
||||
n := &TaxNode{taxid, parent, rank, nil, nil}
|
||||
|
||||
taxonomy.nodes.Insert(n)
|
||||
|
||||
if isRoot {
|
||||
n.parent = n.id
|
||||
taxonomy.root = n
|
||||
}
|
||||
|
||||
return &Taxon{
|
||||
Taxonomy: taxonomy,
|
||||
Node: n,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (taxonomy *Taxonomy) AddAlias(newtaxid, oldtaxid string, replace bool) (*Taxon, error) {
|
||||
newid, err := taxonomy.Id(newtaxid)
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
oldid, err := taxonomy.Id(oldtaxid)
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if !replace && taxonomy.nodes.Contains(newid) {
|
||||
return nil, fmt.Errorf("trying to add alias %s already present in the taxonomy", newtaxid)
|
||||
}
|
||||
|
||||
n := taxonomy.nodes.Get(oldid)
|
||||
|
||||
if n == nil {
|
||||
return nil, fmt.Errorf("trying to add alias %s to a taxon that does not exist", oldtaxid)
|
||||
}
|
||||
|
||||
taxonomy.nodes.Alias(newid, n)
|
||||
|
||||
return &Taxon{
|
||||
Taxonomy: taxonomy,
|
||||
Node: n,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// RankList returns a slice of strings representing the ranks of the taxa
|
||||
// in the taxonomy. It retrieves the ranks from the InnerString instance
|
||||
// associated with the taxonomy.
|
||||
//
|
||||
// Returns:
|
||||
// - A slice of strings containing the ranks of the taxa.
|
||||
func (taxonomy *Taxonomy) RankList() []string {
|
||||
return taxonomy.ranks.Slice()
|
||||
}
|
||||
|
||||
// func (taxonomy *Taxonomy) Taxon(taxid int) (*TaxNode, error) {
|
||||
@ -69,93 +234,6 @@ func (taxonomy *Taxonomy) AddNewTaxa(taxid, parent int, rank string, replace boo
|
||||
// return t, nil
|
||||
// }
|
||||
|
||||
func (taxonomy *Taxonomy) Taxon(taxid interface{}) (*TaxNode, error) {
|
||||
var itaxid int
|
||||
var err error
|
||||
|
||||
switch v := taxid.(type) {
|
||||
case int:
|
||||
itaxid = v
|
||||
case string:
|
||||
itaxid, err = strconv.Atoi(v)
|
||||
|
||||
if err != nil {
|
||||
re := regexp.MustCompile(`TX:(\d+)`)
|
||||
parts := re.FindStringSubmatch(v)
|
||||
if len(parts) != 2 {
|
||||
return nil, fmt.Errorf("I cannot parse taxid from %s", v)
|
||||
}
|
||||
itaxid, _ = strconv.Atoi(parts[1])
|
||||
}
|
||||
}
|
||||
|
||||
t, ok := (*taxonomy.nodes)[itaxid]
|
||||
|
||||
if !ok {
|
||||
a, aok := taxonomy.alias[itaxid]
|
||||
if !aok {
|
||||
return nil, fmt.Errorf("Taxid %d is not part of the taxonomy", taxid)
|
||||
}
|
||||
t = a
|
||||
}
|
||||
return t, nil
|
||||
}
|
||||
func (taxonomy *Taxonomy) AddNewName(taxid int, name, nameclass *string) error {
|
||||
node, node_err := taxonomy.Taxon(taxid)
|
||||
if node_err != nil {
|
||||
return node_err
|
||||
}
|
||||
|
||||
if *nameclass == "scientific name" {
|
||||
node.scientificname = name
|
||||
} else {
|
||||
names := node.alternatenames
|
||||
if names == nil {
|
||||
n := make(map[string]*string)
|
||||
names = &n
|
||||
node.alternatenames = names
|
||||
} else {
|
||||
(*names)[*name] = nameclass
|
||||
}
|
||||
}
|
||||
|
||||
i, ok := taxonomy.index[*name]
|
||||
if !ok {
|
||||
tnm := make(TaxonSet)
|
||||
i = &tnm
|
||||
taxonomy.index[*name] = i
|
||||
}
|
||||
(*i)[taxid] = node
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (taxonomy *Taxonomy) ReindexParent() error {
|
||||
var ok bool
|
||||
for _, taxon := range *taxonomy.nodes {
|
||||
taxon.pparent, ok = (*taxonomy.nodes)[taxon.parent]
|
||||
if !ok {
|
||||
return fmt.Errorf("Parent %d of taxon %d is not defined in taxonomy",
|
||||
taxon.taxid,
|
||||
taxon.parent)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func MakeTaxName(name, nameclass *string) *TaxName {
|
||||
tn := TaxName{name, nameclass}
|
||||
return &tn
|
||||
}
|
||||
|
||||
func (taxonomy *Taxonomy) AddNewAlias(newtaxid, oldtaxid int) error {
|
||||
n, node_err := taxonomy.Taxon(newtaxid)
|
||||
if node_err != nil {
|
||||
return node_err
|
||||
}
|
||||
|
||||
taxonomy.alias[oldtaxid] = n
|
||||
|
||||
return nil
|
||||
func (taxonomy *Taxonomy) Index() *map[string]*TaxonSet {
|
||||
return &(taxonomy.index)
|
||||
}
|
||||
|
Reference in New Issue
Block a user