Changes to be committed:

modified:   cmd/obitools/obitag/main.go
	modified:   cmd/obitools/obitag2/main.go
	modified:   go.mod
	modified:   go.sum
	modified:   pkg/obiformats/ncbitaxdump/read.go
	modified:   pkg/obioptions/version.go
	modified:   pkg/obiseq/attributes.go
	modified:   pkg/obiseq/taxonomy_lca.go
	modified:   pkg/obiseq/taxonomy_methods.go
	modified:   pkg/obiseq/taxonomy_predicate.go
	modified:   pkg/obitax/inner.go
	modified:   pkg/obitax/lca.go
	new file:   pkg/obitax/taxid.go
	modified:   pkg/obitax/taxon.go
	modified:   pkg/obitax/taxonomy.go
	modified:   pkg/obitax/taxonslice.go
	modified:   pkg/obitools/obicleandb/obicleandb.go
	modified:   pkg/obitools/obigrep/options.go
	modified:   pkg/obitools/obilandmark/obilandmark.go
	modified:   pkg/obitools/obilandmark/options.go
	modified:   pkg/obitools/obirefidx/famlilyindexing.go
	modified:   pkg/obitools/obirefidx/geomindexing.go
	modified:   pkg/obitools/obirefidx/obirefidx.go
	modified:   pkg/obitools/obirefidx/options.go
	modified:   pkg/obitools/obitag/obigeomtag.go
	modified:   pkg/obitools/obitag/obitag.go
	modified:   pkg/obitools/obitag/options.go
	modified:   pkg/obiutils/strings.go
This commit is contained in:
Eric Coissac
2024-12-19 13:36:59 +01:00
parent f41a6fbb60
commit 795df34d1a
28 changed files with 590 additions and 280 deletions

View File

@ -1,6 +1,9 @@
package obitax
import "sync"
import (
"strings"
"sync"
)
// InnerString is a struct that holds a map of strings and a read-write lock for concurrent access.
// The index map is used to store key-value pairs of strings.
@ -31,10 +34,10 @@ func (i *InnerString) Innerize(value string) *string {
defer i.lock.Unlock()
s, ok := i.index[value]
if !ok {
value = strings.Clone(value)
s = &value
i.index[value] = s
}
return s
}

View File

@ -16,11 +16,24 @@ import (
// if either of the taxa is nil, if they are not in the same taxonomy, or
// if the taxonomy is unrooted.
func (t1 *Taxon) LCA(t2 *Taxon) (*Taxon, error) {
if t1 == nil || t1.Node == nil {
return nil, fmt.Errorf("try to get LCA of nil taxon")
if t1 == nil && t2 != nil {
return t2, nil
}
if t2 == nil || t2.Node == nil {
if t2 == nil && t1 != nil {
return t1, nil
}
if t1 == nil && t2 == nil {
return nil, fmt.Errorf("try to get LCA of nil taxa")
}
if t1.Node == nil {
return nil, fmt.Errorf("try to get LCA of nil taxa")
}
if t2.Node == nil {
return nil, fmt.Errorf("try to get LCA of nil taxon")
}

60
pkg/obitax/taxid.go Normal file
View File

@ -0,0 +1,60 @@
package obitax
import (
"fmt"
"strconv"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
)
// Taxid represents a taxonomic identifier as a pointer to a string.
type Taxid *string
// TaxidFactory is a factory for creating Taxid instances from strings and integers.
type TaxidFactory struct {
inner *InnerString
code string
alphabet obiutils.AsciiSet
}
// NewTaxidFactory creates and returns a new instance of TaxidFactory.
func NewTaxidFactory(code string, alphabet obiutils.AsciiSet) *TaxidFactory {
return &TaxidFactory{
inner: NewInnerString(),
code: code + ":",
alphabet: alphabet,
}
// Initialize and return a new TaxidFactory.
}
// FromString converts a string representation of a taxonomic identifier into a Taxid.
// It extracts the relevant part of the string after the first colon (':') if present.
func (f *TaxidFactory) FromString(taxid string) (Taxid, error) {
taxid = obiutils.AsciiSpaceSet.TrimLeft(taxid)
part1, part2 := obiutils.SplitInTwo(taxid, ':')
if len(part2) == 0 {
taxid = part1
} else {
if part1 != f.code {
return nil, fmt.Errorf("taxid %s string does not start with taxonomy code %s", taxid, f.code)
}
taxid = part2
}
taxid, err := f.alphabet.FirstWord(taxid) // Get the first word from the input string.
if err != nil {
return nil, err
}
// Return a new Taxid by innerizing the extracted taxid string.
rep := Taxid(f.inner.Innerize(taxid))
return rep, nil
}
// FromInt converts an integer taxonomic identifier into a Taxid.
// It first converts the integer to a string and then innerizes it.
func (f *TaxidFactory) FromInt(taxid int) (Taxid, error) {
s := strconv.Itoa(taxid) // Convert the integer to a string.
return f.inner.Innerize(s), nil // Return a new Taxid by innerizing the string.
}

View File

@ -179,9 +179,9 @@ func (taxon *Taxon) IPath() iter.Seq[*Taxon] {
}
}
// Path returns a slice of TaxNode representing the path from the current Taxon
// to the root Taxon in the associated Taxonomy. It collects all the nodes in the path
// using the IPath method and returns them as a TaxonSlice.
// Path returns a slice of TaxNode representing the path from the current Taxon.
// The first element of the slice is the current Taxon, and the last element is the
// to the root Taxon in the associated Taxonomy.
//
// Returns:
// - A pointer to a TaxonSlice containing the TaxNode instances in the path
@ -371,3 +371,11 @@ func (taxon *Taxon) MetadataStringValues() []string {
}
return values
}
func (taxon *Taxon) SameAs(other *Taxon) bool {
if taxon == nil || other == nil {
return false
}
return taxon.Taxonomy == other.Taxonomy && taxon.Node.id == other.Node.id
}

View File

@ -8,9 +8,10 @@ and retrieving information about taxa.
package obitax
import (
"errors"
"fmt"
"regexp"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
log "github.com/sirupsen/logrus"
)
@ -32,13 +33,12 @@ import (
type Taxonomy struct {
name string
code string
ids *InnerString
ids *TaxidFactory
ranks *InnerString
nameclasses *InnerString
names *InnerString
nodes *TaxonSet
root *TaxNode
matcher *regexp.Regexp
index map[*string]*TaxonSet
}
@ -52,21 +52,18 @@ type Taxonomy struct {
//
// Returns:
// - A pointer to the newly created Taxonomy instance.
func NewTaxonomy(name, code, codeCharacters string) *Taxonomy {
func NewTaxonomy(name, code string, codeCharacters obiutils.AsciiSet) *Taxonomy {
set := make(map[*string]*TaxNode)
matcher := regexp.MustCompile(fmt.Sprintf("^[[:blank:]]*(%s:)?(%s+)", code, codeCharacters))
taxonomy := &Taxonomy{
name: name,
code: code,
ids: NewInnerString(),
ids: NewTaxidFactory(code, codeCharacters),
ranks: NewInnerString(),
nameclasses: NewInnerString(),
names: NewInnerString(),
nodes: &TaxonSet{set: set},
root: nil,
matcher: matcher,
index: make(map[*string]*TaxonSet),
}
@ -85,23 +82,17 @@ func NewTaxonomy(name, code, codeCharacters string) *Taxonomy {
// Returns:
// - The taxon identifier as a *string corresponding to the provided taxid.
// - An error if the taxid is not valid or cannot be converted.
func (taxonomy *Taxonomy) Id(taxid string) (*string, error) {
func (taxonomy *Taxonomy) Id(taxid string) (Taxid, error) {
taxonomy = taxonomy.OrDefault(false)
if taxonomy == nil {
return nil, fmt.Errorf("Cannot extract Id from nil Taxonomy")
return nil, errors.New("Cannot extract Id from nil Taxonomy")
}
matches := taxonomy.matcher.FindStringSubmatch(taxid)
if matches == nil {
return nil, fmt.Errorf("taxid %s is not a valid taxid", taxid)
}
return taxonomy.ids.Innerize(matches[2]), nil
return taxonomy.ids.FromString(taxid)
}
// TaxidSting retrieves the string representation of a taxon node identified by the given ID.
// TaxidString retrieves the string representation of a taxon node identified by the given ID.
// It looks up the node in the taxonomy and returns its formatted string representation
// along with the taxonomy code. If the node does not exist, it returns an error.
//
@ -111,7 +102,7 @@ func (taxonomy *Taxonomy) Id(taxid string) (*string, error) {
// Returns:
// - A string representing the taxon node in the format "taxonomyCode:id [scientificName]",
// or an error if the taxon node with the specified ID does not exist in the taxonomy.
func (taxonomy *Taxonomy) TaxidSting(id string) (string, error) {
func (taxonomy *Taxonomy) TaxidString(id string) (string, error) {
taxonomy = taxonomy.OrDefault(false)
pid, err := taxonomy.Id(id)

View File

@ -13,6 +13,7 @@ package obitax
import (
"bytes"
"fmt"
"log"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
)
@ -59,6 +60,16 @@ func (slice *TaxonSlice) Get(i int) *TaxNode {
return slice.slice[i]
}
func (slice *TaxonSlice) Taxon(i int) *Taxon {
if slice == nil {
return nil
}
return &Taxon{
Node: slice.slice[i],
Taxonomy: slice.taxonomy,
}
}
// Len returns the number of TaxNode instances in the TaxonSlice.
// It provides the count of taxon nodes contained within the slice.
//
@ -124,3 +135,13 @@ func (slice *TaxonSlice) Reverse(inplace bool) *TaxonSlice {
slice: rep,
}
}
func (slice *TaxonSlice) Set(index int, taxon *Taxon) *TaxonSlice {
if slice.taxonomy != taxon.Taxonomy {
log.Panic("Cannot add taxon from a different taxonomy")
}
slice.slice[index] = taxon.Node
return slice
}