Add managment of the taxonomy alias politic

This commit is contained in:
Eric Coissac
2025-02-10 14:05:47 +01:00
parent e2563cd8df
commit 6a8061cc4f
16 changed files with 114 additions and 48 deletions

View File

@ -3,6 +3,7 @@ package main
import (
"os"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
@ -52,13 +53,19 @@ func main() {
case obitaxonomy.CLIRequestsPathForTaxid() != "NA":
taxon, err := obitax.DefaultTaxonomy().Taxon(obitaxonomy.CLIRequestsPathForTaxid())
taxon, isAlias, err := obitax.DefaultTaxonomy().Taxon(obitaxonomy.CLIRequestsPathForTaxid())
if err != nil {
log.Fatalf("Cannot identify the requested taxon: %s (%v)",
obitaxonomy.CLIRequestsPathForTaxid(), err)
}
if isAlias {
if obidefault.FailOnTaxonomy() {
log.Fatalf("Taxon %s is an alias for %s", taxon.String(), taxon.Parent().String())
}
}
s := taxon.Path()
if s == nil {

View File

@ -2,6 +2,8 @@ package obidefault
var __taxonomy__ = ""
var __alternative_name__ = false
var __fail_on_taxonomy__ = false
var __update_taxid__ = false
func SelectedTaxonomy() string {
return __taxonomy__
@ -30,3 +32,27 @@ func SetSelectedTaxonomy(taxonomy string) {
func SetAlternativeNamesSelected(alt bool) {
__alternative_name__ = alt
}
func SetFailOnTaxonomy(fail bool) {
__fail_on_taxonomy__ = fail
}
func SetUpdateTaxid(update bool) {
__update_taxid__ = update
}
func FailOnTaxonomyPtr() *bool {
return &__fail_on_taxonomy__
}
func UpdateTaxidPtr() *bool {
return &__update_taxid__
}
func FailOnTaxonomy() bool {
return __fail_on_taxonomy__
}
func UpdateTaxid() bool {
return __update_taxid__
}

View File

@ -9,7 +9,6 @@ import (
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
"github.com/buger/jsonparser"
)
@ -201,8 +200,6 @@ func _parse_json_array_interface(str []byte, sequence *obiseq.BioSequence) ([]in
}
func _parse_json_header_(header string, sequence *obiseq.BioSequence) string {
taxonomy := obitax.DefaultTaxonomy()
annotations := sequence.Annotations()
start := -1
stop := -1
@ -291,13 +288,8 @@ func _parse_json_header_(header string, sequence *obiseq.BioSequence) string {
case skey == "taxid":
if dataType == jsonparser.Number || dataType == jsonparser.String {
taxid := obiutils.UnsafeString(value)
taxon, err := taxonomy.Taxon(taxid)
if err == nil {
sequence.SetTaxon(taxon)
} else {
sequence.SetTaxid(string(value))
}
taxid := string(value)
sequence.SetTaxid(taxid)
} else {
log.Fatalf("%s: Cannot parse taxid %s", sequence.Id(), string(value))
}
@ -306,15 +298,7 @@ func _parse_json_header_(header string, sequence *obiseq.BioSequence) string {
if dataType == jsonparser.Number || dataType == jsonparser.String {
rank, _ := obiutils.SplitInTwo(skey, '_')
taxid := obiutils.UnsafeString(value)
taxon, err := taxonomy.Taxon(taxid)
if err == nil {
taxid = taxon.String()
} else {
taxid = string(value)
}
taxid := string(value)
sequence.SetTaxid(taxid, rank)
} else {
log.Fatalf("%s: Cannot parse taxid %s", sequence.Id(), string(value))

View File

@ -1,6 +1,7 @@
package obilua
import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
lua "github.com/yuin/gopher-lua"
@ -98,13 +99,18 @@ func taxonomyGetCode(luaState *lua.LState) int {
func taxonomyGetTaxon(luaState *lua.LState) int {
taxo := checkTaxonomy(luaState)
taxid := luaState.CheckString(2)
taxon, err := taxo.Taxon(taxid)
taxon, isAlias, err := taxo.Taxon(taxid)
if err != nil {
luaState.RaiseError("%s : Error on taxon taxon: %v", taxid, err)
return 0
}
if isAlias && obidefault.FailOnTaxonomy() {
luaState.RaiseError("%s : Taxon is an alias of %s", taxid, taxon.String())
return 0
}
luaState.Push(taxon2Lua(luaState, taxon))
return 1
}

View File

@ -177,6 +177,15 @@ func LoadTaxonomyOptionSet(options *getoptions.GetOpt, required, alternatiive bo
options.Alias("a"),
options.Description("Enable the search on all alternative names and not only scientific names."))
}
options.BoolVar(obidefault.FailOnTaxonomyPtr(), "fail-on-taxonomy",
obidefault.FailOnTaxonomy(),
options.Description("Make obitools failing on error if a used taxid is not a currently valid one"),
)
options.BoolVar(obidefault.UpdateTaxidPtr(), "update-taxid", obidefault.UpdateTaxid(),
options.Description("Make obitools automatically updating the taxid that are declared merged to a newest one."),
)
}
// CLIIsDebugMode returns whether the CLI is in debug mode.

View File

@ -8,7 +8,7 @@ import (
// corresponds to the last commit, and not the one when the file will be
// commited
var _Commit = "f2e81ad"
var _Commit = "e2563cd"
var _Version = "Release 4.2.0"
// Version returns the version of the obitools package.

View File

@ -4,6 +4,7 @@ import (
"math"
"strings"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
log "github.com/sirupsen/logrus"
)
@ -15,7 +16,7 @@ func (sequence *BioSequence) TaxonomicDistribution(taxonomy *obitax.Taxonomy) ma
taxonomy = taxonomy.OrDefault(true)
for taxid, v := range taxids {
t, err := taxonomy.Taxon(taxid)
t, isAlias, err := taxonomy.Taxon(taxid)
if err != nil {
log.Fatalf(
"On sequence %s taxid %s is not defined in taxonomy: %s (%v)",
@ -25,6 +26,11 @@ func (sequence *BioSequence) TaxonomicDistribution(taxonomy *obitax.Taxonomy) ma
err,
)
}
if isAlias && obidefault.FailOnTaxonomy() {
log.Fatalf("On sequence %s taxid %s is an alias on %s",
sequence.Id(), taxid, t.String())
}
taxons[t.Node] = v
}
return taxons

View File

@ -5,6 +5,7 @@ import (
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
)
@ -16,7 +17,7 @@ func (s *BioSequence) Taxon(taxonomy *obitax.Taxonomy) *obitax.Taxon {
return nil
}
taxon, _ := taxonomy.Taxon(taxid)
taxon, _, _ := taxonomy.Taxon(taxid)
return taxon
}
@ -28,6 +29,8 @@ func (s *BioSequence) Taxon(taxonomy *obitax.Taxonomy) *obitax.Taxon {
// taxid - the taxid to set.
func (s *BioSequence) SetTaxid(taxid string, rank ...string) {
var err error
var isAlias bool
if taxid == "" {
taxid = "NA"
} else {
@ -35,16 +38,39 @@ func (s *BioSequence) SetTaxid(taxid string, rank ...string) {
taxon := (*obitax.Taxon)(nil)
if taxonomy != nil {
taxon, err = taxonomy.Taxon(taxid)
taxon, isAlias, err = taxonomy.Taxon(taxid)
if err != nil {
log.Warnf("%s: Taxid: %v is unknown from taxonomy (%v)",
s.Id(), taxid, err)
if obidefault.FailOnTaxonomy() {
log.Fatalf("%s: Taxid: %v is unknown from taxonomy (%v)",
s.Id(), taxid, err)
} else {
log.Warnf("%s: Taxid: %v is unknown from taxonomy (%v)",
s.Id(), taxid, err)
}
}
if isAlias {
if obidefault.FailOnTaxonomy() {
log.Fatalf("%s: Taxid: %v is an alias from taxonomy (%v) to %s",
s.Id(), taxid, taxonomy.Name(), taxon.String())
} else {
if obidefault.UpdateTaxid() {
log.Warnf("%s: Taxid: %v is updated to %s",
s.Id(), taxid, taxon.String())
taxid = taxon.String()
} else {
log.Warnf("%s: Taxid %v has to be updated to %s",
s.Id(), taxid, taxon.String())
}
}
} else {
if taxon != nil {
taxid = taxon.String()
}
}
}
if taxon != nil {
taxid = taxon.String()
}
}

View File

@ -63,7 +63,7 @@ func IsSubCladeOfSlot(taxonomy *obitax.Taxonomy, key string) SequencePredicate {
val, ok := sequence.GetStringAttribute(key)
if ok {
parent, err := taxonomy.Taxon(val)
parent, _, err := taxonomy.Taxon(val)
if err != nil {
log.Warnf("%s: %s is unkown from the taxonomy (%v)", sequence.Id(), val, err)

View File

@ -216,7 +216,7 @@ func (taxon *Taxon) ISubTaxonomy() *ITaxon {
}
func (taxonomy *Taxonomy) ISubTaxonomy(taxid string) *ITaxon {
taxon, err := taxonomy.Taxon(taxid)
taxon, _, err := taxonomy.Taxon(taxid)
if err != nil {
return nil

View File

@ -91,7 +91,7 @@ func loadNameTable(reader io.Reader, taxonomy *Taxonomy, onlysn bool) int {
if !onlysn || classname == "scientific name" {
n++
taxon, err := taxonomy.Taxon(taxid)
taxon, _, err := taxonomy.Taxon(taxid)
if err != nil {
log.Fatalf("%s: is unknown from the taxonomy", taxid)
@ -202,7 +202,7 @@ func LoadNCBITaxDump(directory string, onlysn bool) (*Taxonomy, error) {
n = loadMergedTable(buffered, taxonomy)
log.Printf("%d merged taxa read\n", n)
root, err := taxonomy.Taxon("1")
root, _, err := taxonomy.Taxon("1")
if err != nil {
log.Fatal("cannot find the root taxon (1) in the NCBI tax dump")

View File

@ -134,7 +134,7 @@ func LoadNCBITarTaxDump(path string, onlysn bool) (*Taxonomy, error) {
n = loadMergedTable(buffered, taxonomy)
log.Printf("%d merged taxa read\n", n)
root, err := taxonomy.Taxon("1")
root, _, err := taxonomy.Taxon("1")
if err != nil {
log.Fatal("cannot find the root taxon (1) in the NCBI tax dump")

View File

@ -129,28 +129,30 @@ func (taxonomy *Taxonomy) TaxidString(id string) (string, error) {
// Returns:
// - A pointer to the Taxon instance associated with the provided taxid.
// - If the taxid is unknown, the method will log a fatal error.
func (taxonomy *Taxonomy) Taxon(taxid string) (*Taxon, error) {
func (taxonomy *Taxonomy) Taxon(taxid string) (*Taxon, bool, error) {
taxonomy = taxonomy.OrDefault(false)
if taxonomy == nil {
return nil, errors.New("cannot extract taxon from nil taxonomy")
return nil, false, errors.New("cannot extract taxon from nil taxonomy")
}
id, err := taxonomy.Id(taxid)
if err != nil {
return nil, fmt.Errorf("Taxid %s: %v", taxid, err)
return nil, false, fmt.Errorf("Taxid %s: %v", taxid, err)
}
taxon := taxonomy.nodes.Get(id)
isAlias := taxon.Node.id != id
if taxon == nil {
return nil,
false,
fmt.Errorf("Taxid %s is not part of the taxonomy %s",
taxid,
taxonomy.name)
}
return taxon, nil
return taxon, isAlias, nil
}
// AsTaxonSet returns the set of taxon nodes contained within the Taxonomy.
@ -385,7 +387,7 @@ func (taxonomy *Taxonomy) InsertPathString(path []string) (*Taxonomy, error) {
}
var current *Taxon
current, err = taxonomy.Taxon(taxid)
current, _, err = taxonomy.Taxon(taxid)
if err != nil {
return nil, err
@ -396,7 +398,7 @@ func (taxonomy *Taxonomy) InsertPathString(path []string) (*Taxonomy, error) {
}
for _, id := range path[1:] {
taxon, err := taxonomy.Taxon(id)
taxon, _, err := taxonomy.Taxon(id)
if err == nil {
if !current.SameAs(taxon.Parent()) {
return nil, errors.New("path is not consistent with the taxonomy, parent mismatch")

View File

@ -248,14 +248,14 @@ func CLIRestrictTaxonomyPredicate() obiseq.SequencePredicate {
if len(_BelongTaxa) > 0 {
taxonomy := obitax.DefaultTaxonomy()
taxon, err := taxonomy.Taxon(_BelongTaxa[0])
taxon, _, err := taxonomy.Taxon(_BelongTaxa[0])
if err != nil {
p = obiseq.IsSubCladeOfSlot(taxonomy, _BelongTaxa[0])
} else {
p = obiseq.IsSubCladeOf(taxonomy, taxon)
}
for _, staxid := range _BelongTaxa[1:] {
taxon, err := taxonomy.Taxon(staxid)
taxon, _, err := taxonomy.Taxon(staxid)
if err != nil {
p2 = obiseq.IsSubCladeOfSlot(taxonomy, staxid)
} else {
@ -278,7 +278,7 @@ func CLIAvoidTaxonomyPredicate() obiseq.SequencePredicate {
if len(_NotBelongTaxa) > 0 {
taxonomy := obitax.DefaultTaxonomy()
taxon, err := taxonomy.Taxon(_NotBelongTaxa[0])
taxon, _, err := taxonomy.Taxon(_NotBelongTaxa[0])
if err != nil {
p = obiseq.IsSubCladeOfSlot(taxonomy, _NotBelongTaxa[0])
} else {
@ -286,7 +286,7 @@ func CLIAvoidTaxonomyPredicate() obiseq.SequencePredicate {
}
for _, taxid := range _NotBelongTaxa[1:] {
taxon, err := taxonomy.Taxon(taxid)
taxon, _, err := taxonomy.Taxon(taxid)
if err != nil {
p2 = obiseq.IsSubCladeOfSlot(taxonomy, taxid)
} else {

View File

@ -43,7 +43,7 @@ func MatchDistanceIndex(taxonomy *obitax.Taxonomy, distance int, distanceIdx map
taxon = taxonomy.Root()
} else {
var err error
taxon, err = taxonomy.Taxon(distanceIdx[keys[i]])
taxon, _, err = taxonomy.Taxon(distanceIdx[keys[i]])
if err != nil {
log.Panicf("Cannot identify taxon %s in %s (%v)", distanceIdx[keys[i]], taxonomy.Name(), err)
}
@ -197,7 +197,7 @@ func Identify(sequence *obiseq.BioSequence,
log.Panic("Problem in identification line : ", best.Id(), "idx:", idx, "distance:", d)
}
match_taxon, err := taxo.Taxon(identification)
match_taxon, _, err := taxo.Taxon(identification)
if err == nil {
taxon, _ = taxon.LCA(match_taxon)

View File

@ -91,7 +91,7 @@ func CLITaxonomicalRestrictions() (*obitax.TaxonSet, error) {
ts := taxonomy.NewTaxonSet()
for _, taxid := range __taxonomical_restriction__ {
tx, err := taxonomy.Taxon(taxid)
tx, _, err := taxonomy.Taxon(taxid)
if err != nil {
return nil, fmt.Errorf(