From 6a8061cc4f0dc56446886bc3f22dfff95cc00d43 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Mon, 10 Feb 2025 14:05:47 +0100 Subject: [PATCH] Add managment of the taxonomy alias politic --- cmd/obitools/obitaxonomy/main.go | 9 +++++- pkg/obidefault/taxonomy.go | 26 +++++++++++++++++ pkg/obiformats/fastseq_json_header.go | 22 ++------------- pkg/obilua/obitaxonomy.go | 8 +++++- pkg/obioptions/options.go | 9 ++++++ pkg/obioptions/version.go | 2 +- pkg/obiseq/taxonomy_lca.go | 8 +++++- pkg/obiseq/taxonomy_methods.go | 40 ++++++++++++++++++++++----- pkg/obiseq/taxonomy_predicate.go | 2 +- pkg/obitax/iterator.go | 2 +- pkg/obitax/ncbitaxdump_read.go | 4 +-- pkg/obitax/ncbitaxdump_readtar.go | 2 +- pkg/obitax/taxonomy.go | 14 ++++++---- pkg/obitools/obigrep/options.go | 8 +++--- pkg/obitools/obitag/obitag.go | 4 +-- pkg/obitools/obitaxonomy/options.go | 2 +- 16 files changed, 114 insertions(+), 48 deletions(-) diff --git a/cmd/obitools/obitaxonomy/main.go b/cmd/obitools/obitaxonomy/main.go index f26d596..f1dcd4f 100644 --- a/cmd/obitools/obitaxonomy/main.go +++ b/cmd/obitools/obitaxonomy/main.go @@ -3,6 +3,7 @@ package main import ( "os" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" @@ -52,13 +53,19 @@ func main() { case obitaxonomy.CLIRequestsPathForTaxid() != "NA": - taxon, err := obitax.DefaultTaxonomy().Taxon(obitaxonomy.CLIRequestsPathForTaxid()) + taxon, isAlias, err := obitax.DefaultTaxonomy().Taxon(obitaxonomy.CLIRequestsPathForTaxid()) if err != nil { log.Fatalf("Cannot identify the requested taxon: %s (%v)", obitaxonomy.CLIRequestsPathForTaxid(), err) } + if isAlias { + if obidefault.FailOnTaxonomy() { + log.Fatalf("Taxon %s is an alias for %s", taxon.String(), taxon.Parent().String()) + } + } + s := taxon.Path() if s == nil { diff --git a/pkg/obidefault/taxonomy.go b/pkg/obidefault/taxonomy.go index eac3ec6..f21852b 100644 --- a/pkg/obidefault/taxonomy.go +++ b/pkg/obidefault/taxonomy.go @@ -2,6 +2,8 @@ package obidefault var __taxonomy__ = "" var __alternative_name__ = false +var __fail_on_taxonomy__ = false +var __update_taxid__ = false func SelectedTaxonomy() string { return __taxonomy__ @@ -30,3 +32,27 @@ func SetSelectedTaxonomy(taxonomy string) { func SetAlternativeNamesSelected(alt bool) { __alternative_name__ = alt } + +func SetFailOnTaxonomy(fail bool) { + __fail_on_taxonomy__ = fail +} + +func SetUpdateTaxid(update bool) { + __update_taxid__ = update +} + +func FailOnTaxonomyPtr() *bool { + return &__fail_on_taxonomy__ +} + +func UpdateTaxidPtr() *bool { + return &__update_taxid__ +} + +func FailOnTaxonomy() bool { + return __fail_on_taxonomy__ +} + +func UpdateTaxid() bool { + return __update_taxid__ +} diff --git a/pkg/obiformats/fastseq_json_header.go b/pkg/obiformats/fastseq_json_header.go index 0ed53e5..661092a 100644 --- a/pkg/obiformats/fastseq_json_header.go +++ b/pkg/obiformats/fastseq_json_header.go @@ -9,7 +9,6 @@ import ( log "github.com/sirupsen/logrus" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" "github.com/buger/jsonparser" ) @@ -201,8 +200,6 @@ func _parse_json_array_interface(str []byte, sequence *obiseq.BioSequence) ([]in } func _parse_json_header_(header string, sequence *obiseq.BioSequence) string { - taxonomy := obitax.DefaultTaxonomy() - annotations := sequence.Annotations() start := -1 stop := -1 @@ -291,13 +288,8 @@ func _parse_json_header_(header string, sequence *obiseq.BioSequence) string { case skey == "taxid": if dataType == jsonparser.Number || dataType == jsonparser.String { - taxid := obiutils.UnsafeString(value) - taxon, err := taxonomy.Taxon(taxid) - if err == nil { - sequence.SetTaxon(taxon) - } else { - sequence.SetTaxid(string(value)) - } + taxid := string(value) + sequence.SetTaxid(taxid) } else { log.Fatalf("%s: Cannot parse taxid %s", sequence.Id(), string(value)) } @@ -306,15 +298,7 @@ func _parse_json_header_(header string, sequence *obiseq.BioSequence) string { if dataType == jsonparser.Number || dataType == jsonparser.String { rank, _ := obiutils.SplitInTwo(skey, '_') - taxid := obiutils.UnsafeString(value) - taxon, err := taxonomy.Taxon(taxid) - - if err == nil { - taxid = taxon.String() - } else { - taxid = string(value) - } - + taxid := string(value) sequence.SetTaxid(taxid, rank) } else { log.Fatalf("%s: Cannot parse taxid %s", sequence.Id(), string(value)) diff --git a/pkg/obilua/obitaxonomy.go b/pkg/obilua/obitaxonomy.go index a4f4b3a..f244ef4 100644 --- a/pkg/obilua/obitaxonomy.go +++ b/pkg/obilua/obitaxonomy.go @@ -1,6 +1,7 @@ package obilua import ( + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" lua "github.com/yuin/gopher-lua" @@ -98,13 +99,18 @@ func taxonomyGetCode(luaState *lua.LState) int { func taxonomyGetTaxon(luaState *lua.LState) int { taxo := checkTaxonomy(luaState) taxid := luaState.CheckString(2) - taxon, err := taxo.Taxon(taxid) + taxon, isAlias, err := taxo.Taxon(taxid) if err != nil { luaState.RaiseError("%s : Error on taxon taxon: %v", taxid, err) return 0 } + if isAlias && obidefault.FailOnTaxonomy() { + luaState.RaiseError("%s : Taxon is an alias of %s", taxid, taxon.String()) + return 0 + } + luaState.Push(taxon2Lua(luaState, taxon)) return 1 } diff --git a/pkg/obioptions/options.go b/pkg/obioptions/options.go index 68daa57..a1250c5 100644 --- a/pkg/obioptions/options.go +++ b/pkg/obioptions/options.go @@ -177,6 +177,15 @@ func LoadTaxonomyOptionSet(options *getoptions.GetOpt, required, alternatiive bo options.Alias("a"), options.Description("Enable the search on all alternative names and not only scientific names.")) } + + options.BoolVar(obidefault.FailOnTaxonomyPtr(), "fail-on-taxonomy", + obidefault.FailOnTaxonomy(), + options.Description("Make obitools failing on error if a used taxid is not a currently valid one"), + ) + + options.BoolVar(obidefault.UpdateTaxidPtr(), "update-taxid", obidefault.UpdateTaxid(), + options.Description("Make obitools automatically updating the taxid that are declared merged to a newest one."), + ) } // CLIIsDebugMode returns whether the CLI is in debug mode. diff --git a/pkg/obioptions/version.go b/pkg/obioptions/version.go index 0073379..c017f7d 100644 --- a/pkg/obioptions/version.go +++ b/pkg/obioptions/version.go @@ -8,7 +8,7 @@ import ( // corresponds to the last commit, and not the one when the file will be // commited -var _Commit = "f2e81ad" +var _Commit = "e2563cd" var _Version = "Release 4.2.0" // Version returns the version of the obitools package. diff --git a/pkg/obiseq/taxonomy_lca.go b/pkg/obiseq/taxonomy_lca.go index 0758068..4e059c7 100644 --- a/pkg/obiseq/taxonomy_lca.go +++ b/pkg/obiseq/taxonomy_lca.go @@ -4,6 +4,7 @@ import ( "math" "strings" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax" log "github.com/sirupsen/logrus" ) @@ -15,7 +16,7 @@ func (sequence *BioSequence) TaxonomicDistribution(taxonomy *obitax.Taxonomy) ma taxonomy = taxonomy.OrDefault(true) for taxid, v := range taxids { - t, err := taxonomy.Taxon(taxid) + t, isAlias, err := taxonomy.Taxon(taxid) if err != nil { log.Fatalf( "On sequence %s taxid %s is not defined in taxonomy: %s (%v)", @@ -25,6 +26,11 @@ func (sequence *BioSequence) TaxonomicDistribution(taxonomy *obitax.Taxonomy) ma err, ) } + + if isAlias && obidefault.FailOnTaxonomy() { + log.Fatalf("On sequence %s taxid %s is an alias on %s", + sequence.Id(), taxid, t.String()) + } taxons[t.Node] = v } return taxons diff --git a/pkg/obiseq/taxonomy_methods.go b/pkg/obiseq/taxonomy_methods.go index 37edb63..1518436 100644 --- a/pkg/obiseq/taxonomy_methods.go +++ b/pkg/obiseq/taxonomy_methods.go @@ -5,6 +5,7 @@ import ( log "github.com/sirupsen/logrus" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" ) @@ -16,7 +17,7 @@ func (s *BioSequence) Taxon(taxonomy *obitax.Taxonomy) *obitax.Taxon { return nil } - taxon, _ := taxonomy.Taxon(taxid) + taxon, _, _ := taxonomy.Taxon(taxid) return taxon } @@ -28,6 +29,8 @@ func (s *BioSequence) Taxon(taxonomy *obitax.Taxonomy) *obitax.Taxon { // taxid - the taxid to set. func (s *BioSequence) SetTaxid(taxid string, rank ...string) { var err error + var isAlias bool + if taxid == "" { taxid = "NA" } else { @@ -35,16 +38,39 @@ func (s *BioSequence) SetTaxid(taxid string, rank ...string) { taxon := (*obitax.Taxon)(nil) if taxonomy != nil { - taxon, err = taxonomy.Taxon(taxid) + taxon, isAlias, err = taxonomy.Taxon(taxid) if err != nil { - log.Warnf("%s: Taxid: %v is unknown from taxonomy (%v)", - s.Id(), taxid, err) + if obidefault.FailOnTaxonomy() { + log.Fatalf("%s: Taxid: %v is unknown from taxonomy (%v)", + s.Id(), taxid, err) + } else { + log.Warnf("%s: Taxid: %v is unknown from taxonomy (%v)", + s.Id(), taxid, err) + } + } + + if isAlias { + if obidefault.FailOnTaxonomy() { + log.Fatalf("%s: Taxid: %v is an alias from taxonomy (%v) to %s", + s.Id(), taxid, taxonomy.Name(), taxon.String()) + } else { + if obidefault.UpdateTaxid() { + log.Warnf("%s: Taxid: %v is updated to %s", + s.Id(), taxid, taxon.String()) + taxid = taxon.String() + } else { + log.Warnf("%s: Taxid %v has to be updated to %s", + s.Id(), taxid, taxon.String()) + } + } + + } else { + if taxon != nil { + taxid = taxon.String() + } } - } - if taxon != nil { - taxid = taxon.String() } } diff --git a/pkg/obiseq/taxonomy_predicate.go b/pkg/obiseq/taxonomy_predicate.go index 25ac551..55540db 100644 --- a/pkg/obiseq/taxonomy_predicate.go +++ b/pkg/obiseq/taxonomy_predicate.go @@ -63,7 +63,7 @@ func IsSubCladeOfSlot(taxonomy *obitax.Taxonomy, key string) SequencePredicate { val, ok := sequence.GetStringAttribute(key) if ok { - parent, err := taxonomy.Taxon(val) + parent, _, err := taxonomy.Taxon(val) if err != nil { log.Warnf("%s: %s is unkown from the taxonomy (%v)", sequence.Id(), val, err) diff --git a/pkg/obitax/iterator.go b/pkg/obitax/iterator.go index d703d81..42a8058 100644 --- a/pkg/obitax/iterator.go +++ b/pkg/obitax/iterator.go @@ -216,7 +216,7 @@ func (taxon *Taxon) ISubTaxonomy() *ITaxon { } func (taxonomy *Taxonomy) ISubTaxonomy(taxid string) *ITaxon { - taxon, err := taxonomy.Taxon(taxid) + taxon, _, err := taxonomy.Taxon(taxid) if err != nil { return nil diff --git a/pkg/obitax/ncbitaxdump_read.go b/pkg/obitax/ncbitaxdump_read.go index a70321c..dd97ab1 100644 --- a/pkg/obitax/ncbitaxdump_read.go +++ b/pkg/obitax/ncbitaxdump_read.go @@ -91,7 +91,7 @@ func loadNameTable(reader io.Reader, taxonomy *Taxonomy, onlysn bool) int { if !onlysn || classname == "scientific name" { n++ - taxon, err := taxonomy.Taxon(taxid) + taxon, _, err := taxonomy.Taxon(taxid) if err != nil { log.Fatalf("%s: is unknown from the taxonomy", taxid) @@ -202,7 +202,7 @@ func LoadNCBITaxDump(directory string, onlysn bool) (*Taxonomy, error) { n = loadMergedTable(buffered, taxonomy) log.Printf("%d merged taxa read\n", n) - root, err := taxonomy.Taxon("1") + root, _, err := taxonomy.Taxon("1") if err != nil { log.Fatal("cannot find the root taxon (1) in the NCBI tax dump") diff --git a/pkg/obitax/ncbitaxdump_readtar.go b/pkg/obitax/ncbitaxdump_readtar.go index 594d879..9e91a8d 100644 --- a/pkg/obitax/ncbitaxdump_readtar.go +++ b/pkg/obitax/ncbitaxdump_readtar.go @@ -134,7 +134,7 @@ func LoadNCBITarTaxDump(path string, onlysn bool) (*Taxonomy, error) { n = loadMergedTable(buffered, taxonomy) log.Printf("%d merged taxa read\n", n) - root, err := taxonomy.Taxon("1") + root, _, err := taxonomy.Taxon("1") if err != nil { log.Fatal("cannot find the root taxon (1) in the NCBI tax dump") diff --git a/pkg/obitax/taxonomy.go b/pkg/obitax/taxonomy.go index 31ad345..9860236 100644 --- a/pkg/obitax/taxonomy.go +++ b/pkg/obitax/taxonomy.go @@ -129,28 +129,30 @@ func (taxonomy *Taxonomy) TaxidString(id string) (string, error) { // Returns: // - A pointer to the Taxon instance associated with the provided taxid. // - If the taxid is unknown, the method will log a fatal error. -func (taxonomy *Taxonomy) Taxon(taxid string) (*Taxon, error) { +func (taxonomy *Taxonomy) Taxon(taxid string) (*Taxon, bool, error) { taxonomy = taxonomy.OrDefault(false) if taxonomy == nil { - return nil, errors.New("cannot extract taxon from nil taxonomy") + return nil, false, errors.New("cannot extract taxon from nil taxonomy") } id, err := taxonomy.Id(taxid) if err != nil { - return nil, fmt.Errorf("Taxid %s: %v", taxid, err) + return nil, false, fmt.Errorf("Taxid %s: %v", taxid, err) } taxon := taxonomy.nodes.Get(id) + isAlias := taxon.Node.id != id if taxon == nil { return nil, + false, fmt.Errorf("Taxid %s is not part of the taxonomy %s", taxid, taxonomy.name) } - return taxon, nil + return taxon, isAlias, nil } // AsTaxonSet returns the set of taxon nodes contained within the Taxonomy. @@ -385,7 +387,7 @@ func (taxonomy *Taxonomy) InsertPathString(path []string) (*Taxonomy, error) { } var current *Taxon - current, err = taxonomy.Taxon(taxid) + current, _, err = taxonomy.Taxon(taxid) if err != nil { return nil, err @@ -396,7 +398,7 @@ func (taxonomy *Taxonomy) InsertPathString(path []string) (*Taxonomy, error) { } for _, id := range path[1:] { - taxon, err := taxonomy.Taxon(id) + taxon, _, err := taxonomy.Taxon(id) if err == nil { if !current.SameAs(taxon.Parent()) { return nil, errors.New("path is not consistent with the taxonomy, parent mismatch") diff --git a/pkg/obitools/obigrep/options.go b/pkg/obitools/obigrep/options.go index abaa812..f1769ea 100644 --- a/pkg/obitools/obigrep/options.go +++ b/pkg/obitools/obigrep/options.go @@ -248,14 +248,14 @@ func CLIRestrictTaxonomyPredicate() obiseq.SequencePredicate { if len(_BelongTaxa) > 0 { taxonomy := obitax.DefaultTaxonomy() - taxon, err := taxonomy.Taxon(_BelongTaxa[0]) + taxon, _, err := taxonomy.Taxon(_BelongTaxa[0]) if err != nil { p = obiseq.IsSubCladeOfSlot(taxonomy, _BelongTaxa[0]) } else { p = obiseq.IsSubCladeOf(taxonomy, taxon) } for _, staxid := range _BelongTaxa[1:] { - taxon, err := taxonomy.Taxon(staxid) + taxon, _, err := taxonomy.Taxon(staxid) if err != nil { p2 = obiseq.IsSubCladeOfSlot(taxonomy, staxid) } else { @@ -278,7 +278,7 @@ func CLIAvoidTaxonomyPredicate() obiseq.SequencePredicate { if len(_NotBelongTaxa) > 0 { taxonomy := obitax.DefaultTaxonomy() - taxon, err := taxonomy.Taxon(_NotBelongTaxa[0]) + taxon, _, err := taxonomy.Taxon(_NotBelongTaxa[0]) if err != nil { p = obiseq.IsSubCladeOfSlot(taxonomy, _NotBelongTaxa[0]) } else { @@ -286,7 +286,7 @@ func CLIAvoidTaxonomyPredicate() obiseq.SequencePredicate { } for _, taxid := range _NotBelongTaxa[1:] { - taxon, err := taxonomy.Taxon(taxid) + taxon, _, err := taxonomy.Taxon(taxid) if err != nil { p2 = obiseq.IsSubCladeOfSlot(taxonomy, taxid) } else { diff --git a/pkg/obitools/obitag/obitag.go b/pkg/obitools/obitag/obitag.go index 1d1b791..ea72003 100644 --- a/pkg/obitools/obitag/obitag.go +++ b/pkg/obitools/obitag/obitag.go @@ -43,7 +43,7 @@ func MatchDistanceIndex(taxonomy *obitax.Taxonomy, distance int, distanceIdx map taxon = taxonomy.Root() } else { var err error - taxon, err = taxonomy.Taxon(distanceIdx[keys[i]]) + taxon, _, err = taxonomy.Taxon(distanceIdx[keys[i]]) if err != nil { log.Panicf("Cannot identify taxon %s in %s (%v)", distanceIdx[keys[i]], taxonomy.Name(), err) } @@ -197,7 +197,7 @@ func Identify(sequence *obiseq.BioSequence, log.Panic("Problem in identification line : ", best.Id(), "idx:", idx, "distance:", d) } - match_taxon, err := taxo.Taxon(identification) + match_taxon, _, err := taxo.Taxon(identification) if err == nil { taxon, _ = taxon.LCA(match_taxon) diff --git a/pkg/obitools/obitaxonomy/options.go b/pkg/obitools/obitaxonomy/options.go index 18dc552..27bf15c 100644 --- a/pkg/obitools/obitaxonomy/options.go +++ b/pkg/obitools/obitaxonomy/options.go @@ -91,7 +91,7 @@ func CLITaxonomicalRestrictions() (*obitax.TaxonSet, error) { ts := taxonomy.NewTaxonSet() for _, taxid := range __taxonomical_restriction__ { - tx, err := taxonomy.Taxon(taxid) + tx, _, err := taxonomy.Taxon(taxid) if err != nil { return nil, fmt.Errorf(