Patch a small bug on json write

2025-06-29 16:20:46 +00:00 · 2024-12-20 19:42:03 +01:00
parent abfa8f357a
commit 5d0f996625
14 changed files with 458 additions and 79 deletions
--- a/pkg/obiformats/fastseq_json_header.go
+++ b/pkg/obiformats/fastseq_json_header.go
@ -2,18 +2,208 @@ package obiformats

 import (
 	"bytes"
-	"math"
+	"strconv"
 	"strings"
 	"unsafe"

 	log "github.com/sirupsen/logrus"

 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
-	"github.com/goccy/go-json"
+	"github.com/buger/jsonparser"
 )

-func _parse_json_header_(header string, annotations obiseq.Annotation) string {
+func _parse_json_map_string(str []byte, sequence *obiseq.BioSequence) (map[string]string, error) {
+	values := make(map[string]string)
+	jsonparser.ObjectEach(str,
+		func(key []byte, value []byte, dataType jsonparser.ValueType, offset int) (err error) {
+			skey := string(key)
+			values[skey] = string(value)
+			return
+		},
+	)
+	return values, nil
+}
+
+func _parse_json_map_int(str []byte, sequence *obiseq.BioSequence) (map[string]int, error) {
+	values := make(map[string]int)
+	jsonparser.ObjectEach(str,
+		func(key []byte, value []byte, dataType jsonparser.ValueType, offset int) (err error) {
+			skey := string(key)
+			intval, err := jsonparser.ParseInt(value)
+			if err != nil {
+				return err
+			}
+			values[skey] = int(intval)
+			return nil
+		},
+	)
+	return values, nil
+}
+
+func _parse_json_map_float(str []byte, sequence *obiseq.BioSequence) (map[string]float64, error) {
+	values := make(map[string]float64)
+	jsonparser.ObjectEach(str,
+		func(key []byte, value []byte, dataType jsonparser.ValueType, offset int) (err error) {
+			skey := string(key)
+			floatval, err := strconv.ParseFloat(obiutils.UnsafeString(value), 64)
+			if err != nil {
+				return err
+			}
+			values[skey] = float64(floatval)
+			return nil
+		},
+	)
+	return values, nil
+}
+
+func _parse_json_map_bool(str []byte, sequence *obiseq.BioSequence) (map[string]bool, error) {
+	values := make(map[string]bool)
+	jsonparser.ObjectEach(str,
+		func(key []byte, value []byte, dataType jsonparser.ValueType, offset int) (err error) {
+			skey := string(key)
+			boolval, err := jsonparser.ParseBoolean(value)
+			if err != nil {
+				return err
+			}
+			values[skey] = boolval
+			return nil
+		},
+	)
+	return values, nil
+}
+
+func _parse_json_map_interface(str []byte, sequence *obiseq.BioSequence) (map[string]interface{}, error) {
+	values := make(map[string]interface{})
+	jsonparser.ObjectEach(str,
+		func(key []byte, value []byte, dataType jsonparser.ValueType, offset int) (err error) {
+			skey := string(key)
+			switch dataType {
+			case jsonparser.String:
+				values[skey] = string(value)
+			case jsonparser.Number:
+				// Try to parse the number as an int at first then as float if that fails.
+				values[skey], err = jsonparser.ParseInt(value)
+				if err != nil {
+					values[skey], err = strconv.ParseFloat(obiutils.UnsafeString(value), 64)
+				}
+				if err != nil {
+					return
+				}
+			case jsonparser.Boolean:
+			default:
+				values[skey] = string(value)
+			}
+			return
+		},
+	)
+	return values, nil
+}
+
+func _parse_json_array_string(str []byte, sequence *obiseq.BioSequence) ([]string, error) {
+	values := make([]string, 0)
+	jsonparser.ArrayEach(str,
+		func(value []byte, dataType jsonparser.ValueType, offset int, err error) {
+			if dataType == jsonparser.String {
+				skey := string(value)
+				values = append(values, skey)
+			}
+		},
+	)
+	return values, nil
+}
+
+func _parse_json_array_int(str []byte, sequence *obiseq.BioSequence) ([]int, error) {
+	values := make([]int, 0)
+	jsonparser.ArrayEach(str,
+		func(value []byte, dataType jsonparser.ValueType, offset int, err error) {
+			if dataType == jsonparser.Number {
+				intval, err := jsonparser.ParseInt(value)
+				if err != nil {
+					log.Fatalf("%s: Parsing int failed on value %s: %s", sequence.Id(), value, err)
+				}
+				values = append(values, int(intval))
+			}
+		},
+	)
+	return values, nil
+}
+
+func _parse_json_array_float(str []byte, sequence *obiseq.BioSequence) ([]float64, error) {
+	values := make([]float64, 0)
+	jsonparser.ArrayEach(str,
+		func(value []byte, dataType jsonparser.ValueType, offset int, err error) {
+			if dataType == jsonparser.Number {
+				floatval, err := strconv.ParseFloat(obiutils.UnsafeString(value), 64)
+				if err == nil {
+					values = append(values, float64(floatval))
+				} else {
+					log.Fatalf("%s: Parsing float failed on value %s: %s", sequence.Id(), value, err)
+				}
+			}
+		},
+	)
+	return values, nil
+}
+
+func _parse_json_array_bool(str []byte, sequence *obiseq.BioSequence) ([]bool, error) {
+	values := make([]bool, 0)
+	jsonparser.ArrayEach(str,
+		func(value []byte, dataType jsonparser.ValueType, offset int, err error) {
+			if dataType == jsonparser.Boolean {
+				boolval, err := jsonparser.ParseBoolean(value)
+				if err != nil {
+					log.Fatalf("%s: Parsing bool failed on value %s: %s", sequence.Id(), value, err)
+				}
+				values = append(values, boolval)
+			}
+		},
+	)
+	return values, nil
+}
+
+func _parse_json_array_interface(str []byte, sequence *obiseq.BioSequence) ([]interface{}, error) {
+	values := make([]interface{}, 0)
+	jsonparser.ArrayEach(str,
+		func(value []byte, dataType jsonparser.ValueType, offset int, err error) {
+			switch dataType {
+			case jsonparser.String:
+				values = append(values, string(value))
+			case jsonparser.Number:
+				// Try to parse the number as an int at first then as float if that fails.
+				intval, err := jsonparser.ParseInt(value)
+				if err != nil {
+					floatval, err := strconv.ParseFloat(obiutils.UnsafeString(value), 64)
+					if err != nil {
+						values = append(values, string(value))
+					} else {
+						values = append(values, floatval)
+					}
+				} else {
+					values = append(values, intval)
+				}
+			case jsonparser.Boolean:
+				boolval, err := jsonparser.ParseBoolean(value)
+				if err != nil {
+					values = append(values, string(value))
+				} else {
+					values = append(values, boolval)
+				}
+
+			default:
+				values = append(values, string(value))
+			}
+
+		},
+	)
+	return values, nil
+}
+
+func _parse_json_header_(header string, sequence *obiseq.BioSequence) string {
+	taxonomy := obitax.DefaultTaxonomy()
+
+	annotations := sequence.Annotations()
 	start := -1
 	stop := -1
 	level := 0
@ -51,23 +241,136 @@ func _parse_json_header_(header string, annotations obiseq.Annotation) string {

 	stop++

-	err := json.Unmarshal([]byte(header)[start:stop], &annotations)
+	jsonparser.ObjectEach(obiutils.UnsafeBytes(header[start:stop]),
+		func(key []byte, value []byte, dataType jsonparser.ValueType, offset int) error {
+			var err error

-	for k, v := range annotations {
-		switch vt := v.(type) {
-		case float64:
-			if vt == math.Floor(vt) {
-				annotations[k] = int(vt)
-			}
-			{
-				annotations[k] = vt
-			}
-		}
-	}
+			skey := obiutils.UnsafeString(key)

-	if err != nil {
-		log.Fatalf("annotation parsing error on %s : %v\n", header, err)
-	}
+			switch {
+			case skey == "id":
+				sequence.SetId(string(value))
+			case skey == "definition":
+				sequence.SetDefinition(string(value))
+
+			case skey == "count":
+				if dataType != jsonparser.Number {
+					log.Fatalf("%s: Count attribut must be numeric: %s", sequence.Id(), string(value))
+				}
+				count, err := jsonparser.ParseInt(value)
+				if err != nil {
+					log.Fatalf("%s: Cannot parse count %s", sequence.Id(), string(value))
+				}
+				sequence.SetCount(int(count))
+
+			case skey == "obiclean_weight":
+				weight, err := _parse_json_map_int(value, sequence)
+				if err != nil {
+					log.Fatalf("%s: Cannot parse obiclean weight %s", sequence.Id(), string(value))
+				}
+				annotations[skey] = weight
+
+			case skey == "obiclean_status":
+				status, err := _parse_json_map_string(value, sequence)
+				if err != nil {
+					log.Fatalf("%s: Cannot parse obiclean status %s", sequence.Id(), string(value))
+				}
+				annotations[skey] = status
+
+			case strings.HasPrefix(skey, "merged_"):
+				if dataType == jsonparser.Object {
+					data, err := _parse_json_map_int(value, sequence)
+					if err != nil {
+						log.Fatalf("%s: Cannot parse merged slot %s: %v", sequence.Id(), skey, err)
+					} else {
+						annotations[skey] = data
+					}
+				} else {
+					log.Fatalf("%s: Cannot parse merged slot %s", sequence.Id(), skey)
+				}
+
+			case skey == "taxid":
+				if dataType == jsonparser.Number || dataType == jsonparser.String {
+					taxid := obiutils.UnsafeString(value)
+					taxon := taxonomy.Taxon(taxid)
+					if taxon != nil {
+						sequence.SetTaxon(taxon)
+					} else {
+						sequence.SetTaxid(string(value))
+					}
+				} else {
+					log.Fatalf("%s: Cannot parse taxid %s", sequence.Id(), string(value))
+				}
+
+			case strings.HasSuffix(skey, "_taxid"):
+				if dataType == jsonparser.Number || dataType == jsonparser.String {
+					rank, _ := obiutils.SplitInTwo(skey, '_')
+
+					taxid := obiutils.UnsafeString(value)
+					taxon := taxonomy.Taxon(taxid)
+
+					if taxon != nil {
+						taxid = taxon.String()
+					} else {
+						taxid = string(value)
+					}
+
+					sequence.SetTaxid(taxid, rank)
+				} else {
+					log.Fatalf("%s: Cannot parse taxid %s", sequence.Id(), string(value))
+				}
+
+			default:
+				skey = strings.Clone(skey)
+				switch dataType {
+				case jsonparser.String:
+					annotations[skey] = string(value)
+				case jsonparser.Number:
+					// Try to parse the number as an int at first then as float if that fails.
+					annotations[skey], err = jsonparser.ParseInt(value)
+					if err != nil {
+						annotations[skey], err = strconv.ParseFloat(obiutils.UnsafeString(value), 64)
+					}
+				case jsonparser.Array:
+					annotations[skey], err = _parse_json_array_interface(value, sequence)
+				case jsonparser.Object:
+					annotations[skey], err = _parse_json_map_interface(value, sequence)
+				case jsonparser.Boolean:
+					annotations[skey], err = jsonparser.ParseBoolean(value)
+				case jsonparser.Null:
+					annotations[skey] = nil
+				default:
+					log.Fatalf("Unknown data type %v", dataType)
+				}
+			}
+
+			if err != nil {
+				annotations[skey] = "NaN"
+				log.Fatalf("%s: Cannot parse value %s assicated to key %s into a %s value",
+					sequence.Id(), string(value), skey, dataType.String())
+			}
+
+			return err
+		},
+	)
+
+	// err := json.Unmarshal([]byte(header)[start:stop], &annotations)
+
+	// for k, v := range annotations {
+	// 	switch vt := v.(type) {
+	// 	case float64:
+	// 		if vt == math.Floor(vt) {
+	// 			annotations[k] = int(vt)
+	// 		}
+	// 		{
+	// 			annotations[k] = vt
+	// 		}
+	// 	}
+	// }
+
+	// if err != nil {
+	// 	log.Fatalf("annotation parsing error on %s : %v\n", header, err)
+	// }

 	return strings.TrimSpace(header[stop:])
 }
@ -78,7 +381,9 @@ func ParseFastSeqJsonHeader(sequence *obiseq.BioSequence) {

 	definition_part := _parse_json_header_(
 		definition,
-		sequence.Annotations())
+		sequence,
+	)
+
 	if len(definition_part) > 0 {
 		if sequence.HasDefinition() {
 			definition_part = sequence.Definition() + " " + definition_part
--- a/pkg/obiformats/ncbitaxdump/read.go
+++ b/pkg/obiformats/ncbitaxdump/read.go
@ -15,6 +15,19 @@ import (
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
 )

+// loadNodeTable reads a node table from the provided reader and populates the given taxonomy.
+// It is an internal function and should not be called directly. It is part of the NCBI taxdump reader.
+// The node table is expected to be in CSV format with a custom delimiter ('|') and comments
+// starting with '#'. Each record in the table represents a taxon with its taxid, parent taxid,
+// and rank.
+//
+// Parameters:
+//   - reader: An io.Reader from which the node table is read.
+//   - taxonomy: A pointer to an obitax.Taxonomy instance where the taxon data will be added.
+//
+// The function reads each record from the input, trims whitespace from the taxid, parent, and rank,
+// and adds the taxon to the taxonomy. If an error occurs while adding a taxon, the function logs
+// a fatal error and terminates the program.
 func loadNodeTable(reader io.Reader, taxonomy *obitax.Taxonomy) {
 	file := csv.NewReader(reader)
 	file.Comma = '|'
@ -38,6 +51,21 @@ func loadNodeTable(reader io.Reader, taxonomy *obitax.Taxonomy) {
 	}
 }

+// loadNameTable reads a name table from the provided reader and populates the given taxonomy.
+// It is an internal function and should not be called directly. It is part of the NCBI taxdump reader.
+// The name table is expected to be in a custom format with fields separated by the '|' character.
+// Each record in the table represents a taxon with its taxid, name, and class name.
+//
+// Parameters:
+//   - reader: An io.Reader from which the name table is read.
+//   - taxonomy: A pointer to an obitax.Taxonomy instance where the taxon names will be set.
+//   - onlysn: A boolean flag indicating whether to only process records with the class name "scientific name".
+//
+// Returns:
+//
+//	The number of taxon names successfully loaded into the taxonomy. If a line is too long, -1 is returned.
+//	The function processes each line, trims whitespace from the taxid, name, and class name, and sets
+//	the name in the taxonomy if the conditions are met.
 func loadNameTable(reader io.Reader, taxonomy *obitax.Taxonomy, onlysn bool) int {
 	// file := csv.NewReader(reader)
 	// file.Comma = '|'
@ -71,6 +99,19 @@ func loadNameTable(reader io.Reader, taxonomy *obitax.Taxonomy, onlysn bool) int
 	return n
 }

+// loadMergedTable reads a merged table from the provided reader and populates the given taxonomy.
+// It is an internal function and should not be called directly. It is part of the NCBI taxdump reader.
+// The merged table is expected to be in CSV format with a custom delimiter ('|') and comments
+// starting with '#'. Each record in the table represents a mapping between an old taxid and a new taxid.
+//
+// Parameters:
+//   - reader: An io.Reader from which the merged table is read.
+//   - taxonomy: A pointer to an obitax.Taxonomy instance where the alias mappings will be added.
+//
+// Returns:
+//
+//	The number of alias mappings successfully loaded into the taxonomy. The function processes
+//	each record, trims whitespace from the old and new taxid, and adds the alias to the taxonomy.
 func loadMergedTable(reader io.Reader, taxonomy *obitax.Taxonomy) int {
 	file := csv.NewReader(reader)
 	file.Comma = '|'
--- a/pkg/obioptions/version.go
+++ b/pkg/obioptions/version.go
@ -7,7 +7,7 @@ import (
 // TODO: The version number is extracted from git. This induces that the version
 // corresponds to the last commit, and not the one when the file will be
 // commited
-var _Commit = "795df34"
+var _Commit = "abfa8f3"
 var _Version = "Release 4.2.0"

 // Version returns the version of the obitools package.
--- a/pkg/obiseq/taxonomy_classifier.go
+++ b/pkg/obiseq/taxonomy_classifier.go
@ -19,6 +19,8 @@ func TaxonomyClassifier(taxonomicRank string,
 	taxonomy *obitax.Taxonomy,
 	abortOnMissing bool) *BioSequenceClassifier {

+	taxonomy = taxonomy.OrDefault(true)
+
 	keys := make(map[*obitax.TaxNode]int)
 	codes := make([]*obitax.TaxNode, 1)
 	codes[0] = nil
--- a/pkg/obiseq/taxonomy_lca.go
+++ b/pkg/obiseq/taxonomy_lca.go
@ -12,6 +12,8 @@ func (sequence *BioSequence) TaxonomicDistribution(taxonomy *obitax.Taxonomy) ma
 	taxids := sequence.StatsOn(MakeStatsOnDescription("taxid"), "na")
 	taxons := make(map[*obitax.TaxNode]int, len(taxids))

+	taxonomy = taxonomy.OrDefault(true)
+
 	for taxid, v := range taxids {
 		t := taxonomy.Taxon(taxid)
 		if t == nil {
@ -27,6 +29,9 @@ func (sequence *BioSequence) TaxonomicDistribution(taxonomy *obitax.Taxonomy) ma
 }

 func (sequence *BioSequence) LCA(taxonomy *obitax.Taxonomy, threshold float64) (*obitax.Taxon, float64, int) {
+
+	taxonomy = taxonomy.OrDefault(true)
+
 	taxons := sequence.TaxonomicDistribution(taxonomy)
 	paths := make(map[*obitax.TaxNode]*obitax.TaxonSlice, len(taxons))
 	answer := (*obitax.TaxNode)(nil)
@ -34,11 +39,11 @@ func (sequence *BioSequence) LCA(taxonomy *obitax.Taxonomy, threshold float64) (
 	granTotal := 0

 	for t, w := range taxons {
-		p := (&obitax.Taxon{Taxonomy: taxonomy,
-			Node: t,
-		}).Path()
+		taxon := &obitax.Taxon{Taxonomy: taxonomy, Node: t}
+		p := taxon.Path()
+
 		if p == nil {
-			log.Panicf("Sequence %s: taxonomic path cannot be retreived from Taxid %d : %v", sequence.Id(), t.String(taxonomy.Code()))
+			log.Panicf("Sequence %s: taxonomic path cannot be retreived from Taxid : %s", sequence.Id(), taxon.String())
 		}

 		p.Reverse(true)
@ -103,6 +108,8 @@ func (sequence *BioSequence) LCA(taxonomy *obitax.Taxonomy, threshold float64) (

 func AddLCAWorker(taxonomy *obitax.Taxonomy, slot_name string, threshold float64) SeqWorker {

+	taxonomy = taxonomy.OrDefault(true)
+
 	if !strings.HasSuffix(slot_name, "taxid") {
 		slot_name = slot_name + "_taxid"
 	}
--- a/pkg/obiseq/taxonomy_methods.go
+++ b/pkg/obiseq/taxonomy_methods.go
@ -9,6 +9,7 @@ import (
 )

 func (s *BioSequence) Taxon(taxonomy *obitax.Taxonomy) *obitax.Taxon {
+
 	taxid := s.Taxid()
 	if taxid == "NA" {
 		return nil
@ -21,16 +22,39 @@ func (s *BioSequence) Taxon(taxonomy *obitax.Taxonomy) *obitax.Taxon {
 // Parameters:
 //
 //	taxid - the taxid to set.
-func (s *BioSequence) SetTaxid(taxid string) {
+func (s *BioSequence) SetTaxid(taxid string, rank ...string) {
 	if taxid == "" {
 		taxid = "NA"
+	} else {
+		taxonomy := obitax.DefaultTaxonomy()
+		taxon := (*obitax.Taxon)(nil)
+
+		if taxonomy != nil {
+			taxon = taxonomy.Taxon(taxid)
+		}
+
+		if taxon != nil {
+			taxid = taxon.String()
+		}
+	}
+
+	if len(rank) > 0 {
+		r := rank[0]
+		s.SetAttribute(r+"_taxid", taxid)
+	} else {
+		s.SetAttribute("taxid", taxid)
 	}
-	s.SetAttribute("taxid", taxid)
 }

-func (s *BioSequence) SetTaxon(taxon *obitax.Taxon) {
+func (s *BioSequence) SetTaxon(taxon *obitax.Taxon, rank ...string) {
 	taxid := taxon.String()
-	s.SetTaxid(taxid)
+
+	if len(rank) > 0 {
+		r := rank[0]
+		s.SetAttribute(r+"_taxid", taxid)
+	} else {
+		s.SetAttribute("taxid", taxid)
+	}
 }

 // Taxid returns the taxonomic ID associated with the BioSequence.
--- a/pkg/obitax/taxonnode.go
+++ b/pkg/obitax/taxonnode.go
@ -35,13 +35,15 @@ type TaxNode struct {
 //   - taxonomyCode: A string representing the code of the taxonomy to which the node belongs.
 //
 // Returns:
-//   - A formatted string representing the TaxNode in the form "taxonomyCode:id [scientificName]".
+//   - A formatted string representing the TaxNode in the form "taxonomyCode:id [scientificName]@rank".
 func (node *TaxNode) String(taxonomyCode string) string {
 	if node.HasScientificName() {
-		return fmt.Sprintf("%s:%v [%s]",
+		return fmt.Sprintf("%s:%v [%s]@%s",
 			taxonomyCode,
 			*node.id,
-			node.ScientificName())
+			node.ScientificName(),
+			node.Rank(),
+		)
 	}

 	return fmt.Sprintf("%s:%v",
--- a/pkg/obitools/obiannotate/obiannotate.go
+++ b/pkg/obitools/obiannotate/obiannotate.go
@ -273,31 +273,31 @@ func CLIAnnotationWorker() obiseq.SeqWorker {
 	}

 	if CLIHasTaxonAtRank() {
-		taxo := obigrep.CLILoadSelectedTaxonomy()
+		taxo := obitax.DefaultTaxonomy()
 		w := AddTaxonAtRankWorker(taxo, CLITaxonAtRank()...)
 		annotator = annotator.ChainWorkers(w)
 	}

 	if CLISetTaxonomicPath() {
-		taxo := obigrep.CLILoadSelectedTaxonomy()
+		taxo := obitax.DefaultTaxonomy()
 		w := obiseq.MakeSetPathWorker(taxo)
 		annotator = annotator.ChainWorkers(w)
 	}

 	if CLISetTaxonomicRank() {
-		taxo := obigrep.CLILoadSelectedTaxonomy()
+		taxo := obitax.DefaultTaxonomy()
 		w := AddTaxonRankWorker(taxo)
 		annotator = annotator.ChainWorkers(w)
 	}

 	if CLISetScientificName() {
-		taxo := obigrep.CLILoadSelectedTaxonomy()
+		taxo := obitax.DefaultTaxonomy()
 		w := AddScientificNameWorker(taxo)
 		annotator = annotator.ChainWorkers(w)
 	}

 	if CLIHasAddLCA() {
-		taxo := obigrep.CLILoadSelectedTaxonomy()
+		taxo := obitax.DefaultTaxonomy()
 		w := obiseq.AddLCAWorker(taxo, CLILCASlotName(), CLILCAThreshold())
 		annotator = annotator.ChainWorkers(w)
 	}
--- a/pkg/obitools/obicleandb/obicleandb.go
+++ b/pkg/obitools/obicleandb/obicleandb.go
@ -11,6 +11,7 @@ import (
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obistats"
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obigrep"
 )

@ -245,7 +246,7 @@ func ICleanDB(itertator obiiter.IBioSequence) obiiter.IBioSequence {
 		log.Fatal(err)
 	}

-	taxonomy := obigrep.CLILoadSelectedTaxonomy()
+	taxonomy := obitax.DefaultTaxonomy()

 	if len(obigrep.CLIRequiredRanks()) > 0 {
 		rankPredicate = obigrep.CLIHasRankDefinedPredicate()
--- a/pkg/obitools/obiconvert/options.go
+++ b/pkg/obitools/obiconvert/options.go
@ -3,6 +3,7 @@ package obiconvert
 import (
 	"os"

+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
 	log "github.com/sirupsen/logrus"

 	"github.com/DavidGamba/go-getoptions"
@ -115,6 +116,7 @@ func PairedFilesOptionSet(options *getoptions.GetOpt) {
 }

 func OptionSet(options *getoptions.GetOpt) {
+	obioptions.LoadTaxonomyOptionSet(options, false, false)
 	InputOptionSet(options)
 	OutputOptionSet(options)
 	PairedFilesOptionSet(options)
--- a/pkg/obitools/obigrep/options.go
+++ b/pkg/obitools/obigrep/options.go
@ -6,7 +6,7 @@ import (
 	log "github.com/sirupsen/logrus"

 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiapat"
-	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiformats/ncbitaxdump"
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
@ -33,7 +33,6 @@ var _Predicats = make([]string, 0)
 var _IdList = ""

 var _Taxdump = ""
-var _Taxonomy = (*obitax.Taxonomy)(nil)

 var _RequiredAttributes = make([]string, 0)
 var _AttributePatterns = make(map[string]string, 0)
@ -49,10 +48,7 @@ var _pattern_indel = false
 var _pattern_only_forward = false

 func TaxonomySelectionOptionSet(options *getoptions.GetOpt) {
-
-	options.StringVar(&_Taxdump, "taxdump", _Taxdump,
-		options.Alias("t"),
-		options.Description("Points to the directory containing the NCBI Taxonomy database dump."))
+	obioptions.LoadTaxonomyOptionSet(options, false, false)

 	options.StringSliceVar(&_BelongTaxa, "restrict-to-taxon", 1, 1,
 		options.Alias("r"),
@ -246,31 +242,12 @@ func CLIPatternBothStrand() bool {
 	return !_pattern_only_forward
 }

-func CLILoadSelectedTaxonomy() *obitax.Taxonomy {
-	if CLISelectedNCBITaxDump() != "" {
-		if _Taxonomy == nil {
-			var err error
-			_Taxonomy, err = ncbitaxdump.LoadNCBITaxDump(CLISelectedNCBITaxDump(), true)
-			if err != nil {
-				log.Fatalf("cannot load taxonomy %s : %v",
-					CLISelectedNCBITaxDump(), err)
-				return nil
-			}
-		}
-		return _Taxonomy
-	}
-
-	log.Fatalln("no NCBI taxdump selected using option -t|--taxdump")
-
-	return nil
-}
-
 func CLIRestrictTaxonomyPredicate() obiseq.SequencePredicate {
 	var p obiseq.SequencePredicate
 	var p2 obiseq.SequencePredicate

 	if len(_BelongTaxa) > 0 {
-		taxonomy := CLILoadSelectedTaxonomy()
+		taxonomy := obitax.DefaultTaxonomy()

 		taxon := taxonomy.Taxon(_BelongTaxa[0])
 		if taxon == nil {
@ -300,7 +277,7 @@ func CLIAvoidTaxonomyPredicate() obiseq.SequencePredicate {
 	var p2 obiseq.SequencePredicate

 	if len(_NotBelongTaxa) > 0 {
-		taxonomy := CLILoadSelectedTaxonomy()
+		taxonomy := obitax.DefaultTaxonomy()

 		taxon := taxonomy.Taxon(_NotBelongTaxa[0])
 		if taxon == nil {
@ -329,7 +306,7 @@ func CLIAvoidTaxonomyPredicate() obiseq.SequencePredicate {
 func CLIHasRankDefinedPredicate() obiseq.SequencePredicate {

 	if len(_RequiredRanks) > 0 {
-		taxonomy := CLILoadSelectedTaxonomy()
+		taxonomy := obitax.DefaultTaxonomy()
 		p := obiseq.HasRequiredRank(taxonomy, _RequiredRanks[0])

 		for _, rank := range _RequiredRanks[1:] {
--- a/pkg/obitools/obirefidx/geomindexing.go
+++ b/pkg/obitools/obirefidx/geomindexing.go
@ -70,11 +70,7 @@ func GeomIndexSesquence(seqidx int,
 		new_lca, _ := lca.LCA(taxa.Taxon(o))
 		if new_lca.SameAs(lca) {
 			lca = new_lca
-			index[int(seq_dist[o])] = fmt.Sprintf(
-				"%s@%s",
-				lca.String(),
-				lca.Rank(),
-			)
+			index[int(seq_dist[o])] = lca.String()

 			if lca.IsRoot() {
 				break
--- a/pkg/obitools/obirefidx/obirefidx.go
+++ b/pkg/obitools/obirefidx/obirefidx.go
@ -1,7 +1,6 @@
 package obirefidx

 import (
-	"fmt"
 	"os"

 	log "github.com/sirupsen/logrus"
@ -172,11 +171,7 @@ func IndexSequence(seqidx int,
 	for i, d := range closest {
 		if i < (len(closest)-1) && d < closest[i+1] {
 			current_taxon := pseq.Taxon(i)
-			obitag_index[d] = fmt.Sprintf(
-				"%s@%s",
-				current_taxon.String(),
-				current_taxon.Rank(),
-			)
+			obitag_index[d] = current_taxon.String()
 		}
 	}

@ -197,9 +192,10 @@ func IndexReferenceDB(iterator obiiter.IBioSequence) obiiter.IBioSequence {
 	source, references := iterator.Load()
 	log.Infof("Done. Database contains %d sequences", len(references))

-	taxo, error := obioptions.CLILoadSelectedTaxonomy()
-	if error != nil {
-		log.Panicln(error)
+	taxo := obitax.DefaultTaxonomy()
+
+	if taxo == nil {
+		log.Fatal("No taxonomy loaded.")
 	}

 	log.Infoln("Indexing sequence taxids...")
--- a/pkg/obiutils/unsafe.go
+++ b/pkg/obiutils/unsafe.go
@ -0,0 +1,26 @@
+package obiutils
+
+import "unsafe"
+
+// UnsafeBytes converts a string into a byte slice without making a copy of the data.
+// This function is considered unsafe because it directly manipulates memory and does not
+// perform any checks on the string's contents. It should be used with caution.
+//
+// Parameters:
+//   - str: The input string to be converted into a byte slice.
+//
+// Returns:
+//
+//	A byte slice representation of the input string. The returned slice shares the same
+//	underlying data as the original string, so modifications to the byte slice may affect
+//	the original string and vice versa.
+func UnsafeBytes(str string) []byte {
+	d := unsafe.StringData(str)
+	b := unsafe.Slice(d, len(str))
+
+	return b
+}
+
+func UnsafeString(b []byte) string {
+	return unsafe.String(unsafe.SliceData(b), len(b))
+}