From 5d0f9966251d23905bc1aa69be800f2dbde81ff7 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Fri, 20 Dec 2024 19:42:03 +0100 Subject: [PATCH] Patch a small bug on json write --- pkg/obiformats/fastseq_json_header.go | 343 ++++++++++++++++++++++-- pkg/obiformats/ncbitaxdump/read.go | 41 +++ pkg/obioptions/version.go | 2 +- pkg/obiseq/taxonomy_classifier.go | 2 + pkg/obiseq/taxonomy_lca.go | 15 +- pkg/obiseq/taxonomy_methods.go | 32 ++- pkg/obitax/taxonnode.go | 8 +- pkg/obitools/obiannotate/obiannotate.go | 10 +- pkg/obitools/obicleandb/obicleandb.go | 3 +- pkg/obitools/obiconvert/options.go | 2 + pkg/obitools/obigrep/options.go | 33 +-- pkg/obitools/obirefidx/geomindexing.go | 6 +- pkg/obitools/obirefidx/obirefidx.go | 14 +- pkg/obiutils/unsafe.go | 26 ++ 14 files changed, 458 insertions(+), 79 deletions(-) create mode 100644 pkg/obiutils/unsafe.go diff --git a/pkg/obiformats/fastseq_json_header.go b/pkg/obiformats/fastseq_json_header.go index 00b3886..9459b9f 100644 --- a/pkg/obiformats/fastseq_json_header.go +++ b/pkg/obiformats/fastseq_json_header.go @@ -2,18 +2,208 @@ package obiformats import ( "bytes" - "math" + "strconv" "strings" "unsafe" log "github.com/sirupsen/logrus" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" - "github.com/goccy/go-json" + "github.com/buger/jsonparser" ) -func _parse_json_header_(header string, annotations obiseq.Annotation) string { +func _parse_json_map_string(str []byte, sequence *obiseq.BioSequence) (map[string]string, error) { + values := make(map[string]string) + jsonparser.ObjectEach(str, + func(key []byte, value []byte, dataType jsonparser.ValueType, offset int) (err error) { + skey := string(key) + values[skey] = string(value) + return + }, + ) + return values, nil +} + +func _parse_json_map_int(str []byte, sequence *obiseq.BioSequence) (map[string]int, error) { + values := make(map[string]int) + jsonparser.ObjectEach(str, + func(key []byte, value []byte, dataType jsonparser.ValueType, offset int) (err error) { + skey := string(key) + intval, err := jsonparser.ParseInt(value) + if err != nil { + return err + } + values[skey] = int(intval) + return nil + }, + ) + return values, nil +} + +func _parse_json_map_float(str []byte, sequence *obiseq.BioSequence) (map[string]float64, error) { + values := make(map[string]float64) + jsonparser.ObjectEach(str, + func(key []byte, value []byte, dataType jsonparser.ValueType, offset int) (err error) { + skey := string(key) + floatval, err := strconv.ParseFloat(obiutils.UnsafeString(value), 64) + if err != nil { + return err + } + values[skey] = float64(floatval) + return nil + }, + ) + return values, nil +} + +func _parse_json_map_bool(str []byte, sequence *obiseq.BioSequence) (map[string]bool, error) { + values := make(map[string]bool) + jsonparser.ObjectEach(str, + func(key []byte, value []byte, dataType jsonparser.ValueType, offset int) (err error) { + skey := string(key) + boolval, err := jsonparser.ParseBoolean(value) + if err != nil { + return err + } + values[skey] = boolval + return nil + }, + ) + return values, nil +} + +func _parse_json_map_interface(str []byte, sequence *obiseq.BioSequence) (map[string]interface{}, error) { + values := make(map[string]interface{}) + jsonparser.ObjectEach(str, + func(key []byte, value []byte, dataType jsonparser.ValueType, offset int) (err error) { + skey := string(key) + switch dataType { + case jsonparser.String: + values[skey] = string(value) + case jsonparser.Number: + // Try to parse the number as an int at first then as float if that fails. + values[skey], err = jsonparser.ParseInt(value) + if err != nil { + values[skey], err = strconv.ParseFloat(obiutils.UnsafeString(value), 64) + } + if err != nil { + return + } + case jsonparser.Boolean: + default: + values[skey] = string(value) + } + return + }, + ) + return values, nil +} + +func _parse_json_array_string(str []byte, sequence *obiseq.BioSequence) ([]string, error) { + values := make([]string, 0) + jsonparser.ArrayEach(str, + func(value []byte, dataType jsonparser.ValueType, offset int, err error) { + if dataType == jsonparser.String { + skey := string(value) + values = append(values, skey) + } + }, + ) + return values, nil +} + +func _parse_json_array_int(str []byte, sequence *obiseq.BioSequence) ([]int, error) { + values := make([]int, 0) + jsonparser.ArrayEach(str, + func(value []byte, dataType jsonparser.ValueType, offset int, err error) { + if dataType == jsonparser.Number { + intval, err := jsonparser.ParseInt(value) + if err != nil { + log.Fatalf("%s: Parsing int failed on value %s: %s", sequence.Id(), value, err) + } + values = append(values, int(intval)) + } + }, + ) + return values, nil +} + +func _parse_json_array_float(str []byte, sequence *obiseq.BioSequence) ([]float64, error) { + values := make([]float64, 0) + jsonparser.ArrayEach(str, + func(value []byte, dataType jsonparser.ValueType, offset int, err error) { + if dataType == jsonparser.Number { + floatval, err := strconv.ParseFloat(obiutils.UnsafeString(value), 64) + if err == nil { + values = append(values, float64(floatval)) + } else { + log.Fatalf("%s: Parsing float failed on value %s: %s", sequence.Id(), value, err) + } + } + }, + ) + return values, nil +} + +func _parse_json_array_bool(str []byte, sequence *obiseq.BioSequence) ([]bool, error) { + values := make([]bool, 0) + jsonparser.ArrayEach(str, + func(value []byte, dataType jsonparser.ValueType, offset int, err error) { + if dataType == jsonparser.Boolean { + boolval, err := jsonparser.ParseBoolean(value) + if err != nil { + log.Fatalf("%s: Parsing bool failed on value %s: %s", sequence.Id(), value, err) + } + values = append(values, boolval) + } + }, + ) + return values, nil +} + +func _parse_json_array_interface(str []byte, sequence *obiseq.BioSequence) ([]interface{}, error) { + values := make([]interface{}, 0) + jsonparser.ArrayEach(str, + func(value []byte, dataType jsonparser.ValueType, offset int, err error) { + switch dataType { + case jsonparser.String: + values = append(values, string(value)) + case jsonparser.Number: + // Try to parse the number as an int at first then as float if that fails. + intval, err := jsonparser.ParseInt(value) + if err != nil { + floatval, err := strconv.ParseFloat(obiutils.UnsafeString(value), 64) + if err != nil { + values = append(values, string(value)) + } else { + values = append(values, floatval) + } + } else { + values = append(values, intval) + } + case jsonparser.Boolean: + boolval, err := jsonparser.ParseBoolean(value) + if err != nil { + values = append(values, string(value)) + } else { + values = append(values, boolval) + } + + default: + values = append(values, string(value)) + } + + }, + ) + return values, nil +} + +func _parse_json_header_(header string, sequence *obiseq.BioSequence) string { + taxonomy := obitax.DefaultTaxonomy() + + annotations := sequence.Annotations() start := -1 stop := -1 level := 0 @@ -51,23 +241,136 @@ func _parse_json_header_(header string, annotations obiseq.Annotation) string { stop++ - err := json.Unmarshal([]byte(header)[start:stop], &annotations) + jsonparser.ObjectEach(obiutils.UnsafeBytes(header[start:stop]), + func(key []byte, value []byte, dataType jsonparser.ValueType, offset int) error { + var err error - for k, v := range annotations { - switch vt := v.(type) { - case float64: - if vt == math.Floor(vt) { - annotations[k] = int(vt) - } - { - annotations[k] = vt - } - } - } + skey := obiutils.UnsafeString(key) - if err != nil { - log.Fatalf("annotation parsing error on %s : %v\n", header, err) - } + switch { + case skey == "id": + sequence.SetId(string(value)) + case skey == "definition": + sequence.SetDefinition(string(value)) + + case skey == "count": + if dataType != jsonparser.Number { + log.Fatalf("%s: Count attribut must be numeric: %s", sequence.Id(), string(value)) + } + count, err := jsonparser.ParseInt(value) + if err != nil { + log.Fatalf("%s: Cannot parse count %s", sequence.Id(), string(value)) + } + sequence.SetCount(int(count)) + + case skey == "obiclean_weight": + weight, err := _parse_json_map_int(value, sequence) + if err != nil { + log.Fatalf("%s: Cannot parse obiclean weight %s", sequence.Id(), string(value)) + } + annotations[skey] = weight + + case skey == "obiclean_status": + status, err := _parse_json_map_string(value, sequence) + if err != nil { + log.Fatalf("%s: Cannot parse obiclean status %s", sequence.Id(), string(value)) + } + annotations[skey] = status + + case strings.HasPrefix(skey, "merged_"): + if dataType == jsonparser.Object { + data, err := _parse_json_map_int(value, sequence) + if err != nil { + log.Fatalf("%s: Cannot parse merged slot %s: %v", sequence.Id(), skey, err) + } else { + annotations[skey] = data + } + } else { + log.Fatalf("%s: Cannot parse merged slot %s", sequence.Id(), skey) + } + + case skey == "taxid": + if dataType == jsonparser.Number || dataType == jsonparser.String { + taxid := obiutils.UnsafeString(value) + taxon := taxonomy.Taxon(taxid) + if taxon != nil { + sequence.SetTaxon(taxon) + } else { + sequence.SetTaxid(string(value)) + } + } else { + log.Fatalf("%s: Cannot parse taxid %s", sequence.Id(), string(value)) + } + + case strings.HasSuffix(skey, "_taxid"): + if dataType == jsonparser.Number || dataType == jsonparser.String { + rank, _ := obiutils.SplitInTwo(skey, '_') + + taxid := obiutils.UnsafeString(value) + taxon := taxonomy.Taxon(taxid) + + if taxon != nil { + taxid = taxon.String() + } else { + taxid = string(value) + } + + sequence.SetTaxid(taxid, rank) + } else { + log.Fatalf("%s: Cannot parse taxid %s", sequence.Id(), string(value)) + } + + default: + skey = strings.Clone(skey) + switch dataType { + case jsonparser.String: + annotations[skey] = string(value) + case jsonparser.Number: + // Try to parse the number as an int at first then as float if that fails. + annotations[skey], err = jsonparser.ParseInt(value) + if err != nil { + annotations[skey], err = strconv.ParseFloat(obiutils.UnsafeString(value), 64) + } + case jsonparser.Array: + annotations[skey], err = _parse_json_array_interface(value, sequence) + case jsonparser.Object: + annotations[skey], err = _parse_json_map_interface(value, sequence) + case jsonparser.Boolean: + annotations[skey], err = jsonparser.ParseBoolean(value) + case jsonparser.Null: + annotations[skey] = nil + default: + log.Fatalf("Unknown data type %v", dataType) + } + } + + if err != nil { + annotations[skey] = "NaN" + log.Fatalf("%s: Cannot parse value %s assicated to key %s into a %s value", + sequence.Id(), string(value), skey, dataType.String()) + } + + return err + }, + ) + + // err := json.Unmarshal([]byte(header)[start:stop], &annotations) + + // for k, v := range annotations { + // switch vt := v.(type) { + // case float64: + // if vt == math.Floor(vt) { + // annotations[k] = int(vt) + // } + // { + // annotations[k] = vt + // } + // } + // } + + // if err != nil { + // log.Fatalf("annotation parsing error on %s : %v\n", header, err) + // } return strings.TrimSpace(header[stop:]) } @@ -78,7 +381,9 @@ func ParseFastSeqJsonHeader(sequence *obiseq.BioSequence) { definition_part := _parse_json_header_( definition, - sequence.Annotations()) + sequence, + ) + if len(definition_part) > 0 { if sequence.HasDefinition() { definition_part = sequence.Definition() + " " + definition_part diff --git a/pkg/obiformats/ncbitaxdump/read.go b/pkg/obiformats/ncbitaxdump/read.go index a656167..3ba0bea 100644 --- a/pkg/obiformats/ncbitaxdump/read.go +++ b/pkg/obiformats/ncbitaxdump/read.go @@ -15,6 +15,19 @@ import ( "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" ) +// loadNodeTable reads a node table from the provided reader and populates the given taxonomy. +// It is an internal function and should not be called directly. It is part of the NCBI taxdump reader. +// The node table is expected to be in CSV format with a custom delimiter ('|') and comments +// starting with '#'. Each record in the table represents a taxon with its taxid, parent taxid, +// and rank. +// +// Parameters: +// - reader: An io.Reader from which the node table is read. +// - taxonomy: A pointer to an obitax.Taxonomy instance where the taxon data will be added. +// +// The function reads each record from the input, trims whitespace from the taxid, parent, and rank, +// and adds the taxon to the taxonomy. If an error occurs while adding a taxon, the function logs +// a fatal error and terminates the program. func loadNodeTable(reader io.Reader, taxonomy *obitax.Taxonomy) { file := csv.NewReader(reader) file.Comma = '|' @@ -38,6 +51,21 @@ func loadNodeTable(reader io.Reader, taxonomy *obitax.Taxonomy) { } } +// loadNameTable reads a name table from the provided reader and populates the given taxonomy. +// It is an internal function and should not be called directly. It is part of the NCBI taxdump reader. +// The name table is expected to be in a custom format with fields separated by the '|' character. +// Each record in the table represents a taxon with its taxid, name, and class name. +// +// Parameters: +// - reader: An io.Reader from which the name table is read. +// - taxonomy: A pointer to an obitax.Taxonomy instance where the taxon names will be set. +// - onlysn: A boolean flag indicating whether to only process records with the class name "scientific name". +// +// Returns: +// +// The number of taxon names successfully loaded into the taxonomy. If a line is too long, -1 is returned. +// The function processes each line, trims whitespace from the taxid, name, and class name, and sets +// the name in the taxonomy if the conditions are met. func loadNameTable(reader io.Reader, taxonomy *obitax.Taxonomy, onlysn bool) int { // file := csv.NewReader(reader) // file.Comma = '|' @@ -71,6 +99,19 @@ func loadNameTable(reader io.Reader, taxonomy *obitax.Taxonomy, onlysn bool) int return n } +// loadMergedTable reads a merged table from the provided reader and populates the given taxonomy. +// It is an internal function and should not be called directly. It is part of the NCBI taxdump reader. +// The merged table is expected to be in CSV format with a custom delimiter ('|') and comments +// starting with '#'. Each record in the table represents a mapping between an old taxid and a new taxid. +// +// Parameters: +// - reader: An io.Reader from which the merged table is read. +// - taxonomy: A pointer to an obitax.Taxonomy instance where the alias mappings will be added. +// +// Returns: +// +// The number of alias mappings successfully loaded into the taxonomy. The function processes +// each record, trims whitespace from the old and new taxid, and adds the alias to the taxonomy. func loadMergedTable(reader io.Reader, taxonomy *obitax.Taxonomy) int { file := csv.NewReader(reader) file.Comma = '|' diff --git a/pkg/obioptions/version.go b/pkg/obioptions/version.go index 73398c6..d85fc22 100644 --- a/pkg/obioptions/version.go +++ b/pkg/obioptions/version.go @@ -7,7 +7,7 @@ import ( // TODO: The version number is extracted from git. This induces that the version // corresponds to the last commit, and not the one when the file will be // commited -var _Commit = "795df34" +var _Commit = "abfa8f3" var _Version = "Release 4.2.0" // Version returns the version of the obitools package. diff --git a/pkg/obiseq/taxonomy_classifier.go b/pkg/obiseq/taxonomy_classifier.go index 0eba39e..d6fb6fe 100644 --- a/pkg/obiseq/taxonomy_classifier.go +++ b/pkg/obiseq/taxonomy_classifier.go @@ -19,6 +19,8 @@ func TaxonomyClassifier(taxonomicRank string, taxonomy *obitax.Taxonomy, abortOnMissing bool) *BioSequenceClassifier { + taxonomy = taxonomy.OrDefault(true) + keys := make(map[*obitax.TaxNode]int) codes := make([]*obitax.TaxNode, 1) codes[0] = nil diff --git a/pkg/obiseq/taxonomy_lca.go b/pkg/obiseq/taxonomy_lca.go index 02aa847..bde85c7 100644 --- a/pkg/obiseq/taxonomy_lca.go +++ b/pkg/obiseq/taxonomy_lca.go @@ -12,6 +12,8 @@ func (sequence *BioSequence) TaxonomicDistribution(taxonomy *obitax.Taxonomy) ma taxids := sequence.StatsOn(MakeStatsOnDescription("taxid"), "na") taxons := make(map[*obitax.TaxNode]int, len(taxids)) + taxonomy = taxonomy.OrDefault(true) + for taxid, v := range taxids { t := taxonomy.Taxon(taxid) if t == nil { @@ -27,6 +29,9 @@ func (sequence *BioSequence) TaxonomicDistribution(taxonomy *obitax.Taxonomy) ma } func (sequence *BioSequence) LCA(taxonomy *obitax.Taxonomy, threshold float64) (*obitax.Taxon, float64, int) { + + taxonomy = taxonomy.OrDefault(true) + taxons := sequence.TaxonomicDistribution(taxonomy) paths := make(map[*obitax.TaxNode]*obitax.TaxonSlice, len(taxons)) answer := (*obitax.TaxNode)(nil) @@ -34,11 +39,11 @@ func (sequence *BioSequence) LCA(taxonomy *obitax.Taxonomy, threshold float64) ( granTotal := 0 for t, w := range taxons { - p := (&obitax.Taxon{Taxonomy: taxonomy, - Node: t, - }).Path() + taxon := &obitax.Taxon{Taxonomy: taxonomy, Node: t} + p := taxon.Path() + if p == nil { - log.Panicf("Sequence %s: taxonomic path cannot be retreived from Taxid %d : %v", sequence.Id(), t.String(taxonomy.Code())) + log.Panicf("Sequence %s: taxonomic path cannot be retreived from Taxid : %s", sequence.Id(), taxon.String()) } p.Reverse(true) @@ -103,6 +108,8 @@ func (sequence *BioSequence) LCA(taxonomy *obitax.Taxonomy, threshold float64) ( func AddLCAWorker(taxonomy *obitax.Taxonomy, slot_name string, threshold float64) SeqWorker { + taxonomy = taxonomy.OrDefault(true) + if !strings.HasSuffix(slot_name, "taxid") { slot_name = slot_name + "_taxid" } diff --git a/pkg/obiseq/taxonomy_methods.go b/pkg/obiseq/taxonomy_methods.go index 8f4bbf5..fa2b7c6 100644 --- a/pkg/obiseq/taxonomy_methods.go +++ b/pkg/obiseq/taxonomy_methods.go @@ -9,6 +9,7 @@ import ( ) func (s *BioSequence) Taxon(taxonomy *obitax.Taxonomy) *obitax.Taxon { + taxid := s.Taxid() if taxid == "NA" { return nil @@ -21,16 +22,39 @@ func (s *BioSequence) Taxon(taxonomy *obitax.Taxonomy) *obitax.Taxon { // Parameters: // // taxid - the taxid to set. -func (s *BioSequence) SetTaxid(taxid string) { +func (s *BioSequence) SetTaxid(taxid string, rank ...string) { if taxid == "" { taxid = "NA" + } else { + taxonomy := obitax.DefaultTaxonomy() + taxon := (*obitax.Taxon)(nil) + + if taxonomy != nil { + taxon = taxonomy.Taxon(taxid) + } + + if taxon != nil { + taxid = taxon.String() + } + } + + if len(rank) > 0 { + r := rank[0] + s.SetAttribute(r+"_taxid", taxid) + } else { + s.SetAttribute("taxid", taxid) } - s.SetAttribute("taxid", taxid) } -func (s *BioSequence) SetTaxon(taxon *obitax.Taxon) { +func (s *BioSequence) SetTaxon(taxon *obitax.Taxon, rank ...string) { taxid := taxon.String() - s.SetTaxid(taxid) + + if len(rank) > 0 { + r := rank[0] + s.SetAttribute(r+"_taxid", taxid) + } else { + s.SetAttribute("taxid", taxid) + } } // Taxid returns the taxonomic ID associated with the BioSequence. diff --git a/pkg/obitax/taxonnode.go b/pkg/obitax/taxonnode.go index dbfd761..5e3e7ec 100644 --- a/pkg/obitax/taxonnode.go +++ b/pkg/obitax/taxonnode.go @@ -35,13 +35,15 @@ type TaxNode struct { // - taxonomyCode: A string representing the code of the taxonomy to which the node belongs. // // Returns: -// - A formatted string representing the TaxNode in the form "taxonomyCode:id [scientificName]". +// - A formatted string representing the TaxNode in the form "taxonomyCode:id [scientificName]@rank". func (node *TaxNode) String(taxonomyCode string) string { if node.HasScientificName() { - return fmt.Sprintf("%s:%v [%s]", + return fmt.Sprintf("%s:%v [%s]@%s", taxonomyCode, *node.id, - node.ScientificName()) + node.ScientificName(), + node.Rank(), + ) } return fmt.Sprintf("%s:%v", diff --git a/pkg/obitools/obiannotate/obiannotate.go b/pkg/obitools/obiannotate/obiannotate.go index be1608a..5a7fc84 100644 --- a/pkg/obitools/obiannotate/obiannotate.go +++ b/pkg/obitools/obiannotate/obiannotate.go @@ -273,31 +273,31 @@ func CLIAnnotationWorker() obiseq.SeqWorker { } if CLIHasTaxonAtRank() { - taxo := obigrep.CLILoadSelectedTaxonomy() + taxo := obitax.DefaultTaxonomy() w := AddTaxonAtRankWorker(taxo, CLITaxonAtRank()...) annotator = annotator.ChainWorkers(w) } if CLISetTaxonomicPath() { - taxo := obigrep.CLILoadSelectedTaxonomy() + taxo := obitax.DefaultTaxonomy() w := obiseq.MakeSetPathWorker(taxo) annotator = annotator.ChainWorkers(w) } if CLISetTaxonomicRank() { - taxo := obigrep.CLILoadSelectedTaxonomy() + taxo := obitax.DefaultTaxonomy() w := AddTaxonRankWorker(taxo) annotator = annotator.ChainWorkers(w) } if CLISetScientificName() { - taxo := obigrep.CLILoadSelectedTaxonomy() + taxo := obitax.DefaultTaxonomy() w := AddScientificNameWorker(taxo) annotator = annotator.ChainWorkers(w) } if CLIHasAddLCA() { - taxo := obigrep.CLILoadSelectedTaxonomy() + taxo := obitax.DefaultTaxonomy() w := obiseq.AddLCAWorker(taxo, CLILCASlotName(), CLILCAThreshold()) annotator = annotator.ChainWorkers(w) } diff --git a/pkg/obitools/obicleandb/obicleandb.go b/pkg/obitools/obicleandb/obicleandb.go index f5465e4..6a5b1e5 100644 --- a/pkg/obitools/obicleandb/obicleandb.go +++ b/pkg/obitools/obicleandb/obicleandb.go @@ -11,6 +11,7 @@ import ( "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obistats" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obigrep" ) @@ -245,7 +246,7 @@ func ICleanDB(itertator obiiter.IBioSequence) obiiter.IBioSequence { log.Fatal(err) } - taxonomy := obigrep.CLILoadSelectedTaxonomy() + taxonomy := obitax.DefaultTaxonomy() if len(obigrep.CLIRequiredRanks()) > 0 { rankPredicate = obigrep.CLIHasRankDefinedPredicate() diff --git a/pkg/obitools/obiconvert/options.go b/pkg/obitools/obiconvert/options.go index 2f2c2f1..5b96925 100644 --- a/pkg/obitools/obiconvert/options.go +++ b/pkg/obitools/obiconvert/options.go @@ -3,6 +3,7 @@ package obiconvert import ( "os" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" log "github.com/sirupsen/logrus" "github.com/DavidGamba/go-getoptions" @@ -115,6 +116,7 @@ func PairedFilesOptionSet(options *getoptions.GetOpt) { } func OptionSet(options *getoptions.GetOpt) { + obioptions.LoadTaxonomyOptionSet(options, false, false) InputOptionSet(options) OutputOptionSet(options) PairedFilesOptionSet(options) diff --git a/pkg/obitools/obigrep/options.go b/pkg/obitools/obigrep/options.go index c44a23e..e11a03f 100644 --- a/pkg/obitools/obigrep/options.go +++ b/pkg/obitools/obigrep/options.go @@ -6,7 +6,7 @@ import ( log "github.com/sirupsen/logrus" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiapat" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiformats/ncbitaxdump" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" @@ -33,7 +33,6 @@ var _Predicats = make([]string, 0) var _IdList = "" var _Taxdump = "" -var _Taxonomy = (*obitax.Taxonomy)(nil) var _RequiredAttributes = make([]string, 0) var _AttributePatterns = make(map[string]string, 0) @@ -49,10 +48,7 @@ var _pattern_indel = false var _pattern_only_forward = false func TaxonomySelectionOptionSet(options *getoptions.GetOpt) { - - options.StringVar(&_Taxdump, "taxdump", _Taxdump, - options.Alias("t"), - options.Description("Points to the directory containing the NCBI Taxonomy database dump.")) + obioptions.LoadTaxonomyOptionSet(options, false, false) options.StringSliceVar(&_BelongTaxa, "restrict-to-taxon", 1, 1, options.Alias("r"), @@ -246,31 +242,12 @@ func CLIPatternBothStrand() bool { return !_pattern_only_forward } -func CLILoadSelectedTaxonomy() *obitax.Taxonomy { - if CLISelectedNCBITaxDump() != "" { - if _Taxonomy == nil { - var err error - _Taxonomy, err = ncbitaxdump.LoadNCBITaxDump(CLISelectedNCBITaxDump(), true) - if err != nil { - log.Fatalf("cannot load taxonomy %s : %v", - CLISelectedNCBITaxDump(), err) - return nil - } - } - return _Taxonomy - } - - log.Fatalln("no NCBI taxdump selected using option -t|--taxdump") - - return nil -} - func CLIRestrictTaxonomyPredicate() obiseq.SequencePredicate { var p obiseq.SequencePredicate var p2 obiseq.SequencePredicate if len(_BelongTaxa) > 0 { - taxonomy := CLILoadSelectedTaxonomy() + taxonomy := obitax.DefaultTaxonomy() taxon := taxonomy.Taxon(_BelongTaxa[0]) if taxon == nil { @@ -300,7 +277,7 @@ func CLIAvoidTaxonomyPredicate() obiseq.SequencePredicate { var p2 obiseq.SequencePredicate if len(_NotBelongTaxa) > 0 { - taxonomy := CLILoadSelectedTaxonomy() + taxonomy := obitax.DefaultTaxonomy() taxon := taxonomy.Taxon(_NotBelongTaxa[0]) if taxon == nil { @@ -329,7 +306,7 @@ func CLIAvoidTaxonomyPredicate() obiseq.SequencePredicate { func CLIHasRankDefinedPredicate() obiseq.SequencePredicate { if len(_RequiredRanks) > 0 { - taxonomy := CLILoadSelectedTaxonomy() + taxonomy := obitax.DefaultTaxonomy() p := obiseq.HasRequiredRank(taxonomy, _RequiredRanks[0]) for _, rank := range _RequiredRanks[1:] { diff --git a/pkg/obitools/obirefidx/geomindexing.go b/pkg/obitools/obirefidx/geomindexing.go index 3df6bda..22eb4e6 100644 --- a/pkg/obitools/obirefidx/geomindexing.go +++ b/pkg/obitools/obirefidx/geomindexing.go @@ -70,11 +70,7 @@ func GeomIndexSesquence(seqidx int, new_lca, _ := lca.LCA(taxa.Taxon(o)) if new_lca.SameAs(lca) { lca = new_lca - index[int(seq_dist[o])] = fmt.Sprintf( - "%s@%s", - lca.String(), - lca.Rank(), - ) + index[int(seq_dist[o])] = lca.String() if lca.IsRoot() { break diff --git a/pkg/obitools/obirefidx/obirefidx.go b/pkg/obitools/obirefidx/obirefidx.go index 943b863..a7e0763 100644 --- a/pkg/obitools/obirefidx/obirefidx.go +++ b/pkg/obitools/obirefidx/obirefidx.go @@ -1,7 +1,6 @@ package obirefidx import ( - "fmt" "os" log "github.com/sirupsen/logrus" @@ -172,11 +171,7 @@ func IndexSequence(seqidx int, for i, d := range closest { if i < (len(closest)-1) && d < closest[i+1] { current_taxon := pseq.Taxon(i) - obitag_index[d] = fmt.Sprintf( - "%s@%s", - current_taxon.String(), - current_taxon.Rank(), - ) + obitag_index[d] = current_taxon.String() } } @@ -197,9 +192,10 @@ func IndexReferenceDB(iterator obiiter.IBioSequence) obiiter.IBioSequence { source, references := iterator.Load() log.Infof("Done. Database contains %d sequences", len(references)) - taxo, error := obioptions.CLILoadSelectedTaxonomy() - if error != nil { - log.Panicln(error) + taxo := obitax.DefaultTaxonomy() + + if taxo == nil { + log.Fatal("No taxonomy loaded.") } log.Infoln("Indexing sequence taxids...") diff --git a/pkg/obiutils/unsafe.go b/pkg/obiutils/unsafe.go new file mode 100644 index 0000000..a43c540 --- /dev/null +++ b/pkg/obiutils/unsafe.go @@ -0,0 +1,26 @@ +package obiutils + +import "unsafe" + +// UnsafeBytes converts a string into a byte slice without making a copy of the data. +// This function is considered unsafe because it directly manipulates memory and does not +// perform any checks on the string's contents. It should be used with caution. +// +// Parameters: +// - str: The input string to be converted into a byte slice. +// +// Returns: +// +// A byte slice representation of the input string. The returned slice shares the same +// underlying data as the original string, so modifications to the byte slice may affect +// the original string and vice versa. +func UnsafeBytes(str string) []byte { + d := unsafe.StringData(str) + b := unsafe.Slice(d, len(str)) + + return b +} + +func UnsafeString(b []byte) string { + return unsafe.String(unsafe.SliceData(b), len(b)) +}