diff --git a/cmd/obitools/obifind/main.go b/cmd/obitools/obifind/main.go index b51ef3e..0b3f38a 100644 --- a/cmd/obitools/obifind/main.go +++ b/cmd/obitools/obifind/main.go @@ -22,26 +22,26 @@ func main() { } switch { - case obifind.CLIRequestsPathForTaxid() >= 0: + case obifind.CLIRequestsPathForTaxid() != "NA": taxonomy, err := obifind.CLILoadSelectedTaxonomy() if err != nil { fmt.Printf("%+v", err) } - taxon, err := taxonomy.Taxon(obifind.CLIRequestsPathForTaxid()) + taxon := taxonomy.Taxon(obifind.CLIRequestsPathForTaxid()) - if err != nil { + if taxon == nil { fmt.Printf("%+v", err) } - s, err := taxon.Path() + s := taxon.Path() if err != nil { fmt.Printf("%+v", err) } obifind.TaxonWriter(s.Iterator(), - fmt.Sprintf("path:%d", taxon.Taxid())) + fmt.Sprintf("path:%s", taxon.String())) case len(args) == 0: taxonomy, err := obifind.CLILoadSelectedTaxonomy() diff --git a/pkg/obiformats/csv_writer.go b/pkg/obiformats/csv_writer.go index 7d22ff9..acfb604 100644 --- a/pkg/obiformats/csv_writer.go +++ b/pkg/obiformats/csv_writer.go @@ -30,14 +30,10 @@ func CSVRecord(sequence *obiseq.BioSequence, opt Options) []string { if opt.CSVTaxon() { taxid := sequence.Taxid() - sn, ok := sequence.GetAttribute("scientific_name") + sn, ok := sequence.GetStringAttribute("scientific_name") if !ok { - if taxid == 1 { - sn = "root" - } else { - sn = opt.CSVNAValue() - } + sn = opt.CSVNAValue() } record = append(record, fmt.Sprint(taxid), fmt.Sprint(sn)) diff --git a/pkg/obiformats/ncbitaxdump/read.go b/pkg/obiformats/ncbitaxdump/read.go index c304961..ae5b7e3 100644 --- a/pkg/obiformats/ncbitaxdump/read.go +++ b/pkg/obiformats/ncbitaxdump/read.go @@ -7,7 +7,6 @@ import ( "io" "os" "path" - "strconv" "strings" log "github.com/sirupsen/logrus" @@ -26,24 +25,16 @@ func loadNodeTable(reader io.Reader, taxonomy *obitax.Taxonomy) { for record, err := file.Read(); err == nil; record, err = file.Read() { n++ - taxid, err := strconv.Atoi(strings.TrimSpace(record[0])) - - if err != nil { - log.Panicf("Cannot read taxon taxid at line %d: %v", n, err) - } - - parent, err := strconv.Atoi(strings.TrimSpace(record[1])) - - if err != nil { - log.Panicf("Cannot read taxon parent taxid at line %d: %v", n, err) - } - + taxid := strings.TrimSpace(record[0]) + parent := strings.TrimSpace(record[1]) rank := strings.TrimSpace(record[2]) - taxonomy.AddNewTaxa(taxid, parent, rank, true, true) - } + _, err := taxonomy.AddTaxon(taxid, parent, rank, taxid == "1", false) - taxonomy.ReindexParent() + if err != nil { + log.Fatalf("Error adding taxon %s: %v\n", taxid, err) + } + } } func loadNameTable(reader io.Reader, taxonomy *obitax.Taxonomy, onlysn bool) int { @@ -65,18 +56,14 @@ func loadNameTable(reader io.Reader, taxonomy *obitax.Taxonomy, onlysn bool) int } record := strings.Split(string(line), "|") - taxid, err := strconv.Atoi(strings.TrimSpace(record[0])) - - if err != nil { - log.Panicf("Cannot read taxon name taxid at line %d: %v", l, err) - } + taxid := strings.TrimSpace(record[0]) name := strings.TrimSpace(record[1]) classname := strings.TrimSpace(record[3]) if !onlysn || classname == "scientific name" { n++ - taxonomy.AddNewName(taxid, &name, &classname) + taxonomy.Taxon(taxid).SetName(name, classname) } } @@ -94,18 +81,10 @@ func loadMergedTable(reader io.Reader, taxonomy *obitax.Taxonomy) int { for record, err := file.Read(); err == nil; record, err = file.Read() { n++ - oldtaxid, err := strconv.Atoi(strings.TrimSpace(record[0])) + oldtaxid := strings.TrimSpace(record[0]) + newtaxid := strings.TrimSpace(record[1]) - if err != nil { - log.Panicf("Cannot read alias taxid at line %d: %v", n, err) - } - newtaxid, err := strconv.Atoi(strings.TrimSpace(record[1])) - - if err != nil { - log.Panicf("Cannot read alias new taxid at line %d: %v", n, err) - } - - taxonomy.AddNewAlias(newtaxid, oldtaxid) + taxonomy.AddAlias(newtaxid, oldtaxid, false) } return n @@ -113,7 +92,7 @@ func loadMergedTable(reader io.Reader, taxonomy *obitax.Taxonomy) int { func LoadNCBITaxDump(directory string, onlysn bool) (*obitax.Taxonomy, error) { - taxonomy := obitax.NewTaxonomy() + taxonomy := obitax.NewTaxonomy("NCBI Taxonomy", "taxon", "[[:digit:]]") // // Load the Taxonomy nodes diff --git a/pkg/obilua/obiseq.go b/pkg/obilua/obiseq.go index d99c634..e7967ab 100644 --- a/pkg/obilua/obiseq.go +++ b/pkg/obilua/obiseq.go @@ -147,10 +147,10 @@ func bioSequenceGetSetCount(luaState *lua.LState) int { func bioSequenceGetSetTaxid(luaState *lua.LState) int { s := checkBioSequence(luaState) if luaState.GetTop() == 2 { - s.SetTaxid(luaState.CheckInt(2)) + s.SetTaxid(luaState.CheckString(2)) return 0 } - luaState.Push(lua.LNumber(s.Taxid())) + luaState.Push(lua.LString(s.Taxid())) return 1 } diff --git a/pkg/obioptions/version.go b/pkg/obioptions/version.go index 93f0bee..231d812 100644 --- a/pkg/obioptions/version.go +++ b/pkg/obioptions/version.go @@ -7,7 +7,7 @@ import ( // TODO: The version number is extracted from git. This induces that the version // corresponds to the last commit, and not the one when the file will be // commited -var _Commit = "3e00d39" +var _Commit = "9471fed" var _Version = "Release 4.2.0" // Version returns the version of the obitools package. diff --git a/pkg/obiseq/attributes.go b/pkg/obiseq/attributes.go index de4ea31..5a604d9 100644 --- a/pkg/obiseq/attributes.go +++ b/pkg/obiseq/attributes.go @@ -9,17 +9,18 @@ import ( ) // AttributeKeys returns the keys of the attributes in the BioSequence. +// It optionally skips keys associated with container values based on the skip_container parameter. // -// It does not take any parameters. +// Parameters: +// - skip_container: A boolean indicating whether to skip keys associated with a container value. // // Returns: -// -// []string: The keys of the BioSequence. -func (s *BioSequence) AttributeKeys(skip_map bool) obiutils.Set[string] { +// - A set of strings containing the keys of the BioSequence attributes. +func (s *BioSequence) AttributeKeys(skip_container bool) obiutils.Set[string] { keys := obiutils.MakeSet[string]() for k, v := range s.Annotations() { - if !skip_map || !obiutils.IsAMap(v) { + if !skip_container || !obiutils.IsAContainer(v) { keys.Add(k) } } @@ -27,17 +28,18 @@ func (s *BioSequence) AttributeKeys(skip_map bool) obiutils.Set[string] { return keys } -// Keys returns the keys of the BioSequence. +// Keys returns the keys of the BioSequence, including standard keys and attribute keys. // -// It returns a slice of strings containing the keys of the BioSequence. -// The keys include "id", "sequence", "qualities", and the attribute keys -// of the BioSequence. +// It returns a set of strings containing the keys of the BioSequence. +// The keys include "id", "sequence", "qualities", and the attribute keys of the BioSequence. +// +// Parameters: +// - skip_container: A boolean indicating whether to skip keys associated with container values. // // Returns: -// -// []string: The keys of the BioSequence. -func (s *BioSequence) Keys(skip_map bool) obiutils.Set[string] { - keys := s.AttributeKeys(skip_map) +// - A set of strings containing the keys of the BioSequence. +func (s *BioSequence) Keys(skip_container bool) obiutils.Set[string] { + keys := s.AttributeKeys(skip_container) keys.Add("id") if s.HasSequence() { @@ -53,10 +55,10 @@ func (s *BioSequence) Keys(skip_map bool) obiutils.Set[string] { // HasAttribute checks if the BioSequence has the specified attribute. // // Parameters: -// - key: a string representing the attribute key to check. +// - key: A string representing the attribute key to check. // // Returns: -// - a boolean indicating whether the BioSequence has the attribute. +// - A boolean indicating whether the BioSequence has the attribute. func (s *BioSequence) HasAttribute(key string) bool { if key == "id" { return true @@ -386,31 +388,14 @@ func (s *BioSequence) SetCount(count int) { s.SetAttribute("count", count) } -// Taxid returns the taxonomic ID associated with the BioSequence. -// -// It retrieves the "taxid" attribute from the BioSequence's attributes map. -// If the attribute is not found, the function returns 1 as the default taxonomic ID. -// The taxid 1 corresponds to the root taxonomic level. -// -// The function returns an integer representing the taxonomic ID. -func (s *BioSequence) Taxid() int { - taxid, ok := s.GetIntAttribute("taxid") - - if !ok { - taxid = 1 - } - - return taxid -} - // SetTaxid sets the taxid for the BioSequence. // // Parameters: // // taxid - the taxid to set. -func (s *BioSequence) SetTaxid(taxid int) { - if taxid < 1 { - taxid = 1 +func (s *BioSequence) SetTaxid(taxid string) { + if taxid == "" { + taxid = "NA" } s.SetAttribute("taxid", taxid) } diff --git a/pkg/obiseq/biosequence.go b/pkg/obiseq/biosequence.go index 101e333..3a5b5d2 100644 --- a/pkg/obiseq/biosequence.go +++ b/pkg/obiseq/biosequence.go @@ -18,6 +18,7 @@ import ( "unsafe" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" log "github.com/sirupsen/logrus" ) @@ -63,6 +64,7 @@ type BioSequence struct { sequence []byte // The sequence itself, it is accessible by the methode Sequence qualities []byte // The quality scores of the sequence. feature []byte + taxon *obitax.Taxon paired *BioSequence // A pointer to the paired sequence revcomp *BioSequence // A pointer to the reverse complemented sequence annotations Annotation @@ -90,6 +92,7 @@ func NewEmptyBioSequence(preallocate int) *BioSequence { sequence: seq, qualities: nil, feature: nil, + taxon: nil, paired: nil, revcomp: nil, annotations: nil, @@ -223,7 +226,7 @@ func (s *BioSequence) HasDefinition() bool { // No parameters. // Returns a boolean. func (s *BioSequence) HasSequence() bool { - return s.sequence != nil && len(s.sequence) > 0 + return len(s.sequence) > 0 } // Sequence returns the sequence of the BioSequence. @@ -258,7 +261,7 @@ func (s *BioSequence) Len() int { // This function does not have any parameters. // It returns a boolean value indicating whether the BioSequence has qualities. func (s *BioSequence) HasQualities() bool { - return s.qualities != nil && len(s.qualities) > 0 + return len(s.qualities) > 0 } // Qualities returns the sequence quality scores of the BioSequence. diff --git a/pkg/obiseq/taxonomy_classifier.go b/pkg/obiseq/taxonomy_classifier.go new file mode 100644 index 0000000..0eba39e --- /dev/null +++ b/pkg/obiseq/taxonomy_classifier.go @@ -0,0 +1,79 @@ +package obiseq + +import ( + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax" + log "github.com/sirupsen/logrus" +) + +// TaxonomyClassifier is a function that creates a new instance of the BioSequenceClassifier +// for taxonomic classification based on a given taxonomic rank, taxonomy, and abort flag. +// +// Parameters: +// - taxonomicRank: the taxonomic rank to classify the sequences at. +// - taxonomy: the taxonomy object used for classification. +// - abortOnMissing: a flag indicating whether to abort if a taxon is missing in the taxonomy. +// +// Return: +// - *obiseq.BioSequenceClassifier: the new instance of the BioSequenceClassifier. +func TaxonomyClassifier(taxonomicRank string, + taxonomy *obitax.Taxonomy, + abortOnMissing bool) *BioSequenceClassifier { + + keys := make(map[*obitax.TaxNode]int) + codes := make([]*obitax.TaxNode, 1) + codes[0] = nil + keys[nil] = 0 + + code := func(sequence *BioSequence) int { + taxon := sequence.Taxon(taxonomy) + if taxon != nil { + ttaxon := taxon.TaxonAtRank(taxonomicRank) + if abortOnMissing && ttaxon == nil { + log.Fatalf("Taxon at rank %s not found in taxonomy for taxid %d", taxonomicRank, taxon.String()) + } + } else { + if abortOnMissing { + log.Fatalf("Sequence %s: Taxid %s not found in taxonomy", + sequence.Id(), + sequence.Taxid()) + } + taxon = nil + } + + k, ok := keys[taxon.Node] + + if ok { + return k + } + + k = len(codes) + keys[taxon.Node] = k + codes = append(codes, taxon.Node) + + return k + } + + value := func(k int) string { + taxon := codes[k] + return taxon.ScientificName() + } + + reset := func() { + keys = make(map[*obitax.TaxNode]int) + codes = make([]*obitax.TaxNode, 1) + codes[0] = nil + keys[nil] = 0 + } + + clone := func() *BioSequenceClassifier { + return TaxonomyClassifier(taxonomicRank, taxonomy, abortOnMissing) + } + + c := BioSequenceClassifier{ + Code: code, + Value: value, + Reset: reset, + Clone: clone, + Type: "TaxonomyClassifier"} + return &c +} diff --git a/pkg/obiseq/taxonomy_lca.go b/pkg/obiseq/taxonomy_lca.go new file mode 100644 index 0000000..5ed32ce --- /dev/null +++ b/pkg/obiseq/taxonomy_lca.go @@ -0,0 +1,131 @@ +package obiseq + +import ( + "math" + "strings" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax" + log "github.com/sirupsen/logrus" +) + +func (sequence *BioSequence) TaxonomicDistribution(taxonomy *obitax.Taxonomy) map[*obitax.TaxNode]int { + taxids := sequence.StatsOn(MakeStatsOnDescription("taxid"), "na") + taxons := make(map[*obitax.TaxNode]int, len(taxids)) + + for taxid, v := range taxids { + t := taxonomy.Taxon(taxid) + if t == nil { + log.Fatalf( + "On sequence %s taxid %s is not defined in taxonomy: %s", + sequence.Id(), + taxid, + taxonomy.Name()) + } + taxons[t.Node] = v + } + return taxons +} + +func (sequence *BioSequence) LCA(taxonomy *obitax.Taxonomy, threshold float64) (*obitax.Taxon, float64, int) { + taxons := sequence.TaxonomicDistribution(taxonomy) + paths := make(map[*obitax.TaxNode]*obitax.TaxonSlice, len(taxons)) + answer := (*obitax.TaxNode)(nil) + rans := 1.0 + granTotal := 0 + + for t, w := range taxons { + p := (&obitax.Taxon{Taxonomy: taxonomy, + Node: t, + }).Path() + if p == nil { + log.Panicf("Sequence %s: taxonomic path cannot be retreived from Taxid %d : %v", sequence.Id(), t.String(taxonomy.Code())) + } + + p.Reverse(true) + paths[t] = p + answer = p.Get(0) + granTotal += w + } + + rmax := 1.0 + levels := make(map[*obitax.TaxNode]int, len(paths)) + taxonMax := answer + + for i := 0; rmax >= threshold; i++ { + answer = taxonMax + rans = rmax + taxonMax = nil + total := 0 + for taxon, weight := range taxons { + path := paths[taxon] + if path.Len() > i { + levels[path.Get(i)] += weight + } + total += weight + } + weighMax := 0 + for taxon, weight := range levels { + if weight > weighMax { + weighMax = weight + taxonMax = taxon + } + } + + if total > 0 { + rmax *= float64(weighMax) / float64(total) + } else { + rmax = 0.0 + } + + for taxon := range levels { + delete(levels, taxon) + } + for taxon := range taxons { + path := paths[taxon] + if i < path.Len() { + if path.Get(i) != taxonMax { + delete(paths, taxon) + delete(taxons, taxon) + } + } + } + // if taxonMax != nil { + // log.Println("@@@>", i, taxonMax.ScientificName(), taxonMax.Taxid(), rans, weighMax, total, rmax) + // } else { + // log.Println("@@@>", "--", 0, rmax) + // } + } + // log.Println("###>", answer.ScientificName(), answer.Taxid(), rans) + // log.Print("========================================") + return &obitax.Taxon{Taxonomy: taxonomy, Node: answer}, rans, granTotal + +} + +func AddLCAWorker(taxonomy *obitax.Taxonomy, slot_name string, threshold float64) SeqWorker { + + if !strings.HasSuffix(slot_name, "taxid") { + slot_name = slot_name + "_taxid" + } + + lca_error := strings.Replace(slot_name, "taxid", "error", 1) + if lca_error == "error" { + lca_error = "lca_error" + } + + lca_name := strings.Replace(slot_name, "taxid", "name", 1) + if lca_name == "name" { + lca_name = "scientific_name" + } + + f := func(sequence *BioSequence) (BioSequenceSlice, error) { + lca, rans, _ := sequence.LCA(taxonomy, threshold) + + sequence.SetAttribute(slot_name, lca.String()) + sequence.SetAttribute(lca_name, lca.ScientificName()) + sequence.SetAttribute(lca_error, math.Round((1-rans)*1000)/1000) + + return BioSequenceSlice{sequence}, nil + } + + return f +} diff --git a/pkg/obiseq/taxonomy_methods.go b/pkg/obiseq/taxonomy_methods.go new file mode 100644 index 0000000..b6b46fa --- /dev/null +++ b/pkg/obiseq/taxonomy_methods.go @@ -0,0 +1,106 @@ +package obiseq + +import ( + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax" +) + +func (s *BioSequence) Taxon(taxonomy *obitax.Taxonomy) *obitax.Taxon { + taxid := s.Taxid() + if taxid == "NA" { + return nil + } + return taxonomy.Taxon(taxid) +} + +// Taxid returns the taxonomic ID associated with the BioSequence. +// +// It retrieves the "taxid" attribute from the BioSequence's attributes map. +// If the attribute is not found, the function returns 1 as the default taxonomic ID. +// The taxid 1 corresponds to the root taxonomic level. +// +// The function returns an integer representing the taxonomic ID. +func (s *BioSequence) Taxid() (taxid string) { + var ok bool + if s.taxon != nil { + taxid = s.taxon.String() + ok = true + } else { + taxid, ok = s.GetStringAttribute("taxid") + } + + if !ok { + taxid = "NA" + } + + return taxid +} + +// Setting the taxon at a given rank for a given sequence. +// +// Two attributes are added to the sequence. One named by the rank name stores +// the taxid, a second named by the rank name suffixed with '_name' contains the +// Scientific name of the genus. +// If the taxon at the given rank doesn't exist for the taxonomy annotation +// of the sequence, nothing happens. +func (sequence *BioSequence) SetTaxonAtRank(taxonomy *obitax.Taxonomy, rank string) *obitax.Taxon { + var taxonAtRank *obitax.Taxon + + taxon := sequence.Taxon(taxonomy) + taxonAtRank = nil + if taxon != nil { + taxonAtRank = taxon.TaxonAtRank(rank) + if taxonAtRank != nil { + // log.Printf("Taxid: %d Rank: %s --> proposed : %d (%s)", taxid, rank, taxonAtRank.taxid, *(taxonAtRank.scientificname)) + sequence.SetAttribute(rank+"_taxid", taxonAtRank.String()) + sequence.SetAttribute(rank+"_name", taxonAtRank.ScientificName()) + } else { + sequence.SetAttribute(rank+"_taxid", "NA") + sequence.SetAttribute(rank+"_name", "NA") + } + } + + return taxonAtRank +} + +// Setting the species of a sequence. +func (sequence *BioSequence) SetSpecies(taxonomy *obitax.Taxonomy) *obitax.Taxon { + return sequence.SetTaxonAtRank(taxonomy, "species") +} + +// Setting the genus of a sequence. +func (sequence *BioSequence) SetGenus(taxonomy *obitax.Taxonomy) *obitax.Taxon { + return sequence.SetTaxonAtRank(taxonomy, "genus") +} + +// Setting the family of a sequence. +func (sequence *BioSequence) SetFamily(taxonomy *obitax.Taxonomy) *obitax.Taxon { + return sequence.SetTaxonAtRank(taxonomy, "family") +} + +func (sequence *BioSequence) SetPath(taxonomy *obitax.Taxonomy) string { + taxon := sequence.Taxon(taxonomy) + path := taxon.Path() + + tpath := path.String() + sequence.SetAttribute("taxonomic_path", tpath) + + return tpath +} + +func (sequence *BioSequence) SetScientificName(taxonomy *obitax.Taxonomy) string { + taxon := sequence.Taxon(taxonomy) + name := taxon.ScientificName() + + sequence.SetAttribute("scienctific_name", name) + + return name +} + +func (sequence *BioSequence) SetTaxonomicRank(taxonomy *obitax.Taxonomy) string { + taxon := sequence.Taxon(taxonomy) + rank := taxon.Rank() + + sequence.SetAttribute("taxonomic_rank", rank) + + return rank +} diff --git a/pkg/obiseq/taxonomy_predicate.go b/pkg/obiseq/taxonomy_predicate.go new file mode 100644 index 0000000..e8c2f24 --- /dev/null +++ b/pkg/obiseq/taxonomy_predicate.go @@ -0,0 +1,98 @@ +package obiseq + +import ( + log "github.com/sirupsen/logrus" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" +) + +func IsAValidTaxon(taxonomy *obitax.Taxonomy, withAutoCorrection ...bool) SequencePredicate { + // deprecatedTaxidsWarning := make(map[string]bool) + + autocorrection := false + if len(withAutoCorrection) > 0 { + autocorrection = withAutoCorrection[0] + } + + f := func(sequence *BioSequence) bool { + taxon := sequence.Taxon(taxonomy) + + if taxon != nil { + taxid := sequence.Taxid() + ttaxid := taxon.String() + if taxid != ttaxid { + if autocorrection { + sequence.SetTaxid(ttaxid) + log.Printf( + "Sequence %s : Taxid %d updated with %d", + sequence.Id(), + taxid, + ttaxid, + ) + } // else { + // if _, ok := deprecatedTaxidsWarning[taxid]; !ok { + // deprecatedTaxidsWarning[taxid] = true + // log.Printf("Taxid %d is deprecated and must be replaced by %d", taxid, taxon.taxid) + // } + // } + } + } + + return taxon != nil + } + + return f +} + +// A function that takes a taxonomy and a taxid as arguments and returns a function that takes a +// pointer to a BioSequence as an argument and returns a boolean. +func IsSubCladeOf(taxonomy *obitax.Taxonomy, taxid string) SequencePredicate { + parent := taxonomy.Taxon(taxid) + + if parent == nil { + log.Fatalf("Cannot find taxon : %s in taxonomy %s", + taxid, + taxonomy.Name()) + } + + f := func(sequence *BioSequence) bool { + taxon := sequence.Taxon(taxonomy) + return taxon != nil && taxon.IsSubCladeOf(parent) + } + + return f +} + +func IsSubCladeOfSlot(taxonomy *obitax.Taxonomy, key string) SequencePredicate { + + f := func(sequence *BioSequence) bool { + val, ok := sequence.GetStringAttribute(key) + + if ok { + parent := taxonomy.Taxon(val) + taxon := sequence.Taxon(taxonomy) + return parent != nil && taxon != nil && taxon.IsSubCladeOf(parent) + } + + return false + } + + return f +} + +func HasRequiredRank(taxonomy *obitax.Taxonomy, rank string) SequencePredicate { + + if !obiutils.Contains(taxonomy.RankList(), rank) { + log.Fatalf("%s is not a valid rank (allowed ranks are %v)", + rank, + taxonomy.RankList()) + } + + f := func(sequence *BioSequence) bool { + taxon := sequence.Taxon(taxonomy) + return taxon != nil && taxon.HasRankDefined(rank) + } + + return f +} diff --git a/pkg/obiseq/taxonomy_workers.go b/pkg/obiseq/taxonomy_workers.go new file mode 100644 index 0000000..241de96 --- /dev/null +++ b/pkg/obiseq/taxonomy_workers.go @@ -0,0 +1,64 @@ +package obiseq + +import ( + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" + log "github.com/sirupsen/logrus" +) + +func MakeSetTaxonAtRankWorker(taxonomy *obitax.Taxonomy, rank string) SeqWorker { + + if !obiutils.Contains(taxonomy.RankList(), rank) { + log.Fatalf("%s is not a valid rank (allowed ranks are %v)", + rank, + taxonomy.RankList()) + } + + w := func(sequence *BioSequence) (BioSequenceSlice, error) { + sequence.SetTaxonAtRank(taxonomy, rank) + return BioSequenceSlice{sequence}, nil + } + + return w +} + +func MakeSetSpeciesWorker(taxonomy *obitax.Taxonomy) SeqWorker { + + w := func(sequence *BioSequence) (BioSequenceSlice, error) { + sequence.SetSpecies(taxonomy) + return BioSequenceSlice{sequence}, nil + } + + return w +} + +func MakeSetGenusWorker(taxonomy *obitax.Taxonomy) SeqWorker { + + w := func(sequence *BioSequence) (BioSequenceSlice, error) { + sequence.SetGenus(taxonomy) + return BioSequenceSlice{sequence}, nil + } + + return w +} + +func MakeSetFamilyWorker(taxonomy *obitax.Taxonomy) SeqWorker { + + w := func(sequence *BioSequence) (BioSequenceSlice, error) { + sequence.SetFamily(taxonomy) + return BioSequenceSlice{sequence}, nil + } + + return w +} + +func MakeSetPathWorker(taxonomy *obitax.Taxonomy) SeqWorker { + + w := func(sequence *BioSequence) (BioSequenceSlice, error) { + sequence.SetPath(taxonomy) + return BioSequenceSlice{sequence}, nil + } + + return w + +} diff --git a/pkg/obitax/filter_on_name.go b/pkg/obitax/filter_on_name.go index d8f689b..b4321f5 100644 --- a/pkg/obitax/filter_on_name.go +++ b/pkg/obitax/filter_on_name.go @@ -4,31 +4,32 @@ import ( "regexp" ) -func (taxonomy *Taxonomy) IFilterOnName(name string, strict bool) *ITaxonSet { +func (taxonomy *Taxonomy) IFilterOnName(name string, strict bool) *ITaxon { if strict { - nodes, ok := taxonomy.index[name] + nodes, ok := taxonomy.index[taxonomy.names.Innerize(name)] if ok { return nodes.Iterator() } else { - empty := make(TaxonSet) - return (&empty).Iterator() + empty := taxonomy.NewTaxonSet() + return empty.Iterator() } } return taxonomy.Iterator().IFilterOnName(name, strict) } -func (iterator *ITaxonSet) IFilterOnName(name string, strict bool) *ITaxonSet { - newIterator := NewITaxonSet() - sentTaxa := make(map[int]bool) +func (iterator *ITaxon) IFilterOnName(name string, strict bool) *ITaxon { + newIterator := NewITaxon() + sentTaxa := make(map[*string]bool) if strict { go func() { for iterator.Next() { taxon := iterator.Get() - if _, ok := sentTaxa[taxon.taxid]; !ok { + node := taxon.Node + if _, ok := sentTaxa[node.id]; !ok { if taxon.IsNameEqual(name) { - sentTaxa[taxon.taxid] = true + sentTaxa[node.id] = true newIterator.source <- taxon } } @@ -41,9 +42,10 @@ func (iterator *ITaxonSet) IFilterOnName(name string, strict bool) *ITaxonSet { go func() { for iterator.Next() { taxon := iterator.Get() - if _, ok := sentTaxa[taxon.taxid]; !ok { + node := taxon.Node + if _, ok := sentTaxa[node.id]; !ok { if taxon.IsNameMatching(pattern) { - sentTaxa[taxon.taxid] = true + sentTaxa[node.id] = true newIterator.source <- taxon } } diff --git a/pkg/obitax/filter_on_rank.go b/pkg/obitax/filter_on_rank.go index 8a1809f..08df40d 100644 --- a/pkg/obitax/filter_on_rank.go +++ b/pkg/obitax/filter_on_rank.go @@ -1,12 +1,20 @@ package obitax -func (iterator *ITaxonSet) IFilterOnTaxRank(rank string) *ITaxonSet { - newIter := NewITaxonSet() +func (iterator *ITaxon) IFilterOnTaxRank(rank string) *ITaxon { + newIter := NewITaxon() + var prank *string + var ptax *Taxonomy go func() { for iterator.Next() { + taxon := iterator.Get() - if taxon.rank == rank { + if ptax != taxon.Taxonomy { + ptax = taxon.Taxonomy + prank = ptax.ranks.Innerize(rank) + } + + if taxon.Node.rank == prank { newIter.source <- taxon } } @@ -16,14 +24,14 @@ func (iterator *ITaxonSet) IFilterOnTaxRank(rank string) *ITaxonSet { return newIter } -func (set *TaxonSet) IFilterOnTaxRank(rank string) *ITaxonSet { +func (set *TaxonSet) IFilterOnTaxRank(rank string) *ITaxon { return set.Iterator().IFilterOnTaxRank(rank) } -func (slice *TaxonSlice) IFilterOnTaxRank(rank string) *ITaxonSet { +func (slice *TaxonSlice) IFilterOnTaxRank(rank string) *ITaxon { return slice.Iterator().IFilterOnTaxRank(rank) } -func (taxonomy *Taxonomy) IFilterOnTaxRank(rank string) *ITaxonSet { +func (taxonomy *Taxonomy) IFilterOnTaxRank(rank string) *ITaxon { return taxonomy.Iterator().IFilterOnTaxRank(rank) } diff --git a/pkg/obitax/filter_on_subclade_of.go b/pkg/obitax/filter_on_subclade_of.go index 85268a1..0cb6883 100644 --- a/pkg/obitax/filter_on_subclade_of.go +++ b/pkg/obitax/filter_on_subclade_of.go @@ -1,9 +1,7 @@ package obitax -import "reflect" - -func (iterator *ITaxonSet) IFilterOnSubcladeOf(taxon *TaxNode) *ITaxonSet { - newIter := NewITaxonSet() +func (iterator *ITaxon) IFilterOnSubcladeOf(taxon *Taxon) *ITaxon { + newIter := NewITaxon() go func() { for iterator.Next() { @@ -18,32 +16,36 @@ func (iterator *ITaxonSet) IFilterOnSubcladeOf(taxon *TaxNode) *ITaxonSet { return newIter } -func (set *TaxonSet) IFilterOnSubcladeOf(taxon *TaxNode) *ITaxonSet { +func (set *TaxonSet) IFilterOnSubcladeOf(taxon *Taxon) *ITaxon { return set.Iterator().IFilterOnSubcladeOf(taxon) } -func (slice *TaxonSlice) IFilterOnSubcladeOf(taxon *TaxNode) *ITaxonSet { +func (slice *TaxonSlice) IFilterOnSubcladeOf(taxon *Taxon) *ITaxon { return slice.Iterator().IFilterOnSubcladeOf(taxon) } -func (taxonomy *Taxonomy) IFilterOnSubcladeOf(taxon *TaxNode) *ITaxonSet { +func (taxonomy *Taxonomy) IFilterOnSubcladeOf(taxon *Taxon) *ITaxon { return taxonomy.Iterator().IFilterOnSubcladeOf(taxon) } -func (iterator *ITaxonSet) IFilterBelongingSubclades(clades *TaxonSet) *ITaxonSet { +func (iterator *ITaxon) IFilterBelongingSubclades(clades *TaxonSet) *ITaxon { - if len(*clades) == 0 { + if clades.Len() == 0 { return iterator } // Considers the second simplest case when only // a single subclase is provided - if len(*clades) == 1 { - keys := reflect.ValueOf(*clades).MapKeys() - return iterator.IFilterOnSubcladeOf((*clades)[int(keys[0].Int())]) + if clades.Len() == 1 { + keys := make([]*string, 0, len(clades.set)) + for k := range clades.set { + keys = append(keys, k) + } + + return iterator.IFilterOnSubcladeOf(clades.Get(keys[0])) } - newIter := NewITaxonSet() + newIter := NewITaxon() go func() { for iterator.Next() { diff --git a/pkg/obitax/inner.go b/pkg/obitax/inner.go index 3a248f2..8053f13 100644 --- a/pkg/obitax/inner.go +++ b/pkg/obitax/inner.go @@ -5,7 +5,7 @@ import "sync" // InnerString is a struct that holds a map of strings and a read-write lock for concurrent access. // The index map is used to store key-value pairs of strings. type InnerString struct { - index map[string]string + index map[string]*string lock sync.RWMutex } @@ -13,7 +13,7 @@ type InnerString struct { // The lock is set to false. func NewInnerString() *InnerString { return &InnerString{ - index: make(map[string]string), + index: make(map[string]*string), } } @@ -26,13 +26,13 @@ func NewInnerString() *InnerString { // // Returns: // - The string value associated with the key. -func (i *InnerString) Innerize(value string) string { +func (i *InnerString) Innerize(value string) *string { i.lock.Lock() defer i.lock.Unlock() s, ok := i.index[value] if !ok { - i.index[value] = value - s = value + s = &value + i.index[value] = s } return s @@ -42,7 +42,7 @@ func (i *InnerString) Slice() []string { rep := make([]string, len(i.index)) j := 0 for _, v := range i.index { - rep[j] = v + rep[j] = *v j++ } return rep diff --git a/pkg/obitax/issuubcladeof.go b/pkg/obitax/issuubcladeof.go index 82a08ae..50d186f 100644 --- a/pkg/obitax/issuubcladeof.go +++ b/pkg/obitax/issuubcladeof.go @@ -1,6 +1,6 @@ package obitax -import "log" +import log "github.com/sirupsen/logrus" func (taxon *Taxon) IsSubCladeOf(parent *Taxon) bool { @@ -20,3 +20,18 @@ func (taxon *Taxon) IsSubCladeOf(parent *Taxon) bool { return false } + +func (taxon *Taxon) IsBelongingSubclades(clades *TaxonSet) bool { + ok := clades.Contains(taxon.Node.id) + + for !ok && !taxon.IsRoot() { + taxon = taxon.Parent() + ok = clades.Contains(taxon.Node.id) + } + + if taxon.IsRoot() { + ok = clades.Contains(taxon.Node.id) + } + + return ok +} diff --git a/pkg/obitax/iterator.go b/pkg/obitax/iterator.go index 5c29834..2b6568d 100644 --- a/pkg/obitax/iterator.go +++ b/pkg/obitax/iterator.go @@ -1,24 +1,31 @@ package obitax -type ITaxonSet struct { - source chan *TaxNode - current *TaxNode +type ITaxon struct { + source chan *Taxon + current *Taxon finished bool p_finished *bool } -func NewITaxonSet() *ITaxonSet { - i := ITaxonSet{make(chan *TaxNode), nil, false, nil} +func NewITaxon() *ITaxon { + i := ITaxon{ + source: make(chan *Taxon), + current: nil, + finished: false, + p_finished: nil} i.p_finished = &i.finished return &i } -func (set *TaxonSet) Iterator() *ITaxonSet { - i := NewITaxonSet() +func (set *TaxonSet) Iterator() *ITaxon { + i := NewITaxon() go func() { for _, t := range set.set { - i.source <- t + i.source <- &Taxon{ + Taxonomy: set.taxonomy, + Node: t, + } } close(i.source) }() @@ -26,12 +33,15 @@ func (set *TaxonSet) Iterator() *ITaxonSet { return i } -func (set *TaxonSlice) Iterator() *ITaxonSet { - i := NewITaxonSet() +func (set *TaxonSlice) Iterator() *ITaxon { + i := NewITaxon() go func() { for _, t := range set.slice { - i.source <- t + i.source <- &Taxon{ + Taxonomy: set.taxonomy, + Node: t, + } } close(i.source) }() @@ -39,11 +49,11 @@ func (set *TaxonSlice) Iterator() *ITaxonSet { return i } -func (taxonmy *Taxonomy) Iterator() *ITaxonSet { +func (taxonmy *Taxonomy) Iterator() *ITaxon { return taxonmy.nodes.Iterator() } -func (iterator *ITaxonSet) Next() bool { +func (iterator *ITaxon) Next() bool { if *(iterator.p_finished) { return false } @@ -63,37 +73,21 @@ func (iterator *ITaxonSet) Next() bool { // currently pointed by the iterator. You have to use the // 'Next' method to move to the next entry before calling // 'Get' to retreive the following instance. -func (iterator *ITaxonSet) Get() *TaxNode { +func (iterator *ITaxon) Get() *Taxon { return iterator.current } // Finished returns 'true' value if no more data is available // from the iterator. -func (iterator *ITaxonSet) Finished() bool { +func (iterator *ITaxon) Finished() bool { return *iterator.p_finished } -func (iterator *ITaxonSet) Split() *ITaxonSet { - newIter := ITaxonSet{iterator.source, nil, false, iterator.p_finished} - return &newIter -} - -func (iterator *ITaxonSet) TaxonSet() *TaxonSet { - set := make(TaxonSet) - - for iterator.Next() { - taxon := iterator.Get() - set[taxon.id] = taxon +func (iterator *ITaxon) Split() *ITaxon { + return &ITaxon{ + source: iterator.source, + current: nil, + finished: false, + p_finished: iterator.p_finished, } - return &set -} - -func (iterator *ITaxonSet) TaxonSlice() *TaxonSlice { - slice := make(TaxonSlice, 0) - - for iterator.Next() { - taxon := iterator.Get() - slice = append(slice, taxon) - } - return &slice } diff --git a/pkg/obitax/lca.go b/pkg/obitax/lca.go index 2b080ad..4909847 100644 --- a/pkg/obitax/lca.go +++ b/pkg/obitax/lca.go @@ -4,7 +4,7 @@ import ( log "github.com/sirupsen/logrus" ) -func (t1 *TaxNode) LCA(t2 *TaxNode) (*TaxNode, error) { +func (t1 *Taxon) LCA(t2 *Taxon) (*Taxon, error) { if t1 == nil { log.Panicf("Try to get LCA of nil taxon") } @@ -13,25 +13,19 @@ func (t1 *TaxNode) LCA(t2 *TaxNode) (*TaxNode, error) { log.Panicf("Try to get LCA of nil taxon") } - p1, err1 := t1.Path() + p1 := t1.Path() + p2 := t2.Path() - if err1 != nil { - return nil, err1 - } + i1 := p1.Len() - 1 + i2 := p2.Len() - 1 - p2, err2 := t2.Path() - - if err2 != nil { - return nil, err2 - } - - i1 := len(*p1) - 1 - i2 := len(*p2) - 1 - - for i1 >= 0 && i2 >= 0 && (*p1)[i1].taxid == (*p2)[i2].taxid { + for i1 >= 0 && i2 >= 0 && p1.slice[i1].id == p2.slice[i2].id { i1-- i2-- } - return (*p1)[i1+1], nil + return &Taxon{ + Taxonomy: t1.Taxonomy, + Node: p1.slice[i1+1], + }, nil } diff --git a/pkg/obitax/taxon.go b/pkg/obitax/taxon.go index 74c9a99..09401d5 100644 --- a/pkg/obitax/taxon.go +++ b/pkg/obitax/taxon.go @@ -24,6 +24,9 @@ type Taxon struct { // Returns: // - A formatted string representing the Taxon in the form "taxonomy_code:taxon_id [scientific_name]". func (taxon *Taxon) String() string { + if taxon == nil { + return "NA" + } return taxon.Node.String(taxon.Taxonomy.code) } @@ -33,24 +36,52 @@ func (taxon *Taxon) String() string { // Returns: // - The scientific name of the taxon as a string. func (taxon *Taxon) ScientificName() string { + if taxon == nil { + return "NA" + } return taxon.Node.ScientificName() } func (taxon *Taxon) Name(class string) string { - return taxon.Node.Name(class) + if taxon == nil { + return "NA" + } + pclass := taxon.Taxonomy.nameclasses.Innerize(class) + return taxon.Node.Name(pclass) } func (taxon *Taxon) IsNameEqual(name string) bool { + if taxon == nil { + return false + } + return taxon.Node.IsNameEqual(name) } func (taxon *Taxon) IsNameMatching(pattern *regexp.Regexp) bool { + if taxon == nil { + return false + } + return taxon.Node.IsNameMatching(pattern) } func (taxon *Taxon) SetName(name, class string) { - class = taxon.Taxonomy.nameclasses.Innerize(class) - taxon.Node.SetName(name, class) + if taxon == nil { + log.Panicf("nil taxon pointer for name %s [%s]", name, class) + } + + pclass := taxon.Taxonomy.nameclasses.Innerize(class) + pname := taxon.Taxonomy.names.Innerize(name) + taxon.Node.SetName(pname, pclass) +} + +func (taxon *Taxon) IsRoot() bool { + if taxon == nil { + return true + } + + return taxon.Taxonomy.root == taxon.Node } // Rank returns the rank of the Taxon. @@ -59,6 +90,9 @@ func (taxon *Taxon) SetName(name, class string) { // Returns: // - The rank of the taxon as a string (e.g., species, genus, family). func (taxon *Taxon) Rank() string { + if taxon == nil { + return "NA" + } return taxon.Node.Rank() } @@ -70,9 +104,12 @@ func (taxon *Taxon) Rank() string { // - A pointer to the parent Taxon[T]. If the parent does not exist, it returns // a Taxon with a nil Node. func (taxon *Taxon) Parent() *Taxon { + if taxon == nil { + return nil + } + pid := taxon.Node.ParentId() - return &Taxon{taxon.Taxonomy, - taxon.Taxonomy.nodes.Get(pid)} + return taxon.Taxonomy.nodes.Get(pid) } // IPath returns an iterator that yields the path from the current Taxon to the root Taxon @@ -83,12 +120,13 @@ func (taxon *Taxon) Parent() *Taxon { // is called with each Taxon in the path from the current taxon to the root. If the // taxonomy has no root node, the method logs a fatal error and terminates the program. func (taxon *Taxon) IPath() iter.Seq[*Taxon] { + if taxon.Taxonomy.root == nil { log.Fatalf("Taxon[%v].IPath(): Taxonomy has no root node", taxon.Taxonomy.name) } return func(yield func(*Taxon) bool) { - for taxon.Node.parent != taxon.Taxonomy.root.id { + for !taxon.IsRoot() { if !yield(taxon) { return } @@ -96,8 +134,9 @@ func (taxon *Taxon) IPath() iter.Seq[*Taxon] { taxon = taxon.Parent() } - yield(taxon) - + if taxon != nil { + yield(taxon) + } } } @@ -109,6 +148,10 @@ func (taxon *Taxon) IPath() iter.Seq[*Taxon] { // - A pointer to a TaxonSlice[T] containing the TaxNode[T] instances in the path // from the current taxon to the root. func (taxon *Taxon) Path() *TaxonSlice { + if taxon == nil { + return nil + } + s := make([]*TaxNode, 0, 10) for t := range taxon.IPath() { @@ -131,8 +174,13 @@ func (taxon *Taxon) Path() *TaxonSlice { // Returns: // - A boolean indicating whether any taxon in the path has the specified rank defined. func (taxon *Taxon) HasRankDefined(rank string) bool { + if taxon == nil { + return false + } + + prank := taxon.Taxonomy.ranks.Innerize(rank) for t := range taxon.IPath() { - if t.Node.Rank() == rank { + if t.Node.rank == prank { return true } } @@ -151,8 +199,14 @@ func (taxon *Taxon) HasRankDefined(rank string) bool { // - A pointer to the Taxon[T] that matches the specified rank, or nil if no such taxon exists // in the path to the root. func (taxon *Taxon) TaxonAtRank(rank string) *Taxon { + if taxon == nil { + return nil + } + + prank := taxon.Taxonomy.ranks.Innerize(rank) + for t := range taxon.IPath() { - if t.Node.Rank() == rank { + if t.Node.rank == prank { return t } } diff --git a/pkg/obitax/taxonnode.go b/pkg/obitax/taxonnode.go index 3c12a50..f5f67fc 100644 --- a/pkg/obitax/taxonnode.go +++ b/pkg/obitax/taxonnode.go @@ -2,6 +2,7 @@ package obitax import ( "fmt" + "log" "regexp" ) @@ -18,11 +19,11 @@ import ( // a string representing the class name and the value is a pointer to a string // representing the name. type TaxNode struct { - id string - parent string - rank string + id *string + parent *string + rank *string scientificname *string - alternatenames *map[string]*string + alternatenames *map[*string]*string } // String returns a string representation of the TaxNode, including the taxonomy code, @@ -36,7 +37,7 @@ type TaxNode struct { func (node *TaxNode) String(taxonomyCode string) string { return fmt.Sprintf("%s:%v [%s]", taxonomyCode, - node.id, + *node.id, node.ScientificName()) } @@ -45,7 +46,7 @@ func (node *TaxNode) String(taxonomyCode string) string { // // Returns: // - The unique identifier of the taxon node of type T. -func (node *TaxNode) Id() string { +func (node *TaxNode) Id() *string { return node.id } @@ -54,7 +55,7 @@ func (node *TaxNode) Id() string { // // Returns: // - The identifier of the parent taxon of type T. -func (node *TaxNode) ParentId() string { +func (node *TaxNode) ParentId() *string { return node.parent } @@ -66,6 +67,12 @@ func (node *TaxNode) ParentId() string { // - Note: This method assumes that scientificname is not nil; // if it may be nil, additional error handling should be implemented. func (node *TaxNode) ScientificName() string { + if node == nil { + return "NA" + } + if node.scientificname == nil { + return "NA" + } return *node.scientificname } @@ -80,8 +87,9 @@ func (node *TaxNode) ScientificName() string { // Returns: // - The name of the taxon as a string. If the class is not recognized or if no name is available, // an empty string is returned. -func (node *TaxNode) Name(class string) string { - if class == "scientificname" { +func (node *TaxNode) Name(class *string) string { + + if *class == "scientific name" { return *node.scientificname } @@ -98,17 +106,21 @@ func (node *TaxNode) Name(class string) string { return "" } -func (node *TaxNode) SetName(name, class string) { - if class == "scientificname" { - node.scientificname = &name +func (node *TaxNode) SetName(name, class *string) { + if node == nil { + log.Panic("Cannot set name of nil TaxNode") + } + + if *class == "scientific name" { + node.scientificname = name return } if node.alternatenames == nil { - node.alternatenames = &map[string]*string{} + node.alternatenames = &map[*string]*string{} } - (*node.alternatenames)[class] = &name + (*node.alternatenames)[class] = name } // Rank returns the rank of the TaxNode. @@ -117,7 +129,7 @@ func (node *TaxNode) SetName(name, class string) { // Returns: // - The rank of the taxon as a string (e.g., species, genus, family). func (node *TaxNode) Rank() string { - return node.rank + return *node.rank } // IsNameEqual checks if the provided name matches the scientific name or any alternate names @@ -154,9 +166,14 @@ func (node *TaxNode) IsNameEqual(name string) bool { // - A boolean indicating whether the scientific name or any alternate names match the // provided regular expression pattern. func (node *TaxNode) IsNameMatching(pattern *regexp.Regexp) bool { - if pattern.MatchString(*(node.scientificname)) { + if node == nil { + return false + } + + if node.scientificname != nil && pattern.MatchString(*(node.scientificname)) { return true } + if node.alternatenames != nil { for _, n := range *node.alternatenames { if n != nil && pattern.MatchString(*n) { diff --git a/pkg/obitax/taxonomy.go b/pkg/obitax/taxonomy.go index 47c0fdb..2f84334 100644 --- a/pkg/obitax/taxonomy.go +++ b/pkg/obitax/taxonomy.go @@ -21,12 +21,14 @@ import ( type Taxonomy struct { name string code string + ids *InnerString ranks *InnerString nameclasses *InnerString + names *InnerString nodes *TaxonSet root *TaxNode matcher *regexp.Regexp - index map[string]*TaxonSet + index map[*string]*TaxonSet } // NewTaxonomy creates and initializes a new Taxonomy instance with the specified name and code. @@ -39,7 +41,7 @@ type Taxonomy struct { // Returns: // - A pointer to the newly created Taxonomy instance. func NewTaxonomy(name, code, codeCharacters string) *Taxonomy { - set := make(map[string]*TaxNode) + set := make(map[*string]*TaxNode) // codeCharacters := "[[:alnum:]]" // [[:digit:]] @@ -48,12 +50,14 @@ func NewTaxonomy(name, code, codeCharacters string) *Taxonomy { taxonomy := &Taxonomy{ name: name, code: code, + ids: NewInnerString(), ranks: NewInnerString(), nameclasses: NewInnerString(), + names: NewInnerString(), nodes: &TaxonSet{set: set}, root: nil, matcher: matcher, - index: make(map[string]*TaxonSet), + index: make(map[*string]*TaxonSet), } taxonomy.nodes.taxonomy = taxonomy @@ -69,16 +73,16 @@ func NewTaxonomy(name, code, codeCharacters string) *Taxonomy { // - taxid: A string representation of the taxon identifier to be converted. // // Returns: -// - The taxon identifier of type T corresponding to the provided taxid. +// - The taxon identifier as a *string corresponding to the provided taxid. // - An error if the taxid is not valid or cannot be converted. -func (taxonomy *Taxonomy) Id(taxid string) (string, error) { +func (taxonomy *Taxonomy) Id(taxid string) (*string, error) { matches := taxonomy.matcher.FindStringSubmatch(taxid) if matches == nil { - return "", fmt.Errorf("Taxid %s is not a valid taxid", taxid) + return nil, fmt.Errorf("taxid %s is not a valid taxid", taxid) } - return matches[2], nil + return taxonomy.ids.Innerize(matches[2]), nil } // TaxidSting retrieves the string representation of a taxon node identified by the given ID. @@ -92,11 +96,19 @@ func (taxonomy *Taxonomy) Id(taxid string) (string, error) { // - A string representing the taxon node in the format "taxonomyCode:id [scientificName]", // or an error if the taxon node with the specified ID does not exist in the taxonomy. func (taxonomy *Taxonomy) TaxidSting(id string) (string, error) { - node := taxonomy.nodes.Get(id) - if node == nil { - return "", fmt.Errorf("Taxid %d is part of the taxonomy", id) + pid, err := taxonomy.Id(id) + + if err != nil { + return "", err } - return node.String(taxonomy.code), nil + + taxon := taxonomy.nodes.Get(pid) + + if taxon == nil { + return "", fmt.Errorf("taxid %s is not part of the taxonomy", id) + } + + return taxon.String(), nil } // Taxon retrieves the Taxon associated with the given taxid string. @@ -113,19 +125,18 @@ func (taxonomy *Taxonomy) Taxon(taxid string) *Taxon { id, err := taxonomy.Id(taxid) if err != nil { - log.Fatalf("Taxid %s is not a valid taxid", taxid) + log.Fatalf("Taxid %s: %v", taxid, err) } - node := taxonomy.nodes.Get(id) + taxon := taxonomy.nodes.Get(id) - if node == nil { - log.Fatalf("Taxid %s is an unknown taxid", taxid) + if taxon == nil { + log.Fatalf("Taxid %s is not part of the taxonomy %s", + taxid, + taxonomy.name) } - return &Taxon{ - Taxonomy: taxonomy, - Node: node, - } + return taxon } // TaxonSet returns the set of taxon nodes contained within the Taxonomy. @@ -133,7 +144,7 @@ func (taxonomy *Taxonomy) Taxon(taxid string) *Taxon { // // Returns: // - A pointer to the TaxonSet[T] representing the collection of taxon nodes in the taxonomy. -func (taxonomy *Taxonomy) TaxonSet() *TaxonSet { +func (taxonomy *Taxonomy) AsTaxonSet() *TaxonSet { return taxonomy.nodes } @@ -160,13 +171,25 @@ func (taxonomy *Taxonomy) Len() int { // - A pointer to the newly created Taxon[T] instance. // - An error if the taxon cannot be added (e.g., it already exists and replace is false). func (taxonomy *Taxonomy) AddTaxon(taxid, parent string, rank string, isRoot bool, replace bool) (*Taxon, error) { - if !replace && taxonomy.nodes.Contains(taxid) { - return nil, fmt.Errorf("trying to add taxon %d already present in the taxonomy", taxid) + + parentid, perr := taxonomy.Id(parent) + id, err := taxonomy.Id(taxid) + + if perr != nil { + return nil, fmt.Errorf("error in parsing parent taxid %s: %v", parent, perr) } - rank = taxonomy.ranks.Innerize(rank) + if err != nil { + return nil, fmt.Errorf("error in parsing taxid %s: %v", taxid, err) + } - n := &TaxNode{taxid, parent, rank, nil, nil} + if !replace && taxonomy.nodes.Contains(id) { + return nil, fmt.Errorf("trying to add taxon %s already present in the taxonomy", taxid) + } + + prank := taxonomy.ranks.Innerize(rank) + + n := &TaxNode{id, parentid, prank, nil, nil} taxonomy.nodes.Insert(n) @@ -197,18 +220,15 @@ func (taxonomy *Taxonomy) AddAlias(newtaxid, oldtaxid string, replace bool) (*Ta return nil, fmt.Errorf("trying to add alias %s already present in the taxonomy", newtaxid) } - n := taxonomy.nodes.Get(oldid) + t := taxonomy.nodes.Get(oldid) - if n == nil { + if t == nil { return nil, fmt.Errorf("trying to add alias %s to a taxon that does not exist", oldtaxid) } - taxonomy.nodes.Alias(newid, n) + taxonomy.nodes.Alias(newid, t) - return &Taxon{ - Taxonomy: taxonomy, - Node: n, - }, nil + return t, nil } // RankList returns a slice of strings representing the ranks of the taxa @@ -221,19 +241,14 @@ func (taxonomy *Taxonomy) RankList() []string { return taxonomy.ranks.Slice() } -// func (taxonomy *Taxonomy) Taxon(taxid int) (*TaxNode, error) { -// t, ok := (*taxonomy.nodes)[taxid] - -// if !ok { -// a, aok := taxonomy.alias[taxid] -// if !aok { -// return nil, fmt.Errorf("Taxid %d is not part of the taxonomy", taxid) -// } -// t = a -// } -// return t, nil -// } - -func (taxonomy *Taxonomy) Index() *map[string]*TaxonSet { +func (taxonomy *Taxonomy) Index() *map[*string]*TaxonSet { return &(taxonomy.index) } + +func (taxonomy *Taxonomy) Name() string { + return taxonomy.name +} + +func (taxonomy *Taxonomy) Code() string { + return taxonomy.code +} diff --git a/pkg/obitax/taxonset.go b/pkg/obitax/taxonset.go index 3f464e1..3d690ef 100644 --- a/pkg/obitax/taxonset.go +++ b/pkg/obitax/taxonset.go @@ -1,3 +1,4 @@ +// Package obitax provides functionality for managing taxonomic data structures. package obitax import log "github.com/sirupsen/logrus" @@ -7,25 +8,46 @@ import log "github.com/sirupsen/logrus" // as well as a reference to the associated Taxonomy. // // Fields: -// - set: A map that associates taxon identifiers of type T with their corresponding TaxNode[T] instances. -// - taxonomy: A pointer to the Taxonomy[T] instance that this TaxonSet belongs to. +// - set: A map that associates taxon identifiers of type *string with their corresponding TaxNode instances. +// - nalias: The number of aliases in the TaxonSet. +// - taxonomy: A pointer to the Taxonomy instance that this TaxonSet belongs to. type TaxonSet struct { - set map[string]*TaxNode + set map[*string]*TaxNode nalias int taxonomy *Taxonomy } -// Get retrieves the TaxNode[T] associated with the specified taxon identifier. +func (taxonomy *Taxonomy) NewTaxonSet() *TaxonSet { + return &TaxonSet{ + set: make(map[*string]*TaxNode), + nalias: 0, + taxonomy: taxonomy, + } +} + +// Get retrieves the TaxNode associated with the specified taxon identifier. // It returns the TaxNode if it exists in the TaxonSet; otherwise, it returns nil. // // Parameters: -// - i: The taxon identifier of type T for which the TaxNode is to be retrieved. +// - id: A pointer to the taxon identifier for which the TaxNode is to be retrieved. // // Returns: -// - A pointer to the TaxNode[T] associated with the provided identifier, or nil +// - A pointer to the TaxNode associated with the provided identifier, or nil // if no such taxon exists in the set. -func (set *TaxonSet) Get(i string) *TaxNode { - return set.set[i] +func (set *TaxonSet) Get(id *string) *Taxon { + if set == nil { + return nil + } + + node := set.set[id] + if node == nil { + return nil + } + + return &Taxon{ + Taxonomy: set.taxonomy, + Node: set.set[id], + } } // Len returns the number of unique taxa in the TaxonSet. @@ -38,27 +60,37 @@ func (set *TaxonSet) Len() int { return len(set.set) - set.nalias } -// Insert adds a TaxNode[T] to the TaxonSet. If a taxon with the same identifier +// Insert adds a TaxNode to the TaxonSet. If a taxon with the same identifier // already exists in the set, it updates the reference. If the existing taxon was // an alias, its alias count is decremented. // // Parameters: -// - taxon: A pointer to the TaxNode[T] instance to be added to the TaxonSet. +// - taxon: A pointer to the TaxNode instance to be added to the TaxonSet. // // Behavior: // - If a taxon with the same identifier already exists and is different from the // new taxon, the alias count is decremented. -func (set *TaxonSet) Insert(taxon *TaxNode) { - if old := set.set[taxon.id]; old != nil && old.id != taxon.id { +func (set *TaxonSet) Insert(node *TaxNode) { + if old := set.set[node.id]; old != nil && old.id != node.id { set.nalias-- } - set.set[taxon.id] = taxon + set.set[node.id] = node } -// Taxonomy returns a pointer to the Taxonomy[T] instance that this TaxonSet belongs to. +func (set *TaxonSet) InsertTaxon(taxon *Taxon) { + if set.taxonomy != taxon.Taxonomy { + log.Fatalf( + "Cannot insert taxon %s into taxon set belonging %s taxonomy", + taxon.String(), + set.taxonomy.name, + ) + } +} + +// Taxonomy returns a pointer to the Taxonomy instance that this TaxonSet belongs to. // // Returns: -// - A pointer to the Taxonomy[T] instance that this TaxonSet belongs to +// - A pointer to the Taxonomy instance that this TaxonSet belongs to. func (set *TaxonSet) Taxonomy() *Taxonomy { return set.taxonomy } @@ -68,18 +100,18 @@ func (set *TaxonSet) Taxonomy() *Taxonomy { // If the original taxon is not part of the taxon set, it logs a fatal error and terminates the program. // // Parameters: -// - alias: A string representing the alias to be associated with the taxon node. -// - node: A pointer to the TaxNode[T] instance that the alias will refer to. +// - alias: A pointer to a string representing the alias to be associated with the taxon node. +// - node: A pointer to the TaxNode instance that the alias will refer to. // // Behavior: // - If the original taxon corresponding to the alias is not part of the taxon set, // the method will log a fatal error and terminate the program. -func (set *TaxonSet) Alias(id string, node *TaxNode) { - original := set.Get(node.id) - if original != nil { +func (set *TaxonSet) Alias(id *string, taxon *Taxon) { + original := set.Get(taxon.Node.id) + if original == nil { log.Fatalf("Original taxon %v is not part of taxon set", id) } - set.set[id] = node + set.set[id] = taxon.Node set.nalias++ } @@ -88,39 +120,39 @@ func (set *TaxonSet) Alias(id string, node *TaxNode) { // node exists and its identifier is different from the provided identifier; otherwise, it returns false. // // Parameters: -// - id: The identifier of type T to be checked for alias status. +// - id: A pointer to the identifier to be checked for alias status. // // Returns: // - A boolean indicating whether the identifier corresponds to an alias in the set. -func (set *TaxonSet) IsAlias(id string) bool { - node := set.Get(id) - return node != nil && node.id != id +func (set *TaxonSet) IsAlias(id *string) bool { + taxon := set.Get(id) + return taxon != nil && taxon.Node.id != id } // IsATaxon checks if the given ID corresponds to a valid taxon node in the TaxonSet. // It returns true if the node exists and its ID matches the provided ID; otherwise, it returns false. -// id corresponding to alias returns false. +// If the ID corresponds to an alias, it will return false. // // Parameters: -// - id: The identifier of the taxon to check. +// - id: A pointer to the identifier of the taxon to check. // // Returns: // - A boolean indicating whether the specified ID corresponds to a valid taxon node. -func (set *TaxonSet) IsATaxon(id string) bool { - node := set.Get(id) - return node != nil && node.id == id +func (set *TaxonSet) IsATaxon(id *string) bool { + taxon := set.Get(id) + return taxon != nil && taxon.Node.id == id } // Contains checks if the TaxonSet contains a taxon node with the specified ID. // It returns true if the node exists in the set; otherwise, it returns false. -// id corresponding to alias or true taxa returns true. +// If the ID corresponds to an alias, it will return true if the alias exists. // // Parameters: -// - id: The identifier of the taxon to check for presence in the set. +// - id: A pointer to the identifier of the taxon to check for presence in the set. // // Returns: // - A boolean indicating whether the TaxonSet contains a taxon node with the specified ID. -func (set *TaxonSet) Contains(id string) bool { +func (set *TaxonSet) Contains(id *string) bool { node := set.Get(id) return node != nil } diff --git a/pkg/obitax/taxonslice.go b/pkg/obitax/taxonslice.go index 8c567d9..7acf119 100644 --- a/pkg/obitax/taxonslice.go +++ b/pkg/obitax/taxonslice.go @@ -3,6 +3,8 @@ package obitax import ( "bytes" "fmt" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" ) // TaxonSlice represents a slice of TaxNode[T] instances within a taxonomy. @@ -16,6 +18,13 @@ type TaxonSlice struct { taxonomy *Taxonomy } +func (taxonomy *Taxonomy) NewTaxonSlice(size, capacity int) *TaxonSlice { + return &TaxonSlice{ + slice: make([]*TaxNode, size, capacity), + taxonomy: taxonomy, + } +} + // Get retrieves the TaxNode[T] at the specified index from the TaxonSlice. // It returns the taxon node corresponding to the provided index. // @@ -25,6 +34,9 @@ type TaxonSlice struct { // Returns: // - A pointer to the TaxNode[T] at the specified index in the slice. func (slice *TaxonSlice) Get(i int) *TaxNode { + if slice == nil { + return nil + } return slice.slice[i] } @@ -34,6 +46,9 @@ func (slice *TaxonSlice) Get(i int) *TaxNode { // Returns: // - An integer representing the total number of taxon nodes in the TaxonSlice. func (slice *TaxonSlice) Len() int { + if slice == nil { + return 0 + } return len(slice.slice) } @@ -65,3 +80,19 @@ func (path *TaxonSlice) String() string { return buffer.String() } + +func (slice *TaxonSlice) Reverse(inplace bool) *TaxonSlice { + if slice == nil { + return nil + } + + rep := obiutils.Reverse(slice.slice, inplace) + if inplace { + return slice + } + + return &TaxonSlice{ + taxonomy: slice.taxonomy, + slice: rep, + } +} diff --git a/pkg/obitools/obifind/iterator.go b/pkg/obitools/obifind/iterator.go index 5884a2f..52e38af 100644 --- a/pkg/obitools/obifind/iterator.go +++ b/pkg/obitools/obifind/iterator.go @@ -7,13 +7,13 @@ import ( "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax" ) -func IFilterRankRestriction() func(*obitax.ITaxonSet) *obitax.ITaxonSet { - f := func(s *obitax.ITaxonSet) *obitax.ITaxonSet { +func IFilterRankRestriction() func(*obitax.ITaxon) *obitax.ITaxon { + f := func(s *obitax.ITaxon) *obitax.ITaxon { return s } if __restrict_rank__ != "" { - f = func(s *obitax.ITaxonSet) *obitax.ITaxonSet { + f = func(s *obitax.ITaxon) *obitax.ITaxon { return s.IFilterOnTaxRank(__restrict_rank__) } } @@ -21,21 +21,21 @@ func IFilterRankRestriction() func(*obitax.ITaxonSet) *obitax.ITaxonSet { return f } -func ITaxonNameMatcher() (func(string) *obitax.ITaxonSet, error) { +func ITaxonNameMatcher() (func(string) *obitax.ITaxon, error) { taxonomy, err := CLILoadSelectedTaxonomy() if err != nil { return nil, err } - fun := func(name string) *obitax.ITaxonSet { + fun := func(name string) *obitax.ITaxon { return taxonomy.IFilterOnName(name, __fixed_pattern__) } return fun, nil } -func ITaxonRestrictions() (func(*obitax.ITaxonSet) *obitax.ITaxonSet, error) { +func ITaxonRestrictions() (func(*obitax.ITaxon) *obitax.ITaxon, error) { clades, err := CLITaxonomicalRestrictions() @@ -45,23 +45,19 @@ func ITaxonRestrictions() (func(*obitax.ITaxonSet) *obitax.ITaxonSet, error) { rankfilter := IFilterRankRestriction() - fun := func(iterator *obitax.ITaxonSet) *obitax.ITaxonSet { + fun := func(iterator *obitax.ITaxon) *obitax.ITaxon { return rankfilter(iterator).IFilterBelongingSubclades(clades) } return fun, nil } -func TaxonAsString(taxon *obitax.TaxNode, pattern string) string { +func TaxonAsString(taxon *obitax.Taxon, pattern string) string { text := taxon.ScientificName() if __with_path__ { var bf bytes.Buffer - path, err := taxon.Path() - - if err != nil { - fmt.Printf("%+v", err) - } + path := taxon.Path() bf.WriteString(path.Get(path.Len() - 1).ScientificName()) @@ -72,15 +68,15 @@ func TaxonAsString(taxon *obitax.TaxNode, pattern string) string { text = bf.String() } - return fmt.Sprintf("%-20s | %10d | %10d | %-20s | %s", + return fmt.Sprintf("%-20s | %10s | %10s | %-20s | %s", pattern, - taxon.Taxid(), - taxon.Parent().Taxid(), + taxon.String(), + taxon.Parent().String(), taxon.Rank(), text) } -func TaxonWriter(itaxa *obitax.ITaxonSet, pattern string) { +func TaxonWriter(itaxa *obitax.ITaxon, pattern string) { for itaxa.Next() { fmt.Println(TaxonAsString(itaxa.Get(), pattern)) } diff --git a/pkg/obitools/obifind/options.go b/pkg/obitools/obifind/options.go index 18f5a04..2533b23 100644 --- a/pkg/obitools/obifind/options.go +++ b/pkg/obitools/obifind/options.go @@ -12,12 +12,12 @@ var __taxdump__ = "" var __alternative_name__ = false var __rank_list__ = false var __selected_taxonomy__ = (*obitax.Taxonomy)(nil) -var __taxonomical_restriction__ = make([]int, 0) +var __taxonomical_restriction__ = make([]string, 0) var __fixed_pattern__ = false var __with_path__ = false -var __taxid_path__ = -1 -var __taxid_sons__ = -1 +var __taxid_path__ = "NA" +var __taxid_sons__ = "NA" var __restrict_rank__ = "" func LoadTaxonomyOptionSet(options *getoptions.GetOpt, required, alternatiive bool) { @@ -43,7 +43,7 @@ func FilterTaxonomyOptionSet(options *getoptions.GetOpt) { options.Alias("l"), options.Description("List every taxonomic rank available in the taxonomy.")) - options.IntSliceVar(&__taxonomical_restriction__, "restrict-to-taxon", 1, 1, + options.StringSliceVar(&__taxonomical_restriction__, "restrict-to-taxon", 1, 1, options.Alias("r"), options.Description("Restrict output to some subclades.")) } @@ -67,18 +67,18 @@ func CLITaxonomicalRestrictions() (*obitax.TaxonSet, error) { return nil, err } - ts := make(obitax.TaxonSet) + ts := taxonomy.NewTaxonSet() for _, taxid := range __taxonomical_restriction__ { - tx, err := taxonomy.Taxon(taxid) + tx := taxonomy.Taxon(taxid) if err != nil { return nil, err } - ts.Inserts(tx) + ts.InsertTaxon(tx) } - return &ts, nil + return ts, nil } func CLILoadSelectedTaxonomy() (*obitax.Taxonomy, error) { @@ -106,17 +106,17 @@ func OptionSet(options *getoptions.GetOpt) { options.BoolVar(&__with_path__, "with-path", false, options.Alias("P"), options.Description("Adds a column containing the full path for each displayed taxon.")) - options.IntVar(&__taxid_path__, "parents", -1, + options.StringVar(&__taxid_path__, "parents", "NA", options.Alias("p"), options.Description("Displays every parental tree's information for the provided taxid.")) options.StringVar(&__restrict_rank__, "rank", "", options.Description("Restrict to the given taxonomic rank.")) } -func CLIRequestsPathForTaxid() int { +func CLIRequestsPathForTaxid() string { return __taxid_path__ } -func CLIRequestsSonsForTaxid() int { +func CLIRequestsSonsForTaxid() string { return __taxid_sons__ }