diff --git a/cmd/obitools/obitag/main.go b/cmd/obitools/obitag/main.go index 0f4add0..81b17ab 100644 --- a/cmd/obitools/obitag/main.go +++ b/cmd/obitools/obitag/main.go @@ -7,8 +7,8 @@ import ( log "github.com/sirupsen/logrus" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obifind" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obitag" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" @@ -44,9 +44,9 @@ func main() { fs, err := obiconvert.CLIReadBioSequences(args...) obiconvert.OpenSequenceDataErrorMessage(args, err) - taxo, error := obifind.CLILoadSelectedTaxonomy() - if error != nil { - log.Panicln(error) + taxo := obitax.DefaultTaxonomy() + if taxo == nil { + log.Panicln("No loaded taxonomy") } references := obitag.CLIRefDB() diff --git a/cmd/obitools/obitag2/main.go b/cmd/obitools/obitag2/main.go index 2d82720..dc32e4c 100644 --- a/cmd/obitools/obitag2/main.go +++ b/cmd/obitools/obitag2/main.go @@ -7,8 +7,8 @@ import ( log "github.com/sirupsen/logrus" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obifind" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obitag" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obitag2" @@ -45,9 +45,9 @@ func main() { fs, err := obiconvert.CLIReadBioSequences(args...) obiconvert.OpenSequenceDataErrorMessage(args, err) - taxo, error := obifind.CLILoadSelectedTaxonomy() - if error != nil { - log.Panicln(error) + taxo := obitax.DefaultTaxonomy() + if taxo == nil { + log.Panicln("No loaded taxonomy") } references := obitag.CLIRefDB() diff --git a/go.mod b/go.mod index bf96383..62a095f 100644 --- a/go.mod +++ b/go.mod @@ -25,6 +25,7 @@ require ( require ( github.com/Clever/csvlint v0.3.0 // indirect + github.com/buger/jsonparser v1.1.1 // indirect github.com/davecgh/go-spew v1.1.1 // indirect github.com/goombaio/orderedmap v0.0.0-20180924084748-ba921b7e2419 // indirect github.com/kr/pretty v0.3.0 // indirect diff --git a/go.sum b/go.sum index 39449ce..688bba1 100644 --- a/go.sum +++ b/go.sum @@ -8,6 +8,8 @@ github.com/PaesslerAG/jsonpath v0.1.0 h1:gADYeifvlqK3R3i2cR5B4DGgxLXIPb3TRTH1mGi github.com/PaesslerAG/jsonpath v0.1.0/go.mod h1:4BzmtoM/PI8fPO4aQGIusjGxGir2BzcV0grWtFzq1Y8= github.com/barkimedes/go-deepcopy v0.0.0-20220514131651-17c30cfc62df h1:GSoSVRLoBaFpOOds6QyY1L8AX7uoY+Ln3BHc22W40X0= github.com/barkimedes/go-deepcopy v0.0.0-20220514131651-17c30cfc62df/go.mod h1:hiVxq5OP2bUGBRNS3Z/bt/reCLFNbdcST6gISi1fiOM= +github.com/buger/jsonparser v1.1.1 h1:2PnMjfWD7wBILjqQbt530v576A/cAbQvEW9gGIpYMUs= +github.com/buger/jsonparser v1.1.1/go.mod h1:6RYKKt7H4d4+iWqouImQ9R2FZql3VbhNgx27UK13J/0= github.com/chen3feng/stl4go v0.1.1 h1:0L1+mDw7pomftKDruM23f1mA7miavOj6C6MZeadzN2Q= github.com/chen3feng/stl4go v0.1.1/go.mod h1:5ml3psLgETJjRJnMbPE+JiHLrCpt+Ajc2weeTECXzWU= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= diff --git a/pkg/obiformats/ncbitaxdump/read.go b/pkg/obiformats/ncbitaxdump/read.go index 1342594..a656167 100644 --- a/pkg/obiformats/ncbitaxdump/read.go +++ b/pkg/obiformats/ncbitaxdump/read.go @@ -12,6 +12,7 @@ import ( log "github.com/sirupsen/logrus" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" ) func loadNodeTable(reader io.Reader, taxonomy *obitax.Taxonomy) { @@ -103,7 +104,7 @@ func loadMergedTable(reader io.Reader, taxonomy *obitax.Taxonomy) int { // if any of the files cannot be opened or read. func LoadNCBITaxDump(directory string, onlysn bool) (*obitax.Taxonomy, error) { - taxonomy := obitax.NewTaxonomy("NCBI Taxonomy", "taxon", "[[:digit:]]") + taxonomy := obitax.NewTaxonomy("NCBI Taxonomy", "taxon", obiutils.AsciiDigitSet) // // Load the Taxonomy nodes diff --git a/pkg/obioptions/version.go b/pkg/obioptions/version.go index b061222..3d103a6 100644 --- a/pkg/obioptions/version.go +++ b/pkg/obioptions/version.go @@ -7,7 +7,7 @@ import ( // TODO: The version number is extracted from git. This induces that the version // corresponds to the last commit, and not the one when the file will be // commited -var _Commit = "00b0edc" +var _Commit = "f41a6fb" var _Version = "Release 4.2.0" // Version returns the version of the obitools package. diff --git a/pkg/obiseq/attributes.go b/pkg/obiseq/attributes.go index 26fff9c..b3bea02 100644 --- a/pkg/obiseq/attributes.go +++ b/pkg/obiseq/attributes.go @@ -389,18 +389,6 @@ func (s *BioSequence) SetCount(count int) { s.SetAttribute("count", count) } -// SetTaxid sets the taxid for the BioSequence. -// -// Parameters: -// -// taxid - the taxid to set. -func (s *BioSequence) SetTaxid(taxid string) { - if taxid == "" { - taxid = "NA" - } - s.SetAttribute("taxid", taxid) -} - func (s *BioSequence) OBITagRefIndex(slot ...string) map[int]string { key := "obitag_ref_index" diff --git a/pkg/obiseq/taxonomy_lca.go b/pkg/obiseq/taxonomy_lca.go index 5ed32ce..02aa847 100644 --- a/pkg/obiseq/taxonomy_lca.go +++ b/pkg/obiseq/taxonomy_lca.go @@ -123,7 +123,6 @@ func AddLCAWorker(taxonomy *obitax.Taxonomy, slot_name string, threshold float64 sequence.SetAttribute(slot_name, lca.String()) sequence.SetAttribute(lca_name, lca.ScientificName()) sequence.SetAttribute(lca_error, math.Round((1-rans)*1000)/1000) - return BioSequenceSlice{sequence}, nil } diff --git a/pkg/obiseq/taxonomy_methods.go b/pkg/obiseq/taxonomy_methods.go index b6b46fa..8f4bbf5 100644 --- a/pkg/obiseq/taxonomy_methods.go +++ b/pkg/obiseq/taxonomy_methods.go @@ -1,6 +1,10 @@ package obiseq import ( + "fmt" + + log "github.com/sirupsen/logrus" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax" ) @@ -12,6 +16,23 @@ func (s *BioSequence) Taxon(taxonomy *obitax.Taxonomy) *obitax.Taxon { return taxonomy.Taxon(taxid) } +// SetTaxid sets the taxid for the BioSequence. +// +// Parameters: +// +// taxid - the taxid to set. +func (s *BioSequence) SetTaxid(taxid string) { + if taxid == "" { + taxid = "NA" + } + s.SetAttribute("taxid", taxid) +} + +func (s *BioSequence) SetTaxon(taxon *obitax.Taxon) { + taxid := taxon.String() + s.SetTaxid(taxid) +} + // Taxid returns the taxonomic ID associated with the BioSequence. // // It retrieves the "taxid" attribute from the BioSequence's attributes map. @@ -25,7 +46,20 @@ func (s *BioSequence) Taxid() (taxid string) { taxid = s.taxon.String() ok = true } else { - taxid, ok = s.GetStringAttribute("taxid") + var ta interface{} + ta, ok = s.GetAttribute("taxid") + if ok { + switch tv := ta.(type) { + case string: + taxid = tv + case int: + taxid = fmt.Sprintf("%d", tv) + case float64: + taxid = fmt.Sprintf("%d", int(tv)) + default: + log.Fatalf("Taxid: %v is not a string or an integer (%T)", ta, ta) + } + } } if !ok { diff --git a/pkg/obiseq/taxonomy_predicate.go b/pkg/obiseq/taxonomy_predicate.go index e8c2f24..265c1bc 100644 --- a/pkg/obiseq/taxonomy_predicate.go +++ b/pkg/obiseq/taxonomy_predicate.go @@ -47,14 +47,7 @@ func IsAValidTaxon(taxonomy *obitax.Taxonomy, withAutoCorrection ...bool) Sequen // A function that takes a taxonomy and a taxid as arguments and returns a function that takes a // pointer to a BioSequence as an argument and returns a boolean. -func IsSubCladeOf(taxonomy *obitax.Taxonomy, taxid string) SequencePredicate { - parent := taxonomy.Taxon(taxid) - - if parent == nil { - log.Fatalf("Cannot find taxon : %s in taxonomy %s", - taxid, - taxonomy.Name()) - } +func IsSubCladeOf(taxonomy *obitax.Taxonomy, parent *obitax.Taxon) SequencePredicate { f := func(sequence *BioSequence) bool { taxon := sequence.Taxon(taxonomy) diff --git a/pkg/obitax/inner.go b/pkg/obitax/inner.go index f86a029..ede921c 100644 --- a/pkg/obitax/inner.go +++ b/pkg/obitax/inner.go @@ -1,6 +1,9 @@ package obitax -import "sync" +import ( + "strings" + "sync" +) // InnerString is a struct that holds a map of strings and a read-write lock for concurrent access. // The index map is used to store key-value pairs of strings. @@ -31,10 +34,10 @@ func (i *InnerString) Innerize(value string) *string { defer i.lock.Unlock() s, ok := i.index[value] if !ok { + value = strings.Clone(value) s = &value i.index[value] = s } - return s } diff --git a/pkg/obitax/lca.go b/pkg/obitax/lca.go index 271a6a4..de538ab 100644 --- a/pkg/obitax/lca.go +++ b/pkg/obitax/lca.go @@ -16,11 +16,24 @@ import ( // if either of the taxa is nil, if they are not in the same taxonomy, or // if the taxonomy is unrooted. func (t1 *Taxon) LCA(t2 *Taxon) (*Taxon, error) { - if t1 == nil || t1.Node == nil { - return nil, fmt.Errorf("try to get LCA of nil taxon") + if t1 == nil && t2 != nil { + return t2, nil } - if t2 == nil || t2.Node == nil { + if t2 == nil && t1 != nil { + + return t1, nil + } + + if t1 == nil && t2 == nil { + return nil, fmt.Errorf("try to get LCA of nil taxa") + } + + if t1.Node == nil { + return nil, fmt.Errorf("try to get LCA of nil taxa") + } + + if t2.Node == nil { return nil, fmt.Errorf("try to get LCA of nil taxon") } diff --git a/pkg/obitax/taxid.go b/pkg/obitax/taxid.go new file mode 100644 index 0000000..b4c0b35 --- /dev/null +++ b/pkg/obitax/taxid.go @@ -0,0 +1,60 @@ +package obitax + +import ( + "fmt" + "strconv" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" +) + +// Taxid represents a taxonomic identifier as a pointer to a string. +type Taxid *string + +// TaxidFactory is a factory for creating Taxid instances from strings and integers. +type TaxidFactory struct { + inner *InnerString + code string + alphabet obiutils.AsciiSet +} + +// NewTaxidFactory creates and returns a new instance of TaxidFactory. +func NewTaxidFactory(code string, alphabet obiutils.AsciiSet) *TaxidFactory { + return &TaxidFactory{ + inner: NewInnerString(), + code: code + ":", + alphabet: alphabet, + } + // Initialize and return a new TaxidFactory. +} + +// FromString converts a string representation of a taxonomic identifier into a Taxid. +// It extracts the relevant part of the string after the first colon (':') if present. +func (f *TaxidFactory) FromString(taxid string) (Taxid, error) { + taxid = obiutils.AsciiSpaceSet.TrimLeft(taxid) + part1, part2 := obiutils.SplitInTwo(taxid, ':') + if len(part2) == 0 { + taxid = part1 + } else { + if part1 != f.code { + return nil, fmt.Errorf("taxid %s string does not start with taxonomy code %s", taxid, f.code) + } + taxid = part2 + } + + taxid, err := f.alphabet.FirstWord(taxid) // Get the first word from the input string. + + if err != nil { + return nil, err + } + + // Return a new Taxid by innerizing the extracted taxid string. + rep := Taxid(f.inner.Innerize(taxid)) + return rep, nil +} + +// FromInt converts an integer taxonomic identifier into a Taxid. +// It first converts the integer to a string and then innerizes it. +func (f *TaxidFactory) FromInt(taxid int) (Taxid, error) { + s := strconv.Itoa(taxid) // Convert the integer to a string. + return f.inner.Innerize(s), nil // Return a new Taxid by innerizing the string. +} diff --git a/pkg/obitax/taxon.go b/pkg/obitax/taxon.go index b32eba0..d0817cb 100644 --- a/pkg/obitax/taxon.go +++ b/pkg/obitax/taxon.go @@ -179,9 +179,9 @@ func (taxon *Taxon) IPath() iter.Seq[*Taxon] { } } -// Path returns a slice of TaxNode representing the path from the current Taxon -// to the root Taxon in the associated Taxonomy. It collects all the nodes in the path -// using the IPath method and returns them as a TaxonSlice. +// Path returns a slice of TaxNode representing the path from the current Taxon. +// The first element of the slice is the current Taxon, and the last element is the +// to the root Taxon in the associated Taxonomy. // // Returns: // - A pointer to a TaxonSlice containing the TaxNode instances in the path @@ -371,3 +371,11 @@ func (taxon *Taxon) MetadataStringValues() []string { } return values } + +func (taxon *Taxon) SameAs(other *Taxon) bool { + if taxon == nil || other == nil { + return false + } + + return taxon.Taxonomy == other.Taxonomy && taxon.Node.id == other.Node.id +} diff --git a/pkg/obitax/taxonomy.go b/pkg/obitax/taxonomy.go index ab0d58b..3bc53b7 100644 --- a/pkg/obitax/taxonomy.go +++ b/pkg/obitax/taxonomy.go @@ -8,9 +8,10 @@ and retrieving information about taxa. package obitax import ( + "errors" "fmt" - "regexp" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" log "github.com/sirupsen/logrus" ) @@ -32,13 +33,12 @@ import ( type Taxonomy struct { name string code string - ids *InnerString + ids *TaxidFactory ranks *InnerString nameclasses *InnerString names *InnerString nodes *TaxonSet root *TaxNode - matcher *regexp.Regexp index map[*string]*TaxonSet } @@ -52,21 +52,18 @@ type Taxonomy struct { // // Returns: // - A pointer to the newly created Taxonomy instance. -func NewTaxonomy(name, code, codeCharacters string) *Taxonomy { +func NewTaxonomy(name, code string, codeCharacters obiutils.AsciiSet) *Taxonomy { set := make(map[*string]*TaxNode) - matcher := regexp.MustCompile(fmt.Sprintf("^[[:blank:]]*(%s:)?(%s+)", code, codeCharacters)) - taxonomy := &Taxonomy{ name: name, code: code, - ids: NewInnerString(), + ids: NewTaxidFactory(code, codeCharacters), ranks: NewInnerString(), nameclasses: NewInnerString(), names: NewInnerString(), nodes: &TaxonSet{set: set}, root: nil, - matcher: matcher, index: make(map[*string]*TaxonSet), } @@ -85,23 +82,17 @@ func NewTaxonomy(name, code, codeCharacters string) *Taxonomy { // Returns: // - The taxon identifier as a *string corresponding to the provided taxid. // - An error if the taxid is not valid or cannot be converted. -func (taxonomy *Taxonomy) Id(taxid string) (*string, error) { +func (taxonomy *Taxonomy) Id(taxid string) (Taxid, error) { taxonomy = taxonomy.OrDefault(false) if taxonomy == nil { - return nil, fmt.Errorf("Cannot extract Id from nil Taxonomy") + return nil, errors.New("Cannot extract Id from nil Taxonomy") } - matches := taxonomy.matcher.FindStringSubmatch(taxid) - - if matches == nil { - return nil, fmt.Errorf("taxid %s is not a valid taxid", taxid) - } - - return taxonomy.ids.Innerize(matches[2]), nil + return taxonomy.ids.FromString(taxid) } -// TaxidSting retrieves the string representation of a taxon node identified by the given ID. +// TaxidString retrieves the string representation of a taxon node identified by the given ID. // It looks up the node in the taxonomy and returns its formatted string representation // along with the taxonomy code. If the node does not exist, it returns an error. // @@ -111,7 +102,7 @@ func (taxonomy *Taxonomy) Id(taxid string) (*string, error) { // Returns: // - A string representing the taxon node in the format "taxonomyCode:id [scientificName]", // or an error if the taxon node with the specified ID does not exist in the taxonomy. -func (taxonomy *Taxonomy) TaxidSting(id string) (string, error) { +func (taxonomy *Taxonomy) TaxidString(id string) (string, error) { taxonomy = taxonomy.OrDefault(false) pid, err := taxonomy.Id(id) diff --git a/pkg/obitax/taxonslice.go b/pkg/obitax/taxonslice.go index 37066da..f5a956d 100644 --- a/pkg/obitax/taxonslice.go +++ b/pkg/obitax/taxonslice.go @@ -13,6 +13,7 @@ package obitax import ( "bytes" "fmt" + "log" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" ) @@ -59,6 +60,16 @@ func (slice *TaxonSlice) Get(i int) *TaxNode { return slice.slice[i] } +func (slice *TaxonSlice) Taxon(i int) *Taxon { + if slice == nil { + return nil + } + return &Taxon{ + Node: slice.slice[i], + Taxonomy: slice.taxonomy, + } +} + // Len returns the number of TaxNode instances in the TaxonSlice. // It provides the count of taxon nodes contained within the slice. // @@ -124,3 +135,13 @@ func (slice *TaxonSlice) Reverse(inplace bool) *TaxonSlice { slice: rep, } } + +func (slice *TaxonSlice) Set(index int, taxon *Taxon) *TaxonSlice { + if slice.taxonomy != taxon.Taxonomy { + log.Panic("Cannot add taxon from a different taxonomy") + } + + slice.slice[index] = taxon.Node + + return slice +} diff --git a/pkg/obitools/obicleandb/obicleandb.go b/pkg/obitools/obicleandb/obicleandb.go index b93c3c6..f5465e4 100644 --- a/pkg/obitools/obicleandb/obicleandb.go +++ b/pkg/obitools/obicleandb/obicleandb.go @@ -250,22 +250,26 @@ func ICleanDB(itertator obiiter.IBioSequence) obiiter.IBioSequence { if len(obigrep.CLIRequiredRanks()) > 0 { rankPredicate = obigrep.CLIHasRankDefinedPredicate() } else { - rankPredicate = taxonomy.HasRequiredRank("species").And(taxonomy.HasRequiredRank("genus")).And(taxonomy.HasRequiredRank("family")) + rankPredicate = obiseq.HasRequiredRank( + taxonomy, "species", + ).And(obiseq.HasRequiredRank( + taxonomy, "genus", + )).And(obiseq.HasRequiredRank(taxonomy, "family")) } - goodTaxa := taxonomy.IsAValidTaxon(CLIUpdateTaxids()).And(rankPredicate) + goodTaxa := obiseq.IsAValidTaxon(taxonomy, CLIUpdateTaxids()).And(rankPredicate) usable := unique.FilterOn(goodTaxa, obioptions.CLIBatchSize(), obioptions.CLIParallelWorkers()) - annotated := usable.MakeIWorker(taxonomy.MakeSetSpeciesWorker(), + annotated := usable.MakeIWorker(obiseq.MakeSetSpeciesWorker(taxonomy), false, obioptions.CLIParallelWorkers(), - ).MakeIWorker(taxonomy.MakeSetGenusWorker(), + ).MakeIWorker(obiseq.MakeSetGenusWorker(taxonomy), false, obioptions.CLIParallelWorkers(), - ).MakeIWorker(taxonomy.MakeSetFamilyWorker(), + ).MakeIWorker(obiseq.MakeSetFamilyWorker(taxonomy), false, obioptions.CLIParallelWorkers(), ) diff --git a/pkg/obitools/obigrep/options.go b/pkg/obitools/obigrep/options.go index 8295dc4..c44a23e 100644 --- a/pkg/obitools/obigrep/options.go +++ b/pkg/obitools/obigrep/options.go @@ -1,7 +1,6 @@ package obigrep import ( - "strconv" "strings" log "github.com/sirupsen/logrus" @@ -16,7 +15,7 @@ import ( ) var _BelongTaxa = make([]string, 0) -var _NotBelongTaxa = make([]int, 0) +var _NotBelongTaxa = make([]string, 0) var _RequiredRanks = make([]string, 0) var _MinimumLength = 1 @@ -60,7 +59,7 @@ func TaxonomySelectionOptionSet(options *getoptions.GetOpt) { options.ArgName("TAXID"), options.Description("Require that the actual taxon of the sequence belongs the provided taxid.")) - options.IntSliceVar(&_NotBelongTaxa, "ignore-taxon", 1, 1, + options.StringSliceVar(&_NotBelongTaxa, "ignore-taxon", 1, 1, options.Alias("i"), options.ArgName("TAXID"), options.Description("Require that the actual taxon of the sequence doesn't belong the provided taxid.")) @@ -273,18 +272,18 @@ func CLIRestrictTaxonomyPredicate() obiseq.SequencePredicate { if len(_BelongTaxa) > 0 { taxonomy := CLILoadSelectedTaxonomy() - taxid, err := strconv.Atoi(_BelongTaxa[0]) - if err != nil { - p = taxonomy.IsSubCladeOfSlot(_BelongTaxa[0]) + taxon := taxonomy.Taxon(_BelongTaxa[0]) + if taxon == nil { + p = obiseq.IsSubCladeOfSlot(taxonomy, _BelongTaxa[0]) } else { - p = taxonomy.IsSubCladeOf(taxid) + p = obiseq.IsSubCladeOf(taxonomy, taxon) } for _, staxid := range _BelongTaxa[1:] { - taxid, err := strconv.Atoi(staxid) - if err != nil { - p2 = taxonomy.IsSubCladeOfSlot(staxid) + taxon := taxonomy.Taxon(staxid) + if taxon == nil { + p2 = obiseq.IsSubCladeOfSlot(taxonomy, staxid) } else { - p2 = taxonomy.IsSubCladeOf(taxid) + p2 = obiseq.IsSubCladeOf(taxonomy, taxon) } p = p.Or(p2) @@ -297,13 +296,28 @@ func CLIRestrictTaxonomyPredicate() obiseq.SequencePredicate { } func CLIAvoidTaxonomyPredicate() obiseq.SequencePredicate { + var p obiseq.SequencePredicate + var p2 obiseq.SequencePredicate if len(_NotBelongTaxa) > 0 { taxonomy := CLILoadSelectedTaxonomy() - p := taxonomy.IsSubCladeOf(_NotBelongTaxa[0]) + + taxon := taxonomy.Taxon(_NotBelongTaxa[0]) + if taxon == nil { + p = obiseq.IsSubCladeOfSlot(taxonomy, _NotBelongTaxa[0]) + } else { + p = obiseq.IsSubCladeOf(taxonomy, taxon) + } for _, taxid := range _NotBelongTaxa[1:] { - p = p.Or(taxonomy.IsSubCladeOf(taxid)) + taxon := taxonomy.Taxon(taxid) + if taxon == nil { + p2 = obiseq.IsSubCladeOfSlot(taxonomy, taxid) + } else { + p2 = obiseq.IsSubCladeOf(taxonomy, taxon) + } + + p = p.Or(p2) } return p.Not() @@ -316,10 +330,10 @@ func CLIHasRankDefinedPredicate() obiseq.SequencePredicate { if len(_RequiredRanks) > 0 { taxonomy := CLILoadSelectedTaxonomy() - p := taxonomy.HasRequiredRank(_RequiredRanks[0]) + p := obiseq.HasRequiredRank(taxonomy, _RequiredRanks[0]) for _, rank := range _RequiredRanks[1:] { - p = p.And(taxonomy.HasRequiredRank(rank)) + p = p.And(obiseq.HasRequiredRank(taxonomy, rank)) } return p diff --git a/pkg/obitools/obilandmark/obilandmark.go b/pkg/obitools/obilandmark/obilandmark.go index 1700def..7f68f86 100644 --- a/pkg/obitools/obilandmark/obilandmark.go +++ b/pkg/obitools/obilandmark/obilandmark.go @@ -11,7 +11,6 @@ import ( "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obistats" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obifind" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obirefidx" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" "github.com/schollz/progressbar/v3" @@ -155,19 +154,20 @@ func CLISelectLandmarkSequences(iterator obiiter.IBioSequence) obiiter.IBioSeque } } - if obifind.CLIHasSelectedTaxonomy() { - taxo, err := obifind.CLILoadSelectedTaxonomy() - if err != nil { - log.Fatal(err) + if obioptions.CLIHasSelectedTaxonomy() { + taxo := obitax.DefaultTaxonomy() + if taxo == nil { + log.Fatal("No taxonomy available") } - taxa := make(obitax.TaxonSet, len(library)) + taxa := obitax.DefaultTaxonomy().NewTaxonSlice(len(library), len(library)) for i, seq := range library { - taxa[i], err = taxo.Taxon(seq.Taxid()) - if err != nil { - log.Fatal(err) + taxon := seq.Taxon(taxo) + if taxon == nil { + log.Fatal("%s: Cannot identify taxid %s in %s", seq.Id(), seq.Taxid(), taxo.Name()) } + taxa.Set(i, taxon) } pbopt := make([]progressbar.Option, 0, 5) @@ -182,7 +182,7 @@ func CLISelectLandmarkSequences(iterator obiiter.IBioSequence) obiiter.IBioSeque bar := progressbar.NewOptions(len(library), pbopt...) for i, seq := range library { - idx := obirefidx.GeomIndexSesquence(i, library, &taxa, taxo) + idx := obirefidx.GeomIndexSesquence(i, library, taxa, taxo) seq.SetOBITagGeomRefIndex(idx) if i%10 == 0 { diff --git a/pkg/obitools/obilandmark/options.go b/pkg/obitools/obilandmark/options.go index b1208e1..777e431 100644 --- a/pkg/obitools/obilandmark/options.go +++ b/pkg/obitools/obilandmark/options.go @@ -1,8 +1,8 @@ package obilandmark import ( + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obifind" "github.com/DavidGamba/go-getoptions" ) @@ -25,7 +25,7 @@ func LandmarkOptionSet(options *getoptions.GetOpt) { func OptionSet(options *getoptions.GetOpt) { obiconvert.InputOptionSet(options) obiconvert.OutputOptionSet(options) - obifind.LoadTaxonomyOptionSet(options, false, false) + obioptions.LoadTaxonomyOptionSet(options, false, false) LandmarkOptionSet(options) } diff --git a/pkg/obitools/obirefidx/famlilyindexing.go b/pkg/obitools/obirefidx/famlilyindexing.go index 7f3637f..91efdf6 100644 --- a/pkg/obitools/obirefidx/famlilyindexing.go +++ b/pkg/obitools/obirefidx/famlilyindexing.go @@ -13,7 +13,6 @@ import ( "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obifind" "github.com/schollz/progressbar/v3" log "github.com/sirupsen/logrus" ) @@ -73,9 +72,7 @@ func MakeIndexingSliceWorker(indexslot, idslot string, []*obikmer.Table4mer, len(sequences)) - taxa := make( - obitax.TaxonSet, - len(sequences)) + taxa := taxonomy.NewTaxonSlice(sequences.Len(), sequences.Len()) for i, seq := range sequences { j, ok := seq.GetIntAttribute(idslot) @@ -85,8 +82,11 @@ func MakeIndexingSliceWorker(indexslot, idslot string, } kmercounts[i] = (*kmers)[j] - - taxa[i], _ = taxonomy.Taxon(seq.Taxid()) + taxon := seq.Taxon(taxonomy) + if taxon == nil { + log.Panicf("%s: Cannot get the taxon %s in %s", seq.Id(), seq.Taxid(), taxonomy.Name()) + } + taxa.Set(i, taxon) } @@ -103,7 +103,7 @@ func MakeIndexingSliceWorker(indexslot, idslot string, f := func() { for l := range limits { for i := l[0]; i < l[1]; i++ { - idx := IndexSequence(i, sequences, &kmercounts, &taxa, taxonomy) + idx := IndexSequence(i, sequences, &kmercounts, taxa, taxonomy) sequences[i].SetAttribute(indexslot, idx) } } @@ -134,7 +134,7 @@ func IndexFamilyDB(iterator obiiter.IBioSequence) obiiter.IBioSequence { nref := len(references) log.Infof("Done. Database contains %d sequences", nref) - taxonomy, error := obifind.CLILoadSelectedTaxonomy() + taxonomy, error := obioptions.CLILoadSelectedTaxonomy() if error != nil { log.Panicln(error) } @@ -155,13 +155,13 @@ func IndexFamilyDB(iterator obiiter.IBioSequence) obiiter.IBioSequence { log.Info("done") partof := obiiter.IBatchOver(source, references, - obioptions.CLIBatchSize()).MakeIWorker(taxonomy.MakeSetSpeciesWorker(), + obioptions.CLIBatchSize()).MakeIWorker(obiseq.MakeSetSpeciesWorker(taxonomy), false, obioptions.CLIParallelWorkers(), - ).MakeIWorker(taxonomy.MakeSetGenusWorker(), + ).MakeIWorker(obiseq.MakeSetGenusWorker(taxonomy), false, obioptions.CLIParallelWorkers(), - ).MakeIWorker(taxonomy.MakeSetFamilyWorker(), + ).MakeIWorker(obiseq.MakeSetFamilyWorker(taxonomy), false, obioptions.CLIParallelWorkers(), ) @@ -187,14 +187,20 @@ func IndexFamilyDB(iterator obiiter.IBioSequence) obiiter.IBioSequence { clusters := obiseq.MakeBioSequenceSlice(0) kcluster := make([]*obikmer.Table4mer, 0) - taxa := make(obitax.TaxonSet, 0) + taxa := taxonomy.NewTaxonSlice(references.Len(), references.Len()) j := 0 for i, seq := range references { if is_centrer, _ := seq.GetBoolAttribute("reffamidx_clusterhead"); is_centrer { clusters = append(clusters, seq) kcluster = append(kcluster, refcounts[i]) - taxa[j], _ = taxonomy.Taxon(seq.Taxid()) + + taxon := seq.Taxon(taxonomy) + if taxon == nil { + log.Panicf("%s: Cannot get the taxon %s in %s", seq.Id(), seq.Taxid(), taxonomy.Name()) + } + taxa.Set(j, taxon) + j++ } } @@ -225,7 +231,7 @@ func IndexFamilyDB(iterator obiiter.IBioSequence) obiiter.IBioSequence { f := func() { for l := range limits { for i := l[0]; i < l[1]; i++ { - idx := IndexSequence(i, clusters, &kcluster, &taxa, taxonomy) + idx := IndexSequence(i, clusters, &kcluster, taxa, taxonomy) clusters[i].SetOBITagRefIndex(idx) bar.Add(1) } diff --git a/pkg/obitools/obirefidx/geomindexing.go b/pkg/obitools/obirefidx/geomindexing.go index c80d1c7..3df6bda 100644 --- a/pkg/obitools/obirefidx/geomindexing.go +++ b/pkg/obitools/obirefidx/geomindexing.go @@ -2,10 +2,11 @@ package obirefidx import ( "fmt" - log "github.com/sirupsen/logrus" "sort" "sync" + log "github.com/sirupsen/logrus" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obistats" @@ -15,7 +16,7 @@ import ( func GeomIndexSesquence(seqidx int, references obiseq.BioSequenceSlice, - taxa *obitax.TaxonSet, + taxa *obitax.TaxonSlice, taxo *obitax.Taxonomy) map[int]string { sequence := references[seqidx] @@ -56,28 +57,28 @@ func GeomIndexSesquence(seqidx int, order := obiutils.Order(sort.IntSlice(seq_dist)) - lca := (*taxa)[seqidx] + lca := taxa.Taxon(seqidx) index := make(map[int]string) index[0] = fmt.Sprintf( - "%d@%s@%s", - lca.Taxid(), - lca.ScientificName(), - lca.Rank()) + "%s@%s", + lca.String(), + lca.Rank(), + ) for _, o := range order { - new_lca, _ := lca.LCA((*taxa)[o]) - if new_lca.Taxid() != lca.Taxid() { + new_lca, _ := lca.LCA(taxa.Taxon(o)) + if new_lca.SameAs(lca) { lca = new_lca index[int(seq_dist[o])] = fmt.Sprintf( - "%d@%s@%s", - lca.Taxid(), - lca.ScientificName(), - lca.Rank()) - } + "%s@%s", + lca.String(), + lca.Rank(), + ) - if lca.Taxid() == 1 { - break + if lca.IsRoot() { + break + } } } diff --git a/pkg/obitools/obirefidx/obirefidx.go b/pkg/obitools/obirefidx/obirefidx.go index a756217..943b863 100644 --- a/pkg/obitools/obirefidx/obirefidx.go +++ b/pkg/obitools/obirefidx/obirefidx.go @@ -12,102 +12,171 @@ import ( "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obifind" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" "github.com/schollz/progressbar/v3" ) +// IndexSequence processes a biological sequence and indexes it based on taxonomic information. +// It computes the least common ancestors (LCA) for the sequence and its references, +// evaluates common k-mers, and calculates differences in alignment scores. +// +// Parameters: +// - seqidx: The index of the sequence to process within the references slice. +// - references: A slice of biological sequences to compare against. +// - kmers: A pointer to a slice of k-mer tables used for common k-mer calculations. +// - taxa: A slice of taxonomic information corresponding to the sequences. +// - taxo: A taxonomy object used for LCA calculations. +// +// Returns: +// +// A map where the keys are integers representing alignment differences, +// and the values are strings formatted as "Taxon@Rank" indicating the taxonomic +// classification of the sequence based on the computed differences. func IndexSequence(seqidx int, references obiseq.BioSequenceSlice, kmers *[]*obikmer.Table4mer, - taxa *obitax.TaxonSet, + taxa *obitax.TaxonSlice, taxo *obitax.Taxonomy) map[int]string { + // Retrieve the biological sequence at the specified index from the references slice. sequence := references[seqidx] + seq_len := sequence.Len() + // Get the taxon corresponding to the current sequence index. + tseq := taxa.Taxon(seqidx) + // Get the taxonomic path for the current sequence. + pseq := tseq.Path() + path_len := pseq.Len() + + // For each taxonomic ancestor in the path, a biosequence slice is created to store + // the reference sequences having that ancestor as their LCA with the current sequence. + refs := make(map[*obitax.TaxNode]*[]int, path_len) + + for i := 0; i < path_len; i++ { + temp := make([]int, 0, 100) + refs[pseq.Taxon(i).Node] = &temp + } + + // log.Infof("%s length of path: %d", sequence.Id(), len(refs)) + + n := taxa.Len() + lcaCache := make(map[*obitax.TaxNode]*obitax.TaxNode, n) + + for i := 0; i < n; i++ { + taxon := taxa.Taxon(i) // Get the taxon at index i. + // Compute the LCA between the current taxon and the taxon of the sequence. + node, ok := lcaCache[taxon.Node] + if !ok { + lca, err := tseq.LCA(taxon) + if err != nil { + // Log a fatal error if the LCA computation fails, including the taxon details. + log.Fatalf("(%s,%s): %+v", tseq.String(), taxon.String(), err) + } + node = lca.Node + lcaCache[taxon.Node] = node + } + + // log.Infof("%s Processing taxon: %s x %s -> %s", sequence.Id(), tseq.String(), taxon.String(), node.String(taxo.Code())) + + // Append the current sequence to the LCA's reference sequence slice. + *refs[node] = append(*refs[node], i) + } + + closest := make([]int, path_len) + + closest[0] = 0 + + // Initialize a matrix to store alignment scores var matrix []uint64 - lca := make(obitax.TaxonSet, len(references)) - tseq := (*taxa)[seqidx] + // log.Warnf("%s : %s", sequence.Id(), pseq.String()) + for idx_path := 1; idx_path < path_len; idx_path++ { + mini := -1 + seqidcs := refs[pseq.Taxon(idx_path).Node] - for i, taxon := range *taxa { - lca[i], _ = tseq.LCA(taxon) - } + ns := len(*seqidcs) - cw := make([]int, len(references)) - sw := (*kmers)[seqidx] - for i, ref := range *kmers { - cw[i] = obikmer.Common4Mer(sw, ref) - } + if ns > 0 { - ow := obiutils.Reverse(obiutils.IntOrder(cw), true) - pseq, _ := tseq.Path() - obiutils.Reverse(*pseq, true) - // score := make([]int, len(references)) - mindiff := make([]int, len(*pseq)) - /* - nseq := make([]int, len(*pseq)) - nali := make([]int, len(*pseq)) - nok := make([]int, len(*pseq)) - nfast := make([]int, len(*pseq)) - nfastok := make([]int, len(*pseq)) - */ + shared := make([]int, ns) - lseq := sequence.Len() + for j, is := range *seqidcs { + shared[j] = obikmer.Common4Mer((*kmers)[seqidx], (*kmers)[is]) + } - mini := -1 - wordmin := 0 + ow := obiutils.Reverse(obiutils.IntOrder(shared), true) + + for _, order := range ow { + is := (*seqidcs)[order] + suject := references[is] - for i, ancestor := range *pseq { - for _, order := range ow { - if lca[order] == ancestor { - // nseq[i]++ if mini != -1 { - wordmin = max(sequence.Len(), references[order].Len()) - 3 - 4*mini - } - - if cw[order] < wordmin { - break + wordmin := max(seq_len, suject.Len()) - 3 - 4*mini + + // If the common k-mer count for the current order is less than the + // minimum word length, break the loop. + if shared[order] < wordmin { + break + } } + // Initialize variables for Longest Common Subsequence (LCS) score and alignment length. lcs, alilength := -1, -1 - errs := int(1e9) - if mini != -1 && mini <= 1 { - // nfast[i]++ - d, _, _, _ := obialign.D1Or0(sequence, references[order]) - if d >= 0 { - errs = d - // nfastok[i]++ + errs := int(1e9) // Initialize errors to a large number. + + // If mini is set and less than or equal to 1, perform a specific alignment. + if mini == 0 || mini == 1 { + // Perform a specific alignment and get the distance. + d, _, _, _ := obialign.D1Or0(sequence, suject) + if d >= 0 { // If the distance is valid (non-negative). + errs = d // Update errors with the distance. } } else { - // nali[i]++ - lcs, alilength = obialign.FastLCSScore(sequence, references[order], mini, &matrix) - if lcs >= 0 { - // nok[i]++ - errs = alilength - lcs + // Perform a Fast LCS score calculation for the sequence and reference. + lcs, alilength = obialign.FastLCSScore(sequence, suject, mini, &matrix) + if lcs >= 0 { // If LCS score is valid (non-negative). + errs = alilength - lcs // Calculate errors based on alignment length. } - } + + // Update mini with the minimum errors found. if mini == -1 || errs < mini { mini = errs } + + if mini == 0 { + // log.Warnf("%s: %s", sequence.Id(), sequence.String()) + // log.Warnf("%s: %s", suject.Id(), suject.String()) + break + } } + + if mini == -1 { + log.Fatalf("(%s,%s): No alignment found.", sequence.Id(), pseq.Taxon(idx_path).String()) + } + + closest[idx_path] = mini + // insure than closest is strictly increasing + for k := idx_path - 1; k >= 0 && mini < closest[k]; k-- { + closest[k] = mini + // log.Warnf("(%s,%s) Smaller alignment found than previous (%d,%d). Resetting closest.", sequence.Id(), pseq.Taxon(idx_path).String(), mini, closest[k]) + } + } else { + closest[idx_path] = seq_len } - mindiff[i] = mini } - obitag_index := make(map[int]string, len(*pseq)) + obitag_index := make(map[int]string, pseq.Len()) - old := lseq - for i, d := range mindiff { - if d != -1 && d < old { - current_taxid := (*pseq)[i] + // log.Warnf("(%s,%s): %v", sequence.Id(), pseq.Taxon(0).String(), closest) + for i, d := range closest { + if i < (len(closest)-1) && d < closest[i+1] { + current_taxon := pseq.Taxon(i) obitag_index[d] = fmt.Sprintf( - "%d@%s@%s", - current_taxid.Taxid(), - current_taxid.ScientificName(), - current_taxid.Rank()) - old = d + "%s@%s", + current_taxon.String(), + current_taxon.Rank(), + ) } } @@ -128,22 +197,21 @@ func IndexReferenceDB(iterator obiiter.IBioSequence) obiiter.IBioSequence { source, references := iterator.Load() log.Infof("Done. Database contains %d sequences", len(references)) - taxo, error := obifind.CLILoadSelectedTaxonomy() + taxo, error := obioptions.CLILoadSelectedTaxonomy() if error != nil { log.Panicln(error) } log.Infoln("Indexing sequence taxids...") - taxa := make( - obitax.TaxonSet, - len(references)) + n := len(references) + taxa := taxo.NewTaxonSlice(n, n) j := 0 for i, seq := range references { - taxon, err := taxo.Taxon(seq.Taxid()) - if err == nil { - taxa[j] = taxon + taxon := seq.Taxon(taxo) + if taxon != nil { + taxa.Set(j, taxon) references[j] = references[i] j++ } @@ -198,7 +266,7 @@ func IndexReferenceDB(iterator obiiter.IBioSequence) obiiter.IBioSequence { for l := range limits { sl := obiseq.MakeBioSequenceSlice() for i := l[0]; i < l[1]; i++ { - idx := IndexSequence(i, references, &refcounts, &taxa, taxo) + idx := IndexSequence(i, references, &refcounts, taxa, taxo) iref := references[i].Copy() iref.SetOBITagRefIndex(idx) sl = append(sl, iref) diff --git a/pkg/obitools/obirefidx/options.go b/pkg/obitools/obirefidx/options.go index 2a56566..b56d2c3 100644 --- a/pkg/obitools/obirefidx/options.go +++ b/pkg/obitools/obirefidx/options.go @@ -1,8 +1,8 @@ package obirefidx import ( + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obifind" "github.com/DavidGamba/go-getoptions" ) @@ -10,5 +10,5 @@ import ( // the obiuniq command func OptionSet(options *getoptions.GetOpt) { obiconvert.OptionSet(options) - obifind.LoadTaxonomyOptionSet(options, true, false) + obioptions.LoadTaxonomyOptionSet(options, true, false) } diff --git a/pkg/obitools/obitag/obigeomtag.go b/pkg/obitools/obitag/obigeomtag.go index c46d187..21f21aa 100644 --- a/pkg/obitools/obitag/obigeomtag.go +++ b/pkg/obitools/obitag/obigeomtag.go @@ -48,19 +48,18 @@ func ExtractLandmarkSeqs(references *obiseq.BioSequenceSlice) *obiseq.BioSequenc // - taxonomy: a pointer to a Taxonomy object. // // The function returns a pointer to a TaxonSet, which is a set of taxa. -func ExtractTaxonSet(references *obiseq.BioSequenceSlice, taxonomy *obitax.Taxonomy) *obitax.TaxonSet { - var err error - taxa := make(obitax.TaxonSet, len(*references)) +func ExtractTaxonSet(references *obiseq.BioSequenceSlice, taxonomy *obitax.Taxonomy) *obitax.TaxonSlice { + taxa := taxonomy.NewTaxonSlice(references.Len(), references.Len()) for i, ref := range *references { - taxid := ref.Taxid() - taxa[i], err = taxonomy.Taxon(taxid) - if err != nil { - log.Panicf("Taxid %d, for sequence %s not found in taxonomy", taxid, ref.Id()) + taxon := ref.Taxon(taxonomy) + if taxon == nil { + log.Panicf("%s: Cannot get the taxon %s in %s", ref.Id(), ref.Taxid(), taxonomy.Name()) } + taxa.Set(i, taxon) } - return &taxa + return taxa } // MapOnLandmarkSequences calculates the coordinates of landmarks on a given sequence. @@ -149,29 +148,26 @@ func FindGeomClosest(sequence *obiseq.BioSequence, func GeomIdentify(sequence *obiseq.BioSequence, landmarks *obiseq.BioSequenceSlice, references *obiseq.BioSequenceSlice, - taxa *obitax.TaxonSet, + taxa *obitax.TaxonSlice, taxo *obitax.Taxonomy, buffer *[]uint64) *obiseq.BioSequence { best_seq, min_dist, best_id, query_location, matches := FindGeomClosest(sequence, landmarks, references, buffer) - taxon := (*obitax.TaxNode)(nil) + taxon := (*obitax.Taxon)(nil) var err error if best_id > 0.5 { - taxid, _, _ := MatchDistanceIndex(min_dist, (*matches)[0].OBITagGeomRefIndex()) - taxon, _ = taxo.Taxon(taxid) + taxon = MatchDistanceIndex(taxo, min_dist, (*matches)[0].OBITagGeomRefIndex()) for i := 1; i < len(*matches); i++ { - taxid, _, _ := MatchDistanceIndex(min_dist, (*matches)[i].OBITagGeomRefIndex()) - newTaxon, _ := taxo.Taxon(taxid) + newTaxon := MatchDistanceIndex(taxo, min_dist, (*matches)[i].OBITagGeomRefIndex()) taxon, err = newTaxon.LCA(taxon) if err != nil { log.Panicf("LCA error: %v", err) } } - sequence.SetTaxid(taxon.Taxid()) + sequence.SetTaxon(taxon) } else { - taxon, _ = taxo.Taxon(1) - sequence.SetTaxid(1) + sequence.SetTaxon(taxo.Root()) } sequence.SetAttribute("scientific_name", taxon.ScientificName()) diff --git a/pkg/obitools/obitag/obitag.go b/pkg/obitools/obitag/obitag.go index 5dd4db2..3b704f1 100644 --- a/pkg/obitools/obitag/obitag.go +++ b/pkg/obitools/obitag/obitag.go @@ -2,8 +2,6 @@ package obitag import ( "sort" - "strconv" - "strings" log "github.com/sirupsen/logrus" "golang.org/x/exp/maps" @@ -29,7 +27,9 @@ import ( // - taxid: The taxid associated with the matched distance. // - rank: The rank associated with the matched distance. // - scientificName: The scientific name associated with the matched distance. -func MatchDistanceIndex(distance int, distanceIdx map[int]string) (int, string, string) { +func MatchDistanceIndex(taxonomy *obitax.Taxonomy, distance int, distanceIdx map[int]string) *obitax.Taxon { + taxonomy = taxonomy.OrDefault(true) + keys := maps.Keys(distanceIdx) slices.Sort(keys) @@ -37,24 +37,20 @@ func MatchDistanceIndex(distance int, distanceIdx map[int]string) (int, string, return distance <= keys[i] }) - var taxid int - var rank string - var scientificName string + var taxon *obitax.Taxon if i == len(keys) || distance > keys[len(keys)-1] { - taxid = 1 - rank = "no rank" - scientificName = "root" + taxon = taxonomy.Root() } else { - parts := strings.Split(distanceIdx[keys[i]], "@") - taxid, _ = strconv.Atoi(parts[0]) - rank = parts[1] - scientificName = parts[2] + taxon = taxonomy.Taxon(distanceIdx[keys[i]]) + if taxon == nil { + log.Panicf("Cannot identify taxon %s in %s", distanceIdx[keys[i]], taxonomy.Name()) + } } // log.Info("taxid:", taxid, " rank:", rank, " scientificName:", scientificName) - return taxid, rank, scientificName + return taxon } // FindClosests finds the closest bio sequence from a given sequence and a slice of reference sequences. @@ -169,12 +165,12 @@ func FindClosests(sequence *obiseq.BioSequence, func Identify(sequence *obiseq.BioSequence, references obiseq.BioSequenceSlice, refcounts []*obikmer.Table4mer, - taxa obitax.TaxonSet, + taxa *obitax.TaxonSlice, taxo *obitax.Taxonomy, runExact bool) *obiseq.BioSequence { bests, differences, identity, bestmatch, seqidxs := FindClosests(sequence, references, refcounts, runExact) - taxon := (*obitax.TaxNode)(nil) + taxon := (*obitax.Taxon)(nil) if identity >= 0.5 && differences >= 0 { newidx := 0 @@ -183,56 +179,24 @@ func Identify(sequence *obiseq.BioSequence, if idx == nil { // log.Debugln("Need of indexing") newidx++ - idx = obirefidx.IndexSequence(seqidxs[i], references, &refcounts, &taxa, taxo) + idx = obirefidx.IndexSequence(seqidxs[i], references, &refcounts, taxa, taxo) references[seqidxs[i]].SetOBITagRefIndex(idx) log.Debugln(references[seqidxs[i]].Id(), idx) } d := differences identification, ok := idx[d] - found := false - var parts []string - /* - Here is an horrible hack for xprize challence. - With Euka01 the part[0] was equal to "" for at - least a sequence consensus. Which is not normal. - - TO BE CHECKED AND CORRECTED - - The problem seems related to idx that doesn't have - a 0 distance - */ - for !found && d >= 0 { - for !ok && d >= 0 { - identification, ok = idx[d] - d-- - } - - parts = strings.Split(identification, "@") - - found = parts[0] != "" - if !found { - log.Debugln("Problem in identification line : ", best.Id(), "idx:", idx, "distance:", d) - for !ok && d <= 1000 { - identification, ok = idx[d] - d++ - } - - } + for !ok && d >= 0 { + d-- + identification, ok = idx[d] } - match_taxid, err := strconv.Atoi(parts[0]) - - if err != nil { - log.Panicln("Cannot extract taxid from :", identification) + if !ok { + log.Panic("Problem in identification line : ", best.Id(), "idx:", idx, "distance:", d) } - match_taxon, err := taxo.Taxon(match_taxid) - - if err != nil { - log.Panicln("Cannot find taxon corresponding to taxid :", match_taxid) - } + match_taxon := taxo.Taxon(identification) if taxon != nil { taxon, _ = taxon.LCA(match_taxon) @@ -241,16 +205,16 @@ func Identify(sequence *obiseq.BioSequence, } } + log.Debugln(sequence.Id(), "Best matches:", len(bests), "New index:", newidx) - sequence.SetTaxid(taxon.Taxid()) + sequence.SetTaxon(taxon) } else { - taxon, _ = taxo.Taxon(1) - sequence.SetTaxid(1) + taxon = taxo.Root() + sequence.SetTaxon(taxon) } - sequence.SetAttribute("scientific_name", taxon.ScientificName()) sequence.SetAttribute("obitag_rank", taxon.Rank()) sequence.SetAttribute("obitag_bestid", identity) sequence.SetAttribute("obitag_bestmatch", bestmatch) @@ -262,7 +226,7 @@ func Identify(sequence *obiseq.BioSequence, func IdentifySeqWorker(references obiseq.BioSequenceSlice, refcounts []*obikmer.Table4mer, - taxa obitax.TaxonSet, + taxa *obitax.TaxonSlice, taxo *obitax.Taxonomy, runExact bool) obiseq.SeqWorker { return func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) { @@ -279,23 +243,21 @@ func CLIAssignTaxonomy(iterator obiiter.IBioSequence, []*obikmer.Table4mer, len(references)) - taxa := make(obitax.TaxonSet, - len(references)) - + taxa := taxo.NewTaxonSlice(references.Len(), references.Len()) buffer := make([]byte, 0, 1000) - var err error j := 0 for _, seq := range references { references[j] = seq refcounts[j] = obikmer.Count4Mer(seq, &buffer, nil) - taxa[j], err = taxo.Taxon(seq.Taxid()) - if err == nil { + taxon := seq.Taxon(taxo) + taxa.Set(j, taxon) + if taxon != nil { j++ } else { - log.Warnf("Taxid %d is not described in the taxonomy."+ + log.Warnf("Taxid %d is not described in the taxonomy %s."+ " Sequence %s is discared from the reference database", - seq.Taxid(), seq.Id()) + seq.Taxid(), taxo.Name(), seq.Id()) } } diff --git a/pkg/obitools/obitag/options.go b/pkg/obitools/obitag/options.go index ce5d233..1f162f3 100644 --- a/pkg/obitools/obitag/options.go +++ b/pkg/obitools/obitag/options.go @@ -8,7 +8,6 @@ import ( "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obifind" "github.com/DavidGamba/go-getoptions" ) @@ -42,7 +41,7 @@ func TagOptionSet(options *getoptions.GetOpt) { // the obiuniq command func OptionSet(options *getoptions.GetOpt) { obiconvert.OptionSet(options) - obifind.LoadTaxonomyOptionSet(options, true, false) + obioptions.LoadTaxonomyOptionSet(options, true, false) TagOptionSet(options) } diff --git a/pkg/obiutils/strings.go b/pkg/obiutils/strings.go index 3182031..09807c1 100644 --- a/pkg/obiutils/strings.go +++ b/pkg/obiutils/strings.go @@ -1,12 +1,158 @@ package obiutils -import "unsafe" +import ( + "fmt" + "unsafe" +) +type AsciiSet [256]bool + +var AsciiSpaceSet = AsciiSetFromString("\t\n\v\f\r ") +var AsciiDigitSet = AsciiSetFromString("0123456789") +var AsciiUpperSet = AsciiSetFromString("ABCDEFGHIJKLMNOPQRSTUVWXYZ") +var AsciiLowerSet = AsciiSetFromString("abcdefghijklmnopqrstuvwxyz") +var AsciiAlphaSet = AsciiUpperSet.Union(AsciiLowerSet) +var AsciiAlphaNumSet = AsciiAlphaSet.Union(AsciiDigitSet) + +// UnsafeStringFromBytes converts a byte slice into a string without making a copy of the data. +// This function is considered unsafe because it directly manipulates memory and does not +// perform any checks on the byte slice's contents. It should be used with caution. +// +// Parameters: +// - data: A byte slice that contains the data to be converted to a string. +// +// Returns: +// +// A string representation of the byte slice. If the input byte slice is empty, +// an empty string is returned. func UnsafeStringFromBytes(data []byte) string { if len(data) > 0 { + // Convert the byte slice to a string using unsafe operations. s := unsafe.String(unsafe.SliceData(data), len(data)) return s } - return "" + return "" // Return an empty string if the input slice is empty. +} + +func AsciiSetFromString(s string) AsciiSet { + r := [256]bool{} + for _, c := range s { + r[c] = true + } + return r +} + +func (r *AsciiSet) Contains(c byte) bool { + return r[c] +} + +func (r *AsciiSet) Union(s AsciiSet) AsciiSet { + for i := 0; i < 256; i++ { + s[i] = r[i] || s[i] + } + + return s +} + +func (r *AsciiSet) Intersect(s AsciiSet) AsciiSet { + for i := 0; i < 256; i++ { + s[i] = r[i] && s[i] + } + + return s +} + +// FirstWord extracts the first word from a given string. +// A word is defined as a sequence of non-space characters. +// It ignores leading whitespace and stops at the first whitespace character encountered. +// +// Parameters: +// - s: The input string from which the first word is to be extracted. +// +// Returns: +// +// A string containing the first word found in the input string. If the input string +// is empty or contains only whitespace, an empty string is returned. +func FirstWord(s string) string { + // Fast path for ASCII: look for the first ASCII non-space byte + start := 0 + for ; start < len(s); start++ { + c := s[start] + if !AsciiSpaceSet.Contains(c) { + break + } + } + + stop := start + for ; stop < len(s); stop++ { + c := s[stop] + if AsciiSpaceSet.Contains(c) { + break + } + } + return s[start:stop] +} + +// FirstRestrictedWord extracts the first word from a given string while enforcing character restrictions. +// A word is defined as a sequence of non-space characters. The function checks each character +// against the provided restriction array, which indicates whether a character is allowed. +// +// Parameters: +// - s: The input string from which the first restricted word is to be extracted. +// - restriction: A boolean array of size 256 where each index represents a character's ASCII value. +// If restriction[c] is false, the character c is not allowed in the word. +// +// Returns: +// +// A string containing the first word found in the input string that does not contain any restricted characters. +// If a restricted character is found, an error is returned indicating the invalid character. +// If the input string is empty or contains only whitespace, an empty string is returned with no error. +func (restriction *AsciiSet) FirstWord(s string) (string, error) { + // Fast path for ASCII: look for the first ASCII non-space byte + start := 0 + for ; start < len(s); start++ { + c := s[start] + if !AsciiSpaceSet.Contains(c) { + break + } + } + + stop := start + for ; stop < len(s); stop++ { + c := s[stop] + if AsciiSpaceSet.Contains(c) { + break + } + if !restriction.Contains(c) { + return "", fmt.Errorf("invalid character '%c' in string: %s", c, s) + } + } + + return s[start:stop], nil +} + +func (r *AsciiSet) TrimLeft(s string) string { + i := 0 + for ; i < len(s); i++ { + c := s[i] + if !AsciiSpaceSet.Contains(c) { + break + } + } + return s[i:] +} + +func SplitInTwo(s string, sep byte) (string, string) { + i := 0 + for ; i < len(s); i++ { + c := s[i] + if c == sep { + break + } + } + if i == len(s) { + return s, "" + } + return s[:i], s[i+1:] }