From 60c187404de6568734511a90ebf8d2a911b61b26 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Tue, 2 May 2023 10:43:22 +0200 Subject: [PATCH] Restore old obisort and add LCA functionnality to obiannotate. Former-commit-id: aecaacc9dae49f74bd888a8eb8140822d31a42a6 --- pkg/obitax/lca.go | 128 ++++++++++++++++++++++++ pkg/obitools/obiannotate/obiannotate.go | 12 ++- pkg/obitools/obiannotate/options.go | 31 +++++- pkg/obitools/obifind/options.go | 12 +++ pkg/obiutils/slices.go | 13 +++ 5 files changed, 190 insertions(+), 6 deletions(-) diff --git a/pkg/obitax/lca.go b/pkg/obitax/lca.go index 2bd14de..725700b 100644 --- a/pkg/obitax/lca.go +++ b/pkg/obitax/lca.go @@ -1,5 +1,15 @@ package obitax +import ( + "math" + "strconv" + "strings" + + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils" + log "github.com/sirupsen/logrus" +) + func (t1 *TaxNode) LCA(t2 *TaxNode) (*TaxNode, error) { p1, err1 := t1.Path() @@ -23,3 +33,121 @@ func (t1 *TaxNode) LCA(t2 *TaxNode) (*TaxNode, error) { return (*p1)[i1+1], nil } + +func (taxonomy *Taxonomy) TaxonomicDistribution(sequence *obiseq.BioSequence) map[*TaxNode]int { + taxids := sequence.StatsOn("taxid", "na") + taxons := make(map[*TaxNode]int, len(taxids)) + + for k, v := range taxids { + taxid, _ := strconv.Atoi(k) + + t, et := taxonomy.Taxon(taxid) + if et != nil { + log.Panic("Taxid %d not defined in taxonomy : %v", k, et) + } + taxons[t] = v + } + return taxons +} + +func (taxonomy *Taxonomy) LCA(sequence *obiseq.BioSequence, threshold float64) (*TaxNode, float64, int) { + taxons := taxonomy.TaxonomicDistribution(sequence) + paths := make(map[*TaxNode]*TaxonSlice, len(taxons)) + answer := (*TaxNode)(nil) + rans := 1.0 + granTotal := 0 + + for t, w := range taxons { + p, ep := t.Path() + if ep != nil { + log.Panic("Taxonomic path cannot be retreived from Taxid %d : %v", t.Taxid(), ep) + } + + obiutils.Reverse(*p, true) + paths[t] = p + answer = (*p)[0] + granTotal += w + } + + rmax := 1.0 + levels := make(map[*TaxNode]int, len(paths)) + taxonMax := answer + + for i := 0; rmax >= threshold; i++ { + answer = taxonMax + rans = rmax + taxonMax = nil + total := 0 + for taxon, weight := range taxons { + path := paths[taxon] + if len(*path) > i { + levels[(*path)[i]] += weight + } + total += weight + } + weighMax := 0 + for taxon, weight := range levels { + if weight > weighMax { + weighMax = weight + taxonMax = taxon + } + } + + if total > 0 { + rmax *= float64(weighMax) / float64(total) + } else { + rmax = 0.0 + } + + for taxon := range levels { + delete(levels, taxon) + } + for taxon := range taxons { + path := paths[taxon] + if i < len(*path) { + if (*path)[i] != taxonMax { + delete(paths, taxon) + delete(taxons, taxon) + } + } + } + // if taxonMax != nil { + // log.Println("@@@>", i, taxonMax.ScientificName(), taxonMax.Taxid(), rans, weighMax, total, rmax) + // } else { + // log.Println("@@@>", "--", 0, rmax) + // } + } + // log.Println("###>", answer.ScientificName(), answer.Taxid(), rans) + // log.Print("========================================") + return answer, rans, granTotal + +} + +func AddLCAWorker(taxonomy *Taxonomy, slot_name string, threshold float64) obiseq.SeqWorker { + + if !strings.HasSuffix(slot_name, "taxid") { + slot_name = slot_name + "_taxid" + } + + lca_error := strings.Replace(slot_name, "taxid", "error", 1) + if lca_error == "error" { + lca_error = "lca_error" + } + + lca_name := strings.Replace(slot_name, "taxid", "name", 1) + if lca_name == "name" { + lca_name = "scientific_name" + } + + f := func(sequence *obiseq.BioSequence) *obiseq.BioSequence { + lca, rans, _ := taxonomy.LCA(sequence, threshold) + + sequence.SetAttribute(slot_name, lca.Taxid()) + sequence.SetAttribute(lca_name, lca.ScientificName()) + sequence.SetAttribute(lca_error, math.Round((1-rans)*1000)/1000) + + return sequence + } + + return f +} diff --git a/pkg/obitools/obiannotate/obiannotate.go b/pkg/obitools/obiannotate/obiannotate.go index 356cd34..caff95e 100644 --- a/pkg/obitools/obiannotate/obiannotate.go +++ b/pkg/obitools/obiannotate/obiannotate.go @@ -70,11 +70,11 @@ func EvalAttributeWorker(expression map[string]string) obiseq.SeqWorker { var w obiseq.SeqWorker w = nil - for a,e := range expression { + for a, e := range expression { if w == nil { - w = obiseq.EditAttributeWorker(a,e) + w = obiseq.EditAttributeWorker(a, e) } else { - w.ChainWorkers(obiseq.EditAttributeWorker(a,e)) + w.ChainWorkers(obiseq.EditAttributeWorker(a, e)) } } @@ -132,6 +132,12 @@ func CLIAnnotationWorker() obiseq.SeqWorker { annotator = annotator.ChainWorkers(w) } + if CLIHasAddLCA() { + taxo := obigrep.CLILoadSelectedTaxonomy() + w := obitax.AddLCAWorker(taxo, CLILCASlotName(), CLILCAThreshold()) + annotator = annotator.ChainWorkers(w) + } + if CLIHasSetLengthFlag() { w := AddSeqLengthWorker() annotator = annotator.ChainWorkers(w) diff --git a/pkg/obitools/obiannotate/options.go b/pkg/obitools/obiannotate/options.go index b3d61c0..542f348 100644 --- a/pkg/obitools/obiannotate/options.go +++ b/pkg/obitools/obiannotate/options.go @@ -22,6 +22,8 @@ var _clearAll = false var _setSeqLength = false var _uniqueID = false var _ahoCorazick = "" +var _lcaSlot = "" +var _lcaError = 0.0 func SequenceAnnotationOptionSet(options *getoptions.GetOpt) { // options.BoolVar(&_addRank, "seq-rank", _addRank, @@ -38,6 +40,20 @@ func SequenceAnnotationOptionSet(options *getoptions.GetOpt) { options.StringVar(&_ahoCorazick, "aho-corasick", _ahoCorazick, options.Description("Adds an aho-corasick attribut with the count of matches of the provided patterns.")) + + options.StringVar(&_lcaSlot, "add-lca", _lcaSlot, + options.ArgName("SLOT_NAME"), + options.Description("From the taxonomic annotation of the sequence (taxid slot or merged_taxid slot), "+ + "a new slot named is added with the taxid of the lowest common ancester corresponding "+ + "to the current annotation.")) + + options.Float64Var(&_lcaError, "lca-error", _lcaError, + options.ArgName("#.###"), + options.Description("Error rate tolerated on the taxonomical discription during the lowest common "+ + "ancestor. At most a fraction of lca-error of the taxonomic information can disagree with the "+ + "estimated LCA."), + ) + // options.BoolVar(&_uniqueID, "uniq-id", _uniqueID, // options.Description("Forces sequence record ids to be unique."), // ) @@ -155,9 +171,6 @@ func CLISetAttributeExpression() map[string]string { return _evalAttribute } - - - func CLIHasAhoCorasick() bool { _, err := os.Stat(_ahoCorazick) return err == nil @@ -182,3 +195,15 @@ func CLIAhoCorazick() []string { return lines } + +func CLILCASlotName() string { + return _lcaSlot +} + +func CLIHasAddLCA() bool { + return _lcaSlot != "" +} + +func CLILCAThreshold() float64 { + return 1 - _lcaError +} diff --git a/pkg/obitools/obifind/options.go b/pkg/obitools/obifind/options.go index c9279da..d250ac2 100644 --- a/pkg/obitools/obifind/options.go +++ b/pkg/obitools/obifind/options.go @@ -45,6 +45,17 @@ func LoadTaxonomyOptionSet(options *getoptions.GetOpt, required, alternatiive bo options.Description("Restrict output to some subclades.")) } +func FilterTaxonomyOptionSet(options *getoptions.GetOpt) { + options.BoolVar(&__rank_list__, "rank-list", false, + options.Alias("l"), + options.Description("List every taxonomic rank available in the taxonomy.")) + + options.IntSliceVar(&__taxonomical_restriction__, "restrict-to-taxon", 1, 1, + options.Alias("r"), + options.Description("Restrict output to some subclades.")) +} + + func CLISelectedNCBITaxDump() string { return __taxdump__ } @@ -92,6 +103,7 @@ func CLILoadSelectedTaxonomy() (*obitax.Taxonomy, error) { func OptionSet(options *getoptions.GetOpt) { LoadTaxonomyOptionSet(options, true, true) + FilterTaxonomyOptionSet(options) options.BoolVar(&__fixed_pattern__, "fixed", false, options.Alias("F"), options.Description("Match taxon names using a fixed pattern, not a regular expression")) diff --git a/pkg/obiutils/slices.go b/pkg/obiutils/slices.go index c5860fa..9073abf 100644 --- a/pkg/obiutils/slices.go +++ b/pkg/obiutils/slices.go @@ -21,3 +21,16 @@ func LookFor[T comparable](arr []T, x T) int { func RemoveIndex[T comparable](s []T, index int) []T { return append(s[:index], s[index+1:]...) } + +func Reverse[S ~[]E, E any](s S, inplace bool) S { + if !inplace { + c := make([]E,len(s)) + copy(c,s) + s = c + } + for i, j := 0, len(s)-1; i < j; i, j = i+1, j-1 { + s[i], s[j] = s[j], s[i] + } + + return s +}