mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
Restore old obisort and add LCA functionnality to obiannotate.
Former-commit-id: aecaacc9dae49f74bd888a8eb8140822d31a42a6
This commit is contained in:
@ -1,5 +1,15 @@
|
|||||||
package obitax
|
package obitax
|
||||||
|
|
||||||
|
import (
|
||||||
|
"math"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils"
|
||||||
|
log "github.com/sirupsen/logrus"
|
||||||
|
)
|
||||||
|
|
||||||
func (t1 *TaxNode) LCA(t2 *TaxNode) (*TaxNode, error) {
|
func (t1 *TaxNode) LCA(t2 *TaxNode) (*TaxNode, error) {
|
||||||
p1, err1 := t1.Path()
|
p1, err1 := t1.Path()
|
||||||
|
|
||||||
@ -23,3 +33,121 @@ func (t1 *TaxNode) LCA(t2 *TaxNode) (*TaxNode, error) {
|
|||||||
|
|
||||||
return (*p1)[i1+1], nil
|
return (*p1)[i1+1], nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (taxonomy *Taxonomy) TaxonomicDistribution(sequence *obiseq.BioSequence) map[*TaxNode]int {
|
||||||
|
taxids := sequence.StatsOn("taxid", "na")
|
||||||
|
taxons := make(map[*TaxNode]int, len(taxids))
|
||||||
|
|
||||||
|
for k, v := range taxids {
|
||||||
|
taxid, _ := strconv.Atoi(k)
|
||||||
|
|
||||||
|
t, et := taxonomy.Taxon(taxid)
|
||||||
|
if et != nil {
|
||||||
|
log.Panic("Taxid %d not defined in taxonomy : %v", k, et)
|
||||||
|
}
|
||||||
|
taxons[t] = v
|
||||||
|
}
|
||||||
|
return taxons
|
||||||
|
}
|
||||||
|
|
||||||
|
func (taxonomy *Taxonomy) LCA(sequence *obiseq.BioSequence, threshold float64) (*TaxNode, float64, int) {
|
||||||
|
taxons := taxonomy.TaxonomicDistribution(sequence)
|
||||||
|
paths := make(map[*TaxNode]*TaxonSlice, len(taxons))
|
||||||
|
answer := (*TaxNode)(nil)
|
||||||
|
rans := 1.0
|
||||||
|
granTotal := 0
|
||||||
|
|
||||||
|
for t, w := range taxons {
|
||||||
|
p, ep := t.Path()
|
||||||
|
if ep != nil {
|
||||||
|
log.Panic("Taxonomic path cannot be retreived from Taxid %d : %v", t.Taxid(), ep)
|
||||||
|
}
|
||||||
|
|
||||||
|
obiutils.Reverse(*p, true)
|
||||||
|
paths[t] = p
|
||||||
|
answer = (*p)[0]
|
||||||
|
granTotal += w
|
||||||
|
}
|
||||||
|
|
||||||
|
rmax := 1.0
|
||||||
|
levels := make(map[*TaxNode]int, len(paths))
|
||||||
|
taxonMax := answer
|
||||||
|
|
||||||
|
for i := 0; rmax >= threshold; i++ {
|
||||||
|
answer = taxonMax
|
||||||
|
rans = rmax
|
||||||
|
taxonMax = nil
|
||||||
|
total := 0
|
||||||
|
for taxon, weight := range taxons {
|
||||||
|
path := paths[taxon]
|
||||||
|
if len(*path) > i {
|
||||||
|
levels[(*path)[i]] += weight
|
||||||
|
}
|
||||||
|
total += weight
|
||||||
|
}
|
||||||
|
weighMax := 0
|
||||||
|
for taxon, weight := range levels {
|
||||||
|
if weight > weighMax {
|
||||||
|
weighMax = weight
|
||||||
|
taxonMax = taxon
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if total > 0 {
|
||||||
|
rmax *= float64(weighMax) / float64(total)
|
||||||
|
} else {
|
||||||
|
rmax = 0.0
|
||||||
|
}
|
||||||
|
|
||||||
|
for taxon := range levels {
|
||||||
|
delete(levels, taxon)
|
||||||
|
}
|
||||||
|
for taxon := range taxons {
|
||||||
|
path := paths[taxon]
|
||||||
|
if i < len(*path) {
|
||||||
|
if (*path)[i] != taxonMax {
|
||||||
|
delete(paths, taxon)
|
||||||
|
delete(taxons, taxon)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// if taxonMax != nil {
|
||||||
|
// log.Println("@@@>", i, taxonMax.ScientificName(), taxonMax.Taxid(), rans, weighMax, total, rmax)
|
||||||
|
// } else {
|
||||||
|
// log.Println("@@@>", "--", 0, rmax)
|
||||||
|
// }
|
||||||
|
}
|
||||||
|
// log.Println("###>", answer.ScientificName(), answer.Taxid(), rans)
|
||||||
|
// log.Print("========================================")
|
||||||
|
return answer, rans, granTotal
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
func AddLCAWorker(taxonomy *Taxonomy, slot_name string, threshold float64) obiseq.SeqWorker {
|
||||||
|
|
||||||
|
if !strings.HasSuffix(slot_name, "taxid") {
|
||||||
|
slot_name = slot_name + "_taxid"
|
||||||
|
}
|
||||||
|
|
||||||
|
lca_error := strings.Replace(slot_name, "taxid", "error", 1)
|
||||||
|
if lca_error == "error" {
|
||||||
|
lca_error = "lca_error"
|
||||||
|
}
|
||||||
|
|
||||||
|
lca_name := strings.Replace(slot_name, "taxid", "name", 1)
|
||||||
|
if lca_name == "name" {
|
||||||
|
lca_name = "scientific_name"
|
||||||
|
}
|
||||||
|
|
||||||
|
f := func(sequence *obiseq.BioSequence) *obiseq.BioSequence {
|
||||||
|
lca, rans, _ := taxonomy.LCA(sequence, threshold)
|
||||||
|
|
||||||
|
sequence.SetAttribute(slot_name, lca.Taxid())
|
||||||
|
sequence.SetAttribute(lca_name, lca.ScientificName())
|
||||||
|
sequence.SetAttribute(lca_error, math.Round((1-rans)*1000)/1000)
|
||||||
|
|
||||||
|
return sequence
|
||||||
|
}
|
||||||
|
|
||||||
|
return f
|
||||||
|
}
|
||||||
|
@ -70,11 +70,11 @@ func EvalAttributeWorker(expression map[string]string) obiseq.SeqWorker {
|
|||||||
var w obiseq.SeqWorker
|
var w obiseq.SeqWorker
|
||||||
w = nil
|
w = nil
|
||||||
|
|
||||||
for a,e := range expression {
|
for a, e := range expression {
|
||||||
if w == nil {
|
if w == nil {
|
||||||
w = obiseq.EditAttributeWorker(a,e)
|
w = obiseq.EditAttributeWorker(a, e)
|
||||||
} else {
|
} else {
|
||||||
w.ChainWorkers(obiseq.EditAttributeWorker(a,e))
|
w.ChainWorkers(obiseq.EditAttributeWorker(a, e))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -132,6 +132,12 @@ func CLIAnnotationWorker() obiseq.SeqWorker {
|
|||||||
annotator = annotator.ChainWorkers(w)
|
annotator = annotator.ChainWorkers(w)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if CLIHasAddLCA() {
|
||||||
|
taxo := obigrep.CLILoadSelectedTaxonomy()
|
||||||
|
w := obitax.AddLCAWorker(taxo, CLILCASlotName(), CLILCAThreshold())
|
||||||
|
annotator = annotator.ChainWorkers(w)
|
||||||
|
}
|
||||||
|
|
||||||
if CLIHasSetLengthFlag() {
|
if CLIHasSetLengthFlag() {
|
||||||
w := AddSeqLengthWorker()
|
w := AddSeqLengthWorker()
|
||||||
annotator = annotator.ChainWorkers(w)
|
annotator = annotator.ChainWorkers(w)
|
||||||
|
@ -22,6 +22,8 @@ var _clearAll = false
|
|||||||
var _setSeqLength = false
|
var _setSeqLength = false
|
||||||
var _uniqueID = false
|
var _uniqueID = false
|
||||||
var _ahoCorazick = ""
|
var _ahoCorazick = ""
|
||||||
|
var _lcaSlot = ""
|
||||||
|
var _lcaError = 0.0
|
||||||
|
|
||||||
func SequenceAnnotationOptionSet(options *getoptions.GetOpt) {
|
func SequenceAnnotationOptionSet(options *getoptions.GetOpt) {
|
||||||
// options.BoolVar(&_addRank, "seq-rank", _addRank,
|
// options.BoolVar(&_addRank, "seq-rank", _addRank,
|
||||||
@ -38,6 +40,20 @@ func SequenceAnnotationOptionSet(options *getoptions.GetOpt) {
|
|||||||
|
|
||||||
options.StringVar(&_ahoCorazick, "aho-corasick", _ahoCorazick,
|
options.StringVar(&_ahoCorazick, "aho-corasick", _ahoCorazick,
|
||||||
options.Description("Adds an aho-corasick attribut with the count of matches of the provided patterns."))
|
options.Description("Adds an aho-corasick attribut with the count of matches of the provided patterns."))
|
||||||
|
|
||||||
|
options.StringVar(&_lcaSlot, "add-lca", _lcaSlot,
|
||||||
|
options.ArgName("SLOT_NAME"),
|
||||||
|
options.Description("From the taxonomic annotation of the sequence (taxid slot or merged_taxid slot), "+
|
||||||
|
"a new slot named <SLOT_NAME> is added with the taxid of the lowest common ancester corresponding "+
|
||||||
|
"to the current annotation."))
|
||||||
|
|
||||||
|
options.Float64Var(&_lcaError, "lca-error", _lcaError,
|
||||||
|
options.ArgName("#.###"),
|
||||||
|
options.Description("Error rate tolerated on the taxonomical discription during the lowest common "+
|
||||||
|
"ancestor. At most a fraction of lca-error of the taxonomic information can disagree with the "+
|
||||||
|
"estimated LCA."),
|
||||||
|
)
|
||||||
|
|
||||||
// options.BoolVar(&_uniqueID, "uniq-id", _uniqueID,
|
// options.BoolVar(&_uniqueID, "uniq-id", _uniqueID,
|
||||||
// options.Description("Forces sequence record ids to be unique."),
|
// options.Description("Forces sequence record ids to be unique."),
|
||||||
// )
|
// )
|
||||||
@ -155,9 +171,6 @@ func CLISetAttributeExpression() map[string]string {
|
|||||||
return _evalAttribute
|
return _evalAttribute
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
func CLIHasAhoCorasick() bool {
|
func CLIHasAhoCorasick() bool {
|
||||||
_, err := os.Stat(_ahoCorazick)
|
_, err := os.Stat(_ahoCorazick)
|
||||||
return err == nil
|
return err == nil
|
||||||
@ -182,3 +195,15 @@ func CLIAhoCorazick() []string {
|
|||||||
|
|
||||||
return lines
|
return lines
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func CLILCASlotName() string {
|
||||||
|
return _lcaSlot
|
||||||
|
}
|
||||||
|
|
||||||
|
func CLIHasAddLCA() bool {
|
||||||
|
return _lcaSlot != ""
|
||||||
|
}
|
||||||
|
|
||||||
|
func CLILCAThreshold() float64 {
|
||||||
|
return 1 - _lcaError
|
||||||
|
}
|
||||||
|
@ -45,6 +45,17 @@ func LoadTaxonomyOptionSet(options *getoptions.GetOpt, required, alternatiive bo
|
|||||||
options.Description("Restrict output to some subclades."))
|
options.Description("Restrict output to some subclades."))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func FilterTaxonomyOptionSet(options *getoptions.GetOpt) {
|
||||||
|
options.BoolVar(&__rank_list__, "rank-list", false,
|
||||||
|
options.Alias("l"),
|
||||||
|
options.Description("List every taxonomic rank available in the taxonomy."))
|
||||||
|
|
||||||
|
options.IntSliceVar(&__taxonomical_restriction__, "restrict-to-taxon", 1, 1,
|
||||||
|
options.Alias("r"),
|
||||||
|
options.Description("Restrict output to some subclades."))
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
func CLISelectedNCBITaxDump() string {
|
func CLISelectedNCBITaxDump() string {
|
||||||
return __taxdump__
|
return __taxdump__
|
||||||
}
|
}
|
||||||
@ -92,6 +103,7 @@ func CLILoadSelectedTaxonomy() (*obitax.Taxonomy, error) {
|
|||||||
|
|
||||||
func OptionSet(options *getoptions.GetOpt) {
|
func OptionSet(options *getoptions.GetOpt) {
|
||||||
LoadTaxonomyOptionSet(options, true, true)
|
LoadTaxonomyOptionSet(options, true, true)
|
||||||
|
FilterTaxonomyOptionSet(options)
|
||||||
options.BoolVar(&__fixed_pattern__, "fixed", false,
|
options.BoolVar(&__fixed_pattern__, "fixed", false,
|
||||||
options.Alias("F"),
|
options.Alias("F"),
|
||||||
options.Description("Match taxon names using a fixed pattern, not a regular expression"))
|
options.Description("Match taxon names using a fixed pattern, not a regular expression"))
|
||||||
|
@ -21,3 +21,16 @@ func LookFor[T comparable](arr []T, x T) int {
|
|||||||
func RemoveIndex[T comparable](s []T, index int) []T {
|
func RemoveIndex[T comparable](s []T, index int) []T {
|
||||||
return append(s[:index], s[index+1:]...)
|
return append(s[:index], s[index+1:]...)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func Reverse[S ~[]E, E any](s S, inplace bool) S {
|
||||||
|
if !inplace {
|
||||||
|
c := make([]E,len(s))
|
||||||
|
copy(c,s)
|
||||||
|
s = c
|
||||||
|
}
|
||||||
|
for i, j := 0, len(s)-1; i < j; i, j = i+1, j-1 {
|
||||||
|
s[i], s[j] = s[j], s[i]
|
||||||
|
}
|
||||||
|
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
Reference in New Issue
Block a user