Fisrt functional version

This commit is contained in:
Eric Coissac
2024-11-14 19:10:23 +01:00
parent 9471fedfa1
commit 03f4e88a17
26 changed files with 908 additions and 307 deletions

View File

@ -9,17 +9,18 @@ import (
)
// AttributeKeys returns the keys of the attributes in the BioSequence.
// It optionally skips keys associated with container values based on the skip_container parameter.
//
// It does not take any parameters.
// Parameters:
// - skip_container: A boolean indicating whether to skip keys associated with a container value.
//
// Returns:
//
// []string: The keys of the BioSequence.
func (s *BioSequence) AttributeKeys(skip_map bool) obiutils.Set[string] {
// - A set of strings containing the keys of the BioSequence attributes.
func (s *BioSequence) AttributeKeys(skip_container bool) obiutils.Set[string] {
keys := obiutils.MakeSet[string]()
for k, v := range s.Annotations() {
if !skip_map || !obiutils.IsAMap(v) {
if !skip_container || !obiutils.IsAContainer(v) {
keys.Add(k)
}
}
@ -27,17 +28,18 @@ func (s *BioSequence) AttributeKeys(skip_map bool) obiutils.Set[string] {
return keys
}
// Keys returns the keys of the BioSequence.
// Keys returns the keys of the BioSequence, including standard keys and attribute keys.
//
// It returns a slice of strings containing the keys of the BioSequence.
// The keys include "id", "sequence", "qualities", and the attribute keys
// of the BioSequence.
// It returns a set of strings containing the keys of the BioSequence.
// The keys include "id", "sequence", "qualities", and the attribute keys of the BioSequence.
//
// Parameters:
// - skip_container: A boolean indicating whether to skip keys associated with container values.
//
// Returns:
//
// []string: The keys of the BioSequence.
func (s *BioSequence) Keys(skip_map bool) obiutils.Set[string] {
keys := s.AttributeKeys(skip_map)
// - A set of strings containing the keys of the BioSequence.
func (s *BioSequence) Keys(skip_container bool) obiutils.Set[string] {
keys := s.AttributeKeys(skip_container)
keys.Add("id")
if s.HasSequence() {
@ -53,10 +55,10 @@ func (s *BioSequence) Keys(skip_map bool) obiutils.Set[string] {
// HasAttribute checks if the BioSequence has the specified attribute.
//
// Parameters:
// - key: a string representing the attribute key to check.
// - key: A string representing the attribute key to check.
//
// Returns:
// - a boolean indicating whether the BioSequence has the attribute.
// - A boolean indicating whether the BioSequence has the attribute.
func (s *BioSequence) HasAttribute(key string) bool {
if key == "id" {
return true
@ -386,31 +388,14 @@ func (s *BioSequence) SetCount(count int) {
s.SetAttribute("count", count)
}
// Taxid returns the taxonomic ID associated with the BioSequence.
//
// It retrieves the "taxid" attribute from the BioSequence's attributes map.
// If the attribute is not found, the function returns 1 as the default taxonomic ID.
// The taxid 1 corresponds to the root taxonomic level.
//
// The function returns an integer representing the taxonomic ID.
func (s *BioSequence) Taxid() int {
taxid, ok := s.GetIntAttribute("taxid")
if !ok {
taxid = 1
}
return taxid
}
// SetTaxid sets the taxid for the BioSequence.
//
// Parameters:
//
// taxid - the taxid to set.
func (s *BioSequence) SetTaxid(taxid int) {
if taxid < 1 {
taxid = 1
func (s *BioSequence) SetTaxid(taxid string) {
if taxid == "" {
taxid = "NA"
}
s.SetAttribute("taxid", taxid)
}

View File

@ -18,6 +18,7 @@ import (
"unsafe"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
log "github.com/sirupsen/logrus"
)
@ -63,6 +64,7 @@ type BioSequence struct {
sequence []byte // The sequence itself, it is accessible by the methode Sequence
qualities []byte // The quality scores of the sequence.
feature []byte
taxon *obitax.Taxon
paired *BioSequence // A pointer to the paired sequence
revcomp *BioSequence // A pointer to the reverse complemented sequence
annotations Annotation
@ -90,6 +92,7 @@ func NewEmptyBioSequence(preallocate int) *BioSequence {
sequence: seq,
qualities: nil,
feature: nil,
taxon: nil,
paired: nil,
revcomp: nil,
annotations: nil,
@ -223,7 +226,7 @@ func (s *BioSequence) HasDefinition() bool {
// No parameters.
// Returns a boolean.
func (s *BioSequence) HasSequence() bool {
return s.sequence != nil && len(s.sequence) > 0
return len(s.sequence) > 0
}
// Sequence returns the sequence of the BioSequence.
@ -258,7 +261,7 @@ func (s *BioSequence) Len() int {
// This function does not have any parameters.
// It returns a boolean value indicating whether the BioSequence has qualities.
func (s *BioSequence) HasQualities() bool {
return s.qualities != nil && len(s.qualities) > 0
return len(s.qualities) > 0
}
// Qualities returns the sequence quality scores of the BioSequence.

View File

@ -0,0 +1,79 @@
package obiseq
import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
log "github.com/sirupsen/logrus"
)
// TaxonomyClassifier is a function that creates a new instance of the BioSequenceClassifier
// for taxonomic classification based on a given taxonomic rank, taxonomy, and abort flag.
//
// Parameters:
// - taxonomicRank: the taxonomic rank to classify the sequences at.
// - taxonomy: the taxonomy object used for classification.
// - abortOnMissing: a flag indicating whether to abort if a taxon is missing in the taxonomy.
//
// Return:
// - *obiseq.BioSequenceClassifier: the new instance of the BioSequenceClassifier.
func TaxonomyClassifier(taxonomicRank string,
taxonomy *obitax.Taxonomy,
abortOnMissing bool) *BioSequenceClassifier {
keys := make(map[*obitax.TaxNode]int)
codes := make([]*obitax.TaxNode, 1)
codes[0] = nil
keys[nil] = 0
code := func(sequence *BioSequence) int {
taxon := sequence.Taxon(taxonomy)
if taxon != nil {
ttaxon := taxon.TaxonAtRank(taxonomicRank)
if abortOnMissing && ttaxon == nil {
log.Fatalf("Taxon at rank %s not found in taxonomy for taxid %d", taxonomicRank, taxon.String())
}
} else {
if abortOnMissing {
log.Fatalf("Sequence %s: Taxid %s not found in taxonomy",
sequence.Id(),
sequence.Taxid())
}
taxon = nil
}
k, ok := keys[taxon.Node]
if ok {
return k
}
k = len(codes)
keys[taxon.Node] = k
codes = append(codes, taxon.Node)
return k
}
value := func(k int) string {
taxon := codes[k]
return taxon.ScientificName()
}
reset := func() {
keys = make(map[*obitax.TaxNode]int)
codes = make([]*obitax.TaxNode, 1)
codes[0] = nil
keys[nil] = 0
}
clone := func() *BioSequenceClassifier {
return TaxonomyClassifier(taxonomicRank, taxonomy, abortOnMissing)
}
c := BioSequenceClassifier{
Code: code,
Value: value,
Reset: reset,
Clone: clone,
Type: "TaxonomyClassifier"}
return &c
}

131
pkg/obiseq/taxonomy_lca.go Normal file
View File

@ -0,0 +1,131 @@
package obiseq
import (
"math"
"strings"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
log "github.com/sirupsen/logrus"
)
func (sequence *BioSequence) TaxonomicDistribution(taxonomy *obitax.Taxonomy) map[*obitax.TaxNode]int {
taxids := sequence.StatsOn(MakeStatsOnDescription("taxid"), "na")
taxons := make(map[*obitax.TaxNode]int, len(taxids))
for taxid, v := range taxids {
t := taxonomy.Taxon(taxid)
if t == nil {
log.Fatalf(
"On sequence %s taxid %s is not defined in taxonomy: %s",
sequence.Id(),
taxid,
taxonomy.Name())
}
taxons[t.Node] = v
}
return taxons
}
func (sequence *BioSequence) LCA(taxonomy *obitax.Taxonomy, threshold float64) (*obitax.Taxon, float64, int) {
taxons := sequence.TaxonomicDistribution(taxonomy)
paths := make(map[*obitax.TaxNode]*obitax.TaxonSlice, len(taxons))
answer := (*obitax.TaxNode)(nil)
rans := 1.0
granTotal := 0
for t, w := range taxons {
p := (&obitax.Taxon{Taxonomy: taxonomy,
Node: t,
}).Path()
if p == nil {
log.Panicf("Sequence %s: taxonomic path cannot be retreived from Taxid %d : %v", sequence.Id(), t.String(taxonomy.Code()))
}
p.Reverse(true)
paths[t] = p
answer = p.Get(0)
granTotal += w
}
rmax := 1.0
levels := make(map[*obitax.TaxNode]int, len(paths))
taxonMax := answer
for i := 0; rmax >= threshold; i++ {
answer = taxonMax
rans = rmax
taxonMax = nil
total := 0
for taxon, weight := range taxons {
path := paths[taxon]
if path.Len() > i {
levels[path.Get(i)] += weight
}
total += weight
}
weighMax := 0
for taxon, weight := range levels {
if weight > weighMax {
weighMax = weight
taxonMax = taxon
}
}
if total > 0 {
rmax *= float64(weighMax) / float64(total)
} else {
rmax = 0.0
}
for taxon := range levels {
delete(levels, taxon)
}
for taxon := range taxons {
path := paths[taxon]
if i < path.Len() {
if path.Get(i) != taxonMax {
delete(paths, taxon)
delete(taxons, taxon)
}
}
}
// if taxonMax != nil {
// log.Println("@@@>", i, taxonMax.ScientificName(), taxonMax.Taxid(), rans, weighMax, total, rmax)
// } else {
// log.Println("@@@>", "--", 0, rmax)
// }
}
// log.Println("###>", answer.ScientificName(), answer.Taxid(), rans)
// log.Print("========================================")
return &obitax.Taxon{Taxonomy: taxonomy, Node: answer}, rans, granTotal
}
func AddLCAWorker(taxonomy *obitax.Taxonomy, slot_name string, threshold float64) SeqWorker {
if !strings.HasSuffix(slot_name, "taxid") {
slot_name = slot_name + "_taxid"
}
lca_error := strings.Replace(slot_name, "taxid", "error", 1)
if lca_error == "error" {
lca_error = "lca_error"
}
lca_name := strings.Replace(slot_name, "taxid", "name", 1)
if lca_name == "name" {
lca_name = "scientific_name"
}
f := func(sequence *BioSequence) (BioSequenceSlice, error) {
lca, rans, _ := sequence.LCA(taxonomy, threshold)
sequence.SetAttribute(slot_name, lca.String())
sequence.SetAttribute(lca_name, lca.ScientificName())
sequence.SetAttribute(lca_error, math.Round((1-rans)*1000)/1000)
return BioSequenceSlice{sequence}, nil
}
return f
}

View File

@ -0,0 +1,106 @@
package obiseq
import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
)
func (s *BioSequence) Taxon(taxonomy *obitax.Taxonomy) *obitax.Taxon {
taxid := s.Taxid()
if taxid == "NA" {
return nil
}
return taxonomy.Taxon(taxid)
}
// Taxid returns the taxonomic ID associated with the BioSequence.
//
// It retrieves the "taxid" attribute from the BioSequence's attributes map.
// If the attribute is not found, the function returns 1 as the default taxonomic ID.
// The taxid 1 corresponds to the root taxonomic level.
//
// The function returns an integer representing the taxonomic ID.
func (s *BioSequence) Taxid() (taxid string) {
var ok bool
if s.taxon != nil {
taxid = s.taxon.String()
ok = true
} else {
taxid, ok = s.GetStringAttribute("taxid")
}
if !ok {
taxid = "NA"
}
return taxid
}
// Setting the taxon at a given rank for a given sequence.
//
// Two attributes are added to the sequence. One named by the rank name stores
// the taxid, a second named by the rank name suffixed with '_name' contains the
// Scientific name of the genus.
// If the taxon at the given rank doesn't exist for the taxonomy annotation
// of the sequence, nothing happens.
func (sequence *BioSequence) SetTaxonAtRank(taxonomy *obitax.Taxonomy, rank string) *obitax.Taxon {
var taxonAtRank *obitax.Taxon
taxon := sequence.Taxon(taxonomy)
taxonAtRank = nil
if taxon != nil {
taxonAtRank = taxon.TaxonAtRank(rank)
if taxonAtRank != nil {
// log.Printf("Taxid: %d Rank: %s --> proposed : %d (%s)", taxid, rank, taxonAtRank.taxid, *(taxonAtRank.scientificname))
sequence.SetAttribute(rank+"_taxid", taxonAtRank.String())
sequence.SetAttribute(rank+"_name", taxonAtRank.ScientificName())
} else {
sequence.SetAttribute(rank+"_taxid", "NA")
sequence.SetAttribute(rank+"_name", "NA")
}
}
return taxonAtRank
}
// Setting the species of a sequence.
func (sequence *BioSequence) SetSpecies(taxonomy *obitax.Taxonomy) *obitax.Taxon {
return sequence.SetTaxonAtRank(taxonomy, "species")
}
// Setting the genus of a sequence.
func (sequence *BioSequence) SetGenus(taxonomy *obitax.Taxonomy) *obitax.Taxon {
return sequence.SetTaxonAtRank(taxonomy, "genus")
}
// Setting the family of a sequence.
func (sequence *BioSequence) SetFamily(taxonomy *obitax.Taxonomy) *obitax.Taxon {
return sequence.SetTaxonAtRank(taxonomy, "family")
}
func (sequence *BioSequence) SetPath(taxonomy *obitax.Taxonomy) string {
taxon := sequence.Taxon(taxonomy)
path := taxon.Path()
tpath := path.String()
sequence.SetAttribute("taxonomic_path", tpath)
return tpath
}
func (sequence *BioSequence) SetScientificName(taxonomy *obitax.Taxonomy) string {
taxon := sequence.Taxon(taxonomy)
name := taxon.ScientificName()
sequence.SetAttribute("scienctific_name", name)
return name
}
func (sequence *BioSequence) SetTaxonomicRank(taxonomy *obitax.Taxonomy) string {
taxon := sequence.Taxon(taxonomy)
rank := taxon.Rank()
sequence.SetAttribute("taxonomic_rank", rank)
return rank
}

View File

@ -0,0 +1,98 @@
package obiseq
import (
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
)
func IsAValidTaxon(taxonomy *obitax.Taxonomy, withAutoCorrection ...bool) SequencePredicate {
// deprecatedTaxidsWarning := make(map[string]bool)
autocorrection := false
if len(withAutoCorrection) > 0 {
autocorrection = withAutoCorrection[0]
}
f := func(sequence *BioSequence) bool {
taxon := sequence.Taxon(taxonomy)
if taxon != nil {
taxid := sequence.Taxid()
ttaxid := taxon.String()
if taxid != ttaxid {
if autocorrection {
sequence.SetTaxid(ttaxid)
log.Printf(
"Sequence %s : Taxid %d updated with %d",
sequence.Id(),
taxid,
ttaxid,
)
} // else {
// if _, ok := deprecatedTaxidsWarning[taxid]; !ok {
// deprecatedTaxidsWarning[taxid] = true
// log.Printf("Taxid %d is deprecated and must be replaced by %d", taxid, taxon.taxid)
// }
// }
}
}
return taxon != nil
}
return f
}
// A function that takes a taxonomy and a taxid as arguments and returns a function that takes a
// pointer to a BioSequence as an argument and returns a boolean.
func IsSubCladeOf(taxonomy *obitax.Taxonomy, taxid string) SequencePredicate {
parent := taxonomy.Taxon(taxid)
if parent == nil {
log.Fatalf("Cannot find taxon : %s in taxonomy %s",
taxid,
taxonomy.Name())
}
f := func(sequence *BioSequence) bool {
taxon := sequence.Taxon(taxonomy)
return taxon != nil && taxon.IsSubCladeOf(parent)
}
return f
}
func IsSubCladeOfSlot(taxonomy *obitax.Taxonomy, key string) SequencePredicate {
f := func(sequence *BioSequence) bool {
val, ok := sequence.GetStringAttribute(key)
if ok {
parent := taxonomy.Taxon(val)
taxon := sequence.Taxon(taxonomy)
return parent != nil && taxon != nil && taxon.IsSubCladeOf(parent)
}
return false
}
return f
}
func HasRequiredRank(taxonomy *obitax.Taxonomy, rank string) SequencePredicate {
if !obiutils.Contains(taxonomy.RankList(), rank) {
log.Fatalf("%s is not a valid rank (allowed ranks are %v)",
rank,
taxonomy.RankList())
}
f := func(sequence *BioSequence) bool {
taxon := sequence.Taxon(taxonomy)
return taxon != nil && taxon.HasRankDefined(rank)
}
return f
}

View File

@ -0,0 +1,64 @@
package obiseq
import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
log "github.com/sirupsen/logrus"
)
func MakeSetTaxonAtRankWorker(taxonomy *obitax.Taxonomy, rank string) SeqWorker {
if !obiutils.Contains(taxonomy.RankList(), rank) {
log.Fatalf("%s is not a valid rank (allowed ranks are %v)",
rank,
taxonomy.RankList())
}
w := func(sequence *BioSequence) (BioSequenceSlice, error) {
sequence.SetTaxonAtRank(taxonomy, rank)
return BioSequenceSlice{sequence}, nil
}
return w
}
func MakeSetSpeciesWorker(taxonomy *obitax.Taxonomy) SeqWorker {
w := func(sequence *BioSequence) (BioSequenceSlice, error) {
sequence.SetSpecies(taxonomy)
return BioSequenceSlice{sequence}, nil
}
return w
}
func MakeSetGenusWorker(taxonomy *obitax.Taxonomy) SeqWorker {
w := func(sequence *BioSequence) (BioSequenceSlice, error) {
sequence.SetGenus(taxonomy)
return BioSequenceSlice{sequence}, nil
}
return w
}
func MakeSetFamilyWorker(taxonomy *obitax.Taxonomy) SeqWorker {
w := func(sequence *BioSequence) (BioSequenceSlice, error) {
sequence.SetFamily(taxonomy)
return BioSequenceSlice{sequence}, nil
}
return w
}
func MakeSetPathWorker(taxonomy *obitax.Taxonomy) SeqWorker {
w := func(sequence *BioSequence) (BioSequenceSlice, error) {
sequence.SetPath(taxonomy)
return BioSequenceSlice{sequence}, nil
}
return w
}