Adds possibility to extract a taxonomy from taxonomic path included in sequence files

This commit is contained in:
Eric Coissac
2025-01-30 11:18:21 +01:00
parent 2452aef7a9
commit 0df082da06
20 changed files with 460 additions and 173 deletions

View File

@ -218,9 +218,9 @@ func (taxon *Taxon) ISubTaxonomy() *ITaxon {
}
func (taxonomy *Taxonomy) ISubTaxonomy(taxid string) *ITaxon {
taxon := taxonomy.Taxon(taxid)
taxon, err := taxonomy.Taxon(taxid)
if taxon == nil {
if err != nil {
return nil
}

View File

@ -91,7 +91,13 @@ func loadNameTable(reader io.Reader, taxonomy *Taxonomy, onlysn bool) int {
if !onlysn || classname == "scientific name" {
n++
taxonomy.Taxon(taxid).SetName(name, classname)
taxon, err := taxonomy.Taxon(taxid)
if err != nil {
log.Fatalf("%s: is unknown from the taxonomy", taxid)
}
taxon.SetName(name, classname)
}
}
@ -196,7 +202,11 @@ func LoadNCBITaxDump(directory string, onlysn bool) (*Taxonomy, error) {
n = loadMergedTable(buffered, taxonomy)
log.Printf("%d merged taxa read\n", n)
root := taxonomy.Taxon("1")
root, err := taxonomy.Taxon("1")
if err != nil {
log.Fatal("cannot find the root taxon (1) in the NCBI tax dump")
}
taxonomy.SetRoot(root)
return taxonomy, nil

View File

@ -134,7 +134,12 @@ func LoadNCBITarTaxDump(path string, onlysn bool) (*Taxonomy, error) {
n = loadMergedTable(buffered, taxonomy)
log.Printf("%d merged taxa read\n", n)
root := taxonomy.Taxon("1")
root, err := taxonomy.Taxon("1")
if err != nil {
log.Fatal("cannot find the root taxon (1) in the NCBI tax dump")
}
taxonomy.SetRoot(root)
return taxonomy, nil

View File

@ -0,0 +1,64 @@
package obitax
import (
"errors"
"strings"
)
// ParseTaxonString parses a string in the format "code:taxid [scientific name]@rank"
// and returns the individual components. It handles extra whitespace around components.
//
// Parameters:
// - taxonStr: The string to parse in the format "code:taxid [scientific name]@rank"
//
// Returns:
// - code: The taxonomy code
// - taxid: The taxon identifier
// - scientificName: The scientific name (without brackets)
// - rank: The rank
// - error: An error if the string format is invalid
func ParseTaxonString(taxonStr string) (code, taxid, scientificName, rank string, err error) {
// Trim any leading/trailing whitespace from the entire string
taxonStr = strings.TrimSpace(taxonStr)
// Split by '@' to separate rank
parts := strings.Split(taxonStr, "@")
if len(parts) > 2 {
return "", "", "", "", errors.New("invalid format: multiple '@' characters found")
}
mainPart := strings.TrimSpace(parts[0])
if len(parts) == 2 {
rank = strings.TrimSpace(parts[1])
} else {
rank = "no rank"
}
// Find scientific name part (enclosed in square brackets)
startBracket := strings.Index(mainPart, "[")
endBracket := strings.LastIndex(mainPart, "]")
if startBracket == -1 || endBracket == -1 || startBracket > endBracket {
return "", "", "", "", errors.New("invalid format: scientific name must be enclosed in square brackets")
}
// Extract and clean scientific name
scientificName = strings.TrimSpace(mainPart[startBracket+1 : endBracket])
// Process code:taxid part
idPart := strings.TrimSpace(mainPart[:startBracket])
idComponents := strings.Split(idPart, ":")
if len(idComponents) != 2 {
return "", "", "", "", errors.New("invalid format: missing taxonomy code separator ':'")
}
code = strings.TrimSpace(idComponents[0])
taxid = strings.TrimSpace(idComponents[1])
if code == "" || taxid == "" || scientificName == "" {
return "", "", "", "", errors.New("invalid format: code, taxid and scientific name cannot be empty")
}
return code, taxid, scientificName, rank, nil
}

View File

@ -1,6 +1,7 @@
package obitax
import (
"errors"
"iter"
"regexp"
@ -379,3 +380,29 @@ func (taxon *Taxon) SameAs(other *Taxon) bool {
return taxon.Taxonomy == other.Taxonomy && taxon.Node.id == other.Node.id
}
func (taxon *Taxon) AddChild(child string, replace bool) (*Taxon, error) {
if taxon == nil {
return nil, errors.New("nil taxon")
}
code, taxid, scientific_name, rank, err := ParseTaxonString(child)
if err != nil {
return nil, err
}
if taxon.Taxonomy.code != code {
return nil, errors.New("taxonomy code mismatch")
}
newTaxon, err := taxon.Taxonomy.AddTaxon(taxid, *taxon.Node.id, rank, false, replace)
if err != nil {
return nil, err
}
newTaxon.SetName(scientific_name, "scientific name")
return newTaxon, nil
}

View File

@ -12,7 +12,6 @@ import (
"fmt"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
log "github.com/sirupsen/logrus"
)
// Taxonomy represents a hierarchical classification of taxa.
@ -130,27 +129,28 @@ func (taxonomy *Taxonomy) TaxidString(id string) (string, error) {
// Returns:
// - A pointer to the Taxon instance associated with the provided taxid.
// - If the taxid is unknown, the method will log a fatal error.
func (taxonomy *Taxonomy) Taxon(taxid string) *Taxon {
func (taxonomy *Taxonomy) Taxon(taxid string) (*Taxon, error) {
taxonomy = taxonomy.OrDefault(false)
if taxonomy == nil {
return nil
return nil, errors.New("cannot extract taxon from nil taxonomy")
}
id, err := taxonomy.Id(taxid)
if err != nil {
log.Fatalf("Taxid %s: %v", taxid, err)
return nil, fmt.Errorf("Taxid %s: %v", taxid, err)
}
taxon := taxonomy.nodes.Get(id)
if taxon == nil {
log.Fatalf("Taxid %s is not part of the taxonomy %s",
taxid,
taxonomy.name)
return nil,
fmt.Errorf("Taxid %s is not part of the taxonomy %s",
taxid,
taxonomy.name)
}
return taxon
return taxon, nil
}
// AsTaxonSet returns the set of taxon nodes contained within the Taxonomy.
@ -353,3 +353,63 @@ func (taxonomy *Taxonomy) HasRoot() bool {
taxonomy = taxonomy.OrDefault(false)
return taxonomy != nil && taxonomy.root != nil
}
func (taxonomy *Taxonomy) InsertPathString(path []string) (*Taxonomy, error) {
if len(path) == 0 {
return nil, errors.New("path is empty")
}
code, taxid, scientific_name, rank, err := ParseTaxonString(path[0])
if taxonomy == nil {
taxonomy = NewTaxonomy(code, code, obiutils.AsciiAlphaNumSet)
}
if err != nil {
return nil, err
}
if taxonomy.Len() == 0 {
if code != taxonomy.code {
return nil, fmt.Errorf("cannot insert taxon %s into taxonomy %s with code %s",
path[0], taxonomy.name, taxonomy.code)
}
root, err := taxonomy.AddTaxon(taxid, taxid, rank, true, true)
if err != nil {
return nil, err
}
root.SetName(scientific_name, "scientificName")
}
var current *Taxon
current, err = taxonomy.Taxon(taxid)
if err != nil {
return nil, err
}
if !current.IsRoot() {
return nil, errors.New("path does not start with a root node")
}
for _, id := range path[1:] {
taxon, err := taxonomy.Taxon(id)
if err == nil {
if !current.SameAs(taxon.Parent()) {
return nil, errors.New("path is not consistent with the taxonomy, parent mismatch")
}
current = taxon
} else {
current, err = current.AddChild(id, false)
if err != nil {
return nil, err
}
}
}
return taxonomy, nil
}

View File

@ -212,7 +212,8 @@ func (set *TaxonSet) Sort() *TaxonSlice {
pushed = false
for _, node := range set.set {
if !parent[node] && (parent[set.Get(node.parent).Node] ||
!set.Contains(node.parent)) {
!set.Contains(node.parent) ||
node == taxonomy.Root().Node) {
pushed = true
taxa.slice = append(taxa.slice, node)
parent[node] = true