mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
Adds possibility to extract a taxonomy from taxonomic path included in sequence files
This commit is contained in:
64
pkg/obitax/string_parser.go
Normal file
64
pkg/obitax/string_parser.go
Normal file
@ -0,0 +1,64 @@
|
||||
package obitax
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// ParseTaxonString parses a string in the format "code:taxid [scientific name]@rank"
|
||||
// and returns the individual components. It handles extra whitespace around components.
|
||||
//
|
||||
// Parameters:
|
||||
// - taxonStr: The string to parse in the format "code:taxid [scientific name]@rank"
|
||||
//
|
||||
// Returns:
|
||||
// - code: The taxonomy code
|
||||
// - taxid: The taxon identifier
|
||||
// - scientificName: The scientific name (without brackets)
|
||||
// - rank: The rank
|
||||
// - error: An error if the string format is invalid
|
||||
func ParseTaxonString(taxonStr string) (code, taxid, scientificName, rank string, err error) {
|
||||
// Trim any leading/trailing whitespace from the entire string
|
||||
taxonStr = strings.TrimSpace(taxonStr)
|
||||
|
||||
// Split by '@' to separate rank
|
||||
parts := strings.Split(taxonStr, "@")
|
||||
if len(parts) > 2 {
|
||||
return "", "", "", "", errors.New("invalid format: multiple '@' characters found")
|
||||
}
|
||||
|
||||
mainPart := strings.TrimSpace(parts[0])
|
||||
if len(parts) == 2 {
|
||||
rank = strings.TrimSpace(parts[1])
|
||||
} else {
|
||||
rank = "no rank"
|
||||
}
|
||||
|
||||
// Find scientific name part (enclosed in square brackets)
|
||||
startBracket := strings.Index(mainPart, "[")
|
||||
endBracket := strings.LastIndex(mainPart, "]")
|
||||
|
||||
if startBracket == -1 || endBracket == -1 || startBracket > endBracket {
|
||||
return "", "", "", "", errors.New("invalid format: scientific name must be enclosed in square brackets")
|
||||
}
|
||||
|
||||
// Extract and clean scientific name
|
||||
scientificName = strings.TrimSpace(mainPart[startBracket+1 : endBracket])
|
||||
|
||||
// Process code:taxid part
|
||||
idPart := strings.TrimSpace(mainPart[:startBracket])
|
||||
idComponents := strings.Split(idPart, ":")
|
||||
|
||||
if len(idComponents) != 2 {
|
||||
return "", "", "", "", errors.New("invalid format: missing taxonomy code separator ':'")
|
||||
}
|
||||
|
||||
code = strings.TrimSpace(idComponents[0])
|
||||
taxid = strings.TrimSpace(idComponents[1])
|
||||
|
||||
if code == "" || taxid == "" || scientificName == "" {
|
||||
return "", "", "", "", errors.New("invalid format: code, taxid and scientific name cannot be empty")
|
||||
}
|
||||
|
||||
return code, taxid, scientificName, rank, nil
|
||||
}
|
Reference in New Issue
Block a user