2025-03-14 14:22:22 +01:00
|
|
|
package obiformats
|
2025-01-24 18:09:59 +01:00
|
|
|
|
|
|
|
import (
|
|
|
|
"encoding/csv"
|
|
|
|
"errors"
|
|
|
|
"strings"
|
|
|
|
|
2025-03-14 14:22:22 +01:00
|
|
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
|
2025-01-24 18:09:59 +01:00
|
|
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
|
|
|
log "github.com/sirupsen/logrus"
|
|
|
|
)
|
|
|
|
|
2025-06-04 09:48:10 +02:00
|
|
|
func LoadCSVTaxonomy(path string, onlysn, seqAsTaxa bool) (*obitax.Taxonomy, error) {
|
2025-01-24 18:09:59 +01:00
|
|
|
|
2025-01-29 10:45:26 +01:00
|
|
|
log.Infof("Loading taxonomy from csv file: %s", path)
|
|
|
|
|
2025-01-24 18:09:59 +01:00
|
|
|
file, err := obiutils.Ropen(path)
|
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
defer file.Close()
|
|
|
|
|
|
|
|
csvfile := csv.NewReader(file)
|
|
|
|
|
|
|
|
csvfile.Comma = ','
|
|
|
|
csvfile.ReuseRecord = false
|
|
|
|
csvfile.LazyQuotes = true
|
|
|
|
csvfile.Comment = '#'
|
|
|
|
csvfile.FieldsPerRecord = -1
|
|
|
|
csvfile.TrimLeadingSpace = true
|
|
|
|
|
|
|
|
header, err := csvfile.Read()
|
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
log.Fatal(err)
|
|
|
|
}
|
|
|
|
|
|
|
|
taxidColIndex := -1
|
|
|
|
parentColIndex := -1
|
|
|
|
scientific_nameColIndex := -1
|
|
|
|
rankColIndex := -1
|
|
|
|
|
|
|
|
for i, colName := range header {
|
|
|
|
switch colName {
|
|
|
|
case "taxid":
|
|
|
|
taxidColIndex = i
|
|
|
|
case "parent":
|
|
|
|
parentColIndex = i
|
|
|
|
case "scientific_name":
|
|
|
|
scientific_nameColIndex = i
|
2025-01-29 10:45:26 +01:00
|
|
|
case "taxonomic_rank":
|
2025-01-24 18:09:59 +01:00
|
|
|
rankColIndex = i
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if taxidColIndex == -1 {
|
|
|
|
return nil, errors.New("taxonomy file does not contain taxid column")
|
|
|
|
}
|
|
|
|
|
|
|
|
if parentColIndex == -1 {
|
|
|
|
return nil, errors.New("taxonomy file does not contain parent column")
|
|
|
|
}
|
|
|
|
|
|
|
|
if scientific_nameColIndex == -1 {
|
|
|
|
return nil, errors.New("taxonomy file does not contain scientific_name column")
|
|
|
|
}
|
|
|
|
|
|
|
|
if rankColIndex == -1 {
|
|
|
|
return nil, errors.New("taxonomy file does not contain rank column")
|
|
|
|
}
|
|
|
|
|
|
|
|
name := obiutils.RemoveAllExt(path)
|
|
|
|
short := obiutils.Basename(path)
|
|
|
|
|
|
|
|
line, err := csvfile.Read()
|
2025-01-29 10:45:26 +01:00
|
|
|
if err == nil {
|
|
|
|
parts := strings.Split(line[taxidColIndex], " ")
|
|
|
|
parts = strings.Split(parts[0], ":")
|
|
|
|
if len(parts) > 1 {
|
|
|
|
short = parts[0]
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
log.Infof("Taxonomy name: %s", name)
|
|
|
|
log.Infof("Taxon code: %s", short)
|
|
|
|
|
2025-03-14 14:22:22 +01:00
|
|
|
taxonomy := obitax.NewTaxonomy(name, short, obiutils.AsciiAlphaNumSet)
|
2025-01-24 18:09:59 +01:00
|
|
|
|
2025-01-29 10:45:26 +01:00
|
|
|
root := true
|
2025-03-14 14:22:22 +01:00
|
|
|
var taxon *obitax.Taxon
|
2025-01-29 10:45:26 +01:00
|
|
|
|
|
|
|
for err == nil {
|
2025-01-24 18:09:59 +01:00
|
|
|
taxid := line[taxidColIndex]
|
|
|
|
parent := line[parentColIndex]
|
|
|
|
scientific_name := line[scientific_nameColIndex]
|
|
|
|
rank := line[rankColIndex]
|
|
|
|
|
2025-01-29 10:45:26 +01:00
|
|
|
taxon, err = taxonomy.AddTaxon(taxid, parent, rank, root, false)
|
2025-01-24 18:09:59 +01:00
|
|
|
|
2025-01-29 10:45:26 +01:00
|
|
|
if err != nil {
|
|
|
|
log.Fatalf("cannot add taxon %s: %v", taxid, err)
|
|
|
|
}
|
2025-01-24 18:09:59 +01:00
|
|
|
|
2025-01-29 10:45:26 +01:00
|
|
|
root = false
|
2025-01-24 18:09:59 +01:00
|
|
|
|
|
|
|
taxon.SetName(scientific_name, "scientific name")
|
|
|
|
|
2025-01-29 10:45:26 +01:00
|
|
|
line, err = csvfile.Read()
|
2025-01-24 18:09:59 +01:00
|
|
|
}
|
|
|
|
|
2025-01-29 10:45:26 +01:00
|
|
|
log.Infof("%d Taxa loaded", taxonomy.Len())
|
|
|
|
|
2025-01-24 18:09:59 +01:00
|
|
|
if !taxonomy.HasRoot() {
|
|
|
|
return nil, errors.New("taxonomy file does not contain root node")
|
|
|
|
}
|
|
|
|
|
|
|
|
return taxonomy, nil
|
|
|
|
}
|