Rename obifind obitaxonomy and introduce the new CSV format for taxonomy.

This commit is contained in:
Eric Coissac
2025-01-29 10:45:26 +01:00
parent c50a0f409d
commit 00f2dc2697
11 changed files with 4571 additions and 55 deletions

299
pkg/obitax/csviterator.go Normal file
View File

@ -0,0 +1,299 @@
package obitax
import (
"slices"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiitercsv"
)
type __options__ struct {
batch_size int // Number of items to process in a batch
with_pattern bool
with_parent bool
with_path bool
with_rank bool
with_scientific_name bool
raw_taxid bool
with_metadata []string
source string // Source of the data
}
// Options wraps the __options__ struct to provide a pointer to the options.
type Options struct {
pointer *__options__ // Pointer to the underlying options
}
// WithOption is a function type that takes an Options parameter and modifies it.
type WithOption func(Options)
// MakeOptions creates an Options instance with default settings and applies any provided setters.
// It returns the configured Options.
//
// Parameters:
// - setters: A slice of WithOption functions to customize the options.
//
// Returns:
// - An Options instance with the specified settings.
func MakeOptions(setters []WithOption) Options {
o := __options__{
batch_size: obidefault.BatchSize(), // Number of items to process in a batch
with_pattern: true,
with_parent: false,
with_path: false,
with_rank: true,
with_scientific_name: false,
raw_taxid: false,
source: "unknown",
}
opt := Options{&o}
for _, set := range setters {
set(opt)
}
return opt
}
// BatchSize returns the size of the batch to be processed.
// It retrieves the batch size from the underlying options.
func (o *Options) BatchSize() int {
return o.pointer.batch_size
}
// WithPattern returns whether the pattern option is enabled.
// It retrieves the setting from the underlying options.
func (o *Options) WithPattern() bool {
return o.pointer.with_pattern
}
// WithParent returns whether the parent option is enabled.
// It retrieves the setting from the underlying options.
func (o *Options) WithParent() bool {
return o.pointer.with_parent
}
// WithPath returns whether the path option is enabled.
// It retrieves the setting from the underlying options.
func (o *Options) WithPath() bool {
return o.pointer.with_path
}
// WithRank returns whether the rank option is enabled.
// It retrieves the setting from the underlying options.
func (o *Options) WithRank() bool {
return o.pointer.with_rank
}
// WithScientificName returns whether the scientific name option is enabled.
// It retrieves the setting from the underlying options.
func (o *Options) WithScientificName() bool {
return o.pointer.with_scientific_name
}
// RawTaxid returns whether the raw taxid option is enabled.
// It retrieves the setting from the underlying options.
func (o *Options) RawTaxid() bool {
return o.pointer.raw_taxid
}
// Source returns the source of the data.
// It retrieves the source from the underlying options.
func (o *Options) Source() string {
return o.pointer.source
}
// WithMetadata returns a slice of strings containing the metadata
// associated with the Options instance. It retrieves the metadata
// from the pointer's with_metadata field.
func (o *Options) WithMetadata() []string {
if o.WithPattern() {
idx := slices.Index(o.pointer.with_metadata, "query")
if idx >= 0 {
o.pointer.with_metadata = slices.Delete(o.pointer.with_metadata, idx, idx+1)
}
}
return o.pointer.with_metadata
}
// OptionsBatchSize returns a WithOption function that sets the batch_size option.
// Parameters:
// - size: An integer specifying the size of the batch to be processed.
func OptionsBatchSize(size int) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.batch_size = size
})
return f
}
func OptionsWithPattern(value bool) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.with_pattern = value
})
return f
}
func OptionsWithParent(value bool) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.with_parent = value
})
return f
}
func OptionsWithPath(value bool) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.with_path = value
})
return f
}
func OptionsWithRank(value bool) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.with_rank = value
})
return f
}
func OptionsWithScientificName(value bool) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.with_scientific_name = value
})
return f
}
func OptionsRawTaxid(value bool) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.raw_taxid = value
})
return f
}
func OptionsSource(value string) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.source = value
})
return f
}
func OptionsWithMetadata(values ...string) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.with_metadata = values
})
return f
}
func (iterator *ITaxon) CSVTaxaIterator(options ...WithOption) *obiitercsv.ICSVRecord {
opt := MakeOptions(options)
metakeys := make([]string, 0)
newIter := obiitercsv.NewICSVRecord()
newIter.Add(1)
batch_size := opt.BatchSize()
if opt.WithPattern() {
newIter.AppendField("query")
opt.pointer.with_metadata = append(opt.pointer.with_metadata, "query")
}
newIter.AppendField("taxid")
rawtaxid := opt.RawTaxid()
if opt.WithParent() {
newIter.AppendField("parent")
}
if opt.WithRank() {
newIter.AppendField("taxonomic_rank")
}
if opt.WithScientificName() {
newIter.AppendField("scientific_name")
}
if opt.WithMetadata() != nil {
metakeys = opt.WithMetadata()
for _, metadata := range metakeys {
newIter.AppendField(metadata)
}
}
if opt.WithPath() {
newIter.AppendField("path")
}
go func() {
newIter.WaitAndClose()
}()
go func() {
o := 0
data := make([]obiitercsv.CSVRecord, 0, batch_size)
for iterator.Next() {
taxon := iterator.Get()
record := make(obiitercsv.CSVRecord)
if opt.WithPattern() {
record["query"] = taxon.MetadataAsString("query")
}
if rawtaxid {
record["taxid"] = *taxon.Node.Id()
} else {
record["taxid"] = taxon.String()
}
if opt.WithParent() {
if rawtaxid {
record["parent"] = *taxon.Node.ParentId()
} else {
record["parent"] = taxon.Parent().String()
}
}
if opt.WithRank() {
record["taxonomic_rank"] = taxon.Rank()
}
if opt.WithScientificName() {
record["scientific_name"] = taxon.ScientificName()
}
if opt.WithPath() {
record["path"] = taxon.Path().String()
}
for _, key := range metakeys {
record[key] = taxon.MetadataAsString(key)
}
data = append(data, record)
if len(data) >= batch_size {
newIter.Push(obiitercsv.MakeCSVRecordBatch(opt.Source(), o, data))
data = make([]obiitercsv.CSVRecord, 0, batch_size)
o++
}
}
if len(data) > 0 {
newIter.Push(obiitercsv.MakeCSVRecordBatch(opt.Source(), o, data))
}
newIter.Done()
}()
return newIter
}

View File

@ -11,6 +11,8 @@ import (
func LoadCSVTaxonomy(path string, onlysn bool) (*Taxonomy, error) {
log.Infof("Loading taxonomy from csv file: %s", path)
file, err := obiutils.Ropen(path)
if err != nil {
@ -47,7 +49,7 @@ func LoadCSVTaxonomy(path string, onlysn bool) (*Taxonomy, error) {
parentColIndex = i
case "scientific_name":
scientific_nameColIndex = i
case "rank":
case "taxonomic_rank":
rankColIndex = i
}
}
@ -70,31 +72,45 @@ func LoadCSVTaxonomy(path string, onlysn bool) (*Taxonomy, error) {
name := obiutils.RemoveAllExt(path)
short := obiutils.Basename(path)
taxonomy := NewTaxonomy(name, short, obiutils.AsciiAlphaNumSet)
line, err := csvfile.Read()
if err == nil {
parts := strings.Split(line[taxidColIndex], " ")
parts = strings.Split(parts[0], ":")
if len(parts) > 1 {
short = parts[0]
}
}
for err != nil {
log.Infof("Taxonomy name: %s", name)
log.Infof("Taxon code: %s", short)
taxonomy := NewTaxonomy(name, short, obiutils.AsciiAlphaNumSet)
root := true
var taxon *Taxon
for err == nil {
taxid := line[taxidColIndex]
parent := line[parentColIndex]
scientific_name := line[scientific_nameColIndex]
rank := line[rankColIndex]
parts := strings.Split(rank, ":")
rank = parts[0]
root := len(parts) > 1 && parts[1] == "root"
taxon, err := taxonomy.AddTaxon(taxid, parent, rank, false, root)
taxon.SetName(scientific_name, "scientific name")
taxon, err = taxonomy.AddTaxon(taxid, parent, rank, root, false)
if err != nil {
return nil, err
log.Fatalf("cannot add taxon %s: %v", taxid, err)
}
root = false
taxon.SetName(scientific_name, "scientific name")
line, err = csvfile.Read()
}
log.Infof("%d Taxa loaded", taxonomy.Len())
if !taxonomy.HasRoot() {
return nil, errors.New("taxonomy file does not contain root node")
}

View File

@ -1,11 +1 @@
package obitax
import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiitercsv"
)
func WriteTaxonomyCSV(iterator ITaxon,
terminalAction bool, filenames ...string) *obiitercsv.ICSVRecord {
return nil
}

View File

@ -2,6 +2,7 @@ package obitax
import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
log "github.com/sirupsen/logrus"
)
// ITaxon represents an iterator for traversing Taxon instances.
@ -189,12 +190,12 @@ func (taxon *Taxon) ISubTaxonomy() *ITaxon {
go func() {
for i := lpath - 1; i >= 0; i-- {
taxon := path.Taxon(i)
parents[taxon.Node] = true
iter.Push(taxon)
}
pushed := true
log.Warn(parents)
for pushed {
itaxo := taxo.Iterator()
pushed = false
@ -215,3 +216,13 @@ func (taxon *Taxon) ISubTaxonomy() *ITaxon {
return iter
}
func (taxonomy *Taxonomy) ISubTaxonomy(taxid string) *ITaxon {
taxon := taxonomy.Taxon(taxid)
if taxon == nil {
return nil
}
return taxon.ISubTaxonomy()
}

View File

@ -197,7 +197,7 @@ func (taxonomy *Taxonomy) Len() int {
// - A pointer to the newly created Taxon instance.
// - An error if the taxon cannot be added (e.g., it already exists and replace is false).
func (taxonomy *Taxonomy) AddTaxon(taxid, parent string, rank string, isRoot bool, replace bool) (*Taxon, error) {
taxonomy = taxonomy.OrDefault(false)
taxonomy = taxonomy.OrDefault(true)
parentid, perr := taxonomy.Id(parent)
if perr != nil {