Make sequence files recognized as a taxonomy

This commit is contained in:
Eric Coissac
2025-03-14 14:22:22 +01:00
parent d1c31c54de
commit 8448783499
21 changed files with 657 additions and 467 deletions

View File

@ -1,299 +0,0 @@
package obitax
import (
"slices"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiitercsv"
)
type __options__ struct {
batch_size int // Number of items to process in a batch
with_pattern bool
with_parent bool
with_path bool
with_rank bool
with_scientific_name bool
raw_taxid bool
with_metadata []string
source string // Source of the data
}
// Options wraps the __options__ struct to provide a pointer to the options.
type Options struct {
pointer *__options__ // Pointer to the underlying options
}
// WithOption is a function type that takes an Options parameter and modifies it.
type WithOption func(Options)
// MakeOptions creates an Options instance with default settings and applies any provided setters.
// It returns the configured Options.
//
// Parameters:
// - setters: A slice of WithOption functions to customize the options.
//
// Returns:
// - An Options instance with the specified settings.
func MakeOptions(setters []WithOption) Options {
o := __options__{
batch_size: obidefault.BatchSize(), // Number of items to process in a batch
with_pattern: true,
with_parent: false,
with_path: false,
with_rank: true,
with_scientific_name: false,
raw_taxid: false,
source: "unknown",
}
opt := Options{&o}
for _, set := range setters {
set(opt)
}
return opt
}
// BatchSize returns the size of the batch to be processed.
// It retrieves the batch size from the underlying options.
func (o *Options) BatchSize() int {
return o.pointer.batch_size
}
// WithPattern returns whether the pattern option is enabled.
// It retrieves the setting from the underlying options.
func (o *Options) WithPattern() bool {
return o.pointer.with_pattern
}
// WithParent returns whether the parent option is enabled.
// It retrieves the setting from the underlying options.
func (o *Options) WithParent() bool {
return o.pointer.with_parent
}
// WithPath returns whether the path option is enabled.
// It retrieves the setting from the underlying options.
func (o *Options) WithPath() bool {
return o.pointer.with_path
}
// WithRank returns whether the rank option is enabled.
// It retrieves the setting from the underlying options.
func (o *Options) WithRank() bool {
return o.pointer.with_rank
}
// WithScientificName returns whether the scientific name option is enabled.
// It retrieves the setting from the underlying options.
func (o *Options) WithScientificName() bool {
return o.pointer.with_scientific_name
}
// RawTaxid returns whether the raw taxid option is enabled.
// It retrieves the setting from the underlying options.
func (o *Options) RawTaxid() bool {
return o.pointer.raw_taxid
}
// Source returns the source of the data.
// It retrieves the source from the underlying options.
func (o *Options) Source() string {
return o.pointer.source
}
// WithMetadata returns a slice of strings containing the metadata
// associated with the Options instance. It retrieves the metadata
// from the pointer's with_metadata field.
func (o *Options) WithMetadata() []string {
if o.WithPattern() {
idx := slices.Index(o.pointer.with_metadata, "query")
if idx >= 0 {
o.pointer.with_metadata = slices.Delete(o.pointer.with_metadata, idx, idx+1)
}
}
return o.pointer.with_metadata
}
// OptionsBatchSize returns a WithOption function that sets the batch_size option.
// Parameters:
// - size: An integer specifying the size of the batch to be processed.
func OptionsBatchSize(size int) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.batch_size = size
})
return f
}
func OptionsWithPattern(value bool) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.with_pattern = value
})
return f
}
func OptionsWithParent(value bool) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.with_parent = value
})
return f
}
func OptionsWithPath(value bool) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.with_path = value
})
return f
}
func OptionsWithRank(value bool) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.with_rank = value
})
return f
}
func OptionsWithScientificName(value bool) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.with_scientific_name = value
})
return f
}
func OptionsRawTaxid(value bool) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.raw_taxid = value
})
return f
}
func OptionsSource(value string) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.source = value
})
return f
}
func OptionsWithMetadata(values ...string) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.with_metadata = values
})
return f
}
func (iterator *ITaxon) CSVTaxaIterator(options ...WithOption) *obiitercsv.ICSVRecord {
opt := MakeOptions(options)
metakeys := make([]string, 0)
newIter := obiitercsv.NewICSVRecord()
newIter.Add(1)
batch_size := opt.BatchSize()
if opt.WithPattern() {
newIter.AppendField("query")
opt.pointer.with_metadata = append(opt.pointer.with_metadata, "query")
}
newIter.AppendField("taxid")
rawtaxid := opt.RawTaxid()
if opt.WithParent() {
newIter.AppendField("parent")
}
if opt.WithRank() {
newIter.AppendField("taxonomic_rank")
}
if opt.WithScientificName() {
newIter.AppendField("scientific_name")
}
if opt.WithMetadata() != nil {
metakeys = opt.WithMetadata()
for _, metadata := range metakeys {
newIter.AppendField(metadata)
}
}
if opt.WithPath() {
newIter.AppendField("path")
}
go func() {
newIter.WaitAndClose()
}()
go func() {
o := 0
data := make([]obiitercsv.CSVRecord, 0, batch_size)
for iterator.Next() {
taxon := iterator.Get()
record := make(obiitercsv.CSVRecord)
if opt.WithPattern() {
record["query"] = taxon.MetadataAsString("query")
}
if rawtaxid {
record["taxid"] = *taxon.Node.Id()
} else {
record["taxid"] = taxon.String()
}
if opt.WithParent() {
if rawtaxid {
record["parent"] = *taxon.Node.ParentId()
} else {
record["parent"] = taxon.Parent().String()
}
}
if opt.WithRank() {
record["taxonomic_rank"] = taxon.Rank()
}
if opt.WithScientificName() {
record["scientific_name"] = taxon.ScientificName()
}
if opt.WithPath() {
record["path"] = taxon.Path().String()
}
for _, key := range metakeys {
record[key] = taxon.MetadataAsString(key)
}
data = append(data, record)
if len(data) >= batch_size {
newIter.Push(obiitercsv.MakeCSVRecordBatch(opt.Source(), o, data))
data = make([]obiitercsv.CSVRecord, 0, batch_size)
o++
}
}
if len(data) > 0 {
newIter.Push(obiitercsv.MakeCSVRecordBatch(opt.Source(), o, data))
}
newIter.Done()
}()
return newIter
}

View File

@ -1,119 +0,0 @@
package obitax
import (
"encoding/csv"
"errors"
"strings"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
log "github.com/sirupsen/logrus"
)
func LoadCSVTaxonomy(path string, onlysn bool) (*Taxonomy, error) {
log.Infof("Loading taxonomy from csv file: %s", path)
file, err := obiutils.Ropen(path)
if err != nil {
return nil, err
}
defer file.Close()
csvfile := csv.NewReader(file)
csvfile.Comma = ','
csvfile.ReuseRecord = false
csvfile.LazyQuotes = true
csvfile.Comment = '#'
csvfile.FieldsPerRecord = -1
csvfile.TrimLeadingSpace = true
header, err := csvfile.Read()
if err != nil {
log.Fatal(err)
}
taxidColIndex := -1
parentColIndex := -1
scientific_nameColIndex := -1
rankColIndex := -1
for i, colName := range header {
switch colName {
case "taxid":
taxidColIndex = i
case "parent":
parentColIndex = i
case "scientific_name":
scientific_nameColIndex = i
case "taxonomic_rank":
rankColIndex = i
}
}
if taxidColIndex == -1 {
return nil, errors.New("taxonomy file does not contain taxid column")
}
if parentColIndex == -1 {
return nil, errors.New("taxonomy file does not contain parent column")
}
if scientific_nameColIndex == -1 {
return nil, errors.New("taxonomy file does not contain scientific_name column")
}
if rankColIndex == -1 {
return nil, errors.New("taxonomy file does not contain rank column")
}
name := obiutils.RemoveAllExt(path)
short := obiutils.Basename(path)
line, err := csvfile.Read()
if err == nil {
parts := strings.Split(line[taxidColIndex], " ")
parts = strings.Split(parts[0], ":")
if len(parts) > 1 {
short = parts[0]
}
}
log.Infof("Taxonomy name: %s", name)
log.Infof("Taxon code: %s", short)
taxonomy := NewTaxonomy(name, short, obiutils.AsciiAlphaNumSet)
root := true
var taxon *Taxon
for err == nil {
taxid := line[taxidColIndex]
parent := line[parentColIndex]
scientific_name := line[scientific_nameColIndex]
rank := line[rankColIndex]
taxon, err = taxonomy.AddTaxon(taxid, parent, rank, root, false)
if err != nil {
log.Fatalf("cannot add taxon %s: %v", taxid, err)
}
root = false
taxon.SetName(scientific_name, "scientific name")
line, err = csvfile.Read()
}
log.Infof("%d Taxa loaded", taxonomy.Len())
if !taxonomy.HasRoot() {
return nil, errors.New("taxonomy file does not contain root node")
}
return taxonomy, nil
}

View File

@ -1,38 +0,0 @@
package obitax
import (
"strings"
"github.com/TuftsBCB/io/newick"
)
func (taxonomy *Taxonomy) Newick() string {
if taxonomy == nil {
return ""
}
iterator := taxonomy.AsTaxonSet().Sort().Iterator()
nodes := make(map[*string]*newick.Tree, taxonomy.Len())
trees := make([]*newick.Tree, 0)
for iterator.Next() {
taxon := iterator.Get()
tree := &newick.Tree{Label: taxon.String()}
nodes[taxon.Node.id] = tree
if parent, ok := nodes[taxon.Parent().Node.id]; ok {
parent.Children = append(parent.Children, *tree)
} else {
trees = append(trees, tree)
}
}
rep := strings.Builder{}
for _, tree := range trees {
rep.WriteString(tree.String())
rep.WriteString("\n")
}
return rep.String()
}

View File

@ -3,7 +3,6 @@ package obitax
import (
"sync"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
log "github.com/sirupsen/logrus"
)
@ -32,24 +31,5 @@ func IsDefaultTaxonomyDefined() bool {
}
func DefaultTaxonomy() *Taxonomy {
var err error
if __defaut_taxonomy__ == nil {
if obidefault.HasSelectedTaxonomy() {
__defaut_taxonomy_mutex__.Lock()
defer __defaut_taxonomy_mutex__.Unlock()
if __defaut_taxonomy__ == nil {
__defaut_taxonomy__, err = LoadTaxonomy(
obidefault.SelectedTaxonomy(),
!obidefault.AreAlternativeNamesSelected(),
)
if err != nil {
log.Fatalf("Cannot load default taxonomy: %v", err)
}
}
}
}
return __defaut_taxonomy__
}

View File

@ -224,3 +224,9 @@ func (taxonomy *Taxonomy) ISubTaxonomy(taxid string) *ITaxon {
return taxon.ISubTaxonomy()
}
func (iterator *ITaxon) Consume() {
for iterator.Next() {
iterator.Get()
}
}

View File

@ -1,213 +0,0 @@
package obitax
import (
"bufio"
"encoding/csv"
"fmt"
"io"
"os"
"path"
"strings"
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
)
// loadNodeTable reads a node table from the provided reader and populates the given taxonomy.
// It is an internal function and should not be called directly. It is part of the NCBI taxdump reader.
// The node table is expected to be in CSV format with a custom delimiter ('|') and comments
// starting with '#'. Each record in the table represents a taxon with its taxid, parent taxid,
// and rank.
//
// Parameters:
// - reader: An io.Reader from which the node table is read.
// - taxonomy: A pointer to an obitax.Taxonomy instance where the taxon data will be added.
//
// The function reads each record from the input, trims whitespace from the taxid, parent, and rank,
// and adds the taxon to the taxonomy. If an error occurs while adding a taxon, the function logs
// a fatal error and terminates the program.
func loadNodeTable(reader io.Reader, taxonomy *Taxonomy) {
file := csv.NewReader(reader)
file.Comma = '|'
file.Comment = '#'
file.TrimLeadingSpace = true
file.ReuseRecord = true
n := 0
for record, err := file.Read(); err == nil; record, err = file.Read() {
n++
taxid := strings.TrimSpace(record[0])
parent := strings.TrimSpace(record[1])
rank := strings.TrimSpace(record[2])
_, err := taxonomy.AddTaxon(taxid, parent, rank, taxid == "1", false)
if err != nil {
log.Fatalf("Error adding taxon %s: %v\n", taxid, err)
}
}
}
// loadNameTable reads a name table from the provided reader and populates the given taxonomy.
// It is an internal function and should not be called directly. It is part of the NCBI taxdump reader.
// The name table is expected to be in a custom format with fields separated by the '|' character.
// Each record in the table represents a taxon with its taxid, name, and class name.
//
// Parameters:
// - reader: An io.Reader from which the name table is read.
// - taxonomy: A pointer to an obitax.Taxonomy instance where the taxon names will be set.
// - onlysn: A boolean flag indicating whether to only process records with the class name "scientific name".
//
// Returns:
//
// The number of taxon names successfully loaded into the taxonomy. If a line is too long, -1 is returned.
// The function processes each line, trims whitespace from the taxid, name, and class name, and sets
// the name in the taxonomy if the conditions are met.
func loadNameTable(reader io.Reader, taxonomy *Taxonomy, onlysn bool) int {
// file := csv.NewReader(reader)
// file.Comma = '|'
// file.Comment = '#'
// file.TrimLeadingSpace = true
// file.ReuseRecord = true
// file.LazyQuotes = true
file := bufio.NewReader(reader)
n := 0
l := 0
for line, prefix, err := file.ReadLine(); err == nil; line, prefix, err = file.ReadLine() {
l++
if prefix {
return -1
}
record := strings.Split(string(line), "|")
taxid := strings.TrimSpace(record[0])
name := strings.TrimSpace(record[1])
classname := strings.TrimSpace(record[3])
if !onlysn || classname == "scientific name" {
n++
taxon, _, err := taxonomy.Taxon(taxid)
if err != nil {
log.Fatalf("%s: is unknown from the taxonomy", taxid)
}
taxon.SetName(name, classname)
}
}
return n
}
// loadMergedTable reads a merged table from the provided reader and populates the given taxonomy.
// It is an internal function and should not be called directly. It is part of the NCBI taxdump reader.
// The merged table is expected to be in CSV format with a custom delimiter ('|') and comments
// starting with '#'. Each record in the table represents a mapping between an old taxid and a new taxid.
//
// Parameters:
// - reader: An io.Reader from which the merged table is read.
// - taxonomy: A pointer to an obitax.Taxonomy instance where the alias mappings will be added.
//
// Returns:
//
// The number of alias mappings successfully loaded into the taxonomy. The function processes
// each record, trims whitespace from the old and new taxid, and adds the alias to the taxonomy.
func loadMergedTable(reader io.Reader, taxonomy *Taxonomy) int {
file := csv.NewReader(reader)
file.Comma = '|'
file.Comment = '#'
file.TrimLeadingSpace = true
file.ReuseRecord = true
n := 0
for record, err := file.Read(); err == nil; record, err = file.Read() {
n++
oldtaxid := strings.TrimSpace(record[0])
newtaxid := strings.TrimSpace(record[1])
taxonomy.AddAlias(oldtaxid, newtaxid, false)
}
return n
}
// LoadNCBITaxDump loads the NCBI taxonomy data from the specified directory.
// It reads the taxonomy nodes, taxon names, and merged taxa from the corresponding files
// and constructs a Taxonomy object.
//
// Parameters:
// - directory: A string representing the path to the directory containing the NCBI taxonomy dump files.
// - onlysn: A boolean indicating whether to load only scientific names (true) or all names (false).
//
// Returns:
// - A pointer to the obitax.Taxonomy object containing the loaded taxonomy data, or an error
// if any of the files cannot be opened or read.
func LoadNCBITaxDump(directory string, onlysn bool) (*Taxonomy, error) {
taxonomy := NewTaxonomy("NCBI Taxonomy", "taxon", obiutils.AsciiDigitSet)
//
// Load the Taxonomy nodes
//
log.Printf("Loading Taxonomy nodes\n")
nodefile, err := os.Open(path.Join(directory, "nodes.dmp"))
if err != nil {
return nil, fmt.Errorf("cannot open nodes file from '%s'",
directory)
}
defer nodefile.Close()
buffered := bufio.NewReader(nodefile)
loadNodeTable(buffered, taxonomy)
log.Printf("%d Taxonomy nodes read\n", taxonomy.Len())
//
// Load the Taxonomy nodes
//
log.Printf("Loading Taxon names\n")
namefile, nerr := os.Open(path.Join(directory, "names.dmp"))
if nerr != nil {
return nil, fmt.Errorf("cannot open names file from '%s'",
directory)
}
defer namefile.Close()
n := loadNameTable(namefile, taxonomy, onlysn)
log.Printf("%d taxon names read\n", n)
//
// Load the merged taxa
//
log.Printf("Loading Merged taxa\n")
aliasfile, aerr := os.Open(path.Join(directory, "merged.dmp"))
if aerr != nil {
return nil, fmt.Errorf("cannot open merged file from '%s'",
directory)
}
defer aliasfile.Close()
buffered = bufio.NewReader(aliasfile)
n = loadMergedTable(buffered, taxonomy)
log.Printf("%d merged taxa read\n", n)
root, _, err := taxonomy.Taxon("1")
if err != nil {
log.Fatal("cannot find the root taxon (1) in the NCBI tax dump")
}
taxonomy.SetRoot(root)
return taxonomy, nil
}

View File

@ -1,146 +0,0 @@
package obitax
import (
"archive/tar"
"bufio"
"fmt"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
log "github.com/sirupsen/logrus"
)
func IsNCBITarTaxDump(path string) bool {
file, err := obiutils.Ropen(path)
if err != nil {
return false
}
defer file.Close()
citations := false
division := false
gencode := false
names := false
delnodes := false
gc := false
merged := false
nodes := false
tarfile := tar.NewReader(file)
header, err := tarfile.Next()
for err == nil {
name := header.Name
if header.Typeflag == tar.TypeReg {
switch name {
case "citations.dmp":
citations = true
case "division.dmp":
division = true
case "gencode.dmp":
gencode = true
case "names.dmp":
names = true
case "delnodes.dmp":
delnodes = true
case "gc.prt":
gc = true
case "merged.dmp":
merged = true
case "nodes.dmp":
nodes = true
}
}
header, err = tarfile.Next()
}
return citations && division && gencode && names && delnodes && gc && merged && nodes
}
func LoadNCBITarTaxDump(path string, onlysn bool) (*Taxonomy, error) {
taxonomy := NewTaxonomy("NCBI Taxonomy", "taxon", obiutils.AsciiDigitSet)
//
// Load the Taxonomy nodes
//
log.Printf("Loading Taxonomy nodes\n")
file, err := obiutils.Ropen(path)
if err != nil {
return nil, fmt.Errorf("cannot open taxonomy file from '%s'",
path)
}
nodefile, err := obiutils.TarFileReader(file, "nodes.dmp")
if err != nil {
file.Close()
return nil, fmt.Errorf("cannot open nodes file from '%s'",
path)
}
buffered := bufio.NewReader(nodefile)
loadNodeTable(buffered, taxonomy)
log.Printf("%d Taxonomy nodes read\n", taxonomy.Len())
file.Close()
//
// Load the Taxonomy nodes
//
log.Printf("Loading Taxon names\n")
file, err = obiutils.Ropen(path)
if err != nil {
return nil, fmt.Errorf("cannot open taxonomy file from '%s'",
path)
}
namefile, nerr := obiutils.TarFileReader(file, "names.dmp")
if nerr != nil {
file.Close()
return nil, fmt.Errorf("cannot open names file from '%s'",
path)
}
n := loadNameTable(namefile, taxonomy, onlysn)
log.Printf("%d taxon names read\n", n)
file.Close()
//
// Load the merged taxa
//
log.Printf("Loading Merged taxa\n")
file, err = obiutils.Ropen(path)
if err != nil {
return nil, fmt.Errorf("cannot open taxonomy file from '%s'",
path)
}
aliasfile, aerr := obiutils.TarFileReader(file, "merged.dmp")
if aerr != nil {
file.Close()
return nil, fmt.Errorf("cannot open merged file from '%s'",
path)
}
buffered = bufio.NewReader(aliasfile)
n = loadMergedTable(buffered, taxonomy)
log.Printf("%d merged taxa read\n", n)
root, _, err := taxonomy.Taxon("1")
if err != nil {
log.Fatal("cannot find the root taxon (1) in the NCBI tax dump")
}
taxonomy.SetRoot(root)
return taxonomy, nil
}

View File

@ -1 +0,0 @@
package obitax

View File

@ -1,84 +0,0 @@
package obitax
import (
"fmt"
"os"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
"github.com/gabriel-vasile/mimetype"
log "github.com/sirupsen/logrus"
)
type TaxonomyLoader func(path string, onlysn bool) (*Taxonomy, error)
func DetectTaxonomyTarFormat(path string) (TaxonomyLoader, error) {
switch {
case IsNCBITarTaxDump(path):
log.Infof("NCBI Taxdump Tar Archive detected: %s", path)
return LoadNCBITarTaxDump, nil
}
return nil, fmt.Errorf("unknown taxonomy format: %s", path)
}
func DetectTaxonomyFormat(path string) (TaxonomyLoader, error) {
file, err := os.Open(path)
if err != nil {
return nil, err
}
fileInfo, err := file.Stat()
if err != nil {
file.Close()
return nil, err
}
file.Close()
if fileInfo.IsDir() {
// For the moment, we only support NCBI Taxdump directory format
log.Infof("NCBI Taxdump detected: %s", path)
return LoadNCBITaxDump, nil
} else {
file, err := obiutils.Ropen(path)
if err != nil {
return nil, err
}
mimetype, err := mimetype.DetectReader(file)
if err != nil {
file.Close()
return nil, err
}
file.Close()
switch mimetype.String() {
case "text/csv":
return LoadCSVTaxonomy, nil
case "application/x-tar":
return DetectTaxonomyTarFormat(path)
}
log.Fatalf("Detected file format: %s", mimetype.String())
}
return nil, nil
}
func LoadTaxonomy(path string, onlysn bool) (*Taxonomy, error) {
loader, err := DetectTaxonomyFormat(path)
if err != nil {
return nil, err
}
taxonomy, err := loader(path, onlysn)
return taxonomy, err
}