mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
introduce obidefault
This commit is contained in:
103
pkg/obitax/csvtaxdump_read.go
Normal file
103
pkg/obitax/csvtaxdump_read.go
Normal file
@ -0,0 +1,103 @@
|
||||
package obitax
|
||||
|
||||
import (
|
||||
"encoding/csv"
|
||||
"errors"
|
||||
"strings"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
func LoadCSVTaxonomy(path string, onlysn bool) (*Taxonomy, error) {
|
||||
|
||||
file, err := obiutils.Ropen(path)
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
defer file.Close()
|
||||
|
||||
csvfile := csv.NewReader(file)
|
||||
|
||||
csvfile.Comma = ','
|
||||
csvfile.ReuseRecord = false
|
||||
csvfile.LazyQuotes = true
|
||||
csvfile.Comment = '#'
|
||||
csvfile.FieldsPerRecord = -1
|
||||
csvfile.TrimLeadingSpace = true
|
||||
|
||||
header, err := csvfile.Read()
|
||||
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
taxidColIndex := -1
|
||||
parentColIndex := -1
|
||||
scientific_nameColIndex := -1
|
||||
rankColIndex := -1
|
||||
|
||||
for i, colName := range header {
|
||||
switch colName {
|
||||
case "taxid":
|
||||
taxidColIndex = i
|
||||
case "parent":
|
||||
parentColIndex = i
|
||||
case "scientific_name":
|
||||
scientific_nameColIndex = i
|
||||
case "rank":
|
||||
rankColIndex = i
|
||||
}
|
||||
}
|
||||
|
||||
if taxidColIndex == -1 {
|
||||
return nil, errors.New("taxonomy file does not contain taxid column")
|
||||
}
|
||||
|
||||
if parentColIndex == -1 {
|
||||
return nil, errors.New("taxonomy file does not contain parent column")
|
||||
}
|
||||
|
||||
if scientific_nameColIndex == -1 {
|
||||
return nil, errors.New("taxonomy file does not contain scientific_name column")
|
||||
}
|
||||
|
||||
if rankColIndex == -1 {
|
||||
return nil, errors.New("taxonomy file does not contain rank column")
|
||||
}
|
||||
|
||||
name := obiutils.RemoveAllExt(path)
|
||||
short := obiutils.Basename(path)
|
||||
taxonomy := NewTaxonomy(name, short, obiutils.AsciiAlphaNumSet)
|
||||
|
||||
line, err := csvfile.Read()
|
||||
|
||||
for err != nil {
|
||||
taxid := line[taxidColIndex]
|
||||
parent := line[parentColIndex]
|
||||
scientific_name := line[scientific_nameColIndex]
|
||||
rank := line[rankColIndex]
|
||||
|
||||
parts := strings.Split(rank, ":")
|
||||
|
||||
rank = parts[0]
|
||||
|
||||
root := len(parts) > 1 && parts[1] == "root"
|
||||
|
||||
taxon, err := taxonomy.AddTaxon(taxid, parent, rank, false, root)
|
||||
taxon.SetName(scientific_name, "scientific name")
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if !taxonomy.HasRoot() {
|
||||
return nil, errors.New("taxonomy file does not contain root node")
|
||||
}
|
||||
|
||||
return taxonomy, nil
|
||||
}
|
11
pkg/obitax/csvtaxdump_write.go
Normal file
11
pkg/obitax/csvtaxdump_write.go
Normal file
@ -0,0 +1,11 @@
|
||||
package obitax
|
||||
|
||||
// import (
|
||||
// "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
||||
// )
|
||||
|
||||
// func WriteTaxonomyCSV(iterator obiiter.IBioSequence,
|
||||
// terminalAction bool, filenames ...string) *obiiter.ICSVRecord {
|
||||
|
||||
// return nil
|
||||
// }
|
@ -1,6 +1,9 @@
|
||||
package obitax
|
||||
|
||||
import log "github.com/sirupsen/logrus"
|
||||
import (
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
var __defaut_taxonomy__ *Taxonomy
|
||||
|
||||
@ -26,5 +29,20 @@ func IsDefaultTaxonomyDefined() bool {
|
||||
}
|
||||
|
||||
func DefaultTaxonomy() *Taxonomy {
|
||||
var err error
|
||||
if __defaut_taxonomy__ == nil {
|
||||
if obidefault.HasSelectedTaxonomy() {
|
||||
__defaut_taxonomy__, err = LoadTaxonomy(
|
||||
obidefault.SelectedTaxonomy(),
|
||||
!obidefault.AreAlternativeNamesSelected(),
|
||||
)
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("Cannot load default taxonomy: %v", err)
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return __defaut_taxonomy__
|
||||
}
|
||||
|
203
pkg/obitax/ncbitaxdump_read.go
Normal file
203
pkg/obitax/ncbitaxdump_read.go
Normal file
@ -0,0 +1,203 @@
|
||||
package obitax
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"encoding/csv"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"path"
|
||||
"strings"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
)
|
||||
|
||||
// loadNodeTable reads a node table from the provided reader and populates the given taxonomy.
|
||||
// It is an internal function and should not be called directly. It is part of the NCBI taxdump reader.
|
||||
// The node table is expected to be in CSV format with a custom delimiter ('|') and comments
|
||||
// starting with '#'. Each record in the table represents a taxon with its taxid, parent taxid,
|
||||
// and rank.
|
||||
//
|
||||
// Parameters:
|
||||
// - reader: An io.Reader from which the node table is read.
|
||||
// - taxonomy: A pointer to an obitax.Taxonomy instance where the taxon data will be added.
|
||||
//
|
||||
// The function reads each record from the input, trims whitespace from the taxid, parent, and rank,
|
||||
// and adds the taxon to the taxonomy. If an error occurs while adding a taxon, the function logs
|
||||
// a fatal error and terminates the program.
|
||||
func loadNodeTable(reader io.Reader, taxonomy *Taxonomy) {
|
||||
file := csv.NewReader(reader)
|
||||
file.Comma = '|'
|
||||
file.Comment = '#'
|
||||
file.TrimLeadingSpace = true
|
||||
file.ReuseRecord = true
|
||||
|
||||
n := 0
|
||||
|
||||
for record, err := file.Read(); err == nil; record, err = file.Read() {
|
||||
n++
|
||||
taxid := strings.TrimSpace(record[0])
|
||||
parent := strings.TrimSpace(record[1])
|
||||
rank := strings.TrimSpace(record[2])
|
||||
|
||||
_, err := taxonomy.AddTaxon(taxid, parent, rank, taxid == "1", false)
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("Error adding taxon %s: %v\n", taxid, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// loadNameTable reads a name table from the provided reader and populates the given taxonomy.
|
||||
// It is an internal function and should not be called directly. It is part of the NCBI taxdump reader.
|
||||
// The name table is expected to be in a custom format with fields separated by the '|' character.
|
||||
// Each record in the table represents a taxon with its taxid, name, and class name.
|
||||
//
|
||||
// Parameters:
|
||||
// - reader: An io.Reader from which the name table is read.
|
||||
// - taxonomy: A pointer to an obitax.Taxonomy instance where the taxon names will be set.
|
||||
// - onlysn: A boolean flag indicating whether to only process records with the class name "scientific name".
|
||||
//
|
||||
// Returns:
|
||||
//
|
||||
// The number of taxon names successfully loaded into the taxonomy. If a line is too long, -1 is returned.
|
||||
// The function processes each line, trims whitespace from the taxid, name, and class name, and sets
|
||||
// the name in the taxonomy if the conditions are met.
|
||||
func loadNameTable(reader io.Reader, taxonomy *Taxonomy, onlysn bool) int {
|
||||
// file := csv.NewReader(reader)
|
||||
// file.Comma = '|'
|
||||
// file.Comment = '#'
|
||||
// file.TrimLeadingSpace = true
|
||||
// file.ReuseRecord = true
|
||||
// file.LazyQuotes = true
|
||||
file := bufio.NewReader(reader)
|
||||
|
||||
n := 0
|
||||
l := 0
|
||||
|
||||
for line, prefix, err := file.ReadLine(); err == nil; line, prefix, err = file.ReadLine() {
|
||||
l++
|
||||
if prefix {
|
||||
return -1
|
||||
}
|
||||
|
||||
record := strings.Split(string(line), "|")
|
||||
taxid := strings.TrimSpace(record[0])
|
||||
|
||||
name := strings.TrimSpace(record[1])
|
||||
classname := strings.TrimSpace(record[3])
|
||||
|
||||
if !onlysn || classname == "scientific name" {
|
||||
n++
|
||||
taxonomy.Taxon(taxid).SetName(name, classname)
|
||||
}
|
||||
}
|
||||
|
||||
return n
|
||||
}
|
||||
|
||||
// loadMergedTable reads a merged table from the provided reader and populates the given taxonomy.
|
||||
// It is an internal function and should not be called directly. It is part of the NCBI taxdump reader.
|
||||
// The merged table is expected to be in CSV format with a custom delimiter ('|') and comments
|
||||
// starting with '#'. Each record in the table represents a mapping between an old taxid and a new taxid.
|
||||
//
|
||||
// Parameters:
|
||||
// - reader: An io.Reader from which the merged table is read.
|
||||
// - taxonomy: A pointer to an obitax.Taxonomy instance where the alias mappings will be added.
|
||||
//
|
||||
// Returns:
|
||||
//
|
||||
// The number of alias mappings successfully loaded into the taxonomy. The function processes
|
||||
// each record, trims whitespace from the old and new taxid, and adds the alias to the taxonomy.
|
||||
func loadMergedTable(reader io.Reader, taxonomy *Taxonomy) int {
|
||||
file := csv.NewReader(reader)
|
||||
file.Comma = '|'
|
||||
file.Comment = '#'
|
||||
file.TrimLeadingSpace = true
|
||||
file.ReuseRecord = true
|
||||
|
||||
n := 0
|
||||
|
||||
for record, err := file.Read(); err == nil; record, err = file.Read() {
|
||||
n++
|
||||
oldtaxid := strings.TrimSpace(record[0])
|
||||
newtaxid := strings.TrimSpace(record[1])
|
||||
|
||||
taxonomy.AddAlias(newtaxid, oldtaxid, false)
|
||||
}
|
||||
|
||||
return n
|
||||
}
|
||||
|
||||
// LoadNCBITaxDump loads the NCBI taxonomy data from the specified directory.
|
||||
// It reads the taxonomy nodes, taxon names, and merged taxa from the corresponding files
|
||||
// and constructs a Taxonomy object.
|
||||
//
|
||||
// Parameters:
|
||||
// - directory: A string representing the path to the directory containing the NCBI taxonomy dump files.
|
||||
// - onlysn: A boolean indicating whether to load only scientific names (true) or all names (false).
|
||||
//
|
||||
// Returns:
|
||||
// - A pointer to the obitax.Taxonomy object containing the loaded taxonomy data, or an error
|
||||
// if any of the files cannot be opened or read.
|
||||
func LoadNCBITaxDump(directory string, onlysn bool) (*Taxonomy, error) {
|
||||
|
||||
taxonomy := NewTaxonomy("NCBI Taxonomy", "taxon", obiutils.AsciiDigitSet)
|
||||
|
||||
//
|
||||
// Load the Taxonomy nodes
|
||||
//
|
||||
|
||||
log.Printf("Loading Taxonomy nodes\n")
|
||||
|
||||
nodefile, err := os.Open(path.Join(directory, "nodes.dmp"))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot open nodes file from '%s'",
|
||||
directory)
|
||||
}
|
||||
defer nodefile.Close()
|
||||
|
||||
buffered := bufio.NewReader(nodefile)
|
||||
loadNodeTable(buffered, taxonomy)
|
||||
log.Printf("%d Taxonomy nodes read\n", taxonomy.Len())
|
||||
|
||||
//
|
||||
// Load the Taxonomy nodes
|
||||
//
|
||||
|
||||
log.Printf("Loading Taxon names\n")
|
||||
|
||||
namefile, nerr := os.Open(path.Join(directory, "names.dmp"))
|
||||
if nerr != nil {
|
||||
return nil, fmt.Errorf("cannot open names file from '%s'",
|
||||
directory)
|
||||
}
|
||||
defer namefile.Close()
|
||||
|
||||
n := loadNameTable(namefile, taxonomy, onlysn)
|
||||
log.Printf("%d taxon names read\n", n)
|
||||
|
||||
//
|
||||
// Load the merged taxa
|
||||
//
|
||||
|
||||
log.Printf("Loading Merged taxa\n")
|
||||
|
||||
aliasfile, aerr := os.Open(path.Join(directory, "merged.dmp"))
|
||||
if aerr != nil {
|
||||
return nil, fmt.Errorf("cannot open merged file from '%s'",
|
||||
directory)
|
||||
}
|
||||
defer aliasfile.Close()
|
||||
|
||||
buffered = bufio.NewReader(aliasfile)
|
||||
n = loadMergedTable(buffered, taxonomy)
|
||||
log.Printf("%d merged taxa read\n", n)
|
||||
|
||||
root := taxonomy.Taxon("1")
|
||||
taxonomy.SetRoot(root)
|
||||
|
||||
return taxonomy, nil
|
||||
}
|
141
pkg/obitax/ncbitaxdump_readtar.go
Normal file
141
pkg/obitax/ncbitaxdump_readtar.go
Normal file
@ -0,0 +1,141 @@
|
||||
package obitax
|
||||
|
||||
import (
|
||||
"archive/tar"
|
||||
"bufio"
|
||||
"fmt"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
func IsNCBITarTaxDump(path string) bool {
|
||||
|
||||
file, err := obiutils.Ropen(path)
|
||||
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
defer file.Close()
|
||||
|
||||
citations := false
|
||||
division := false
|
||||
gencode := false
|
||||
names := false
|
||||
delnodes := false
|
||||
gc := false
|
||||
merged := false
|
||||
nodes := false
|
||||
|
||||
tarfile := tar.NewReader(file)
|
||||
|
||||
header, err := tarfile.Next()
|
||||
|
||||
for err == nil {
|
||||
name := header.Name
|
||||
|
||||
if header.Typeflag == tar.TypeReg {
|
||||
switch name {
|
||||
case "citations.dmp":
|
||||
citations = true
|
||||
case "division.dmp":
|
||||
division = true
|
||||
case "gencode.dmp":
|
||||
gencode = true
|
||||
case "names.dmp":
|
||||
names = true
|
||||
case "delnodes.dmp":
|
||||
delnodes = true
|
||||
case "gc.prt":
|
||||
gc = true
|
||||
case "merged.dmp":
|
||||
merged = true
|
||||
case "nodes.dmp":
|
||||
nodes = true
|
||||
}
|
||||
}
|
||||
header, err = tarfile.Next()
|
||||
}
|
||||
|
||||
return citations && division && gencode && names && delnodes && gc && merged && nodes
|
||||
}
|
||||
|
||||
func LoadNCBITarTaxDump(path string, onlysn bool) (*Taxonomy, error) {
|
||||
|
||||
taxonomy := NewTaxonomy("NCBI Taxonomy", "taxon", obiutils.AsciiDigitSet)
|
||||
|
||||
//
|
||||
// Load the Taxonomy nodes
|
||||
//
|
||||
|
||||
log.Printf("Loading Taxonomy nodes\n")
|
||||
|
||||
file, err := obiutils.Ropen(path)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot open taxonomy file from '%s'",
|
||||
path)
|
||||
}
|
||||
|
||||
nodefile, err := obiutils.TarFileReader(file, "nodes.dmp")
|
||||
if err != nil {
|
||||
file.Close()
|
||||
return nil, fmt.Errorf("cannot open nodes file from '%s'",
|
||||
path)
|
||||
}
|
||||
|
||||
buffered := bufio.NewReader(nodefile)
|
||||
loadNodeTable(buffered, taxonomy)
|
||||
log.Printf("%d Taxonomy nodes read\n", taxonomy.Len())
|
||||
file.Close()
|
||||
|
||||
//
|
||||
// Load the Taxonomy nodes
|
||||
//
|
||||
|
||||
log.Printf("Loading Taxon names\n")
|
||||
|
||||
file, err = obiutils.Ropen(path)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot open taxonomy file from '%s'",
|
||||
path)
|
||||
}
|
||||
|
||||
namefile, nerr := obiutils.TarFileReader(file, "names.dmp")
|
||||
if nerr != nil {
|
||||
file.Close()
|
||||
return nil, fmt.Errorf("cannot open names file from '%s'",
|
||||
path)
|
||||
}
|
||||
n := loadNameTable(namefile, taxonomy, onlysn)
|
||||
log.Printf("%d taxon names read\n", n)
|
||||
file.Close()
|
||||
|
||||
//
|
||||
// Load the merged taxa
|
||||
//
|
||||
|
||||
log.Printf("Loading Merged taxa\n")
|
||||
file, err = obiutils.Ropen(path)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot open taxonomy file from '%s'",
|
||||
path)
|
||||
}
|
||||
|
||||
aliasfile, aerr := obiutils.TarFileReader(file, "merged.dmp")
|
||||
if aerr != nil {
|
||||
file.Close()
|
||||
return nil, fmt.Errorf("cannot open merged file from '%s'",
|
||||
path)
|
||||
}
|
||||
|
||||
buffered = bufio.NewReader(aliasfile)
|
||||
n = loadMergedTable(buffered, taxonomy)
|
||||
log.Printf("%d merged taxa read\n", n)
|
||||
|
||||
root := taxonomy.Taxon("1")
|
||||
taxonomy.SetRoot(root)
|
||||
|
||||
return taxonomy, nil
|
||||
}
|
84
pkg/obitax/taxonomy_read.go
Normal file
84
pkg/obitax/taxonomy_read.go
Normal file
@ -0,0 +1,84 @@
|
||||
package obitax
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
"github.com/gabriel-vasile/mimetype"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
type TaxonomyLoader func(path string, onlysn bool) (*Taxonomy, error)
|
||||
|
||||
func DetectTaxonomyTarFormat(path string) (TaxonomyLoader, error) {
|
||||
|
||||
switch {
|
||||
case IsNCBITarTaxDump(path):
|
||||
log.Infof("NCBI Taxdump Tar Archive detected: %s", path)
|
||||
return LoadNCBITarTaxDump, nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("unknown taxonomy format: %s", path)
|
||||
}
|
||||
|
||||
func DetectTaxonomyFormat(path string) (TaxonomyLoader, error) {
|
||||
|
||||
file, err := os.Open(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
fileInfo, err := file.Stat()
|
||||
if err != nil {
|
||||
file.Close()
|
||||
return nil, err
|
||||
}
|
||||
|
||||
file.Close()
|
||||
|
||||
if fileInfo.IsDir() {
|
||||
// For the moment, we only support NCBI Taxdump directory format
|
||||
log.Infof("NCBI Taxdump detected: %s", path)
|
||||
return LoadNCBITaxDump, nil
|
||||
} else {
|
||||
file, err := obiutils.Ropen(path)
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
mimetype, err := mimetype.DetectReader(file)
|
||||
|
||||
if err != nil {
|
||||
file.Close()
|
||||
return nil, err
|
||||
}
|
||||
|
||||
file.Close()
|
||||
|
||||
switch mimetype.String() {
|
||||
case "text/csv":
|
||||
return LoadCSVTaxonomy, nil
|
||||
case "application/x-tar":
|
||||
return DetectTaxonomyTarFormat(path)
|
||||
}
|
||||
|
||||
log.Fatalf("Detected file format: %s", mimetype.String())
|
||||
}
|
||||
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func LoadTaxonomy(path string, onlysn bool) (*Taxonomy, error) {
|
||||
loader, err := DetectTaxonomyFormat(path)
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
taxonomy, err := loader(path, onlysn)
|
||||
|
||||
return taxonomy, err
|
||||
}
|
Reference in New Issue
Block a user