Make sequence files recognized as a taxonomy

2026-02-03 06:40:33 +00:00 · 2025-03-14 14:22:22 +01:00
parent d1c31c54de
commit 8448783499
21 changed files with 657 additions and 467 deletions
--- a/pkg/obiformats/csviterator.go
+++ b/pkg/obiformats/csviterator.go
@@ -0,0 +1,113 @@
+package obiformats
+
+import (
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiitercsv"
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
+)
+
+func CSVTaxaIterator(iterator *obitax.ITaxon, options ...WithOption) *obiitercsv.ICSVRecord {
+
+	opt := MakeOptions(options)
+	metakeys := make([]string, 0)
+
+	newIter := obiitercsv.NewICSVRecord()
+
+	newIter.Add(1)
+
+	batch_size := opt.BatchSize()
+
+	if opt.WithPattern() {
+		newIter.AppendField("query")
+		opt.pointer.with_metadata = append(opt.pointer.with_metadata, "query")
+	}
+
+	newIter.AppendField("taxid")
+	rawtaxid := opt.RawTaxid()
+
+	if opt.WithParent() {
+		newIter.AppendField("parent")
+	}
+
+	if opt.WithRank() {
+		newIter.AppendField("taxonomic_rank")
+	}
+
+	if opt.WithScientificName() {
+		newIter.AppendField("scientific_name")
+	}
+
+	if opt.WithMetadata() != nil {
+		metakeys = opt.WithMetadata()
+		for _, metadata := range metakeys {
+			newIter.AppendField(metadata)
+		}
+	}
+
+	if opt.WithPath() {
+		newIter.AppendField("path")
+	}
+
+	go func() {
+		newIter.WaitAndClose()
+	}()
+
+	go func() {
+		o := 0
+		data := make([]obiitercsv.CSVRecord, 0, batch_size)
+		for iterator.Next() {
+
+			taxon := iterator.Get()
+			record := make(obiitercsv.CSVRecord)
+
+			if opt.WithPattern() {
+				record["query"] = taxon.MetadataAsString("query")
+			}
+
+			if rawtaxid {
+				record["taxid"] = *taxon.Node.Id()
+			} else {
+				record["taxid"] = taxon.String()
+			}
+
+			if opt.WithParent() {
+				if rawtaxid {
+					record["parent"] = *taxon.Node.ParentId()
+				} else {
+					record["parent"] = taxon.Parent().String()
+				}
+			}
+
+			if opt.WithRank() {
+				record["taxonomic_rank"] = taxon.Rank()
+			}
+
+			if opt.WithScientificName() {
+				record["scientific_name"] = taxon.ScientificName()
+			}
+
+			if opt.WithPath() {
+				record["path"] = taxon.Path().String()
+			}
+
+			for _, key := range metakeys {
+				record[key] = taxon.MetadataAsString(key)
+			}
+
+			data = append(data, record)
+			if len(data) >= batch_size {
+				newIter.Push(obiitercsv.MakeCSVRecordBatch(opt.Source(), o, data))
+				data = make([]obiitercsv.CSVRecord, 0, batch_size)
+				o++
+			}
+
+		}
+
+		if len(data) > 0 {
+			newIter.Push(obiitercsv.MakeCSVRecordBatch(opt.Source(), o, data))
+		}
+
+		newIter.Done()
+	}()
+
+	return newIter
+}
--- a/pkg/obiformats/csvtaxdump_read.go
+++ b/pkg/obiformats/csvtaxdump_read.go
@@ -0,0 +1,120 @@
+package obiformats
+
+import (
+	"encoding/csv"
+	"errors"
+	"strings"
+
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
+	log "github.com/sirupsen/logrus"
+)
+
+func LoadCSVTaxonomy(path string, onlysn bool) (*obitax.Taxonomy, error) {
+
+	log.Infof("Loading taxonomy from csv file: %s", path)
+
+	file, err := obiutils.Ropen(path)
+
+	if err != nil {
+		return nil, err
+	}
+
+	defer file.Close()
+
+	csvfile := csv.NewReader(file)
+
+	csvfile.Comma = ','
+	csvfile.ReuseRecord = false
+	csvfile.LazyQuotes = true
+	csvfile.Comment = '#'
+	csvfile.FieldsPerRecord = -1
+	csvfile.TrimLeadingSpace = true
+
+	header, err := csvfile.Read()
+
+	if err != nil {
+		log.Fatal(err)
+	}
+
+	taxidColIndex := -1
+	parentColIndex := -1
+	scientific_nameColIndex := -1
+	rankColIndex := -1
+
+	for i, colName := range header {
+		switch colName {
+		case "taxid":
+			taxidColIndex = i
+		case "parent":
+			parentColIndex = i
+		case "scientific_name":
+			scientific_nameColIndex = i
+		case "taxonomic_rank":
+			rankColIndex = i
+		}
+	}
+
+	if taxidColIndex == -1 {
+		return nil, errors.New("taxonomy file does not contain taxid column")
+	}
+
+	if parentColIndex == -1 {
+		return nil, errors.New("taxonomy file does not contain parent column")
+	}
+
+	if scientific_nameColIndex == -1 {
+		return nil, errors.New("taxonomy file does not contain scientific_name column")
+	}
+
+	if rankColIndex == -1 {
+		return nil, errors.New("taxonomy file does not contain rank column")
+	}
+
+	name := obiutils.RemoveAllExt(path)
+	short := obiutils.Basename(path)
+
+	line, err := csvfile.Read()
+	if err == nil {
+		parts := strings.Split(line[taxidColIndex], " ")
+		parts = strings.Split(parts[0], ":")
+		if len(parts) > 1 {
+			short = parts[0]
+		}
+	}
+
+	log.Infof("Taxonomy name: %s", name)
+	log.Infof("Taxon code: %s", short)
+
+	taxonomy := obitax.NewTaxonomy(name, short, obiutils.AsciiAlphaNumSet)
+
+	root := true
+	var taxon *obitax.Taxon
+
+	for err == nil {
+		taxid := line[taxidColIndex]
+		parent := line[parentColIndex]
+		scientific_name := line[scientific_nameColIndex]
+		rank := line[rankColIndex]
+
+		taxon, err = taxonomy.AddTaxon(taxid, parent, rank, root, false)
+
+		if err != nil {
+			log.Fatalf("cannot add taxon %s:  %v", taxid, err)
+		}
+
+		root = false
+
+		taxon.SetName(scientific_name, "scientific name")
+
+		line, err = csvfile.Read()
+	}
+
+	log.Infof("%d Taxa loaded", taxonomy.Len())
+
+	if !taxonomy.HasRoot() {
+		return nil, errors.New("taxonomy file does not contain root node")
+	}
+
+	return taxonomy, nil
+}
--- a/pkg/obiformats/ncbitaxdump_read.go
+++ b/pkg/obiformats/ncbitaxdump_read.go
@@ -0,0 +1,214 @@
+package obiformats
+
+import (
+	"bufio"
+	"encoding/csv"
+	"fmt"
+	"io"
+	"os"
+	"path"
+	"strings"
+
+	log "github.com/sirupsen/logrus"
+
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
+)
+
+// loadNodeTable reads a node table from the provided reader and populates the given taxonomy.
+// It is an internal function and should not be called directly. It is part of the NCBI taxdump reader.
+// The node table is expected to be in CSV format with a custom delimiter ('|') and comments
+// starting with '#'. Each record in the table represents a taxon with its taxid, parent taxid,
+// and rank.
+//
+// Parameters:
+//   - reader: An io.Reader from which the node table is read.
+//   - taxonomy: A pointer to an obitax.Taxonomy instance where the taxon data will be added.
+//
+// The function reads each record from the input, trims whitespace from the taxid, parent, and rank,
+// and adds the taxon to the taxonomy. If an error occurs while adding a taxon, the function logs
+// a fatal error and terminates the program.
+func loadNodeTable(reader io.Reader, taxonomy *obitax.Taxonomy) {
+	file := csv.NewReader(reader)
+	file.Comma = '|'
+	file.Comment = '#'
+	file.TrimLeadingSpace = true
+	file.ReuseRecord = true
+
+	n := 0
+
+	for record, err := file.Read(); err == nil; record, err = file.Read() {
+		n++
+		taxid := strings.TrimSpace(record[0])
+		parent := strings.TrimSpace(record[1])
+		rank := strings.TrimSpace(record[2])
+
+		_, err := taxonomy.AddTaxon(taxid, parent, rank, taxid == "1", false)
+
+		if err != nil {
+			log.Fatalf("Error adding taxon %s: %v\n", taxid, err)
+		}
+	}
+}
+
+// loadNameTable reads a name table from the provided reader and populates the given taxonomy.
+// It is an internal function and should not be called directly. It is part of the NCBI taxdump reader.
+// The name table is expected to be in a custom format with fields separated by the '|' character.
+// Each record in the table represents a taxon with its taxid, name, and class name.
+//
+// Parameters:
+//   - reader: An io.Reader from which the name table is read.
+//   - taxonomy: A pointer to an obitax.Taxonomy instance where the taxon names will be set.
+//   - onlysn: A boolean flag indicating whether to only process records with the class name "scientific name".
+//
+// Returns:
+//
+//	The number of taxon names successfully loaded into the taxonomy. If a line is too long, -1 is returned.
+//	The function processes each line, trims whitespace from the taxid, name, and class name, and sets
+//	the name in the taxonomy if the conditions are met.
+func loadNameTable(reader io.Reader, taxonomy *obitax.Taxonomy, onlysn bool) int {
+	// file := csv.NewReader(reader)
+	// file.Comma = '|'
+	// file.Comment = '#'
+	// file.TrimLeadingSpace = true
+	// file.ReuseRecord = true
+	// file.LazyQuotes = true
+	file := bufio.NewReader(reader)
+
+	n := 0
+	l := 0
+
+	for line, prefix, err := file.ReadLine(); err == nil; line, prefix, err = file.ReadLine() {
+		l++
+		if prefix {
+			return -1
+		}
+
+		record := strings.Split(string(line), "|")
+		taxid := strings.TrimSpace(record[0])
+
+		name := strings.TrimSpace(record[1])
+		classname := strings.TrimSpace(record[3])
+
+		if !onlysn || classname == "scientific name" {
+			n++
+			taxon, _, err := taxonomy.Taxon(taxid)
+
+			if err != nil {
+				log.Fatalf("%s: is unknown from the taxonomy", taxid)
+			}
+
+			taxon.SetName(name, classname)
+		}
+	}
+
+	return n
+}
+
+// loadMergedTable reads a merged table from the provided reader and populates the given taxonomy.
+// It is an internal function and should not be called directly. It is part of the NCBI taxdump reader.
+// The merged table is expected to be in CSV format with a custom delimiter ('|') and comments
+// starting with '#'. Each record in the table represents a mapping between an old taxid and a new taxid.
+//
+// Parameters:
+//   - reader: An io.Reader from which the merged table is read.
+//   - taxonomy: A pointer to an obitax.Taxonomy instance where the alias mappings will be added.
+//
+// Returns:
+//
+//	The number of alias mappings successfully loaded into the taxonomy. The function processes
+//	each record, trims whitespace from the old and new taxid, and adds the alias to the taxonomy.
+func loadMergedTable(reader io.Reader, taxonomy *obitax.Taxonomy) int {
+	file := csv.NewReader(reader)
+	file.Comma = '|'
+	file.Comment = '#'
+	file.TrimLeadingSpace = true
+	file.ReuseRecord = true
+
+	n := 0
+
+	for record, err := file.Read(); err == nil; record, err = file.Read() {
+		n++
+		oldtaxid := strings.TrimSpace(record[0])
+		newtaxid := strings.TrimSpace(record[1])
+
+		taxonomy.AddAlias(oldtaxid, newtaxid, false)
+	}
+
+	return n
+}
+
+// LoadNCBITaxDump loads the NCBI taxonomy data from the specified directory.
+// It reads the taxonomy nodes, taxon names, and merged taxa from the corresponding files
+// and constructs a Taxonomy object.
+//
+// Parameters:
+//   - directory: A string representing the path to the directory containing the NCBI taxonomy dump files.
+//   - onlysn: A boolean indicating whether to load only scientific names (true) or all names (false).
+//
+// Returns:
+//   - A pointer to the obitax.Taxonomy object containing the loaded taxonomy data, or an error
+//     if any of the files cannot be opened or read.
+func LoadNCBITaxDump(directory string, onlysn bool) (*obitax.Taxonomy, error) {
+
+	taxonomy := obitax.NewTaxonomy("NCBI Taxonomy", "taxon", obiutils.AsciiDigitSet)
+
+	//
+	// Load the Taxonomy nodes
+	//
+
+	log.Printf("Loading Taxonomy nodes\n")
+
+	nodefile, err := os.Open(path.Join(directory, "nodes.dmp"))
+	if err != nil {
+		return nil, fmt.Errorf("cannot open nodes file from '%s'",
+			directory)
+	}
+	defer nodefile.Close()
+
+	buffered := bufio.NewReader(nodefile)
+	loadNodeTable(buffered, taxonomy)
+	log.Printf("%d Taxonomy nodes read\n", taxonomy.Len())
+
+	//
+	// Load the Taxonomy nodes
+	//
+
+	log.Printf("Loading Taxon names\n")
+
+	namefile, nerr := os.Open(path.Join(directory, "names.dmp"))
+	if nerr != nil {
+		return nil, fmt.Errorf("cannot open names file from '%s'",
+			directory)
+	}
+	defer namefile.Close()
+
+	n := loadNameTable(namefile, taxonomy, onlysn)
+	log.Printf("%d taxon names read\n", n)
+
+	//
+	// Load the merged taxa
+	//
+
+	log.Printf("Loading Merged taxa\n")
+
+	aliasfile, aerr := os.Open(path.Join(directory, "merged.dmp"))
+	if aerr != nil {
+		return nil, fmt.Errorf("cannot open merged file from '%s'",
+			directory)
+	}
+	defer aliasfile.Close()
+
+	buffered = bufio.NewReader(aliasfile)
+	n = loadMergedTable(buffered, taxonomy)
+	log.Printf("%d merged taxa read\n", n)
+
+	root, _, err := taxonomy.Taxon("1")
+
+	if err != nil {
+		log.Fatal("cannot find the root taxon (1) in the NCBI tax dump")
+	}
+	taxonomy.SetRoot(root)
+
+	return taxonomy, nil
+}
--- a/pkg/obiformats/ncbitaxdump_readtar.go
+++ b/pkg/obiformats/ncbitaxdump_readtar.go
@@ -0,0 +1,147 @@
+package obiformats
+
+import (
+	"archive/tar"
+	"bufio"
+	"fmt"
+
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
+
+	log "github.com/sirupsen/logrus"
+)
+
+func IsNCBITarTaxDump(path string) bool {
+
+	file, err := obiutils.Ropen(path)
+
+	if err != nil {
+		return false
+	}
+
+	defer file.Close()
+
+	citations := false
+	division := false
+	gencode := false
+	names := false
+	delnodes := false
+	gc := false
+	merged := false
+	nodes := false
+
+	tarfile := tar.NewReader(file)
+
+	header, err := tarfile.Next()
+
+	for err == nil {
+		name := header.Name
+
+		if header.Typeflag == tar.TypeReg {
+			switch name {
+			case "citations.dmp":
+				citations = true
+			case "division.dmp":
+				division = true
+			case "gencode.dmp":
+				gencode = true
+			case "names.dmp":
+				names = true
+			case "delnodes.dmp":
+				delnodes = true
+			case "gc.prt":
+				gc = true
+			case "merged.dmp":
+				merged = true
+			case "nodes.dmp":
+				nodes = true
+			}
+		}
+		header, err = tarfile.Next()
+	}
+
+	return citations && division && gencode && names && delnodes && gc && merged && nodes
+}
+
+func LoadNCBITarTaxDump(path string, onlysn bool) (*obitax.Taxonomy, error) {
+
+	taxonomy := obitax.NewTaxonomy("NCBI Taxonomy", "taxon", obiutils.AsciiDigitSet)
+
+	//
+	// Load the Taxonomy nodes
+	//
+
+	log.Printf("Loading Taxonomy nodes\n")
+
+	file, err := obiutils.Ropen(path)
+	if err != nil {
+		return nil, fmt.Errorf("cannot open taxonomy file from '%s'",
+			path)
+	}
+
+	nodefile, err := obiutils.TarFileReader(file, "nodes.dmp")
+	if err != nil {
+		file.Close()
+		return nil, fmt.Errorf("cannot open nodes file from '%s'",
+			path)
+	}
+
+	buffered := bufio.NewReader(nodefile)
+	loadNodeTable(buffered, taxonomy)
+	log.Printf("%d Taxonomy nodes read\n", taxonomy.Len())
+	file.Close()
+
+	//
+	// Load the Taxonomy nodes
+	//
+
+	log.Printf("Loading Taxon names\n")
+
+	file, err = obiutils.Ropen(path)
+	if err != nil {
+		return nil, fmt.Errorf("cannot open taxonomy file from '%s'",
+			path)
+	}
+
+	namefile, nerr := obiutils.TarFileReader(file, "names.dmp")
+	if nerr != nil {
+		file.Close()
+		return nil, fmt.Errorf("cannot open names file from '%s'",
+			path)
+	}
+	n := loadNameTable(namefile, taxonomy, onlysn)
+	log.Printf("%d taxon names read\n", n)
+	file.Close()
+
+	//
+	// Load the merged taxa
+	//
+
+	log.Printf("Loading Merged taxa\n")
+	file, err = obiutils.Ropen(path)
+	if err != nil {
+		return nil, fmt.Errorf("cannot open taxonomy file from '%s'",
+			path)
+	}
+
+	aliasfile, aerr := obiutils.TarFileReader(file, "merged.dmp")
+	if aerr != nil {
+		file.Close()
+		return nil, fmt.Errorf("cannot open merged file from '%s'",
+			path)
+	}
+
+	buffered = bufio.NewReader(aliasfile)
+	n = loadMergedTable(buffered, taxonomy)
+	log.Printf("%d merged taxa read\n", n)
+
+	root, _, err := taxonomy.Taxon("1")
+
+	if err != nil {
+		log.Fatal("cannot find the root taxon (1) in the NCBI tax dump")
+	}
+
+	taxonomy.SetRoot(root)
+
+	return taxonomy, nil
+}
--- a/pkg/obiformats/newick_write.go
+++ b/pkg/obiformats/newick_write.go
@@ -0,0 +1,175 @@
+package obiformats
+
+import (
+	"fmt"
+	"io"
+	"os"
+	"strings"
+
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
+	log "github.com/sirupsen/logrus"
+)
+
+// Tree corresponds to any value representable in a Newick format. Each
+// tree value corresponds to a single node.
+type Tree struct {
+	// All children of this node, which may be empty.
+	Children []*Tree
+
+	// The label of this node. If it's empty, then this node does
+	// not have a name.
+	TaxNode *obitax.TaxNode
+
+	// The branch length of this node corresponding to the distance between
+	// it and its parent node. If it's `nil`, then no distance exists.
+	Length *float64
+}
+
+func (tree *Tree) Newick(level int, taxid, scientific_name, rank bool) string {
+	var buffer strings.Builder
+
+	buffer.WriteString(strings.Repeat(" ", level))
+
+	if len(tree.Children) > 0 {
+		buffer.WriteString("(\n")
+		for i, c := range tree.Children {
+			if i > 0 {
+				buffer.WriteString(",\n")
+			}
+			buffer.WriteString(c.Newick(level+1, taxid, scientific_name, rank))
+		}
+		buffer.WriteByte('\n')
+		buffer.WriteString(strings.Repeat(" ", level))
+		buffer.WriteByte(')')
+	}
+	if scientific_name || taxid || rank {
+		buffer.WriteByte('\'')
+	}
+	if scientific_name {
+		sn := strings.ReplaceAll(tree.TaxNode.ScientificName(), ",", "")
+		buffer.WriteString(sn)
+	}
+	if taxid || rank {
+		if scientific_name {
+			buffer.WriteByte(' ')
+		}
+		buffer.WriteByte('-')
+		if taxid {
+			buffer.WriteString(*tree.TaxNode.Id())
+			if rank {
+				buffer.WriteByte('@')
+			}
+		}
+		if rank {
+			buffer.WriteString(tree.TaxNode.Rank())
+		}
+		buffer.WriteByte('-')
+	}
+	if scientific_name || taxid || rank {
+		buffer.WriteByte('\'')
+	}
+
+	if tree.Length != nil {
+		buffer.WriteString(fmt.Sprintf(":%f", *tree.Length))
+	}
+
+	if level == 0 {
+		buffer.WriteString(";\n")
+	}
+	return buffer.String()
+}
+
+func Newick(taxa *obitax.TaxonSet, taxid, scientific_name, rank bool) string {
+	if taxa == nil {
+		return ""
+	}
+
+	iterator := taxa.Sort().Iterator()
+
+	nodes := make(map[*string]*Tree, taxa.Len())
+	trees := make([]*Tree, 0)
+
+	for iterator.Next() {
+		taxon := iterator.Get()
+
+		tree := &Tree{TaxNode: taxon.Node}
+		if parent, ok := nodes[taxon.Parent().Node.Id()]; ok {
+			parent.Children = append(parent.Children, tree)
+		} else {
+			trees = append(trees, tree)
+		}
+		nodes[taxon.Node.Id()] = tree
+	}
+
+	return trees[0].Newick(0, taxid, scientific_name, rank)
+}
+
+func WriteNewick(iterator *obitax.ITaxon,
+	file io.WriteCloser,
+	options ...WithOption) (*obitax.ITaxon, error) {
+	newiterator := obitax.NewITaxon()
+
+	var taxonomy *obitax.Taxonomy
+	var taxa *obitax.TaxonSet
+
+	opt := MakeOptions(options)
+
+	file, _ = obiutils.CompressStream(file, opt.CompressedFile(), opt.CloseFile())
+	obiutils.RegisterAPipe()
+
+	go func() {
+		for iterator.Next() {
+			taxon := iterator.Get()
+			if taxonomy == nil {
+				taxonomy = taxon.Taxonomy
+				taxa = taxonomy.NewTaxonSet()
+			}
+			if taxon.Taxonomy != taxonomy {
+				log.Fatal("Newick writer cannot deal with multi-taxonomy iterator")
+			}
+			taxa.InsertTaxon(taxon)
+			newiterator.Push(taxon)
+		}
+
+		newick := Newick(taxa, opt.WithTaxid(), opt.WithScientificName(), opt.WithRank())
+		file.Write(obiutils.UnsafeBytes(newick))
+
+		newiterator.Close()
+		if opt.CloseFile() {
+			file.Close()
+		}
+
+		obiutils.UnregisterPipe()
+		log.Debugf("Writing newick file done")
+	}()
+
+	return newiterator, nil
+}
+
+func WriteNewickToFile(iterator *obitax.ITaxon,
+	filename string,
+	options ...WithOption) (*obitax.ITaxon, error) {
+
+	flags := os.O_WRONLY | os.O_CREATE
+	flags |= os.O_TRUNC
+
+	file, err := os.OpenFile(filename, flags, 0660)
+
+	if err != nil {
+		log.Fatalf("open file error: %v", err)
+		return nil, err
+	}
+
+	options = append(options, OptionCloseFile())
+
+	iterator, err = WriteNewick(iterator, file, options...)
+
+	return iterator, err
+}
+
+func WriteNewickToStdout(iterator *obitax.ITaxon,
+	options ...WithOption) (*obitax.ITaxon, error) {
+	options = append(options, OptionCloseFile())
+	return WriteNewick(iterator, os.Stdout, options...)
+}
--- a/pkg/obiformats/ngsfilter_read.go
+++ b/pkg/obiformats/ngsfilter_read.go
@@ -15,6 +15,7 @@ import (

 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obingslibrary"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
 	"github.com/gabriel-vasile/mimetype"
 )

@@ -87,7 +88,7 @@ func _parseMainNGSFilter(text string) (obingslibrary.PrimerPair, obingslibrary.T
 }

 func NGSFilterCsvDetector(raw []byte, limit uint32) bool {
-	r := csv.NewReader(bytes.NewReader(dropLastLine(raw, limit)))
+	r := csv.NewReader(bytes.NewReader(obiutils.DropLastLine(raw, limit)))
 	r.Comma = ','
 	r.ReuseRecord = true
 	r.LazyQuotes = true
@@ -121,18 +122,6 @@ func NGSFilterCsvDetector(raw []byte, limit uint32) bool {

 }

-func dropLastLine(b []byte, readLimit uint32) []byte {
-	if readLimit == 0 || uint32(len(b)) < readLimit {
-		return b
-	}
-	for i := len(b) - 1; i > 0; i-- {
-		if b[i] == '\n' {
-			return b[:i]
-		}
-	}
-	return b
-}
-
 func OBIMimeNGSFilterTypeGuesser(stream io.Reader) (*mimetype.MIME, io.Reader, error) {

 	// Create a buffer to store the read data
--- a/pkg/obiformats/options.go
+++ b/pkg/obiformats/options.go
@@ -1,6 +1,8 @@
 package obiformats

 import (
+	"slices"
+
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
 )
@@ -34,6 +36,14 @@ type __options__ struct {
 	paired_filename       string
 	source                string
 	with_feature_table    bool
+	with_pattern          bool
+	with_parent           bool
+	with_path             bool
+	with_rank             bool
+	with_taxid            bool
+	with_scientific_name  bool
+	raw_taxid             bool
+	with_metadata         []string
 }

 type Options struct {
@@ -72,6 +82,13 @@ func MakeOptions(setters []WithOption) Options {
 		paired_filename:       "",
 		source:                "unknown",
 		with_feature_table:    false,
+		with_pattern:          true,
+		with_parent:           false,
+		with_path:             false,
+		with_rank:             true,
+		with_taxid:            true,
+		with_scientific_name:  false,
+		raw_taxid:             false,
 	}

 	opt := Options{&o}
@@ -199,6 +216,60 @@ func (opt Options) WithFeatureTable() bool {
 	return opt.pointer.with_feature_table
 }

+// WithPattern returns whether the pattern option is enabled.
+// It retrieves the setting from the underlying options.
+func (o *Options) WithPattern() bool {
+	return o.pointer.with_pattern
+}
+
+// WithParent returns whether the parent option is enabled.
+// It retrieves the setting from the underlying options.
+func (o *Options) WithParent() bool {
+	return o.pointer.with_parent
+}
+
+// WithPath returns whether the path option is enabled.
+// It retrieves the setting from the underlying options.
+func (o *Options) WithPath() bool {
+	return o.pointer.with_path
+}
+
+// WithRank returns whether the rank option is enabled.
+// It retrieves the setting from the underlying options.
+func (o *Options) WithRank() bool {
+	return o.pointer.with_rank
+}
+
+func (o *Options) WithTaxid() bool {
+	return o.pointer.with_taxid
+}
+
+// WithScientificName returns whether the scientific name option is enabled.
+// It retrieves the setting from the underlying options.
+func (o *Options) WithScientificName() bool {
+	return o.pointer.with_scientific_name
+}
+
+// RawTaxid returns whether the raw taxid option is enabled.
+// It retrieves the setting from the underlying options.
+func (o *Options) RawTaxid() bool {
+	return o.pointer.raw_taxid
+}
+
+// WithMetadata returns a slice of strings containing the metadata
+// associated with the Options instance. It retrieves the metadata
+// from the pointer's with_metadata field.
+func (o *Options) WithMetadata() []string {
+	if o.WithPattern() {
+		idx := slices.Index(o.pointer.with_metadata, "query")
+		if idx >= 0 {
+			o.pointer.with_metadata = slices.Delete(o.pointer.with_metadata, idx, idx+1)
+		}
+	}
+
+	return o.pointer.with_metadata
+}
+
 func OptionCloseFile() WithOption {
 	f := WithOption(func(opt Options) {
 		opt.pointer.closefile = true
@@ -456,3 +527,66 @@ func WithFeatureTable(with bool) WithOption {

 	return f
 }
+
+func OptionsWithPattern(value bool) WithOption {
+	f := WithOption(func(opt Options) {
+		opt.pointer.with_pattern = value
+	})
+
+	return f
+}
+
+func OptionsWithParent(value bool) WithOption {
+	f := WithOption(func(opt Options) {
+		opt.pointer.with_parent = value
+	})
+
+	return f
+}
+
+func OptionsWithPath(value bool) WithOption {
+	f := WithOption(func(opt Options) {
+		opt.pointer.with_path = value
+	})
+
+	return f
+}
+
+func OptionsWithRank(value bool) WithOption {
+	f := WithOption(func(opt Options) {
+		opt.pointer.with_rank = value
+	})
+
+	return f
+}
+
+func OptionsWithTaxid(value bool) WithOption {
+	f := WithOption(func(opt Options) {
+		opt.pointer.with_taxid = value
+	})
+
+	return f
+}
+
+func OptionsWithScientificName(value bool) WithOption {
+	f := WithOption(func(opt Options) {
+		opt.pointer.with_scientific_name = value
+	})
+
+	return f
+}
+
+func OptionsRawTaxid(value bool) WithOption {
+	f := WithOption(func(opt Options) {
+		opt.pointer.raw_taxid = value
+	})
+
+	return f
+}
+
+func OptionsWithMetadata(values ...string) WithOption {
+	f := WithOption(func(opt Options) {
+		opt.pointer.with_metadata = values
+	})
+	return f
+}
--- a/pkg/obiformats/taxonomy_read.go
+++ b/pkg/obiformats/taxonomy_read.go
@@ -0,0 +1,109 @@
+package obiformats
+
+import (
+	"fmt"
+	"os"
+
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
+	"github.com/gabriel-vasile/mimetype"
+
+	log "github.com/sirupsen/logrus"
+)
+
+type TaxonomyLoader func(path string, onlysn bool) (*obitax.Taxonomy, error)
+
+func DetectTaxonomyTarFormat(path string) (TaxonomyLoader, error) {
+
+	switch {
+	case IsNCBITarTaxDump(path):
+		log.Infof("NCBI Taxdump Tar Archive detected: %s", path)
+		return LoadNCBITarTaxDump, nil
+	}
+
+	return nil, fmt.Errorf("unknown taxonomy format: %s", path)
+}
+
+func DetectTaxonomyFormat(path string) (TaxonomyLoader, error) {
+
+	obiutils.RegisterOBIMimeType()
+
+	file, err := os.Open(path)
+	if err != nil {
+		return nil, err
+	}
+
+	fileInfo, err := file.Stat()
+	if err != nil {
+		file.Close()
+		return nil, err
+	}
+
+	file.Close()
+
+	if fileInfo.IsDir() {
+		// For the moment, we only support NCBI Taxdump directory format
+		log.Infof("NCBI Taxdump detected: %s", path)
+		return LoadNCBITaxDump, nil
+	} else {
+		file, err := obiutils.Ropen(path)
+
+		if err != nil {
+			return nil, err
+		}
+
+		mimetype, err := mimetype.DetectReader(file)
+
+		if err != nil {
+			file.Close()
+			return nil, err
+		}
+
+		file.Close()
+
+		switch mimetype.String() {
+		case "text/csv":
+			return LoadCSVTaxonomy, nil
+		case "application/x-tar":
+			return DetectTaxonomyTarFormat(path)
+		case "text/fasta":
+			return func(path string, onlysn bool) (*obitax.Taxonomy, error) {
+				input, err := ReadFastaFromFile(path)
+
+				if err != nil {
+					return nil, err
+				}
+				_, data := input.Load()
+
+				return data.ExtractTaxonomy(nil)
+			}, nil
+		case "text/fastq":
+			return func(path string, onlysn bool) (*obitax.Taxonomy, error) {
+				input, err := ReadFastqFromFile(path)
+
+				if err != nil {
+					return nil, err
+				}
+				_, data := input.Load()
+
+				return data.ExtractTaxonomy(nil)
+			}, nil
+		}
+
+		log.Fatalf("Detected file format: %s", mimetype.String())
+	}
+
+	return nil, nil
+}
+
+func LoadTaxonomy(path string, onlysn bool) (*obitax.Taxonomy, error) {
+	loader, err := DetectTaxonomyFormat(path)
+
+	if err != nil {
+		return nil, err
+	}
+
+	taxonomy, err := loader(path, onlysn)
+
+	return taxonomy, err
+}
--- a/pkg/obiformats/universal_read.go
+++ b/pkg/obiformats/universal_read.go
@@ -3,11 +3,8 @@ package obiformats
 import (
 	"bufio"
 	"bytes"
-	"encoding/csv"
-	"errors"
 	"io"
 	"path"
-	"regexp"

 	"github.com/gabriel-vasile/mimetype"

@@ -41,70 +38,7 @@ type SequenceReader func(reader io.Reader, options ...WithOption) (obiiter.IBioS
 // - io.Reader: A modified reader with the read data.
 // - error: Any error encountered during the process.
 func OBIMimeTypeGuesser(stream io.Reader) (*mimetype.MIME, io.Reader, error) {
-	csv := func(in []byte, limit uint32) bool {
-		in = dropLastLine(in, limit)
-
-		br := bytes.NewReader(in)
-		r := csv.NewReader(br)
-		r.Comma = ','
-		r.ReuseRecord = true
-		r.LazyQuotes = true
-		r.Comment = '#'
-
-		lines := 0
-		for {
-			_, err := r.Read()
-			if errors.Is(err, io.EOF) {
-				break
-			}
-			if err != nil {
-				return false
-			}
-			lines++
-		}
-
-		return r.FieldsPerRecord > 1 && lines > 1
-	}
-
-	fastaDetector := func(raw []byte, limit uint32) bool {
-		ok, err := regexp.Match("^>[^ ]", raw)
-		return ok && err == nil
-	}
-
-	fastqDetector := func(raw []byte, limit uint32) bool {
-		ok, err := regexp.Match("^@[^ ].*\n[A-Za-z.-]+", raw)
-		return ok && err == nil
-	}
-
-	ecoPCR2Detector := func(raw []byte, limit uint32) bool {
-		ok := bytes.HasPrefix(raw, []byte("#@ecopcr-v2"))
-		return ok
-	}
-
-	genbankDetector := func(raw []byte, limit uint32) bool {
-		ok2 := bytes.HasPrefix(raw, []byte("LOCUS       "))
-		ok1, err := regexp.Match("^[^ ]* +Genetic Sequence Data Bank *\n", raw)
-		return ok2 || (ok1 && err == nil)
-	}
-
-	emblDetector := func(raw []byte, limit uint32) bool {
-		ok := bytes.HasPrefix(raw, []byte("ID   "))
-		return ok
-	}
-
-	mimetype.Lookup("text/plain").Extend(fastaDetector, "text/fasta", ".fasta")
-	mimetype.Lookup("text/plain").Extend(fastqDetector, "text/fastq", ".fastq")
-	mimetype.Lookup("text/plain").Extend(ecoPCR2Detector, "text/ecopcr2", ".ecopcr")
-	mimetype.Lookup("text/plain").Extend(genbankDetector, "text/genbank", ".seq")
-	mimetype.Lookup("text/plain").Extend(emblDetector, "text/embl", ".dat")
-	mimetype.Lookup("text/plain").Extend(csv, "text/csv", ".csv")
-
-	mimetype.Lookup("application/octet-stream").Extend(fastaDetector, "text/fasta", ".fasta")
-	mimetype.Lookup("application/octet-stream").Extend(fastqDetector, "text/fastq", ".fastq")
-	mimetype.Lookup("application/octet-stream").Extend(ecoPCR2Detector, "text/ecopcr2", ".ecopcr")
-	mimetype.Lookup("application/octet-stream").Extend(genbankDetector, "text/genbank", ".seq")
-	mimetype.Lookup("application/octet-stream").Extend(emblDetector, "text/embl", ".dat")
-	mimetype.Lookup("application/octet-stream").Extend(csv, "text/csv", ".csv")
+	obiutils.RegisterOBIMimeType()

 	// Create a buffer to store the read data
 	mimetype.SetLimit(1024 * 1024)