mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-12-08 08:40:26 +00:00
Make sequence files recognized as a taxonomy
This commit is contained in:
113
pkg/obiformats/csviterator.go
Normal file
113
pkg/obiformats/csviterator.go
Normal file
@@ -0,0 +1,113 @@
|
||||
package obiformats
|
||||
|
||||
import (
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiitercsv"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
|
||||
)
|
||||
|
||||
func CSVTaxaIterator(iterator *obitax.ITaxon, options ...WithOption) *obiitercsv.ICSVRecord {
|
||||
|
||||
opt := MakeOptions(options)
|
||||
metakeys := make([]string, 0)
|
||||
|
||||
newIter := obiitercsv.NewICSVRecord()
|
||||
|
||||
newIter.Add(1)
|
||||
|
||||
batch_size := opt.BatchSize()
|
||||
|
||||
if opt.WithPattern() {
|
||||
newIter.AppendField("query")
|
||||
opt.pointer.with_metadata = append(opt.pointer.with_metadata, "query")
|
||||
}
|
||||
|
||||
newIter.AppendField("taxid")
|
||||
rawtaxid := opt.RawTaxid()
|
||||
|
||||
if opt.WithParent() {
|
||||
newIter.AppendField("parent")
|
||||
}
|
||||
|
||||
if opt.WithRank() {
|
||||
newIter.AppendField("taxonomic_rank")
|
||||
}
|
||||
|
||||
if opt.WithScientificName() {
|
||||
newIter.AppendField("scientific_name")
|
||||
}
|
||||
|
||||
if opt.WithMetadata() != nil {
|
||||
metakeys = opt.WithMetadata()
|
||||
for _, metadata := range metakeys {
|
||||
newIter.AppendField(metadata)
|
||||
}
|
||||
}
|
||||
|
||||
if opt.WithPath() {
|
||||
newIter.AppendField("path")
|
||||
}
|
||||
|
||||
go func() {
|
||||
newIter.WaitAndClose()
|
||||
}()
|
||||
|
||||
go func() {
|
||||
o := 0
|
||||
data := make([]obiitercsv.CSVRecord, 0, batch_size)
|
||||
for iterator.Next() {
|
||||
|
||||
taxon := iterator.Get()
|
||||
record := make(obiitercsv.CSVRecord)
|
||||
|
||||
if opt.WithPattern() {
|
||||
record["query"] = taxon.MetadataAsString("query")
|
||||
}
|
||||
|
||||
if rawtaxid {
|
||||
record["taxid"] = *taxon.Node.Id()
|
||||
} else {
|
||||
record["taxid"] = taxon.String()
|
||||
}
|
||||
|
||||
if opt.WithParent() {
|
||||
if rawtaxid {
|
||||
record["parent"] = *taxon.Node.ParentId()
|
||||
} else {
|
||||
record["parent"] = taxon.Parent().String()
|
||||
}
|
||||
}
|
||||
|
||||
if opt.WithRank() {
|
||||
record["taxonomic_rank"] = taxon.Rank()
|
||||
}
|
||||
|
||||
if opt.WithScientificName() {
|
||||
record["scientific_name"] = taxon.ScientificName()
|
||||
}
|
||||
|
||||
if opt.WithPath() {
|
||||
record["path"] = taxon.Path().String()
|
||||
}
|
||||
|
||||
for _, key := range metakeys {
|
||||
record[key] = taxon.MetadataAsString(key)
|
||||
}
|
||||
|
||||
data = append(data, record)
|
||||
if len(data) >= batch_size {
|
||||
newIter.Push(obiitercsv.MakeCSVRecordBatch(opt.Source(), o, data))
|
||||
data = make([]obiitercsv.CSVRecord, 0, batch_size)
|
||||
o++
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if len(data) > 0 {
|
||||
newIter.Push(obiitercsv.MakeCSVRecordBatch(opt.Source(), o, data))
|
||||
}
|
||||
|
||||
newIter.Done()
|
||||
}()
|
||||
|
||||
return newIter
|
||||
}
|
||||
120
pkg/obiformats/csvtaxdump_read.go
Normal file
120
pkg/obiformats/csvtaxdump_read.go
Normal file
@@ -0,0 +1,120 @@
|
||||
package obiformats
|
||||
|
||||
import (
|
||||
"encoding/csv"
|
||||
"errors"
|
||||
"strings"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
func LoadCSVTaxonomy(path string, onlysn bool) (*obitax.Taxonomy, error) {
|
||||
|
||||
log.Infof("Loading taxonomy from csv file: %s", path)
|
||||
|
||||
file, err := obiutils.Ropen(path)
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
defer file.Close()
|
||||
|
||||
csvfile := csv.NewReader(file)
|
||||
|
||||
csvfile.Comma = ','
|
||||
csvfile.ReuseRecord = false
|
||||
csvfile.LazyQuotes = true
|
||||
csvfile.Comment = '#'
|
||||
csvfile.FieldsPerRecord = -1
|
||||
csvfile.TrimLeadingSpace = true
|
||||
|
||||
header, err := csvfile.Read()
|
||||
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
taxidColIndex := -1
|
||||
parentColIndex := -1
|
||||
scientific_nameColIndex := -1
|
||||
rankColIndex := -1
|
||||
|
||||
for i, colName := range header {
|
||||
switch colName {
|
||||
case "taxid":
|
||||
taxidColIndex = i
|
||||
case "parent":
|
||||
parentColIndex = i
|
||||
case "scientific_name":
|
||||
scientific_nameColIndex = i
|
||||
case "taxonomic_rank":
|
||||
rankColIndex = i
|
||||
}
|
||||
}
|
||||
|
||||
if taxidColIndex == -1 {
|
||||
return nil, errors.New("taxonomy file does not contain taxid column")
|
||||
}
|
||||
|
||||
if parentColIndex == -1 {
|
||||
return nil, errors.New("taxonomy file does not contain parent column")
|
||||
}
|
||||
|
||||
if scientific_nameColIndex == -1 {
|
||||
return nil, errors.New("taxonomy file does not contain scientific_name column")
|
||||
}
|
||||
|
||||
if rankColIndex == -1 {
|
||||
return nil, errors.New("taxonomy file does not contain rank column")
|
||||
}
|
||||
|
||||
name := obiutils.RemoveAllExt(path)
|
||||
short := obiutils.Basename(path)
|
||||
|
||||
line, err := csvfile.Read()
|
||||
if err == nil {
|
||||
parts := strings.Split(line[taxidColIndex], " ")
|
||||
parts = strings.Split(parts[0], ":")
|
||||
if len(parts) > 1 {
|
||||
short = parts[0]
|
||||
}
|
||||
}
|
||||
|
||||
log.Infof("Taxonomy name: %s", name)
|
||||
log.Infof("Taxon code: %s", short)
|
||||
|
||||
taxonomy := obitax.NewTaxonomy(name, short, obiutils.AsciiAlphaNumSet)
|
||||
|
||||
root := true
|
||||
var taxon *obitax.Taxon
|
||||
|
||||
for err == nil {
|
||||
taxid := line[taxidColIndex]
|
||||
parent := line[parentColIndex]
|
||||
scientific_name := line[scientific_nameColIndex]
|
||||
rank := line[rankColIndex]
|
||||
|
||||
taxon, err = taxonomy.AddTaxon(taxid, parent, rank, root, false)
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("cannot add taxon %s: %v", taxid, err)
|
||||
}
|
||||
|
||||
root = false
|
||||
|
||||
taxon.SetName(scientific_name, "scientific name")
|
||||
|
||||
line, err = csvfile.Read()
|
||||
}
|
||||
|
||||
log.Infof("%d Taxa loaded", taxonomy.Len())
|
||||
|
||||
if !taxonomy.HasRoot() {
|
||||
return nil, errors.New("taxonomy file does not contain root node")
|
||||
}
|
||||
|
||||
return taxonomy, nil
|
||||
}
|
||||
214
pkg/obiformats/ncbitaxdump_read.go
Normal file
214
pkg/obiformats/ncbitaxdump_read.go
Normal file
@@ -0,0 +1,214 @@
|
||||
package obiformats
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"encoding/csv"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"path"
|
||||
"strings"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
)
|
||||
|
||||
// loadNodeTable reads a node table from the provided reader and populates the given taxonomy.
|
||||
// It is an internal function and should not be called directly. It is part of the NCBI taxdump reader.
|
||||
// The node table is expected to be in CSV format with a custom delimiter ('|') and comments
|
||||
// starting with '#'. Each record in the table represents a taxon with its taxid, parent taxid,
|
||||
// and rank.
|
||||
//
|
||||
// Parameters:
|
||||
// - reader: An io.Reader from which the node table is read.
|
||||
// - taxonomy: A pointer to an obitax.Taxonomy instance where the taxon data will be added.
|
||||
//
|
||||
// The function reads each record from the input, trims whitespace from the taxid, parent, and rank,
|
||||
// and adds the taxon to the taxonomy. If an error occurs while adding a taxon, the function logs
|
||||
// a fatal error and terminates the program.
|
||||
func loadNodeTable(reader io.Reader, taxonomy *obitax.Taxonomy) {
|
||||
file := csv.NewReader(reader)
|
||||
file.Comma = '|'
|
||||
file.Comment = '#'
|
||||
file.TrimLeadingSpace = true
|
||||
file.ReuseRecord = true
|
||||
|
||||
n := 0
|
||||
|
||||
for record, err := file.Read(); err == nil; record, err = file.Read() {
|
||||
n++
|
||||
taxid := strings.TrimSpace(record[0])
|
||||
parent := strings.TrimSpace(record[1])
|
||||
rank := strings.TrimSpace(record[2])
|
||||
|
||||
_, err := taxonomy.AddTaxon(taxid, parent, rank, taxid == "1", false)
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("Error adding taxon %s: %v\n", taxid, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// loadNameTable reads a name table from the provided reader and populates the given taxonomy.
|
||||
// It is an internal function and should not be called directly. It is part of the NCBI taxdump reader.
|
||||
// The name table is expected to be in a custom format with fields separated by the '|' character.
|
||||
// Each record in the table represents a taxon with its taxid, name, and class name.
|
||||
//
|
||||
// Parameters:
|
||||
// - reader: An io.Reader from which the name table is read.
|
||||
// - taxonomy: A pointer to an obitax.Taxonomy instance where the taxon names will be set.
|
||||
// - onlysn: A boolean flag indicating whether to only process records with the class name "scientific name".
|
||||
//
|
||||
// Returns:
|
||||
//
|
||||
// The number of taxon names successfully loaded into the taxonomy. If a line is too long, -1 is returned.
|
||||
// The function processes each line, trims whitespace from the taxid, name, and class name, and sets
|
||||
// the name in the taxonomy if the conditions are met.
|
||||
func loadNameTable(reader io.Reader, taxonomy *obitax.Taxonomy, onlysn bool) int {
|
||||
// file := csv.NewReader(reader)
|
||||
// file.Comma = '|'
|
||||
// file.Comment = '#'
|
||||
// file.TrimLeadingSpace = true
|
||||
// file.ReuseRecord = true
|
||||
// file.LazyQuotes = true
|
||||
file := bufio.NewReader(reader)
|
||||
|
||||
n := 0
|
||||
l := 0
|
||||
|
||||
for line, prefix, err := file.ReadLine(); err == nil; line, prefix, err = file.ReadLine() {
|
||||
l++
|
||||
if prefix {
|
||||
return -1
|
||||
}
|
||||
|
||||
record := strings.Split(string(line), "|")
|
||||
taxid := strings.TrimSpace(record[0])
|
||||
|
||||
name := strings.TrimSpace(record[1])
|
||||
classname := strings.TrimSpace(record[3])
|
||||
|
||||
if !onlysn || classname == "scientific name" {
|
||||
n++
|
||||
taxon, _, err := taxonomy.Taxon(taxid)
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("%s: is unknown from the taxonomy", taxid)
|
||||
}
|
||||
|
||||
taxon.SetName(name, classname)
|
||||
}
|
||||
}
|
||||
|
||||
return n
|
||||
}
|
||||
|
||||
// loadMergedTable reads a merged table from the provided reader and populates the given taxonomy.
|
||||
// It is an internal function and should not be called directly. It is part of the NCBI taxdump reader.
|
||||
// The merged table is expected to be in CSV format with a custom delimiter ('|') and comments
|
||||
// starting with '#'. Each record in the table represents a mapping between an old taxid and a new taxid.
|
||||
//
|
||||
// Parameters:
|
||||
// - reader: An io.Reader from which the merged table is read.
|
||||
// - taxonomy: A pointer to an obitax.Taxonomy instance where the alias mappings will be added.
|
||||
//
|
||||
// Returns:
|
||||
//
|
||||
// The number of alias mappings successfully loaded into the taxonomy. The function processes
|
||||
// each record, trims whitespace from the old and new taxid, and adds the alias to the taxonomy.
|
||||
func loadMergedTable(reader io.Reader, taxonomy *obitax.Taxonomy) int {
|
||||
file := csv.NewReader(reader)
|
||||
file.Comma = '|'
|
||||
file.Comment = '#'
|
||||
file.TrimLeadingSpace = true
|
||||
file.ReuseRecord = true
|
||||
|
||||
n := 0
|
||||
|
||||
for record, err := file.Read(); err == nil; record, err = file.Read() {
|
||||
n++
|
||||
oldtaxid := strings.TrimSpace(record[0])
|
||||
newtaxid := strings.TrimSpace(record[1])
|
||||
|
||||
taxonomy.AddAlias(oldtaxid, newtaxid, false)
|
||||
}
|
||||
|
||||
return n
|
||||
}
|
||||
|
||||
// LoadNCBITaxDump loads the NCBI taxonomy data from the specified directory.
|
||||
// It reads the taxonomy nodes, taxon names, and merged taxa from the corresponding files
|
||||
// and constructs a Taxonomy object.
|
||||
//
|
||||
// Parameters:
|
||||
// - directory: A string representing the path to the directory containing the NCBI taxonomy dump files.
|
||||
// - onlysn: A boolean indicating whether to load only scientific names (true) or all names (false).
|
||||
//
|
||||
// Returns:
|
||||
// - A pointer to the obitax.Taxonomy object containing the loaded taxonomy data, or an error
|
||||
// if any of the files cannot be opened or read.
|
||||
func LoadNCBITaxDump(directory string, onlysn bool) (*obitax.Taxonomy, error) {
|
||||
|
||||
taxonomy := obitax.NewTaxonomy("NCBI Taxonomy", "taxon", obiutils.AsciiDigitSet)
|
||||
|
||||
//
|
||||
// Load the Taxonomy nodes
|
||||
//
|
||||
|
||||
log.Printf("Loading Taxonomy nodes\n")
|
||||
|
||||
nodefile, err := os.Open(path.Join(directory, "nodes.dmp"))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot open nodes file from '%s'",
|
||||
directory)
|
||||
}
|
||||
defer nodefile.Close()
|
||||
|
||||
buffered := bufio.NewReader(nodefile)
|
||||
loadNodeTable(buffered, taxonomy)
|
||||
log.Printf("%d Taxonomy nodes read\n", taxonomy.Len())
|
||||
|
||||
//
|
||||
// Load the Taxonomy nodes
|
||||
//
|
||||
|
||||
log.Printf("Loading Taxon names\n")
|
||||
|
||||
namefile, nerr := os.Open(path.Join(directory, "names.dmp"))
|
||||
if nerr != nil {
|
||||
return nil, fmt.Errorf("cannot open names file from '%s'",
|
||||
directory)
|
||||
}
|
||||
defer namefile.Close()
|
||||
|
||||
n := loadNameTable(namefile, taxonomy, onlysn)
|
||||
log.Printf("%d taxon names read\n", n)
|
||||
|
||||
//
|
||||
// Load the merged taxa
|
||||
//
|
||||
|
||||
log.Printf("Loading Merged taxa\n")
|
||||
|
||||
aliasfile, aerr := os.Open(path.Join(directory, "merged.dmp"))
|
||||
if aerr != nil {
|
||||
return nil, fmt.Errorf("cannot open merged file from '%s'",
|
||||
directory)
|
||||
}
|
||||
defer aliasfile.Close()
|
||||
|
||||
buffered = bufio.NewReader(aliasfile)
|
||||
n = loadMergedTable(buffered, taxonomy)
|
||||
log.Printf("%d merged taxa read\n", n)
|
||||
|
||||
root, _, err := taxonomy.Taxon("1")
|
||||
|
||||
if err != nil {
|
||||
log.Fatal("cannot find the root taxon (1) in the NCBI tax dump")
|
||||
}
|
||||
taxonomy.SetRoot(root)
|
||||
|
||||
return taxonomy, nil
|
||||
}
|
||||
147
pkg/obiformats/ncbitaxdump_readtar.go
Normal file
147
pkg/obiformats/ncbitaxdump_readtar.go
Normal file
@@ -0,0 +1,147 @@
|
||||
package obiformats
|
||||
|
||||
import (
|
||||
"archive/tar"
|
||||
"bufio"
|
||||
"fmt"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
func IsNCBITarTaxDump(path string) bool {
|
||||
|
||||
file, err := obiutils.Ropen(path)
|
||||
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
defer file.Close()
|
||||
|
||||
citations := false
|
||||
division := false
|
||||
gencode := false
|
||||
names := false
|
||||
delnodes := false
|
||||
gc := false
|
||||
merged := false
|
||||
nodes := false
|
||||
|
||||
tarfile := tar.NewReader(file)
|
||||
|
||||
header, err := tarfile.Next()
|
||||
|
||||
for err == nil {
|
||||
name := header.Name
|
||||
|
||||
if header.Typeflag == tar.TypeReg {
|
||||
switch name {
|
||||
case "citations.dmp":
|
||||
citations = true
|
||||
case "division.dmp":
|
||||
division = true
|
||||
case "gencode.dmp":
|
||||
gencode = true
|
||||
case "names.dmp":
|
||||
names = true
|
||||
case "delnodes.dmp":
|
||||
delnodes = true
|
||||
case "gc.prt":
|
||||
gc = true
|
||||
case "merged.dmp":
|
||||
merged = true
|
||||
case "nodes.dmp":
|
||||
nodes = true
|
||||
}
|
||||
}
|
||||
header, err = tarfile.Next()
|
||||
}
|
||||
|
||||
return citations && division && gencode && names && delnodes && gc && merged && nodes
|
||||
}
|
||||
|
||||
func LoadNCBITarTaxDump(path string, onlysn bool) (*obitax.Taxonomy, error) {
|
||||
|
||||
taxonomy := obitax.NewTaxonomy("NCBI Taxonomy", "taxon", obiutils.AsciiDigitSet)
|
||||
|
||||
//
|
||||
// Load the Taxonomy nodes
|
||||
//
|
||||
|
||||
log.Printf("Loading Taxonomy nodes\n")
|
||||
|
||||
file, err := obiutils.Ropen(path)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot open taxonomy file from '%s'",
|
||||
path)
|
||||
}
|
||||
|
||||
nodefile, err := obiutils.TarFileReader(file, "nodes.dmp")
|
||||
if err != nil {
|
||||
file.Close()
|
||||
return nil, fmt.Errorf("cannot open nodes file from '%s'",
|
||||
path)
|
||||
}
|
||||
|
||||
buffered := bufio.NewReader(nodefile)
|
||||
loadNodeTable(buffered, taxonomy)
|
||||
log.Printf("%d Taxonomy nodes read\n", taxonomy.Len())
|
||||
file.Close()
|
||||
|
||||
//
|
||||
// Load the Taxonomy nodes
|
||||
//
|
||||
|
||||
log.Printf("Loading Taxon names\n")
|
||||
|
||||
file, err = obiutils.Ropen(path)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot open taxonomy file from '%s'",
|
||||
path)
|
||||
}
|
||||
|
||||
namefile, nerr := obiutils.TarFileReader(file, "names.dmp")
|
||||
if nerr != nil {
|
||||
file.Close()
|
||||
return nil, fmt.Errorf("cannot open names file from '%s'",
|
||||
path)
|
||||
}
|
||||
n := loadNameTable(namefile, taxonomy, onlysn)
|
||||
log.Printf("%d taxon names read\n", n)
|
||||
file.Close()
|
||||
|
||||
//
|
||||
// Load the merged taxa
|
||||
//
|
||||
|
||||
log.Printf("Loading Merged taxa\n")
|
||||
file, err = obiutils.Ropen(path)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot open taxonomy file from '%s'",
|
||||
path)
|
||||
}
|
||||
|
||||
aliasfile, aerr := obiutils.TarFileReader(file, "merged.dmp")
|
||||
if aerr != nil {
|
||||
file.Close()
|
||||
return nil, fmt.Errorf("cannot open merged file from '%s'",
|
||||
path)
|
||||
}
|
||||
|
||||
buffered = bufio.NewReader(aliasfile)
|
||||
n = loadMergedTable(buffered, taxonomy)
|
||||
log.Printf("%d merged taxa read\n", n)
|
||||
|
||||
root, _, err := taxonomy.Taxon("1")
|
||||
|
||||
if err != nil {
|
||||
log.Fatal("cannot find the root taxon (1) in the NCBI tax dump")
|
||||
}
|
||||
|
||||
taxonomy.SetRoot(root)
|
||||
|
||||
return taxonomy, nil
|
||||
}
|
||||
175
pkg/obiformats/newick_write.go
Normal file
175
pkg/obiformats/newick_write.go
Normal file
@@ -0,0 +1,175 @@
|
||||
package obiformats
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
// Tree corresponds to any value representable in a Newick format. Each
|
||||
// tree value corresponds to a single node.
|
||||
type Tree struct {
|
||||
// All children of this node, which may be empty.
|
||||
Children []*Tree
|
||||
|
||||
// The label of this node. If it's empty, then this node does
|
||||
// not have a name.
|
||||
TaxNode *obitax.TaxNode
|
||||
|
||||
// The branch length of this node corresponding to the distance between
|
||||
// it and its parent node. If it's `nil`, then no distance exists.
|
||||
Length *float64
|
||||
}
|
||||
|
||||
func (tree *Tree) Newick(level int, taxid, scientific_name, rank bool) string {
|
||||
var buffer strings.Builder
|
||||
|
||||
buffer.WriteString(strings.Repeat(" ", level))
|
||||
|
||||
if len(tree.Children) > 0 {
|
||||
buffer.WriteString("(\n")
|
||||
for i, c := range tree.Children {
|
||||
if i > 0 {
|
||||
buffer.WriteString(",\n")
|
||||
}
|
||||
buffer.WriteString(c.Newick(level+1, taxid, scientific_name, rank))
|
||||
}
|
||||
buffer.WriteByte('\n')
|
||||
buffer.WriteString(strings.Repeat(" ", level))
|
||||
buffer.WriteByte(')')
|
||||
}
|
||||
if scientific_name || taxid || rank {
|
||||
buffer.WriteByte('\'')
|
||||
}
|
||||
if scientific_name {
|
||||
sn := strings.ReplaceAll(tree.TaxNode.ScientificName(), ",", "")
|
||||
buffer.WriteString(sn)
|
||||
}
|
||||
if taxid || rank {
|
||||
if scientific_name {
|
||||
buffer.WriteByte(' ')
|
||||
}
|
||||
buffer.WriteByte('-')
|
||||
if taxid {
|
||||
buffer.WriteString(*tree.TaxNode.Id())
|
||||
if rank {
|
||||
buffer.WriteByte('@')
|
||||
}
|
||||
}
|
||||
if rank {
|
||||
buffer.WriteString(tree.TaxNode.Rank())
|
||||
}
|
||||
buffer.WriteByte('-')
|
||||
}
|
||||
if scientific_name || taxid || rank {
|
||||
buffer.WriteByte('\'')
|
||||
}
|
||||
|
||||
if tree.Length != nil {
|
||||
buffer.WriteString(fmt.Sprintf(":%f", *tree.Length))
|
||||
}
|
||||
|
||||
if level == 0 {
|
||||
buffer.WriteString(";\n")
|
||||
}
|
||||
return buffer.String()
|
||||
}
|
||||
|
||||
func Newick(taxa *obitax.TaxonSet, taxid, scientific_name, rank bool) string {
|
||||
if taxa == nil {
|
||||
return ""
|
||||
}
|
||||
|
||||
iterator := taxa.Sort().Iterator()
|
||||
|
||||
nodes := make(map[*string]*Tree, taxa.Len())
|
||||
trees := make([]*Tree, 0)
|
||||
|
||||
for iterator.Next() {
|
||||
taxon := iterator.Get()
|
||||
|
||||
tree := &Tree{TaxNode: taxon.Node}
|
||||
if parent, ok := nodes[taxon.Parent().Node.Id()]; ok {
|
||||
parent.Children = append(parent.Children, tree)
|
||||
} else {
|
||||
trees = append(trees, tree)
|
||||
}
|
||||
nodes[taxon.Node.Id()] = tree
|
||||
}
|
||||
|
||||
return trees[0].Newick(0, taxid, scientific_name, rank)
|
||||
}
|
||||
|
||||
func WriteNewick(iterator *obitax.ITaxon,
|
||||
file io.WriteCloser,
|
||||
options ...WithOption) (*obitax.ITaxon, error) {
|
||||
newiterator := obitax.NewITaxon()
|
||||
|
||||
var taxonomy *obitax.Taxonomy
|
||||
var taxa *obitax.TaxonSet
|
||||
|
||||
opt := MakeOptions(options)
|
||||
|
||||
file, _ = obiutils.CompressStream(file, opt.CompressedFile(), opt.CloseFile())
|
||||
obiutils.RegisterAPipe()
|
||||
|
||||
go func() {
|
||||
for iterator.Next() {
|
||||
taxon := iterator.Get()
|
||||
if taxonomy == nil {
|
||||
taxonomy = taxon.Taxonomy
|
||||
taxa = taxonomy.NewTaxonSet()
|
||||
}
|
||||
if taxon.Taxonomy != taxonomy {
|
||||
log.Fatal("Newick writer cannot deal with multi-taxonomy iterator")
|
||||
}
|
||||
taxa.InsertTaxon(taxon)
|
||||
newiterator.Push(taxon)
|
||||
}
|
||||
|
||||
newick := Newick(taxa, opt.WithTaxid(), opt.WithScientificName(), opt.WithRank())
|
||||
file.Write(obiutils.UnsafeBytes(newick))
|
||||
|
||||
newiterator.Close()
|
||||
if opt.CloseFile() {
|
||||
file.Close()
|
||||
}
|
||||
|
||||
obiutils.UnregisterPipe()
|
||||
log.Debugf("Writing newick file done")
|
||||
}()
|
||||
|
||||
return newiterator, nil
|
||||
}
|
||||
|
||||
func WriteNewickToFile(iterator *obitax.ITaxon,
|
||||
filename string,
|
||||
options ...WithOption) (*obitax.ITaxon, error) {
|
||||
|
||||
flags := os.O_WRONLY | os.O_CREATE
|
||||
flags |= os.O_TRUNC
|
||||
|
||||
file, err := os.OpenFile(filename, flags, 0660)
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("open file error: %v", err)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
options = append(options, OptionCloseFile())
|
||||
|
||||
iterator, err = WriteNewick(iterator, file, options...)
|
||||
|
||||
return iterator, err
|
||||
}
|
||||
|
||||
func WriteNewickToStdout(iterator *obitax.ITaxon,
|
||||
options ...WithOption) (*obitax.ITaxon, error) {
|
||||
options = append(options, OptionCloseFile())
|
||||
return WriteNewick(iterator, os.Stdout, options...)
|
||||
}
|
||||
@@ -15,6 +15,7 @@ import (
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obingslibrary"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
"github.com/gabriel-vasile/mimetype"
|
||||
)
|
||||
|
||||
@@ -87,7 +88,7 @@ func _parseMainNGSFilter(text string) (obingslibrary.PrimerPair, obingslibrary.T
|
||||
}
|
||||
|
||||
func NGSFilterCsvDetector(raw []byte, limit uint32) bool {
|
||||
r := csv.NewReader(bytes.NewReader(dropLastLine(raw, limit)))
|
||||
r := csv.NewReader(bytes.NewReader(obiutils.DropLastLine(raw, limit)))
|
||||
r.Comma = ','
|
||||
r.ReuseRecord = true
|
||||
r.LazyQuotes = true
|
||||
@@ -121,18 +122,6 @@ func NGSFilterCsvDetector(raw []byte, limit uint32) bool {
|
||||
|
||||
}
|
||||
|
||||
func dropLastLine(b []byte, readLimit uint32) []byte {
|
||||
if readLimit == 0 || uint32(len(b)) < readLimit {
|
||||
return b
|
||||
}
|
||||
for i := len(b) - 1; i > 0; i-- {
|
||||
if b[i] == '\n' {
|
||||
return b[:i]
|
||||
}
|
||||
}
|
||||
return b
|
||||
}
|
||||
|
||||
func OBIMimeNGSFilterTypeGuesser(stream io.Reader) (*mimetype.MIME, io.Reader, error) {
|
||||
|
||||
// Create a buffer to store the read data
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
package obiformats
|
||||
|
||||
import (
|
||||
"slices"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
)
|
||||
@@ -34,6 +36,14 @@ type __options__ struct {
|
||||
paired_filename string
|
||||
source string
|
||||
with_feature_table bool
|
||||
with_pattern bool
|
||||
with_parent bool
|
||||
with_path bool
|
||||
with_rank bool
|
||||
with_taxid bool
|
||||
with_scientific_name bool
|
||||
raw_taxid bool
|
||||
with_metadata []string
|
||||
}
|
||||
|
||||
type Options struct {
|
||||
@@ -72,6 +82,13 @@ func MakeOptions(setters []WithOption) Options {
|
||||
paired_filename: "",
|
||||
source: "unknown",
|
||||
with_feature_table: false,
|
||||
with_pattern: true,
|
||||
with_parent: false,
|
||||
with_path: false,
|
||||
with_rank: true,
|
||||
with_taxid: true,
|
||||
with_scientific_name: false,
|
||||
raw_taxid: false,
|
||||
}
|
||||
|
||||
opt := Options{&o}
|
||||
@@ -199,6 +216,60 @@ func (opt Options) WithFeatureTable() bool {
|
||||
return opt.pointer.with_feature_table
|
||||
}
|
||||
|
||||
// WithPattern returns whether the pattern option is enabled.
|
||||
// It retrieves the setting from the underlying options.
|
||||
func (o *Options) WithPattern() bool {
|
||||
return o.pointer.with_pattern
|
||||
}
|
||||
|
||||
// WithParent returns whether the parent option is enabled.
|
||||
// It retrieves the setting from the underlying options.
|
||||
func (o *Options) WithParent() bool {
|
||||
return o.pointer.with_parent
|
||||
}
|
||||
|
||||
// WithPath returns whether the path option is enabled.
|
||||
// It retrieves the setting from the underlying options.
|
||||
func (o *Options) WithPath() bool {
|
||||
return o.pointer.with_path
|
||||
}
|
||||
|
||||
// WithRank returns whether the rank option is enabled.
|
||||
// It retrieves the setting from the underlying options.
|
||||
func (o *Options) WithRank() bool {
|
||||
return o.pointer.with_rank
|
||||
}
|
||||
|
||||
func (o *Options) WithTaxid() bool {
|
||||
return o.pointer.with_taxid
|
||||
}
|
||||
|
||||
// WithScientificName returns whether the scientific name option is enabled.
|
||||
// It retrieves the setting from the underlying options.
|
||||
func (o *Options) WithScientificName() bool {
|
||||
return o.pointer.with_scientific_name
|
||||
}
|
||||
|
||||
// RawTaxid returns whether the raw taxid option is enabled.
|
||||
// It retrieves the setting from the underlying options.
|
||||
func (o *Options) RawTaxid() bool {
|
||||
return o.pointer.raw_taxid
|
||||
}
|
||||
|
||||
// WithMetadata returns a slice of strings containing the metadata
|
||||
// associated with the Options instance. It retrieves the metadata
|
||||
// from the pointer's with_metadata field.
|
||||
func (o *Options) WithMetadata() []string {
|
||||
if o.WithPattern() {
|
||||
idx := slices.Index(o.pointer.with_metadata, "query")
|
||||
if idx >= 0 {
|
||||
o.pointer.with_metadata = slices.Delete(o.pointer.with_metadata, idx, idx+1)
|
||||
}
|
||||
}
|
||||
|
||||
return o.pointer.with_metadata
|
||||
}
|
||||
|
||||
func OptionCloseFile() WithOption {
|
||||
f := WithOption(func(opt Options) {
|
||||
opt.pointer.closefile = true
|
||||
@@ -456,3 +527,66 @@ func WithFeatureTable(with bool) WithOption {
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func OptionsWithPattern(value bool) WithOption {
|
||||
f := WithOption(func(opt Options) {
|
||||
opt.pointer.with_pattern = value
|
||||
})
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func OptionsWithParent(value bool) WithOption {
|
||||
f := WithOption(func(opt Options) {
|
||||
opt.pointer.with_parent = value
|
||||
})
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func OptionsWithPath(value bool) WithOption {
|
||||
f := WithOption(func(opt Options) {
|
||||
opt.pointer.with_path = value
|
||||
})
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func OptionsWithRank(value bool) WithOption {
|
||||
f := WithOption(func(opt Options) {
|
||||
opt.pointer.with_rank = value
|
||||
})
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func OptionsWithTaxid(value bool) WithOption {
|
||||
f := WithOption(func(opt Options) {
|
||||
opt.pointer.with_taxid = value
|
||||
})
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func OptionsWithScientificName(value bool) WithOption {
|
||||
f := WithOption(func(opt Options) {
|
||||
opt.pointer.with_scientific_name = value
|
||||
})
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func OptionsRawTaxid(value bool) WithOption {
|
||||
f := WithOption(func(opt Options) {
|
||||
opt.pointer.raw_taxid = value
|
||||
})
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func OptionsWithMetadata(values ...string) WithOption {
|
||||
f := WithOption(func(opt Options) {
|
||||
opt.pointer.with_metadata = values
|
||||
})
|
||||
return f
|
||||
}
|
||||
|
||||
109
pkg/obiformats/taxonomy_read.go
Normal file
109
pkg/obiformats/taxonomy_read.go
Normal file
@@ -0,0 +1,109 @@
|
||||
package obiformats
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
"github.com/gabriel-vasile/mimetype"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
type TaxonomyLoader func(path string, onlysn bool) (*obitax.Taxonomy, error)
|
||||
|
||||
func DetectTaxonomyTarFormat(path string) (TaxonomyLoader, error) {
|
||||
|
||||
switch {
|
||||
case IsNCBITarTaxDump(path):
|
||||
log.Infof("NCBI Taxdump Tar Archive detected: %s", path)
|
||||
return LoadNCBITarTaxDump, nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("unknown taxonomy format: %s", path)
|
||||
}
|
||||
|
||||
func DetectTaxonomyFormat(path string) (TaxonomyLoader, error) {
|
||||
|
||||
obiutils.RegisterOBIMimeType()
|
||||
|
||||
file, err := os.Open(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
fileInfo, err := file.Stat()
|
||||
if err != nil {
|
||||
file.Close()
|
||||
return nil, err
|
||||
}
|
||||
|
||||
file.Close()
|
||||
|
||||
if fileInfo.IsDir() {
|
||||
// For the moment, we only support NCBI Taxdump directory format
|
||||
log.Infof("NCBI Taxdump detected: %s", path)
|
||||
return LoadNCBITaxDump, nil
|
||||
} else {
|
||||
file, err := obiutils.Ropen(path)
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
mimetype, err := mimetype.DetectReader(file)
|
||||
|
||||
if err != nil {
|
||||
file.Close()
|
||||
return nil, err
|
||||
}
|
||||
|
||||
file.Close()
|
||||
|
||||
switch mimetype.String() {
|
||||
case "text/csv":
|
||||
return LoadCSVTaxonomy, nil
|
||||
case "application/x-tar":
|
||||
return DetectTaxonomyTarFormat(path)
|
||||
case "text/fasta":
|
||||
return func(path string, onlysn bool) (*obitax.Taxonomy, error) {
|
||||
input, err := ReadFastaFromFile(path)
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
_, data := input.Load()
|
||||
|
||||
return data.ExtractTaxonomy(nil)
|
||||
}, nil
|
||||
case "text/fastq":
|
||||
return func(path string, onlysn bool) (*obitax.Taxonomy, error) {
|
||||
input, err := ReadFastqFromFile(path)
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
_, data := input.Load()
|
||||
|
||||
return data.ExtractTaxonomy(nil)
|
||||
}, nil
|
||||
}
|
||||
|
||||
log.Fatalf("Detected file format: %s", mimetype.String())
|
||||
}
|
||||
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func LoadTaxonomy(path string, onlysn bool) (*obitax.Taxonomy, error) {
|
||||
loader, err := DetectTaxonomyFormat(path)
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
taxonomy, err := loader(path, onlysn)
|
||||
|
||||
return taxonomy, err
|
||||
}
|
||||
@@ -3,11 +3,8 @@ package obiformats
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"encoding/csv"
|
||||
"errors"
|
||||
"io"
|
||||
"path"
|
||||
"regexp"
|
||||
|
||||
"github.com/gabriel-vasile/mimetype"
|
||||
|
||||
@@ -41,70 +38,7 @@ type SequenceReader func(reader io.Reader, options ...WithOption) (obiiter.IBioS
|
||||
// - io.Reader: A modified reader with the read data.
|
||||
// - error: Any error encountered during the process.
|
||||
func OBIMimeTypeGuesser(stream io.Reader) (*mimetype.MIME, io.Reader, error) {
|
||||
csv := func(in []byte, limit uint32) bool {
|
||||
in = dropLastLine(in, limit)
|
||||
|
||||
br := bytes.NewReader(in)
|
||||
r := csv.NewReader(br)
|
||||
r.Comma = ','
|
||||
r.ReuseRecord = true
|
||||
r.LazyQuotes = true
|
||||
r.Comment = '#'
|
||||
|
||||
lines := 0
|
||||
for {
|
||||
_, err := r.Read()
|
||||
if errors.Is(err, io.EOF) {
|
||||
break
|
||||
}
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
lines++
|
||||
}
|
||||
|
||||
return r.FieldsPerRecord > 1 && lines > 1
|
||||
}
|
||||
|
||||
fastaDetector := func(raw []byte, limit uint32) bool {
|
||||
ok, err := regexp.Match("^>[^ ]", raw)
|
||||
return ok && err == nil
|
||||
}
|
||||
|
||||
fastqDetector := func(raw []byte, limit uint32) bool {
|
||||
ok, err := regexp.Match("^@[^ ].*\n[A-Za-z.-]+", raw)
|
||||
return ok && err == nil
|
||||
}
|
||||
|
||||
ecoPCR2Detector := func(raw []byte, limit uint32) bool {
|
||||
ok := bytes.HasPrefix(raw, []byte("#@ecopcr-v2"))
|
||||
return ok
|
||||
}
|
||||
|
||||
genbankDetector := func(raw []byte, limit uint32) bool {
|
||||
ok2 := bytes.HasPrefix(raw, []byte("LOCUS "))
|
||||
ok1, err := regexp.Match("^[^ ]* +Genetic Sequence Data Bank *\n", raw)
|
||||
return ok2 || (ok1 && err == nil)
|
||||
}
|
||||
|
||||
emblDetector := func(raw []byte, limit uint32) bool {
|
||||
ok := bytes.HasPrefix(raw, []byte("ID "))
|
||||
return ok
|
||||
}
|
||||
|
||||
mimetype.Lookup("text/plain").Extend(fastaDetector, "text/fasta", ".fasta")
|
||||
mimetype.Lookup("text/plain").Extend(fastqDetector, "text/fastq", ".fastq")
|
||||
mimetype.Lookup("text/plain").Extend(ecoPCR2Detector, "text/ecopcr2", ".ecopcr")
|
||||
mimetype.Lookup("text/plain").Extend(genbankDetector, "text/genbank", ".seq")
|
||||
mimetype.Lookup("text/plain").Extend(emblDetector, "text/embl", ".dat")
|
||||
mimetype.Lookup("text/plain").Extend(csv, "text/csv", ".csv")
|
||||
|
||||
mimetype.Lookup("application/octet-stream").Extend(fastaDetector, "text/fasta", ".fasta")
|
||||
mimetype.Lookup("application/octet-stream").Extend(fastqDetector, "text/fastq", ".fastq")
|
||||
mimetype.Lookup("application/octet-stream").Extend(ecoPCR2Detector, "text/ecopcr2", ".ecopcr")
|
||||
mimetype.Lookup("application/octet-stream").Extend(genbankDetector, "text/genbank", ".seq")
|
||||
mimetype.Lookup("application/octet-stream").Extend(emblDetector, "text/embl", ".dat")
|
||||
mimetype.Lookup("application/octet-stream").Extend(csv, "text/csv", ".csv")
|
||||
obiutils.RegisterOBIMimeType()
|
||||
|
||||
// Create a buffer to store the read data
|
||||
mimetype.SetLimit(1024 * 1024)
|
||||
|
||||
Reference in New Issue
Block a user