Rename obifind obitaxonomy and introduce the new CSV format for taxonomy.

This commit is contained in:
Eric Coissac
2025-01-29 10:45:26 +01:00
parent c50a0f409d
commit 00f2dc2697
11 changed files with 4571 additions and 55 deletions

View File

@ -7,6 +7,8 @@
- In `obimultiplex`, the short version of the **--tag-list** option used to specify the list - In `obimultiplex`, the short version of the **--tag-list** option used to specify the list
of tags and primers to be used for the demultiplexing has been changed from `-t` to `-s`. of tags and primers to be used for the demultiplexing has been changed from `-t` to `-s`.
- The command `obifind` is now renamed `obitaxonomy`.
- The **--taxdump** option used to specify the path to the taxdump containing the NCBI taxonomy - The **--taxdump** option used to specify the path to the taxdump containing the NCBI taxonomy
has been renamed to **--taxonomy**. has been renamed to **--taxonomy**.
@ -21,6 +23,11 @@
### New features ### New features
- `obitoaxonomy` a new **--dump|D** option allows for dumping a sub-taxonomy.
- Taxonomy dump can now be provided as a four-columns CSV file to the **--taxonomy**
option.
- NCBI Taxonomy dump does not need to be uncompressed and unarchived anymore. The - NCBI Taxonomy dump does not need to be uncompressed and unarchived anymore. The
path of the tar and gziped dump file can be directly specified using the path of the tar and gziped dump file can be directly specified using the
**--taxonomy** option. **--taxonomy** option.

View File

@ -6,25 +6,29 @@ import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obifind" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obitaxonomy"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
) )
func main() { func main() {
optionParser := obioptions.GenerateOptionParser(obifind.OptionSet) optionParser := obioptions.GenerateOptionParser(obitaxonomy.OptionSet)
_, args := optionParser(os.Args) _, args := optionParser(os.Args)
var iterator *obitax.ITaxon var iterator *obitax.ITaxon
switch { switch {
case obifind.CLIRequestsPathForTaxid() != "NA":
taxon := obitax.DefaultTaxonomy().Taxon(obifind.CLIRequestsPathForTaxid()) case obitaxonomy.CLIDumpSubtaxonomy():
iterator = obitaxonomy.CLISubTaxonomyIterator()
case obitaxonomy.CLIRequestsPathForTaxid() != "NA":
taxon := obitax.DefaultTaxonomy().Taxon(obitaxonomy.CLIRequestsPathForTaxid())
if taxon == nil { if taxon == nil {
log.Fatalf("Cannot identify the requested taxon: %s", log.Fatalf("Cannot identify the requested taxon: %s",
obifind.CLIRequestsPathForTaxid()) obitaxonomy.CLIRequestsPathForTaxid())
} }
s := taxon.Path() s := taxon.Path()
@ -35,7 +39,7 @@ func main() {
iterator = s.Iterator() iterator = s.Iterator()
if obifind.CLIWithQuery() { if obitaxonomy.CLIWithQuery() {
iterator = iterator.AddMetadata("query", taxon.String()) iterator = iterator.AddMetadata("query", taxon.String())
} }
@ -45,8 +49,8 @@ func main() {
iters := make([]*obitax.ITaxon, len(args)) iters := make([]*obitax.ITaxon, len(args))
for i, pat := range args { for i, pat := range args {
ii := obitax.DefaultTaxonomy().IFilterOnName(pat, obifind.CLIFixedPattern(), true) ii := obitax.DefaultTaxonomy().IFilterOnName(pat, obitaxonomy.CLIFixedPattern(), true)
if obifind.CLIWithQuery() { if obitaxonomy.CLIWithQuery() {
ii = ii.AddMetadata("query", pat) ii = ii.AddMetadata("query", pat)
} }
iters[i] = ii iters[i] = ii
@ -59,8 +63,8 @@ func main() {
} }
} }
iterator = obifind.CLITaxonRestrictions(iterator) iterator = obitaxonomy.CLITaxonRestrictions(iterator)
obifind.CLICSVTaxaWriter(iterator, true) obitaxonomy.CLICSVTaxaWriter(iterator, true)
obiutils.WaitForLastPipe() obiutils.WaitForLastPipe()

View File

@ -8,7 +8,7 @@ import (
// corresponds to the last commit, and not the one when the file will be // corresponds to the last commit, and not the one when the file will be
// commited // commited
var _Commit = "7c4042d" var _Commit = "c50a0f4"
var _Version = "Release 4.2.0" var _Version = "Release 4.2.0"
// Version returns the version of the obitools package. // Version returns the version of the obitools package.

View File

@ -1,11 +1,10 @@
package obifind package obitax
import ( import (
"slices" "slices"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiitercsv" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiitercsv"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
) )
type __options__ struct { type __options__ struct {
@ -192,7 +191,7 @@ func OptionsWithMetadata(values ...string) WithOption {
return f return f
} }
func NewCSVTaxaIterator(iterator *obitax.ITaxon, options ...WithOption) *obiitercsv.ICSVRecord { func (iterator *ITaxon) CSVTaxaIterator(options ...WithOption) *obiitercsv.ICSVRecord {
opt := MakeOptions(options) opt := MakeOptions(options)
metakeys := make([]string, 0) metakeys := make([]string, 0)

View File

@ -11,6 +11,8 @@ import (
func LoadCSVTaxonomy(path string, onlysn bool) (*Taxonomy, error) { func LoadCSVTaxonomy(path string, onlysn bool) (*Taxonomy, error) {
log.Infof("Loading taxonomy from csv file: %s", path)
file, err := obiutils.Ropen(path) file, err := obiutils.Ropen(path)
if err != nil { if err != nil {
@ -47,7 +49,7 @@ func LoadCSVTaxonomy(path string, onlysn bool) (*Taxonomy, error) {
parentColIndex = i parentColIndex = i
case "scientific_name": case "scientific_name":
scientific_nameColIndex = i scientific_nameColIndex = i
case "rank": case "taxonomic_rank":
rankColIndex = i rankColIndex = i
} }
} }
@ -70,31 +72,45 @@ func LoadCSVTaxonomy(path string, onlysn bool) (*Taxonomy, error) {
name := obiutils.RemoveAllExt(path) name := obiutils.RemoveAllExt(path)
short := obiutils.Basename(path) short := obiutils.Basename(path)
taxonomy := NewTaxonomy(name, short, obiutils.AsciiAlphaNumSet)
line, err := csvfile.Read() line, err := csvfile.Read()
if err == nil {
parts := strings.Split(line[taxidColIndex], " ")
parts = strings.Split(parts[0], ":")
if len(parts) > 1 {
short = parts[0]
}
}
for err != nil { log.Infof("Taxonomy name: %s", name)
log.Infof("Taxon code: %s", short)
taxonomy := NewTaxonomy(name, short, obiutils.AsciiAlphaNumSet)
root := true
var taxon *Taxon
for err == nil {
taxid := line[taxidColIndex] taxid := line[taxidColIndex]
parent := line[parentColIndex] parent := line[parentColIndex]
scientific_name := line[scientific_nameColIndex] scientific_name := line[scientific_nameColIndex]
rank := line[rankColIndex] rank := line[rankColIndex]
parts := strings.Split(rank, ":") taxon, err = taxonomy.AddTaxon(taxid, parent, rank, root, false)
rank = parts[0]
root := len(parts) > 1 && parts[1] == "root"
taxon, err := taxonomy.AddTaxon(taxid, parent, rank, false, root)
taxon.SetName(scientific_name, "scientific name")
if err != nil { if err != nil {
return nil, err log.Fatalf("cannot add taxon %s: %v", taxid, err)
} }
root = false
taxon.SetName(scientific_name, "scientific name")
line, err = csvfile.Read()
} }
log.Infof("%d Taxa loaded", taxonomy.Len())
if !taxonomy.HasRoot() { if !taxonomy.HasRoot() {
return nil, errors.New("taxonomy file does not contain root node") return nil, errors.New("taxonomy file does not contain root node")
} }

View File

@ -1,11 +1 @@
package obitax package obitax
import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiitercsv"
)
func WriteTaxonomyCSV(iterator ITaxon,
terminalAction bool, filenames ...string) *obiitercsv.ICSVRecord {
return nil
}

View File

@ -2,6 +2,7 @@ package obitax
import ( import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
log "github.com/sirupsen/logrus"
) )
// ITaxon represents an iterator for traversing Taxon instances. // ITaxon represents an iterator for traversing Taxon instances.
@ -189,12 +190,12 @@ func (taxon *Taxon) ISubTaxonomy() *ITaxon {
go func() { go func() {
for i := lpath - 1; i >= 0; i-- { for i := lpath - 1; i >= 0; i-- {
taxon := path.Taxon(i) taxon := path.Taxon(i)
parents[taxon.Node] = true
iter.Push(taxon) iter.Push(taxon)
} }
pushed := true pushed := true
log.Warn(parents)
for pushed { for pushed {
itaxo := taxo.Iterator() itaxo := taxo.Iterator()
pushed = false pushed = false
@ -215,3 +216,13 @@ func (taxon *Taxon) ISubTaxonomy() *ITaxon {
return iter return iter
} }
func (taxonomy *Taxonomy) ISubTaxonomy(taxid string) *ITaxon {
taxon := taxonomy.Taxon(taxid)
if taxon == nil {
return nil
}
return taxon.ISubTaxonomy()
}

View File

@ -197,7 +197,7 @@ func (taxonomy *Taxonomy) Len() int {
// - A pointer to the newly created Taxon instance. // - A pointer to the newly created Taxon instance.
// - An error if the taxon cannot be added (e.g., it already exists and replace is false). // - An error if the taxon cannot be added (e.g., it already exists and replace is false).
func (taxonomy *Taxonomy) AddTaxon(taxid, parent string, rank string, isRoot bool, replace bool) (*Taxon, error) { func (taxonomy *Taxonomy) AddTaxon(taxid, parent string, rank string, isRoot bool, replace bool) (*Taxon, error) {
taxonomy = taxonomy.OrDefault(false) taxonomy = taxonomy.OrDefault(true)
parentid, perr := taxonomy.Id(parent) parentid, perr := taxonomy.Id(parent)
if perr != nil { if perr != nil {

View File

@ -1,4 +1,4 @@
package obifind package obitaxonomy
import ( import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
@ -40,24 +40,34 @@ func CLIFilterRankRestriction(iterator *obitax.ITaxon) *obitax.ITaxon {
return iterator return iterator
} }
func CLISubTaxonomyIterator() *obitax.ITaxon {
if CLIDumpSubtaxonomy() {
return obitax.DefaultTaxonomy().ISubTaxonomy(CLISubTaxonomyNode())
}
log.Fatalf("No sub-taxonomy specified use the --dump option")
return nil
}
func CLICSVTaxaIterator(iterator *obitax.ITaxon) *obiitercsv.ICSVRecord { func CLICSVTaxaIterator(iterator *obitax.ITaxon) *obiitercsv.ICSVRecord {
if iterator == nil { if iterator == nil {
return nil return nil
} }
options := make([]WithOption, 0) options := make([]obitax.WithOption, 0)
options = append(options, options = append(options,
OptionsWithPattern(CLIWithQuery()), obitax.OptionsWithPattern(CLIWithQuery()),
OptionsWithParent(CLIWithParent()), obitax.OptionsWithParent(CLIWithParent()),
OptionsWithRank(CLIWithRank()), obitax.OptionsWithRank(CLIWithRank()),
OptionsWithScientificName(CLIWithScientificName()), obitax.OptionsWithScientificName(CLIWithScientificName()),
OptionsWithPath(CLIWithPath()), obitax.OptionsWithPath(CLIWithPath()),
OptionsRawTaxid(CLIRawTaxid()), obitax.OptionsRawTaxid(CLIRawTaxid()),
OptionsSource(obidefault.SelectedTaxonomy()), obitax.OptionsSource(obidefault.SelectedTaxonomy()),
) )
return NewCSVTaxaIterator(iterator, options...) return iterator.CSVTaxaIterator(options...)
} }
func CLICSVTaxaWriter(iterator *obitax.ITaxon, terminalAction bool) *obiitercsv.ICSVRecord { func CLICSVTaxaWriter(iterator *obitax.ITaxon, terminalAction bool) *obiitercsv.ICSVRecord {

View File

@ -1,4 +1,4 @@
package obifind package obitaxonomy
import ( import (
"fmt" "fmt"
@ -16,11 +16,12 @@ var __with_path__ = false
var __with_query__ = false var __with_query__ = false
var __without_rank__ = false var __without_rank__ = false
var __without_parent__ = false var __without_parent__ = false
var __with_scientific_name__ = false var __without_scientific_name__ = false
var __raw_taxid__ = false var __raw_taxid__ = false
var __taxid_path__ = "NA" var __taxid_path__ = "NA"
var __taxid_sons__ = "NA" var __taxid_sons__ = "NA"
var __restrict_rank__ = "" var __restrict_rank__ = ""
var __to_dump__ = ""
func FilterTaxonomyOptionSet(options *getoptions.GetOpt) { func FilterTaxonomyOptionSet(options *getoptions.GetOpt) {
options.BoolVar(&__rank_list__, "rank-list", false, options.BoolVar(&__rank_list__, "rank-list", false,
@ -40,27 +41,35 @@ func OptionSet(options *getoptions.GetOpt) {
options.Description("Match taxon names using a fixed pattern, not a regular expression")) options.Description("Match taxon names using a fixed pattern, not a regular expression"))
options.StringVar(&__taxid_path__, "parents", "NA", options.StringVar(&__taxid_path__, "parents", "NA",
options.Alias("p"), options.Alias("p"),
options.ArgName("TAXID"),
options.Description("Displays every parental tree's information for the provided taxid.")) options.Description("Displays every parental tree's information for the provided taxid."))
options.StringVar(&__restrict_rank__, "rank", "", options.StringVar(&__restrict_rank__, "rank", "",
options.ArgName("RANK"),
options.Description("Restrict to the given taxonomic rank.")) options.Description("Restrict to the given taxonomic rank."))
options.BoolVar(&__without_parent__, "without-parent", __without_parent__, options.BoolVar(&__without_parent__, "without-parent", __without_parent__,
options.Description("Adds a column containing the parent's taxonid for each displayed taxon.")) options.Description("Supress the column containing the parent's taxonid from the output."))
options.StringVar(&__taxid_sons__, "sons", "NA", options.StringVar(&__taxid_sons__, "sons", "NA",
options.Alias("s"), options.Alias("s"),
options.ArgName("TAXID"),
options.Description("Displays every sons' tree's information for the provided taxid.")) options.Description("Displays every sons' tree's information for the provided taxid."))
options.BoolVar(&__with_path__, "with-path", false, options.BoolVar(&__with_path__, "with-path", false,
options.Description("Adds a column containing the full path for each displayed taxon.")) options.Description("Adds a column containing the full path for each displayed taxon."))
options.BoolVar(&__without_rank__, "without-rank", __without_rank__, options.BoolVar(&__without_rank__, "without-rank", __without_rank__,
options.Alias("R"), options.Alias("R"),
options.Description("Adds a column containing the taxonomic rank for each displayed taxon.")) options.Description("Supress the column containing the taxonomic rank from the output."))
options.BoolVar(&__with_query__, "with-query", false, options.BoolVar(&__with_query__, "with-query", false,
options.Alias("P"), options.Alias("P"),
options.Description("Adds a column containing query used to filter taxon name for each displayed taxon.")) options.Description("Adds a column containing query used to filter taxon name for each displayed taxon."))
options.BoolVar(&__with_scientific_name__, "with-scientific-name", false, options.BoolVar(&__without_scientific_name__, "without-scientific-name", __without_scientific_name__,
options.Alias("S"), options.Alias("S"),
options.Description("Adds a column containing the scientific name for each displayed taxon.")) options.Description("Supress the column containing the scientific name from the output."))
options.BoolVar(&__raw_taxid__, "raw-taxid", false, options.BoolVar(&__raw_taxid__, "raw-taxid", false,
options.Description("Displays the raw taxid for each displayed taxon.")) options.Description("Displays the raw taxid for each displayed taxon."))
options.StringVar(&__to_dump__, "dump", __to_dump__,
options.Alias("D"),
options.ArgName("TAXID"),
options.Description("Dump a sub-taxonomy corresponding to the precised clade"),
)
} }
func CLITaxonomicalRestrictions() (*obitax.TaxonSet, error) { func CLITaxonomicalRestrictions() (*obitax.TaxonSet, error) {
@ -109,7 +118,7 @@ func CLIWithRank() bool {
} }
func CLIWithScientificName() bool { func CLIWithScientificName() bool {
return __with_scientific_name__ return !__without_scientific_name__
} }
func CLIRawTaxid() bool { func CLIRawTaxid() bool {
@ -127,3 +136,11 @@ func CLIFixedPattern() bool {
func CLIWithQuery() bool { func CLIWithQuery() bool {
return __with_query__ return __with_query__
} }
func CLIDumpSubtaxonomy() bool {
return __to_dump__ != ""
}
func CLISubTaxonomyNode() string {
return __to_dump__
}

4462
xxx.csv Normal file

File diff suppressed because it is too large Load Diff