Rename obifind obitaxonomy and introduce the new CSV format for taxonomy.

This commit is contained in:
Eric Coissac
2025-01-29 10:45:26 +01:00
parent c50a0f409d
commit 00f2dc2697
11 changed files with 4571 additions and 55 deletions

View File

@ -7,6 +7,8 @@
- In `obimultiplex`, the short version of the **--tag-list** option used to specify the list
of tags and primers to be used for the demultiplexing has been changed from `-t` to `-s`.
- The command `obifind` is now renamed `obitaxonomy`.
- The **--taxdump** option used to specify the path to the taxdump containing the NCBI taxonomy
has been renamed to **--taxonomy**.
@ -21,6 +23,11 @@
### New features
- `obitoaxonomy` a new **--dump|D** option allows for dumping a sub-taxonomy.
- Taxonomy dump can now be provided as a four-columns CSV file to the **--taxonomy**
option.
- NCBI Taxonomy dump does not need to be uncompressed and unarchived anymore. The
path of the tar and gziped dump file can be directly specified using the
**--taxonomy** option.

View File

@ -6,25 +6,29 @@ import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obifind"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obitaxonomy"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
)
func main() {
optionParser := obioptions.GenerateOptionParser(obifind.OptionSet)
optionParser := obioptions.GenerateOptionParser(obitaxonomy.OptionSet)
_, args := optionParser(os.Args)
var iterator *obitax.ITaxon
switch {
case obifind.CLIRequestsPathForTaxid() != "NA":
taxon := obitax.DefaultTaxonomy().Taxon(obifind.CLIRequestsPathForTaxid())
case obitaxonomy.CLIDumpSubtaxonomy():
iterator = obitaxonomy.CLISubTaxonomyIterator()
case obitaxonomy.CLIRequestsPathForTaxid() != "NA":
taxon := obitax.DefaultTaxonomy().Taxon(obitaxonomy.CLIRequestsPathForTaxid())
if taxon == nil {
log.Fatalf("Cannot identify the requested taxon: %s",
obifind.CLIRequestsPathForTaxid())
obitaxonomy.CLIRequestsPathForTaxid())
}
s := taxon.Path()
@ -35,7 +39,7 @@ func main() {
iterator = s.Iterator()
if obifind.CLIWithQuery() {
if obitaxonomy.CLIWithQuery() {
iterator = iterator.AddMetadata("query", taxon.String())
}
@ -45,8 +49,8 @@ func main() {
iters := make([]*obitax.ITaxon, len(args))
for i, pat := range args {
ii := obitax.DefaultTaxonomy().IFilterOnName(pat, obifind.CLIFixedPattern(), true)
if obifind.CLIWithQuery() {
ii := obitax.DefaultTaxonomy().IFilterOnName(pat, obitaxonomy.CLIFixedPattern(), true)
if obitaxonomy.CLIWithQuery() {
ii = ii.AddMetadata("query", pat)
}
iters[i] = ii
@ -59,8 +63,8 @@ func main() {
}
}
iterator = obifind.CLITaxonRestrictions(iterator)
obifind.CLICSVTaxaWriter(iterator, true)
iterator = obitaxonomy.CLITaxonRestrictions(iterator)
obitaxonomy.CLICSVTaxaWriter(iterator, true)
obiutils.WaitForLastPipe()

View File

@ -8,7 +8,7 @@ import (
// corresponds to the last commit, and not the one when the file will be
// commited
var _Commit = "7c4042d"
var _Commit = "c50a0f4"
var _Version = "Release 4.2.0"
// Version returns the version of the obitools package.

View File

@ -1,11 +1,10 @@
package obifind
package obitax
import (
"slices"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiitercsv"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
)
type __options__ struct {
@ -192,7 +191,7 @@ func OptionsWithMetadata(values ...string) WithOption {
return f
}
func NewCSVTaxaIterator(iterator *obitax.ITaxon, options ...WithOption) *obiitercsv.ICSVRecord {
func (iterator *ITaxon) CSVTaxaIterator(options ...WithOption) *obiitercsv.ICSVRecord {
opt := MakeOptions(options)
metakeys := make([]string, 0)

View File

@ -11,6 +11,8 @@ import (
func LoadCSVTaxonomy(path string, onlysn bool) (*Taxonomy, error) {
log.Infof("Loading taxonomy from csv file: %s", path)
file, err := obiutils.Ropen(path)
if err != nil {
@ -47,7 +49,7 @@ func LoadCSVTaxonomy(path string, onlysn bool) (*Taxonomy, error) {
parentColIndex = i
case "scientific_name":
scientific_nameColIndex = i
case "rank":
case "taxonomic_rank":
rankColIndex = i
}
}
@ -70,31 +72,45 @@ func LoadCSVTaxonomy(path string, onlysn bool) (*Taxonomy, error) {
name := obiutils.RemoveAllExt(path)
short := obiutils.Basename(path)
taxonomy := NewTaxonomy(name, short, obiutils.AsciiAlphaNumSet)
line, err := csvfile.Read()
if err == nil {
parts := strings.Split(line[taxidColIndex], " ")
parts = strings.Split(parts[0], ":")
if len(parts) > 1 {
short = parts[0]
}
}
for err != nil {
log.Infof("Taxonomy name: %s", name)
log.Infof("Taxon code: %s", short)
taxonomy := NewTaxonomy(name, short, obiutils.AsciiAlphaNumSet)
root := true
var taxon *Taxon
for err == nil {
taxid := line[taxidColIndex]
parent := line[parentColIndex]
scientific_name := line[scientific_nameColIndex]
rank := line[rankColIndex]
parts := strings.Split(rank, ":")
rank = parts[0]
root := len(parts) > 1 && parts[1] == "root"
taxon, err := taxonomy.AddTaxon(taxid, parent, rank, false, root)
taxon.SetName(scientific_name, "scientific name")
taxon, err = taxonomy.AddTaxon(taxid, parent, rank, root, false)
if err != nil {
return nil, err
log.Fatalf("cannot add taxon %s: %v", taxid, err)
}
root = false
taxon.SetName(scientific_name, "scientific name")
line, err = csvfile.Read()
}
log.Infof("%d Taxa loaded", taxonomy.Len())
if !taxonomy.HasRoot() {
return nil, errors.New("taxonomy file does not contain root node")
}

View File

@ -1,11 +1 @@
package obitax
import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiitercsv"
)
func WriteTaxonomyCSV(iterator ITaxon,
terminalAction bool, filenames ...string) *obiitercsv.ICSVRecord {
return nil
}

View File

@ -2,6 +2,7 @@ package obitax
import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
log "github.com/sirupsen/logrus"
)
// ITaxon represents an iterator for traversing Taxon instances.
@ -189,12 +190,12 @@ func (taxon *Taxon) ISubTaxonomy() *ITaxon {
go func() {
for i := lpath - 1; i >= 0; i-- {
taxon := path.Taxon(i)
parents[taxon.Node] = true
iter.Push(taxon)
}
pushed := true
log.Warn(parents)
for pushed {
itaxo := taxo.Iterator()
pushed = false
@ -215,3 +216,13 @@ func (taxon *Taxon) ISubTaxonomy() *ITaxon {
return iter
}
func (taxonomy *Taxonomy) ISubTaxonomy(taxid string) *ITaxon {
taxon := taxonomy.Taxon(taxid)
if taxon == nil {
return nil
}
return taxon.ISubTaxonomy()
}

View File

@ -197,7 +197,7 @@ func (taxonomy *Taxonomy) Len() int {
// - A pointer to the newly created Taxon instance.
// - An error if the taxon cannot be added (e.g., it already exists and replace is false).
func (taxonomy *Taxonomy) AddTaxon(taxid, parent string, rank string, isRoot bool, replace bool) (*Taxon, error) {
taxonomy = taxonomy.OrDefault(false)
taxonomy = taxonomy.OrDefault(true)
parentid, perr := taxonomy.Id(parent)
if perr != nil {

View File

@ -1,4 +1,4 @@
package obifind
package obitaxonomy
import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
@ -40,24 +40,34 @@ func CLIFilterRankRestriction(iterator *obitax.ITaxon) *obitax.ITaxon {
return iterator
}
func CLISubTaxonomyIterator() *obitax.ITaxon {
if CLIDumpSubtaxonomy() {
return obitax.DefaultTaxonomy().ISubTaxonomy(CLISubTaxonomyNode())
}
log.Fatalf("No sub-taxonomy specified use the --dump option")
return nil
}
func CLICSVTaxaIterator(iterator *obitax.ITaxon) *obiitercsv.ICSVRecord {
if iterator == nil {
return nil
}
options := make([]WithOption, 0)
options := make([]obitax.WithOption, 0)
options = append(options,
OptionsWithPattern(CLIWithQuery()),
OptionsWithParent(CLIWithParent()),
OptionsWithRank(CLIWithRank()),
OptionsWithScientificName(CLIWithScientificName()),
OptionsWithPath(CLIWithPath()),
OptionsRawTaxid(CLIRawTaxid()),
OptionsSource(obidefault.SelectedTaxonomy()),
obitax.OptionsWithPattern(CLIWithQuery()),
obitax.OptionsWithParent(CLIWithParent()),
obitax.OptionsWithRank(CLIWithRank()),
obitax.OptionsWithScientificName(CLIWithScientificName()),
obitax.OptionsWithPath(CLIWithPath()),
obitax.OptionsRawTaxid(CLIRawTaxid()),
obitax.OptionsSource(obidefault.SelectedTaxonomy()),
)
return NewCSVTaxaIterator(iterator, options...)
return iterator.CSVTaxaIterator(options...)
}
func CLICSVTaxaWriter(iterator *obitax.ITaxon, terminalAction bool) *obiitercsv.ICSVRecord {

View File

@ -1,4 +1,4 @@
package obifind
package obitaxonomy
import (
"fmt"
@ -16,11 +16,12 @@ var __with_path__ = false
var __with_query__ = false
var __without_rank__ = false
var __without_parent__ = false
var __with_scientific_name__ = false
var __without_scientific_name__ = false
var __raw_taxid__ = false
var __taxid_path__ = "NA"
var __taxid_sons__ = "NA"
var __restrict_rank__ = ""
var __to_dump__ = ""
func FilterTaxonomyOptionSet(options *getoptions.GetOpt) {
options.BoolVar(&__rank_list__, "rank-list", false,
@ -40,27 +41,35 @@ func OptionSet(options *getoptions.GetOpt) {
options.Description("Match taxon names using a fixed pattern, not a regular expression"))
options.StringVar(&__taxid_path__, "parents", "NA",
options.Alias("p"),
options.ArgName("TAXID"),
options.Description("Displays every parental tree's information for the provided taxid."))
options.StringVar(&__restrict_rank__, "rank", "",
options.ArgName("RANK"),
options.Description("Restrict to the given taxonomic rank."))
options.BoolVar(&__without_parent__, "without-parent", __without_parent__,
options.Description("Adds a column containing the parent's taxonid for each displayed taxon."))
options.Description("Supress the column containing the parent's taxonid from the output."))
options.StringVar(&__taxid_sons__, "sons", "NA",
options.Alias("s"),
options.ArgName("TAXID"),
options.Description("Displays every sons' tree's information for the provided taxid."))
options.BoolVar(&__with_path__, "with-path", false,
options.Description("Adds a column containing the full path for each displayed taxon."))
options.BoolVar(&__without_rank__, "without-rank", __without_rank__,
options.Alias("R"),
options.Description("Adds a column containing the taxonomic rank for each displayed taxon."))
options.Description("Supress the column containing the taxonomic rank from the output."))
options.BoolVar(&__with_query__, "with-query", false,
options.Alias("P"),
options.Description("Adds a column containing query used to filter taxon name for each displayed taxon."))
options.BoolVar(&__with_scientific_name__, "with-scientific-name", false,
options.BoolVar(&__without_scientific_name__, "without-scientific-name", __without_scientific_name__,
options.Alias("S"),
options.Description("Adds a column containing the scientific name for each displayed taxon."))
options.Description("Supress the column containing the scientific name from the output."))
options.BoolVar(&__raw_taxid__, "raw-taxid", false,
options.Description("Displays the raw taxid for each displayed taxon."))
options.StringVar(&__to_dump__, "dump", __to_dump__,
options.Alias("D"),
options.ArgName("TAXID"),
options.Description("Dump a sub-taxonomy corresponding to the precised clade"),
)
}
func CLITaxonomicalRestrictions() (*obitax.TaxonSet, error) {
@ -109,7 +118,7 @@ func CLIWithRank() bool {
}
func CLIWithScientificName() bool {
return __with_scientific_name__
return !__without_scientific_name__
}
func CLIRawTaxid() bool {
@ -127,3 +136,11 @@ func CLIFixedPattern() bool {
func CLIWithQuery() bool {
return __with_query__
}
func CLIDumpSubtaxonomy() bool {
return __to_dump__ != ""
}
func CLISubTaxonomyNode() string {
return __to_dump__
}

4462
xxx.csv Normal file

File diff suppressed because it is too large Load Diff