From 8448783499490c73cad4f85f5d2c2c1e0d24d684 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Fri, 14 Mar 2025 14:22:22 +0100 Subject: [PATCH] Make sequence files recognized as a taxonomy --- cmd/obitools/obitaxonomy/main.go | 7 +- cmd/test/main.go | 4 +- pkg/obiformats/csviterator.go | 113 +++++++ pkg/{obitax => obiformats}/csvtaxdump_read.go | 9 +- .../ncbitaxdump_read.go | 13 +- .../ncbitaxdump_readtar.go | 7 +- pkg/obiformats/newick_write.go | 175 ++++++++++ pkg/obiformats/ngsfilter_read.go | 15 +- pkg/obiformats/options.go | 134 ++++++++ pkg/{obitax => obiformats}/taxonomy_read.go | 31 +- pkg/obiformats/universal_read.go | 68 +--- pkg/obioptions/options.go | 20 ++ pkg/obioptions/version.go | 2 +- pkg/obitax/csviterator.go | 299 ------------------ pkg/obitax/csvtaxdump_write.go | 38 --- pkg/obitax/default_taxonomy.go | 20 -- pkg/obitax/iterator.go | 6 + pkg/obitax/newick_write.go | 1 - pkg/obitools/obitaxonomy/obitaxonomy.go | 58 +++- pkg/obitools/obitaxonomy/options.go | 9 + pkg/obiutils/mimetypes.go | 95 ++++++ 21 files changed, 657 insertions(+), 467 deletions(-) create mode 100644 pkg/obiformats/csviterator.go rename pkg/{obitax => obiformats}/csvtaxdump_read.go (89%) rename pkg/{obitax => obiformats}/ncbitaxdump_read.go (93%) rename pkg/{obitax => obiformats}/ncbitaxdump_readtar.go (91%) create mode 100644 pkg/obiformats/newick_write.go rename pkg/{obitax => obiformats}/taxonomy_read.go (64%) delete mode 100644 pkg/obitax/csviterator.go delete mode 100644 pkg/obitax/csvtaxdump_write.go delete mode 100644 pkg/obitax/newick_write.go create mode 100644 pkg/obiutils/mimetypes.go diff --git a/cmd/obitools/obitaxonomy/main.go b/cmd/obitools/obitaxonomy/main.go index 284a778..ae9a3f8 100644 --- a/cmd/obitools/obitaxonomy/main.go +++ b/cmd/obitools/obitaxonomy/main.go @@ -115,7 +115,12 @@ func main() { } iterator = obitaxonomy.CLITaxonRestrictions(iterator) - obitaxonomy.CLICSVTaxaWriter(iterator, true) + + if obitaxonomy.CLIAsNewick() { + obitaxonomy.CLINewickWriter(iterator, true) + } else { + obitaxonomy.CLICSVTaxaWriter(iterator, true) + } obiutils.WaitForLastPipe() diff --git a/cmd/test/main.go b/cmd/test/main.go index 8a66f0c..d21821f 100644 --- a/cmd/test/main.go +++ b/cmd/test/main.go @@ -3,13 +3,13 @@ package main import ( "os" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiformats" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" ) func main() { - obitax.DetectTaxonomyFormat(os.Args[1]) + obiformats.DetectTaxonomyFormat(os.Args[1]) println(obiutils.RemoveAllExt("toto/tutu/test.txt")) println(obiutils.Basename("toto/tutu/test.txt")) diff --git a/pkg/obiformats/csviterator.go b/pkg/obiformats/csviterator.go new file mode 100644 index 0000000..4beeeb8 --- /dev/null +++ b/pkg/obiformats/csviterator.go @@ -0,0 +1,113 @@ +package obiformats + +import ( + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiitercsv" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax" +) + +func CSVTaxaIterator(iterator *obitax.ITaxon, options ...WithOption) *obiitercsv.ICSVRecord { + + opt := MakeOptions(options) + metakeys := make([]string, 0) + + newIter := obiitercsv.NewICSVRecord() + + newIter.Add(1) + + batch_size := opt.BatchSize() + + if opt.WithPattern() { + newIter.AppendField("query") + opt.pointer.with_metadata = append(opt.pointer.with_metadata, "query") + } + + newIter.AppendField("taxid") + rawtaxid := opt.RawTaxid() + + if opt.WithParent() { + newIter.AppendField("parent") + } + + if opt.WithRank() { + newIter.AppendField("taxonomic_rank") + } + + if opt.WithScientificName() { + newIter.AppendField("scientific_name") + } + + if opt.WithMetadata() != nil { + metakeys = opt.WithMetadata() + for _, metadata := range metakeys { + newIter.AppendField(metadata) + } + } + + if opt.WithPath() { + newIter.AppendField("path") + } + + go func() { + newIter.WaitAndClose() + }() + + go func() { + o := 0 + data := make([]obiitercsv.CSVRecord, 0, batch_size) + for iterator.Next() { + + taxon := iterator.Get() + record := make(obiitercsv.CSVRecord) + + if opt.WithPattern() { + record["query"] = taxon.MetadataAsString("query") + } + + if rawtaxid { + record["taxid"] = *taxon.Node.Id() + } else { + record["taxid"] = taxon.String() + } + + if opt.WithParent() { + if rawtaxid { + record["parent"] = *taxon.Node.ParentId() + } else { + record["parent"] = taxon.Parent().String() + } + } + + if opt.WithRank() { + record["taxonomic_rank"] = taxon.Rank() + } + + if opt.WithScientificName() { + record["scientific_name"] = taxon.ScientificName() + } + + if opt.WithPath() { + record["path"] = taxon.Path().String() + } + + for _, key := range metakeys { + record[key] = taxon.MetadataAsString(key) + } + + data = append(data, record) + if len(data) >= batch_size { + newIter.Push(obiitercsv.MakeCSVRecordBatch(opt.Source(), o, data)) + data = make([]obiitercsv.CSVRecord, 0, batch_size) + o++ + } + + } + + if len(data) > 0 { + newIter.Push(obiitercsv.MakeCSVRecordBatch(opt.Source(), o, data)) + } + + newIter.Done() + }() + + return newIter +} diff --git a/pkg/obitax/csvtaxdump_read.go b/pkg/obiformats/csvtaxdump_read.go similarity index 89% rename from pkg/obitax/csvtaxdump_read.go rename to pkg/obiformats/csvtaxdump_read.go index b25c90c..bc17aa6 100644 --- a/pkg/obitax/csvtaxdump_read.go +++ b/pkg/obiformats/csvtaxdump_read.go @@ -1,15 +1,16 @@ -package obitax +package obiformats import ( "encoding/csv" "errors" "strings" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" log "github.com/sirupsen/logrus" ) -func LoadCSVTaxonomy(path string, onlysn bool) (*Taxonomy, error) { +func LoadCSVTaxonomy(path string, onlysn bool) (*obitax.Taxonomy, error) { log.Infof("Loading taxonomy from csv file: %s", path) @@ -85,10 +86,10 @@ func LoadCSVTaxonomy(path string, onlysn bool) (*Taxonomy, error) { log.Infof("Taxonomy name: %s", name) log.Infof("Taxon code: %s", short) - taxonomy := NewTaxonomy(name, short, obiutils.AsciiAlphaNumSet) + taxonomy := obitax.NewTaxonomy(name, short, obiutils.AsciiAlphaNumSet) root := true - var taxon *Taxon + var taxon *obitax.Taxon for err == nil { taxid := line[taxidColIndex] diff --git a/pkg/obitax/ncbitaxdump_read.go b/pkg/obiformats/ncbitaxdump_read.go similarity index 93% rename from pkg/obitax/ncbitaxdump_read.go rename to pkg/obiformats/ncbitaxdump_read.go index dd97ab1..7e6c3f0 100644 --- a/pkg/obitax/ncbitaxdump_read.go +++ b/pkg/obiformats/ncbitaxdump_read.go @@ -1,4 +1,4 @@ -package obitax +package obiformats import ( "bufio" @@ -11,6 +11,7 @@ import ( log "github.com/sirupsen/logrus" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" ) @@ -27,7 +28,7 @@ import ( // The function reads each record from the input, trims whitespace from the taxid, parent, and rank, // and adds the taxon to the taxonomy. If an error occurs while adding a taxon, the function logs // a fatal error and terminates the program. -func loadNodeTable(reader io.Reader, taxonomy *Taxonomy) { +func loadNodeTable(reader io.Reader, taxonomy *obitax.Taxonomy) { file := csv.NewReader(reader) file.Comma = '|' file.Comment = '#' @@ -65,7 +66,7 @@ func loadNodeTable(reader io.Reader, taxonomy *Taxonomy) { // The number of taxon names successfully loaded into the taxonomy. If a line is too long, -1 is returned. // The function processes each line, trims whitespace from the taxid, name, and class name, and sets // the name in the taxonomy if the conditions are met. -func loadNameTable(reader io.Reader, taxonomy *Taxonomy, onlysn bool) int { +func loadNameTable(reader io.Reader, taxonomy *obitax.Taxonomy, onlysn bool) int { // file := csv.NewReader(reader) // file.Comma = '|' // file.Comment = '#' @@ -117,7 +118,7 @@ func loadNameTable(reader io.Reader, taxonomy *Taxonomy, onlysn bool) int { // // The number of alias mappings successfully loaded into the taxonomy. The function processes // each record, trims whitespace from the old and new taxid, and adds the alias to the taxonomy. -func loadMergedTable(reader io.Reader, taxonomy *Taxonomy) int { +func loadMergedTable(reader io.Reader, taxonomy *obitax.Taxonomy) int { file := csv.NewReader(reader) file.Comma = '|' file.Comment = '#' @@ -148,9 +149,9 @@ func loadMergedTable(reader io.Reader, taxonomy *Taxonomy) int { // Returns: // - A pointer to the obitax.Taxonomy object containing the loaded taxonomy data, or an error // if any of the files cannot be opened or read. -func LoadNCBITaxDump(directory string, onlysn bool) (*Taxonomy, error) { +func LoadNCBITaxDump(directory string, onlysn bool) (*obitax.Taxonomy, error) { - taxonomy := NewTaxonomy("NCBI Taxonomy", "taxon", obiutils.AsciiDigitSet) + taxonomy := obitax.NewTaxonomy("NCBI Taxonomy", "taxon", obiutils.AsciiDigitSet) // // Load the Taxonomy nodes diff --git a/pkg/obitax/ncbitaxdump_readtar.go b/pkg/obiformats/ncbitaxdump_readtar.go similarity index 91% rename from pkg/obitax/ncbitaxdump_readtar.go rename to pkg/obiformats/ncbitaxdump_readtar.go index 9e91a8d..6b3dba2 100644 --- a/pkg/obitax/ncbitaxdump_readtar.go +++ b/pkg/obiformats/ncbitaxdump_readtar.go @@ -1,10 +1,11 @@ -package obitax +package obiformats import ( "archive/tar" "bufio" "fmt" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" log "github.com/sirupsen/logrus" @@ -62,9 +63,9 @@ func IsNCBITarTaxDump(path string) bool { return citations && division && gencode && names && delnodes && gc && merged && nodes } -func LoadNCBITarTaxDump(path string, onlysn bool) (*Taxonomy, error) { +func LoadNCBITarTaxDump(path string, onlysn bool) (*obitax.Taxonomy, error) { - taxonomy := NewTaxonomy("NCBI Taxonomy", "taxon", obiutils.AsciiDigitSet) + taxonomy := obitax.NewTaxonomy("NCBI Taxonomy", "taxon", obiutils.AsciiDigitSet) // // Load the Taxonomy nodes diff --git a/pkg/obiformats/newick_write.go b/pkg/obiformats/newick_write.go new file mode 100644 index 0000000..e843189 --- /dev/null +++ b/pkg/obiformats/newick_write.go @@ -0,0 +1,175 @@ +package obiformats + +import ( + "fmt" + "io" + "os" + "strings" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" + log "github.com/sirupsen/logrus" +) + +// Tree corresponds to any value representable in a Newick format. Each +// tree value corresponds to a single node. +type Tree struct { + // All children of this node, which may be empty. + Children []*Tree + + // The label of this node. If it's empty, then this node does + // not have a name. + TaxNode *obitax.TaxNode + + // The branch length of this node corresponding to the distance between + // it and its parent node. If it's `nil`, then no distance exists. + Length *float64 +} + +func (tree *Tree) Newick(level int, taxid, scientific_name, rank bool) string { + var buffer strings.Builder + + buffer.WriteString(strings.Repeat(" ", level)) + + if len(tree.Children) > 0 { + buffer.WriteString("(\n") + for i, c := range tree.Children { + if i > 0 { + buffer.WriteString(",\n") + } + buffer.WriteString(c.Newick(level+1, taxid, scientific_name, rank)) + } + buffer.WriteByte('\n') + buffer.WriteString(strings.Repeat(" ", level)) + buffer.WriteByte(')') + } + if scientific_name || taxid || rank { + buffer.WriteByte('\'') + } + if scientific_name { + sn := strings.ReplaceAll(tree.TaxNode.ScientificName(), ",", "") + buffer.WriteString(sn) + } + if taxid || rank { + if scientific_name { + buffer.WriteByte(' ') + } + buffer.WriteByte('-') + if taxid { + buffer.WriteString(*tree.TaxNode.Id()) + if rank { + buffer.WriteByte('@') + } + } + if rank { + buffer.WriteString(tree.TaxNode.Rank()) + } + buffer.WriteByte('-') + } + if scientific_name || taxid || rank { + buffer.WriteByte('\'') + } + + if tree.Length != nil { + buffer.WriteString(fmt.Sprintf(":%f", *tree.Length)) + } + + if level == 0 { + buffer.WriteString(";\n") + } + return buffer.String() +} + +func Newick(taxa *obitax.TaxonSet, taxid, scientific_name, rank bool) string { + if taxa == nil { + return "" + } + + iterator := taxa.Sort().Iterator() + + nodes := make(map[*string]*Tree, taxa.Len()) + trees := make([]*Tree, 0) + + for iterator.Next() { + taxon := iterator.Get() + + tree := &Tree{TaxNode: taxon.Node} + if parent, ok := nodes[taxon.Parent().Node.Id()]; ok { + parent.Children = append(parent.Children, tree) + } else { + trees = append(trees, tree) + } + nodes[taxon.Node.Id()] = tree + } + + return trees[0].Newick(0, taxid, scientific_name, rank) +} + +func WriteNewick(iterator *obitax.ITaxon, + file io.WriteCloser, + options ...WithOption) (*obitax.ITaxon, error) { + newiterator := obitax.NewITaxon() + + var taxonomy *obitax.Taxonomy + var taxa *obitax.TaxonSet + + opt := MakeOptions(options) + + file, _ = obiutils.CompressStream(file, opt.CompressedFile(), opt.CloseFile()) + obiutils.RegisterAPipe() + + go func() { + for iterator.Next() { + taxon := iterator.Get() + if taxonomy == nil { + taxonomy = taxon.Taxonomy + taxa = taxonomy.NewTaxonSet() + } + if taxon.Taxonomy != taxonomy { + log.Fatal("Newick writer cannot deal with multi-taxonomy iterator") + } + taxa.InsertTaxon(taxon) + newiterator.Push(taxon) + } + + newick := Newick(taxa, opt.WithTaxid(), opt.WithScientificName(), opt.WithRank()) + file.Write(obiutils.UnsafeBytes(newick)) + + newiterator.Close() + if opt.CloseFile() { + file.Close() + } + + obiutils.UnregisterPipe() + log.Debugf("Writing newick file done") + }() + + return newiterator, nil +} + +func WriteNewickToFile(iterator *obitax.ITaxon, + filename string, + options ...WithOption) (*obitax.ITaxon, error) { + + flags := os.O_WRONLY | os.O_CREATE + flags |= os.O_TRUNC + + file, err := os.OpenFile(filename, flags, 0660) + + if err != nil { + log.Fatalf("open file error: %v", err) + return nil, err + } + + options = append(options, OptionCloseFile()) + + iterator, err = WriteNewick(iterator, file, options...) + + return iterator, err +} + +func WriteNewickToStdout(iterator *obitax.ITaxon, + options ...WithOption) (*obitax.ITaxon, error) { + options = append(options, OptionCloseFile()) + return WriteNewick(iterator, os.Stdout, options...) +} diff --git a/pkg/obiformats/ngsfilter_read.go b/pkg/obiformats/ngsfilter_read.go index 42730f5..7eb2c55 100644 --- a/pkg/obiformats/ngsfilter_read.go +++ b/pkg/obiformats/ngsfilter_read.go @@ -15,6 +15,7 @@ import ( "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obingslibrary" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" "github.com/gabriel-vasile/mimetype" ) @@ -87,7 +88,7 @@ func _parseMainNGSFilter(text string) (obingslibrary.PrimerPair, obingslibrary.T } func NGSFilterCsvDetector(raw []byte, limit uint32) bool { - r := csv.NewReader(bytes.NewReader(dropLastLine(raw, limit))) + r := csv.NewReader(bytes.NewReader(obiutils.DropLastLine(raw, limit))) r.Comma = ',' r.ReuseRecord = true r.LazyQuotes = true @@ -121,18 +122,6 @@ func NGSFilterCsvDetector(raw []byte, limit uint32) bool { } -func dropLastLine(b []byte, readLimit uint32) []byte { - if readLimit == 0 || uint32(len(b)) < readLimit { - return b - } - for i := len(b) - 1; i > 0; i-- { - if b[i] == '\n' { - return b[:i] - } - } - return b -} - func OBIMimeNGSFilterTypeGuesser(stream io.Reader) (*mimetype.MIME, io.Reader, error) { // Create a buffer to store the read data diff --git a/pkg/obiformats/options.go b/pkg/obiformats/options.go index 2068b75..243390a 100644 --- a/pkg/obiformats/options.go +++ b/pkg/obiformats/options.go @@ -1,6 +1,8 @@ package obiformats import ( + "slices" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" ) @@ -34,6 +36,14 @@ type __options__ struct { paired_filename string source string with_feature_table bool + with_pattern bool + with_parent bool + with_path bool + with_rank bool + with_taxid bool + with_scientific_name bool + raw_taxid bool + with_metadata []string } type Options struct { @@ -72,6 +82,13 @@ func MakeOptions(setters []WithOption) Options { paired_filename: "", source: "unknown", with_feature_table: false, + with_pattern: true, + with_parent: false, + with_path: false, + with_rank: true, + with_taxid: true, + with_scientific_name: false, + raw_taxid: false, } opt := Options{&o} @@ -199,6 +216,60 @@ func (opt Options) WithFeatureTable() bool { return opt.pointer.with_feature_table } +// WithPattern returns whether the pattern option is enabled. +// It retrieves the setting from the underlying options. +func (o *Options) WithPattern() bool { + return o.pointer.with_pattern +} + +// WithParent returns whether the parent option is enabled. +// It retrieves the setting from the underlying options. +func (o *Options) WithParent() bool { + return o.pointer.with_parent +} + +// WithPath returns whether the path option is enabled. +// It retrieves the setting from the underlying options. +func (o *Options) WithPath() bool { + return o.pointer.with_path +} + +// WithRank returns whether the rank option is enabled. +// It retrieves the setting from the underlying options. +func (o *Options) WithRank() bool { + return o.pointer.with_rank +} + +func (o *Options) WithTaxid() bool { + return o.pointer.with_taxid +} + +// WithScientificName returns whether the scientific name option is enabled. +// It retrieves the setting from the underlying options. +func (o *Options) WithScientificName() bool { + return o.pointer.with_scientific_name +} + +// RawTaxid returns whether the raw taxid option is enabled. +// It retrieves the setting from the underlying options. +func (o *Options) RawTaxid() bool { + return o.pointer.raw_taxid +} + +// WithMetadata returns a slice of strings containing the metadata +// associated with the Options instance. It retrieves the metadata +// from the pointer's with_metadata field. +func (o *Options) WithMetadata() []string { + if o.WithPattern() { + idx := slices.Index(o.pointer.with_metadata, "query") + if idx >= 0 { + o.pointer.with_metadata = slices.Delete(o.pointer.with_metadata, idx, idx+1) + } + } + + return o.pointer.with_metadata +} + func OptionCloseFile() WithOption { f := WithOption(func(opt Options) { opt.pointer.closefile = true @@ -456,3 +527,66 @@ func WithFeatureTable(with bool) WithOption { return f } + +func OptionsWithPattern(value bool) WithOption { + f := WithOption(func(opt Options) { + opt.pointer.with_pattern = value + }) + + return f +} + +func OptionsWithParent(value bool) WithOption { + f := WithOption(func(opt Options) { + opt.pointer.with_parent = value + }) + + return f +} + +func OptionsWithPath(value bool) WithOption { + f := WithOption(func(opt Options) { + opt.pointer.with_path = value + }) + + return f +} + +func OptionsWithRank(value bool) WithOption { + f := WithOption(func(opt Options) { + opt.pointer.with_rank = value + }) + + return f +} + +func OptionsWithTaxid(value bool) WithOption { + f := WithOption(func(opt Options) { + opt.pointer.with_taxid = value + }) + + return f +} + +func OptionsWithScientificName(value bool) WithOption { + f := WithOption(func(opt Options) { + opt.pointer.with_scientific_name = value + }) + + return f +} + +func OptionsRawTaxid(value bool) WithOption { + f := WithOption(func(opt Options) { + opt.pointer.raw_taxid = value + }) + + return f +} + +func OptionsWithMetadata(values ...string) WithOption { + f := WithOption(func(opt Options) { + opt.pointer.with_metadata = values + }) + return f +} diff --git a/pkg/obitax/taxonomy_read.go b/pkg/obiformats/taxonomy_read.go similarity index 64% rename from pkg/obitax/taxonomy_read.go rename to pkg/obiformats/taxonomy_read.go index 2bd46c7..fe99526 100644 --- a/pkg/obitax/taxonomy_read.go +++ b/pkg/obiformats/taxonomy_read.go @@ -1,16 +1,17 @@ -package obitax +package obiformats import ( "fmt" "os" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" "github.com/gabriel-vasile/mimetype" log "github.com/sirupsen/logrus" ) -type TaxonomyLoader func(path string, onlysn bool) (*Taxonomy, error) +type TaxonomyLoader func(path string, onlysn bool) (*obitax.Taxonomy, error) func DetectTaxonomyTarFormat(path string) (TaxonomyLoader, error) { @@ -25,6 +26,8 @@ func DetectTaxonomyTarFormat(path string) (TaxonomyLoader, error) { func DetectTaxonomyFormat(path string) (TaxonomyLoader, error) { + obiutils.RegisterOBIMimeType() + file, err := os.Open(path) if err != nil { return nil, err @@ -63,6 +66,28 @@ func DetectTaxonomyFormat(path string) (TaxonomyLoader, error) { return LoadCSVTaxonomy, nil case "application/x-tar": return DetectTaxonomyTarFormat(path) + case "text/fasta": + return func(path string, onlysn bool) (*obitax.Taxonomy, error) { + input, err := ReadFastaFromFile(path) + + if err != nil { + return nil, err + } + _, data := input.Load() + + return data.ExtractTaxonomy(nil) + }, nil + case "text/fastq": + return func(path string, onlysn bool) (*obitax.Taxonomy, error) { + input, err := ReadFastqFromFile(path) + + if err != nil { + return nil, err + } + _, data := input.Load() + + return data.ExtractTaxonomy(nil) + }, nil } log.Fatalf("Detected file format: %s", mimetype.String()) @@ -71,7 +96,7 @@ func DetectTaxonomyFormat(path string) (TaxonomyLoader, error) { return nil, nil } -func LoadTaxonomy(path string, onlysn bool) (*Taxonomy, error) { +func LoadTaxonomy(path string, onlysn bool) (*obitax.Taxonomy, error) { loader, err := DetectTaxonomyFormat(path) if err != nil { diff --git a/pkg/obiformats/universal_read.go b/pkg/obiformats/universal_read.go index e8b08ef..0a09f89 100644 --- a/pkg/obiformats/universal_read.go +++ b/pkg/obiformats/universal_read.go @@ -3,11 +3,8 @@ package obiformats import ( "bufio" "bytes" - "encoding/csv" - "errors" "io" "path" - "regexp" "github.com/gabriel-vasile/mimetype" @@ -41,70 +38,7 @@ type SequenceReader func(reader io.Reader, options ...WithOption) (obiiter.IBioS // - io.Reader: A modified reader with the read data. // - error: Any error encountered during the process. func OBIMimeTypeGuesser(stream io.Reader) (*mimetype.MIME, io.Reader, error) { - csv := func(in []byte, limit uint32) bool { - in = dropLastLine(in, limit) - - br := bytes.NewReader(in) - r := csv.NewReader(br) - r.Comma = ',' - r.ReuseRecord = true - r.LazyQuotes = true - r.Comment = '#' - - lines := 0 - for { - _, err := r.Read() - if errors.Is(err, io.EOF) { - break - } - if err != nil { - return false - } - lines++ - } - - return r.FieldsPerRecord > 1 && lines > 1 - } - - fastaDetector := func(raw []byte, limit uint32) bool { - ok, err := regexp.Match("^>[^ ]", raw) - return ok && err == nil - } - - fastqDetector := func(raw []byte, limit uint32) bool { - ok, err := regexp.Match("^@[^ ].*\n[A-Za-z.-]+", raw) - return ok && err == nil - } - - ecoPCR2Detector := func(raw []byte, limit uint32) bool { - ok := bytes.HasPrefix(raw, []byte("#@ecopcr-v2")) - return ok - } - - genbankDetector := func(raw []byte, limit uint32) bool { - ok2 := bytes.HasPrefix(raw, []byte("LOCUS ")) - ok1, err := regexp.Match("^[^ ]* +Genetic Sequence Data Bank *\n", raw) - return ok2 || (ok1 && err == nil) - } - - emblDetector := func(raw []byte, limit uint32) bool { - ok := bytes.HasPrefix(raw, []byte("ID ")) - return ok - } - - mimetype.Lookup("text/plain").Extend(fastaDetector, "text/fasta", ".fasta") - mimetype.Lookup("text/plain").Extend(fastqDetector, "text/fastq", ".fastq") - mimetype.Lookup("text/plain").Extend(ecoPCR2Detector, "text/ecopcr2", ".ecopcr") - mimetype.Lookup("text/plain").Extend(genbankDetector, "text/genbank", ".seq") - mimetype.Lookup("text/plain").Extend(emblDetector, "text/embl", ".dat") - mimetype.Lookup("text/plain").Extend(csv, "text/csv", ".csv") - - mimetype.Lookup("application/octet-stream").Extend(fastaDetector, "text/fasta", ".fasta") - mimetype.Lookup("application/octet-stream").Extend(fastqDetector, "text/fastq", ".fastq") - mimetype.Lookup("application/octet-stream").Extend(ecoPCR2Detector, "text/ecopcr2", ".ecopcr") - mimetype.Lookup("application/octet-stream").Extend(genbankDetector, "text/genbank", ".seq") - mimetype.Lookup("application/octet-stream").Extend(emblDetector, "text/embl", ".dat") - mimetype.Lookup("application/octet-stream").Extend(csv, "text/csv", ".csv") + obiutils.RegisterOBIMimeType() // Create a buffer to store the read data mimetype.SetLimit(1024 * 1024) diff --git a/pkg/obioptions/options.go b/pkg/obioptions/options.go index 1e5d56b..7e6f126 100644 --- a/pkg/obioptions/options.go +++ b/pkg/obioptions/options.go @@ -4,8 +4,10 @@ import ( "fmt" "os" "runtime" + "sync" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiformats" log "github.com/sirupsen/logrus" "github.com/DavidGamba/go-getoptions" @@ -20,6 +22,8 @@ var _Pprof = false var _PprofMudex = 10 var _PprofGoroutine = 6060 +var __defaut_taxonomy_mutex__ sync.Mutex + type ArgumentParser func([]string) (*getoptions.GetOpt, []string) func GenerateOptionParser(program string, @@ -87,6 +91,22 @@ func GenerateOptionParser(program string, os.Exit(0) } + if options.Called("taxonomy") { + __defaut_taxonomy_mutex__.Lock() + defer __defaut_taxonomy_mutex__.Unlock() + taxonomy, err := obiformats.LoadTaxonomy( + obidefault.SelectedTaxonomy(), + !obidefault.AreAlternativeNamesSelected(), + ) + + if err != nil { + log.Fatalf("Cannot load default taxonomy: %v", err) + + } + + taxonomy.SetAsDefault() + } + log.SetLevel(log.InfoLevel) if options.Called("debug") { log.SetLevel(log.DebugLevel) diff --git a/pkg/obioptions/version.go b/pkg/obioptions/version.go index fe4d685..d86a527 100644 --- a/pkg/obioptions/version.go +++ b/pkg/obioptions/version.go @@ -8,7 +8,7 @@ import ( // corresponds to the last commit, and not the one when the file will be // commited -var _Commit = "7a9dc1a" +var _Commit = "d1c31c5" var _Version = "Release 4.4.0" // Version returns the version of the obitools package. diff --git a/pkg/obitax/csviterator.go b/pkg/obitax/csviterator.go deleted file mode 100644 index 2b07cf5..0000000 --- a/pkg/obitax/csviterator.go +++ /dev/null @@ -1,299 +0,0 @@ -package obitax - -import ( - "slices" - - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiitercsv" -) - -type __options__ struct { - batch_size int // Number of items to process in a batch - with_pattern bool - with_parent bool - with_path bool - with_rank bool - with_scientific_name bool - raw_taxid bool - with_metadata []string - source string // Source of the data -} - -// Options wraps the __options__ struct to provide a pointer to the options. -type Options struct { - pointer *__options__ // Pointer to the underlying options -} - -// WithOption is a function type that takes an Options parameter and modifies it. -type WithOption func(Options) - -// MakeOptions creates an Options instance with default settings and applies any provided setters. -// It returns the configured Options. -// -// Parameters: -// - setters: A slice of WithOption functions to customize the options. -// -// Returns: -// - An Options instance with the specified settings. -func MakeOptions(setters []WithOption) Options { - o := __options__{ - batch_size: obidefault.BatchSize(), // Number of items to process in a batch - with_pattern: true, - with_parent: false, - with_path: false, - with_rank: true, - with_scientific_name: false, - raw_taxid: false, - source: "unknown", - } - opt := Options{&o} - - for _, set := range setters { - set(opt) - } - - return opt -} - -// BatchSize returns the size of the batch to be processed. -// It retrieves the batch size from the underlying options. -func (o *Options) BatchSize() int { - return o.pointer.batch_size -} - -// WithPattern returns whether the pattern option is enabled. -// It retrieves the setting from the underlying options. -func (o *Options) WithPattern() bool { - return o.pointer.with_pattern -} - -// WithParent returns whether the parent option is enabled. -// It retrieves the setting from the underlying options. -func (o *Options) WithParent() bool { - return o.pointer.with_parent -} - -// WithPath returns whether the path option is enabled. -// It retrieves the setting from the underlying options. -func (o *Options) WithPath() bool { - return o.pointer.with_path -} - -// WithRank returns whether the rank option is enabled. -// It retrieves the setting from the underlying options. -func (o *Options) WithRank() bool { - return o.pointer.with_rank -} - -// WithScientificName returns whether the scientific name option is enabled. -// It retrieves the setting from the underlying options. -func (o *Options) WithScientificName() bool { - return o.pointer.with_scientific_name -} - -// RawTaxid returns whether the raw taxid option is enabled. -// It retrieves the setting from the underlying options. -func (o *Options) RawTaxid() bool { - return o.pointer.raw_taxid -} - -// Source returns the source of the data. -// It retrieves the source from the underlying options. -func (o *Options) Source() string { - return o.pointer.source -} - -// WithMetadata returns a slice of strings containing the metadata -// associated with the Options instance. It retrieves the metadata -// from the pointer's with_metadata field. -func (o *Options) WithMetadata() []string { - if o.WithPattern() { - idx := slices.Index(o.pointer.with_metadata, "query") - if idx >= 0 { - o.pointer.with_metadata = slices.Delete(o.pointer.with_metadata, idx, idx+1) - } - } - - return o.pointer.with_metadata -} - -// OptionsBatchSize returns a WithOption function that sets the batch_size option. -// Parameters: -// - size: An integer specifying the size of the batch to be processed. -func OptionsBatchSize(size int) WithOption { - f := WithOption(func(opt Options) { - opt.pointer.batch_size = size - }) - - return f -} - -func OptionsWithPattern(value bool) WithOption { - f := WithOption(func(opt Options) { - opt.pointer.with_pattern = value - }) - - return f -} - -func OptionsWithParent(value bool) WithOption { - f := WithOption(func(opt Options) { - opt.pointer.with_parent = value - }) - - return f -} - -func OptionsWithPath(value bool) WithOption { - f := WithOption(func(opt Options) { - opt.pointer.with_path = value - }) - - return f -} - -func OptionsWithRank(value bool) WithOption { - f := WithOption(func(opt Options) { - opt.pointer.with_rank = value - }) - - return f -} - -func OptionsWithScientificName(value bool) WithOption { - f := WithOption(func(opt Options) { - opt.pointer.with_scientific_name = value - }) - - return f -} - -func OptionsRawTaxid(value bool) WithOption { - f := WithOption(func(opt Options) { - opt.pointer.raw_taxid = value - }) - - return f -} - -func OptionsSource(value string) WithOption { - f := WithOption(func(opt Options) { - opt.pointer.source = value - }) - - return f -} - -func OptionsWithMetadata(values ...string) WithOption { - f := WithOption(func(opt Options) { - opt.pointer.with_metadata = values - }) - return f -} - -func (iterator *ITaxon) CSVTaxaIterator(options ...WithOption) *obiitercsv.ICSVRecord { - - opt := MakeOptions(options) - metakeys := make([]string, 0) - - newIter := obiitercsv.NewICSVRecord() - - newIter.Add(1) - - batch_size := opt.BatchSize() - - if opt.WithPattern() { - newIter.AppendField("query") - opt.pointer.with_metadata = append(opt.pointer.with_metadata, "query") - } - - newIter.AppendField("taxid") - rawtaxid := opt.RawTaxid() - - if opt.WithParent() { - newIter.AppendField("parent") - } - - if opt.WithRank() { - newIter.AppendField("taxonomic_rank") - } - - if opt.WithScientificName() { - newIter.AppendField("scientific_name") - } - - if opt.WithMetadata() != nil { - metakeys = opt.WithMetadata() - for _, metadata := range metakeys { - newIter.AppendField(metadata) - } - } - - if opt.WithPath() { - newIter.AppendField("path") - } - - go func() { - newIter.WaitAndClose() - }() - - go func() { - o := 0 - data := make([]obiitercsv.CSVRecord, 0, batch_size) - for iterator.Next() { - - taxon := iterator.Get() - record := make(obiitercsv.CSVRecord) - - if opt.WithPattern() { - record["query"] = taxon.MetadataAsString("query") - } - - if rawtaxid { - record["taxid"] = *taxon.Node.Id() - } else { - record["taxid"] = taxon.String() - } - - if opt.WithParent() { - if rawtaxid { - record["parent"] = *taxon.Node.ParentId() - } else { - record["parent"] = taxon.Parent().String() - } - } - - if opt.WithRank() { - record["taxonomic_rank"] = taxon.Rank() - } - - if opt.WithScientificName() { - record["scientific_name"] = taxon.ScientificName() - } - - if opt.WithPath() { - record["path"] = taxon.Path().String() - } - - for _, key := range metakeys { - record[key] = taxon.MetadataAsString(key) - } - - data = append(data, record) - if len(data) >= batch_size { - newIter.Push(obiitercsv.MakeCSVRecordBatch(opt.Source(), o, data)) - data = make([]obiitercsv.CSVRecord, 0, batch_size) - o++ - } - - } - - if len(data) > 0 { - newIter.Push(obiitercsv.MakeCSVRecordBatch(opt.Source(), o, data)) - } - - newIter.Done() - }() - - return newIter -} diff --git a/pkg/obitax/csvtaxdump_write.go b/pkg/obitax/csvtaxdump_write.go deleted file mode 100644 index c2c6a1a..0000000 --- a/pkg/obitax/csvtaxdump_write.go +++ /dev/null @@ -1,38 +0,0 @@ -package obitax - -import ( - "strings" - - "github.com/TuftsBCB/io/newick" -) - -func (taxonomy *Taxonomy) Newick() string { - if taxonomy == nil { - return "" - } - - iterator := taxonomy.AsTaxonSet().Sort().Iterator() - - nodes := make(map[*string]*newick.Tree, taxonomy.Len()) - trees := make([]*newick.Tree, 0) - - for iterator.Next() { - taxon := iterator.Get() - tree := &newick.Tree{Label: taxon.String()} - nodes[taxon.Node.id] = tree - if parent, ok := nodes[taxon.Parent().Node.id]; ok { - parent.Children = append(parent.Children, *tree) - } else { - trees = append(trees, tree) - } - } - - rep := strings.Builder{} - - for _, tree := range trees { - rep.WriteString(tree.String()) - rep.WriteString("\n") - } - - return rep.String() -} diff --git a/pkg/obitax/default_taxonomy.go b/pkg/obitax/default_taxonomy.go index 219d321..4ddc9d3 100644 --- a/pkg/obitax/default_taxonomy.go +++ b/pkg/obitax/default_taxonomy.go @@ -3,7 +3,6 @@ package obitax import ( "sync" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault" log "github.com/sirupsen/logrus" ) @@ -32,24 +31,5 @@ func IsDefaultTaxonomyDefined() bool { } func DefaultTaxonomy() *Taxonomy { - var err error - if __defaut_taxonomy__ == nil { - if obidefault.HasSelectedTaxonomy() { - __defaut_taxonomy_mutex__.Lock() - defer __defaut_taxonomy_mutex__.Unlock() - if __defaut_taxonomy__ == nil { - __defaut_taxonomy__, err = LoadTaxonomy( - obidefault.SelectedTaxonomy(), - !obidefault.AreAlternativeNamesSelected(), - ) - - if err != nil { - log.Fatalf("Cannot load default taxonomy: %v", err) - - } - } - } - } - return __defaut_taxonomy__ } diff --git a/pkg/obitax/iterator.go b/pkg/obitax/iterator.go index 42a8058..8c4d9c7 100644 --- a/pkg/obitax/iterator.go +++ b/pkg/obitax/iterator.go @@ -224,3 +224,9 @@ func (taxonomy *Taxonomy) ISubTaxonomy(taxid string) *ITaxon { return taxon.ISubTaxonomy() } + +func (iterator *ITaxon) Consume() { + for iterator.Next() { + iterator.Get() + } +} diff --git a/pkg/obitax/newick_write.go b/pkg/obitax/newick_write.go deleted file mode 100644 index 9f4c3a6..0000000 --- a/pkg/obitax/newick_write.go +++ /dev/null @@ -1 +0,0 @@ -package obitax diff --git a/pkg/obitools/obitaxonomy/obitaxonomy.go b/pkg/obitools/obitaxonomy/obitaxonomy.go index da8dd1e..2ff330a 100644 --- a/pkg/obitools/obitaxonomy/obitaxonomy.go +++ b/pkg/obitools/obitaxonomy/obitaxonomy.go @@ -5,6 +5,7 @@ import ( "time" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiformats" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiitercsv" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" @@ -60,25 +61,64 @@ func CLICSVTaxaIterator(iterator *obitax.ITaxon) *obiitercsv.ICSVRecord { return nil } - options := make([]obitax.WithOption, 0) + options := make([]obiformats.WithOption, 0) options = append(options, - obitax.OptionsWithPattern(CLIWithQuery()), - obitax.OptionsWithParent(CLIWithParent()), - obitax.OptionsWithRank(CLIWithRank()), - obitax.OptionsWithScientificName(CLIWithScientificName()), - obitax.OptionsWithPath(CLIWithPath()), - obitax.OptionsRawTaxid(obidefault.UseRawTaxids()), - obitax.OptionsSource(obidefault.SelectedTaxonomy()), + obiformats.OptionsWithPattern(CLIWithQuery()), + obiformats.OptionsWithParent(CLIWithParent()), + obiformats.OptionsWithRank(CLIWithRank()), + obiformats.OptionsWithScientificName(CLIWithScientificName()), + obiformats.OptionsWithPath(CLIWithPath()), + obiformats.OptionsRawTaxid(obidefault.UseRawTaxids()), + obiformats.OptionsSource(obidefault.SelectedTaxonomy()), ) - return iterator.CSVTaxaIterator(options...) + return obiformats.CSVTaxaIterator(iterator, options...) } func CLICSVTaxaWriter(iterator *obitax.ITaxon, terminalAction bool) *obiitercsv.ICSVRecord { return obicsv.CLICSVWriter(CLICSVTaxaIterator(iterator), terminalAction) } +func CLINewickWriter(iterator *obitax.ITaxon, + terminalAction bool) *obitax.ITaxon { + + var err error + var newIter *obitax.ITaxon + + options := make([]obiformats.WithOption, 0) + options = append(options, obiformats.OptionsCompressed(obidefault.CompressOutput()), + obiformats.OptionsWithRank(CLIWithRank()), + obiformats.OptionsWithScientificName(CLIWithScientificName()), + obiformats.OptionsWithTaxid(true), + ) + + filename := obiconvert.CLIOutPutFileName() + + if filename != "-" { + newIter, err = obiformats.WriteNewickToFile(iterator, filename, options...) + + if err != nil { + log.Fatalf("Cannot write to file : %+v", err) + } + + } else { + newIter, err = obiformats.WriteNewickToStdout(iterator, options...) + + if err != nil { + log.Fatalf("Cannot write to stdout : %+v", err) + } + + } + + if terminalAction { + newIter.Consume() + return nil + } + + return newIter +} + func CLIDownloadNCBITaxdump() error { now := time.Now() dateStr := now.Format("20060102") // In Go, this specific date is used as reference for formatting diff --git a/pkg/obitools/obitaxonomy/options.go b/pkg/obitools/obitaxonomy/options.go index 4a3079d..ccdc904 100644 --- a/pkg/obitools/obitaxonomy/options.go +++ b/pkg/obitools/obitaxonomy/options.go @@ -24,6 +24,7 @@ var __restrict_rank__ = "" var __to_dump__ = "" var __download_ncbi__ = false var __extract_taxonomy__ = false +var __newick__ = false func FilterTaxonomyOptionSet(options *getoptions.GetOpt) { options.BoolVar(&__rank_list__, "rank-list", false, @@ -77,6 +78,10 @@ func OptionSet(options *getoptions.GetOpt) { options.BoolVar(&__extract_taxonomy__, "extract-taxonomy", __extract_taxonomy__, options.Description("Extract taxonomy from a sequence file"), ) + options.BoolVar(&__newick__, "newick", __newick__, + options.Description("Format the resulting taxonomy as a newick tree"), + ) + } func CLITaxonomicalRestrictions() (*obitax.TaxonSet, error) { @@ -156,3 +161,7 @@ func CLIDownloadNCBI() bool { func CLIExtractTaxonomy() bool { return __extract_taxonomy__ } + +func CLIAsNewick() bool { + return __newick__ +} diff --git a/pkg/obiutils/mimetypes.go b/pkg/obiutils/mimetypes.go new file mode 100644 index 0000000..3192764 --- /dev/null +++ b/pkg/obiutils/mimetypes.go @@ -0,0 +1,95 @@ +package obiutils + +import ( + "bytes" + "encoding/csv" + "errors" + "io" + "regexp" + + "github.com/gabriel-vasile/mimetype" +) + +func DropLastLine(b []byte, readLimit uint32) []byte { + if readLimit == 0 || uint32(len(b)) < readLimit { + return b + } + for i := len(b) - 1; i > 0; i-- { + if b[i] == '\n' { + return b[:i] + } + } + return b +} + +var __obimimetype_registred__ = false + +func RegisterOBIMimeType() { + if !__obimimetype_registred__ { + csv := func(in []byte, limit uint32) bool { + in = DropLastLine(in, limit) + + br := bytes.NewReader(in) + r := csv.NewReader(br) + r.Comma = ',' + r.ReuseRecord = true + r.LazyQuotes = true + r.Comment = '#' + + lines := 0 + for { + _, err := r.Read() + if errors.Is(err, io.EOF) { + break + } + if err != nil { + return false + } + lines++ + } + + return r.FieldsPerRecord > 1 && lines > 1 + } + + fastaDetector := func(raw []byte, limit uint32) bool { + ok, err := regexp.Match("^>[^ ]", raw) + return ok && err == nil + } + + fastqDetector := func(raw []byte, limit uint32) bool { + ok, err := regexp.Match("^@[^ ].*\n[A-Za-z.-]+", raw) + return ok && err == nil + } + + ecoPCR2Detector := func(raw []byte, limit uint32) bool { + ok := bytes.HasPrefix(raw, []byte("#@ecopcr-v2")) + return ok + } + + genbankDetector := func(raw []byte, limit uint32) bool { + ok2 := bytes.HasPrefix(raw, []byte("LOCUS ")) + ok1, err := regexp.Match("^[^ ]* +Genetic Sequence Data Bank *\n", raw) + return ok2 || (ok1 && err == nil) + } + + emblDetector := func(raw []byte, limit uint32) bool { + ok := bytes.HasPrefix(raw, []byte("ID ")) + return ok + } + + mimetype.Lookup("text/plain").Extend(fastaDetector, "text/fasta", ".fasta") + mimetype.Lookup("text/plain").Extend(fastqDetector, "text/fastq", ".fastq") + mimetype.Lookup("text/plain").Extend(ecoPCR2Detector, "text/ecopcr2", ".ecopcr") + mimetype.Lookup("text/plain").Extend(genbankDetector, "text/genbank", ".seq") + mimetype.Lookup("text/plain").Extend(emblDetector, "text/embl", ".dat") + mimetype.Lookup("text/plain").Extend(csv, "text/csv", ".csv") + + mimetype.Lookup("application/octet-stream").Extend(fastaDetector, "text/fasta", ".fasta") + mimetype.Lookup("application/octet-stream").Extend(fastqDetector, "text/fastq", ".fastq") + mimetype.Lookup("application/octet-stream").Extend(ecoPCR2Detector, "text/ecopcr2", ".ecopcr") + mimetype.Lookup("application/octet-stream").Extend(genbankDetector, "text/genbank", ".seq") + mimetype.Lookup("application/octet-stream").Extend(emblDetector, "text/embl", ".dat") + mimetype.Lookup("application/octet-stream").Extend(csv, "text/csv", ".csv") + } + __obimimetype_registred__ = true +}