From 23758b00f6f118874b6d0bfacb9aa3f4df7c2b52 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Wed, 31 Jan 2024 15:43:02 +0100 Subject: [PATCH] Patch a bug in the embl reader and adds some doc Former-commit-id: 9b5f75fb14bcc3043da1647055279987a295d271 --- Release-notes.md | 1 + pkg/obiformats/embl_read.go | 3 +- pkg/obiiter/batchiterator.go | 7 +++ pkg/obiseq/revcomp.go | 5 +++ pkg/obitax/bioseq_classifier.go | 64 ++++++++++++++++++++++++++++ pkg/obitax/path.go | 28 ++++++++++-- pkg/obitools/obilandmark/taxostat.go | 1 + 7 files changed, 105 insertions(+), 4 deletions(-) create mode 100644 pkg/obitax/bioseq_classifier.go create mode 100644 pkg/obitools/obilandmark/taxostat.go diff --git a/Release-notes.md b/Release-notes.md index 7d0f624..c14d102 100644 --- a/Release-notes.md +++ b/Release-notes.md @@ -36,6 +36,7 @@ - In `obimultiplex`, correct a bug leading to a miss-read of the ngsfilter file when tags where written in lower case. - In `obitag`, correct a bug leading to the annotation by taxid 1 (root) all the sequences having a 100% match with one the reference sequence. +- Correct a bug in the EMBL reader. ## November 16th, 2023. Release 4.1.0 diff --git a/pkg/obiformats/embl_read.go b/pkg/obiformats/embl_read.go index 31642e7..1754208 100644 --- a/pkg/obiformats/embl_read.go +++ b/pkg/obiformats/embl_read.go @@ -133,7 +133,8 @@ func _ParseEmblFile(source string, input <-chan _FileChunk, out obiiter.IBioSequ } case strings.HasPrefix(line, " "): parts := strings.SplitN(line[5:], " ", 7) - for i := 0; i < 6; i++ { + np := len(parts) + for i := 0; i < np; i++ { seqBytes.WriteString(parts[i]) } case line == "//": diff --git a/pkg/obiiter/batchiterator.go b/pkg/obiiter/batchiterator.go index a629a19..3ab8793 100644 --- a/pkg/obiiter/batchiterator.go +++ b/pkg/obiiter/batchiterator.go @@ -762,3 +762,10 @@ func IBatchOver(data obiseq.BioSequenceSlice, } return newIter } + +// func IBatchOverClasses(data obiseq.BioSequenceSlice, +// classifier *obiseq.BioSequenceClassifier) IBioSequence { + +// newIter := MakeIBioSequence() +// classMap := make(map[string]int) +// } diff --git a/pkg/obiseq/revcomp.go b/pkg/obiseq/revcomp.go index 35b9e79..7530192 100644 --- a/pkg/obiseq/revcomp.go +++ b/pkg/obiseq/revcomp.go @@ -99,6 +99,11 @@ func (sequence *BioSequence) _revcmpMutation() *BioSequence { return sequence } +/** +* ReverseComplementWorker is a function that returns a SeqWorker which performs reverse complement operation on given BioSequence. +* @param inplace {bool}: If true, changes will be made to original sequence object else new sequence object will be created. Default value is false. +* @returns {SeqWorker} A function that accepts *BioSequence and returns its reversed-complement form. + */ func ReverseComplementWorker(inplace bool) SeqWorker { f := func(input *BioSequence) *BioSequence { return input.ReverseComplement(inplace) diff --git a/pkg/obitax/bioseq_classifier.go b/pkg/obitax/bioseq_classifier.go new file mode 100644 index 0000000..fd786bd --- /dev/null +++ b/pkg/obitax/bioseq_classifier.go @@ -0,0 +1,64 @@ +package obitax + +import ( + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" + log "github.com/sirupsen/logrus" +) + +// TaxonomyClassifier is a function that creates a new instance of the BioSequenceClassifier +// for taxonomic classification based on a given taxonomic rank, taxonomy, and abort flag. +// +// Parameters: +// - taxonomicRank: the taxonomic rank to classify the sequences at. +// - taxonomy: the taxonomy object used for classification. +// - abortOnMissing: a flag indicating whether to abort if a taxon is missing in the taxonomy. +// +// Return: +// - *obiseq.BioSequenceClassifier: the new instance of the BioSequenceClassifier. +func TaxonomyClassifier(taxonomicRank string, + taxonomy *Taxonomy, + abortOnMissing bool) *obiseq.BioSequenceClassifier { + + code := func(sequence *obiseq.BioSequence) int { + taxid := sequence.Taxid() + taxon, err := taxonomy.Taxon(taxid) + if err == nil { + taxon = taxon.TaxonAtRank(taxonomicRank) + } else { + taxon = nil + } + if taxon == nil { + if abortOnMissing { + if err != nil { + log.Fatalf("Taxid %d not found in taxonomy", taxid) + } else { + log.Fatalf("Taxon at rank %s not found in taxonomy for taxid %d", taxonomicRank, taxid) + } + + } + + return 0 + } + return taxon.Taxid() + } + + value := func(k int) string { + taxon, _ := taxonomy.Taxon(k) + return taxon.ScientificName() + } + + reset := func() { + } + + clone := func() *obiseq.BioSequenceClassifier { + return TaxonomyClassifier(taxonomicRank, taxonomy, abortOnMissing) + } + + c := obiseq.BioSequenceClassifier{ + Code: code, + Value: value, + Reset: reset, + Clone: clone, + Type: "TaxonomyClassifier"} + return &c +} diff --git a/pkg/obitax/path.go b/pkg/obitax/path.go index ea69d96..543ef68 100644 --- a/pkg/obitax/path.go +++ b/pkg/obitax/path.go @@ -6,6 +6,11 @@ import ( log "github.com/sirupsen/logrus" ) +// Path generates the lineage path from the current taxon up to the root. +// +// This method does not take parameters as it is called on a TaxNode receiver. +// It returns a pointer to a TaxonSlice containing the path and an error if +// the taxonomy needs reindexing. func (taxon *TaxNode) Path() (*TaxonSlice, error) { path := make(TaxonSlice, 0, 30) @@ -24,6 +29,18 @@ func (taxon *TaxNode) Path() (*TaxonSlice, error) { return &path, nil } +// TaxonAtRank traverses up the taxonomy tree starting from the current +// node until it finds a node that matches the specified rank. +// +// If a node with the given rank is not found in the path to the root, +// or if the taxonomy tree is not properly indexed (i.e., a node's parent +// is itself), the function will return nil. In case the taxonomy needs +// reindexing, the function will panic. +// +// rank: the taxonomic rank to search for (e.g., "species", "genus"). +// +// Returns a pointer to a TaxNode representing the node at the +// specified rank, or nil if no such node exists in the path. func (taxon *TaxNode) TaxonAtRank(rank string) *TaxNode { for taxon.rank != rank && taxon != taxon.pparent { taxon = taxon.pparent @@ -33,13 +50,20 @@ func (taxon *TaxNode) TaxonAtRank(rank string) *TaxNode { } } - if taxon == taxon.pparent && taxon.rank != rank { + if taxon == taxon.pparent && taxon.rank != rank { taxon = nil } return taxon } +// Species retrieves the TaxNode corresponding to the species rank. +// +// This method does not take any parameters. It is a convenience +// wrapper around the TaxonAtRank method, specifically retrieving +// the species-level taxonomic classification for the calling TaxNode. +// +// Returns a pointer to the TaxNode representing the species. func (taxon *TaxNode) Species() *TaxNode { return taxon.TaxonAtRank("species") } @@ -52,8 +76,6 @@ func (taxon *TaxNode) Family() *TaxNode { return taxon.TaxonAtRank("family") } -// Returns a TaxonSet listing the requested taxon and all -// its ancestors in the taxonomy down to the root. func (taxonomy *Taxonomy) Path(taxid int) (*TaxonSlice, error) { taxon, err := taxonomy.Taxon(taxid) diff --git a/pkg/obitools/obilandmark/taxostat.go b/pkg/obitools/obilandmark/taxostat.go new file mode 100644 index 0000000..02eb1fe --- /dev/null +++ b/pkg/obitools/obilandmark/taxostat.go @@ -0,0 +1 @@ +package obilandmark