From 54a138196c4348a0c3964f1774b99d1fc934d233 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Fri, 21 Jun 2024 14:28:57 +0200 Subject: [PATCH] Patch a bug in fasta and fastq reading Former-commit-id: bcaa264b4c4a7c67617eb909b199176bf09913db --- pkg/obichunk/options.go | 16 +++-- pkg/obiformats/fastaseq_read.go | 9 ++- pkg/obiformats/fastqseq_read.go | 15 +++-- pkg/obiiter/merge.go | 4 +- pkg/obioptions/version.go | 2 +- pkg/obiseq/merge.go | 73 +++++++++++++++++------ pkg/obitax/lca.go | 2 +- pkg/obitools/obiclean/obiclean.go | 2 +- pkg/obitools/obiconsensus/obiconsensus.go | 6 +- 9 files changed, 85 insertions(+), 44 deletions(-) diff --git a/pkg/obichunk/options.go b/pkg/obichunk/options.go index 4d6a90d..7eb0114 100644 --- a/pkg/obichunk/options.go +++ b/pkg/obichunk/options.go @@ -1,9 +1,12 @@ package obichunk -import "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" +import ( + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" +) type __options__ struct { - statsOn []string + statsOn obiseq.StatsOnDescriptions categories []string navalue string cacheOnDisk bool @@ -21,7 +24,7 @@ type WithOption func(Options) func MakeOptions(setters []WithOption) Options { o := __options__{ - statsOn: make([]string, 0, 100), + statsOn: make(obiseq.StatsOnDescriptions, 10), categories: make([]string, 0, 100), navalue: "NA", cacheOnDisk: false, @@ -53,7 +56,7 @@ func (opt Options) PopCategories() string { return "" } -func (opt Options) StatsOn() []string { +func (opt Options) StatsOn() obiseq.StatsOnDescriptions { return opt.pointer.statsOn } @@ -114,7 +117,10 @@ func OptionNAValue(na string) WithOption { func OptionStatOn(keys ...string) WithOption { f := WithOption(func(opt Options) { - opt.pointer.statsOn = append(opt.pointer.categories, keys...) + for _, k := range keys { + d := obiseq.MakeStatsOnDescription(k) + opt.pointer.statsOn[d.Name] = d + } }) return f diff --git a/pkg/obiformats/fastaseq_read.go b/pkg/obiformats/fastaseq_read.go index e737101..b828b3d 100644 --- a/pkg/obiformats/fastaseq_read.go +++ b/pkg/obiformats/fastaseq_read.go @@ -50,13 +50,12 @@ func _ParseFastaFile(source string, var identifier string var definition string - state := 0 - - idBytes := new(bytes.Buffer) - defBytes := new(bytes.Buffer) - seqBytes := new(bytes.Buffer) + idBytes := bytes.Buffer{} + defBytes := bytes.Buffer{} + seqBytes := bytes.Buffer{} for chunks := range input { + state := 0 scanner := bufio.NewReader(chunks.raw) start, _ := scanner.Peek(20) if start[0] != '>' { diff --git a/pkg/obiformats/fastqseq_read.go b/pkg/obiformats/fastqseq_read.go index becef96..511fb1c 100644 --- a/pkg/obiformats/fastqseq_read.go +++ b/pkg/obiformats/fastqseq_read.go @@ -129,14 +129,13 @@ func _ParseFastqFile(source string, var identifier string var definition string - state := 0 - - idBytes := new(bytes.Buffer) - defBytes := new(bytes.Buffer) - qualBytes := new(bytes.Buffer) - seqBytes := new(bytes.Buffer) + idBytes := bytes.Buffer{} + defBytes := bytes.Buffer{} + qualBytes := bytes.Buffer{} + seqBytes := bytes.Buffer{} for chunks := range input { + state := 0 scanner := bufio.NewReader(chunks.raw) sequences := make(obiseq.BioSequenceSlice, 0, 100) previous := byte(0) @@ -257,7 +256,7 @@ func _ParseFastqFile(source string, } case 10: if is_end_of_line { - _storeSequenceQuality(qualBytes, sequences[len(sequences)-1], quality_shift) + _storeSequenceQuality(&qualBytes, sequences[len(sequences)-1], quality_shift) if no_order { if len(sequences) == batch_size { @@ -286,7 +285,7 @@ func _ParseFastqFile(source string, if len(sequences) > 0 { if state == 10 { - _storeSequenceQuality(qualBytes, sequences[len(sequences)-1], quality_shift) + _storeSequenceQuality(&qualBytes, sequences[len(sequences)-1], quality_shift) state = 1 } diff --git a/pkg/obiiter/merge.go b/pkg/obiiter/merge.go index d74c83a..7dea932 100644 --- a/pkg/obiiter/merge.go +++ b/pkg/obiiter/merge.go @@ -2,7 +2,7 @@ package obiiter import "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" -func (iterator IBioSequence) IMergeSequenceBatch(na string, statsOn []string, sizes ...int) IBioSequence { +func (iterator IBioSequence) IMergeSequenceBatch(na string, statsOn obiseq.StatsOnDescriptions, sizes ...int) IBioSequence { batchsize := 100 if len(sizes) > 0 { @@ -36,7 +36,7 @@ func (iterator IBioSequence) IMergeSequenceBatch(na string, statsOn []string, si return newIter } -func MergePipe(na string, statsOn []string, sizes ...int) Pipeable { +func MergePipe(na string, statsOn obiseq.StatsOnDescriptions, sizes ...int) Pipeable { f := func(iterator IBioSequence) IBioSequence { return iterator.IMergeSequenceBatch(na, statsOn, sizes...) } diff --git a/pkg/obioptions/version.go b/pkg/obioptions/version.go index 46f03cd..cda2792 100644 --- a/pkg/obioptions/version.go +++ b/pkg/obioptions/version.go @@ -7,7 +7,7 @@ import ( // TODO: The version number is extracted from git. This induces that the version // corresponds to the last commit, and not the one when the file will be // commited -var _Commit = "58bcc67" +var _Commit = "612868a" var _Version = "Release 4.2.0" // Version returns the version of the obitools package. diff --git a/pkg/obiseq/merge.go b/pkg/obiseq/merge.go index cfabb37..70824e4 100644 --- a/pkg/obiseq/merge.go +++ b/pkg/obiseq/merge.go @@ -11,6 +11,43 @@ import ( ) type StatsOnValues map[string]int +type StatsOnWeights func(sequence *BioSequence) int +type StatsOnDescription struct { + Name string + Key string + Weight StatsOnWeights +} +type StatsOnDescriptions map[string]StatsOnDescription + +func BioseqCount(sequence *BioSequence) int { + return sequence.Count() +} + +func MakeStatsOnDescription(descriptor string) StatsOnDescription { + parts := strings.SplitN(descriptor, ":", 2) + var ff StatsOnWeights + switch len(parts) { + case 1: + ff = func(s *BioSequence) int { + return s.Count() + } + + case 2: + ff = func(s *BioSequence) int { + v, ok := s.GetIntAttribute(parts[1]) + if !ok { + return 0 + } + return v + } + } + + return StatsOnDescription{ + Name: descriptor, + Key: parts[0], + Weight: ff, + } +} // StatsOnSlotName returns the name of the slot that summarizes statistics of occurrence for a given attribute. // @@ -50,8 +87,8 @@ func (sequence *BioSequence) HasStatsOn(key string) bool { // // Return type: // - StatsOnValues -func (sequence *BioSequence) StatsOn(key string, na string) StatsOnValues { - mkey := StatsOnSlotName(key) +func (sequence *BioSequence) StatsOn(desc StatsOnDescription, na string) StatsOnValues { + mkey := StatsOnSlotName(desc.Name) annotations := sequence.Annotations() istat, ok := annotations[mkey] @@ -88,8 +125,8 @@ func (sequence *BioSequence) StatsOn(key string, na string) StatsOnValues { newstat = true } - if newstat && sequence.StatsPlusOne(key, sequence, na) { - delete(sequence.Annotations(), key) + if newstat && sequence.StatsPlusOne(desc, sequence, na) { + delete(sequence.Annotations(), desc.Key) } return stats @@ -103,14 +140,14 @@ func (sequence *BioSequence) StatsOn(key string, na string) StatsOnValues { // - na: the value to be used if the attribute is not present // Return type: // - bool -func (sequence *BioSequence) StatsPlusOne(key string, toAdd *BioSequence, na string) bool { +func (sequence *BioSequence) StatsPlusOne(desc StatsOnDescription, toAdd *BioSequence, na string) bool { sval := na annotations := sequence.Annotations() - stats := sequence.StatsOn(key, na) + stats := sequence.StatsOn(desc, na) retval := false if toAdd.HasAnnotation() { - value, ok := toAdd.Annotations()[key] + value, ok := toAdd.Annotations()[desc.Key] if ok { @@ -139,8 +176,8 @@ func (sequence *BioSequence) StatsPlusOne(key string, toAdd *BioSequence, na str if !ok { old = 0 } - stats[sval] = old + toAdd.Count() - annotations[StatsOnSlotName(key)] = stats // TODO: check if this is necessary + stats[sval] = old + desc.Weight(toAdd) + annotations[StatsOnSlotName(desc.Name)] = stats // TODO: check if this is necessary return retval } @@ -170,7 +207,7 @@ func (stats StatsOnValues) Merge(toMerged StatsOnValues) StatsOnValues { // // Return type: // - *BioSequence: the merged sequence (BioSequence) -func (sequence *BioSequence) Merge(tomerge *BioSequence, na string, inplace bool, statsOn ...string) *BioSequence { +func (sequence *BioSequence) Merge(tomerge *BioSequence, na string, inplace bool, statsOn StatsOnDescriptions) *BioSequence { if !inplace { sequence = sequence.Copy() } @@ -183,14 +220,14 @@ func (sequence *BioSequence) Merge(tomerge *BioSequence, na string, inplace bool count := sequence.Count() + tomerge.Count() - for _, key := range statsOn { + for key, desc := range statsOn { if tomerge.HasStatsOn(key) { - smk := sequence.StatsOn(key, na) - mmk := tomerge.StatsOn(key, na) + smk := sequence.StatsOn(desc, na) + mmk := tomerge.StatsOn(desc, na) annotations[StatsOnSlotName(key)] = smk.Merge(mmk) } else { - sequence.StatsPlusOne(key, tomerge, na) + sequence.StatsPlusOne(desc, tomerge, na) } } @@ -237,19 +274,19 @@ func (sequence *BioSequence) Merge(tomerge *BioSequence, na string, inplace bool // // Return type: // - *BioSequence: the merged sequence (BioSequence) -func (sequences BioSequenceSlice) Merge(na string, statsOn []string) *BioSequence { +func (sequences BioSequenceSlice) Merge(na string, statsOn StatsOnDescriptions) *BioSequence { seq := sequences[0] //sequences[0] = nil seq.SetQualities(nil) if len(sequences) == 1 { seq.Annotations()["count"] = seq.Count() - for _, v := range statsOn { - seq.StatsOn(v, na) + for _, desc := range statsOn { + seq.StatsOn(desc, na) } } else { for k, toMerge := range sequences[1:] { - seq.Merge(toMerge, na, true, statsOn...) + seq.Merge(toMerge, na, true, statsOn) toMerge.Recycle() sequences[1+k] = nil } diff --git a/pkg/obitax/lca.go b/pkg/obitax/lca.go index c54a6bf..84d6cea 100644 --- a/pkg/obitax/lca.go +++ b/pkg/obitax/lca.go @@ -43,7 +43,7 @@ func (t1 *TaxNode) LCA(t2 *TaxNode) (*TaxNode, error) { } func (taxonomy *Taxonomy) TaxonomicDistribution(sequence *obiseq.BioSequence) map[*TaxNode]int { - taxids := sequence.StatsOn("taxid", "na") + taxids := sequence.StatsOn(obiseq.MakeStatsOnDescription("taxid"), "na") taxons := make(map[*TaxNode]int, len(taxids)) for k, v := range taxids { diff --git a/pkg/obitools/obiclean/obiclean.go b/pkg/obitools/obiclean/obiclean.go index 8ee93e7..de90af1 100644 --- a/pkg/obitools/obiclean/obiclean.go +++ b/pkg/obitools/obiclean/obiclean.go @@ -34,7 +34,7 @@ func buildSamples(dataset obiseq.BioSequenceSlice, samples := make(map[string]*([]*seqPCR)) for _, s := range dataset { - stats := s.StatsOn(tag, NAValue) + stats := s.StatsOn(obiseq.MakeStatsOnDescription(tag), NAValue) for k, v := range stats { pcr, ok := samples[k] diff --git a/pkg/obitools/obiconsensus/obiconsensus.go b/pkg/obitools/obiconsensus/obiconsensus.go index 22e21d8..3ee3cad 100644 --- a/pkg/obitools/obiconsensus/obiconsensus.go +++ b/pkg/obitools/obiconsensus/obiconsensus.go @@ -129,7 +129,7 @@ func SampleWeight(seqs *obiseq.BioSequenceSlice, sample, sample_key string) func f := func(i int) float64 { - stats := (*seqs)[i].StatsOn(sample_key, "NA") + stats := (*seqs)[i].StatsOn(obiseq.MakeStatsOnDescription(sample_key), "NA") if value, ok := stats[sample]; ok { return float64(value) @@ -155,7 +155,7 @@ func SeqBySamples(seqs obiseq.BioSequenceSlice, sample_key string) map[string]*o for _, s := range seqs { if s.HasStatsOn(sample_key) { - stats := s.StatsOn(sample_key, "NA") + stats := s.StatsOn(obiseq.MakeStatsOnDescription(sample_key), "NA") for k := range stats { if seqset, ok := samples[k]; ok { *seqset = append(*seqset, s) @@ -378,7 +378,7 @@ func CLIOBIMinion(itertator obiiter.IBioSequence) obiiter.IBioSequence { }() obiuniq.AddStatsOn(CLISampleAttribute()) - obiuniq.AddStatsOn("obiconsensus_weight") + obiuniq.AddStatsOn("sample:obiconsensus_weight") obiuniq.SetUniqueInMemory(false) obiuniq.SetNoSingleton(CLINoSingleton()) return obiuniq.CLIUnique(newIter).Pipe(obiiter.WorkerPipe(obiannotate.AddSeqLengthWorker(), false))