Patch a bug in fasta and fastq reading

Former-commit-id: bcaa264b4c4a7c67617eb909b199176bf09913db
This commit is contained in:
Eric Coissac
2024-06-21 14:28:57 +02:00
parent 818ce87bab
commit 54a138196c
9 changed files with 85 additions and 44 deletions

View File

@ -1,9 +1,12 @@
package obichunk package obichunk
import "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
)
type __options__ struct { type __options__ struct {
statsOn []string statsOn obiseq.StatsOnDescriptions
categories []string categories []string
navalue string navalue string
cacheOnDisk bool cacheOnDisk bool
@ -21,7 +24,7 @@ type WithOption func(Options)
func MakeOptions(setters []WithOption) Options { func MakeOptions(setters []WithOption) Options {
o := __options__{ o := __options__{
statsOn: make([]string, 0, 100), statsOn: make(obiseq.StatsOnDescriptions, 10),
categories: make([]string, 0, 100), categories: make([]string, 0, 100),
navalue: "NA", navalue: "NA",
cacheOnDisk: false, cacheOnDisk: false,
@ -53,7 +56,7 @@ func (opt Options) PopCategories() string {
return "" return ""
} }
func (opt Options) StatsOn() []string { func (opt Options) StatsOn() obiseq.StatsOnDescriptions {
return opt.pointer.statsOn return opt.pointer.statsOn
} }
@ -114,7 +117,10 @@ func OptionNAValue(na string) WithOption {
func OptionStatOn(keys ...string) WithOption { func OptionStatOn(keys ...string) WithOption {
f := WithOption(func(opt Options) { f := WithOption(func(opt Options) {
opt.pointer.statsOn = append(opt.pointer.categories, keys...) for _, k := range keys {
d := obiseq.MakeStatsOnDescription(k)
opt.pointer.statsOn[d.Name] = d
}
}) })
return f return f

View File

@ -50,13 +50,12 @@ func _ParseFastaFile(source string,
var identifier string var identifier string
var definition string var definition string
state := 0 idBytes := bytes.Buffer{}
defBytes := bytes.Buffer{}
idBytes := new(bytes.Buffer) seqBytes := bytes.Buffer{}
defBytes := new(bytes.Buffer)
seqBytes := new(bytes.Buffer)
for chunks := range input { for chunks := range input {
state := 0
scanner := bufio.NewReader(chunks.raw) scanner := bufio.NewReader(chunks.raw)
start, _ := scanner.Peek(20) start, _ := scanner.Peek(20)
if start[0] != '>' { if start[0] != '>' {

View File

@ -129,14 +129,13 @@ func _ParseFastqFile(source string,
var identifier string var identifier string
var definition string var definition string
state := 0 idBytes := bytes.Buffer{}
defBytes := bytes.Buffer{}
idBytes := new(bytes.Buffer) qualBytes := bytes.Buffer{}
defBytes := new(bytes.Buffer) seqBytes := bytes.Buffer{}
qualBytes := new(bytes.Buffer)
seqBytes := new(bytes.Buffer)
for chunks := range input { for chunks := range input {
state := 0
scanner := bufio.NewReader(chunks.raw) scanner := bufio.NewReader(chunks.raw)
sequences := make(obiseq.BioSequenceSlice, 0, 100) sequences := make(obiseq.BioSequenceSlice, 0, 100)
previous := byte(0) previous := byte(0)
@ -257,7 +256,7 @@ func _ParseFastqFile(source string,
} }
case 10: case 10:
if is_end_of_line { if is_end_of_line {
_storeSequenceQuality(qualBytes, sequences[len(sequences)-1], quality_shift) _storeSequenceQuality(&qualBytes, sequences[len(sequences)-1], quality_shift)
if no_order { if no_order {
if len(sequences) == batch_size { if len(sequences) == batch_size {
@ -286,7 +285,7 @@ func _ParseFastqFile(source string,
if len(sequences) > 0 { if len(sequences) > 0 {
if state == 10 { if state == 10 {
_storeSequenceQuality(qualBytes, sequences[len(sequences)-1], quality_shift) _storeSequenceQuality(&qualBytes, sequences[len(sequences)-1], quality_shift)
state = 1 state = 1
} }

View File

@ -2,7 +2,7 @@ package obiiter
import "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" import "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
func (iterator IBioSequence) IMergeSequenceBatch(na string, statsOn []string, sizes ...int) IBioSequence { func (iterator IBioSequence) IMergeSequenceBatch(na string, statsOn obiseq.StatsOnDescriptions, sizes ...int) IBioSequence {
batchsize := 100 batchsize := 100
if len(sizes) > 0 { if len(sizes) > 0 {
@ -36,7 +36,7 @@ func (iterator IBioSequence) IMergeSequenceBatch(na string, statsOn []string, si
return newIter return newIter
} }
func MergePipe(na string, statsOn []string, sizes ...int) Pipeable { func MergePipe(na string, statsOn obiseq.StatsOnDescriptions, sizes ...int) Pipeable {
f := func(iterator IBioSequence) IBioSequence { f := func(iterator IBioSequence) IBioSequence {
return iterator.IMergeSequenceBatch(na, statsOn, sizes...) return iterator.IMergeSequenceBatch(na, statsOn, sizes...)
} }

View File

@ -7,7 +7,7 @@ import (
// TODO: The version number is extracted from git. This induces that the version // TODO: The version number is extracted from git. This induces that the version
// corresponds to the last commit, and not the one when the file will be // corresponds to the last commit, and not the one when the file will be
// commited // commited
var _Commit = "58bcc67" var _Commit = "612868a"
var _Version = "Release 4.2.0" var _Version = "Release 4.2.0"
// Version returns the version of the obitools package. // Version returns the version of the obitools package.

View File

@ -11,6 +11,43 @@ import (
) )
type StatsOnValues map[string]int type StatsOnValues map[string]int
type StatsOnWeights func(sequence *BioSequence) int
type StatsOnDescription struct {
Name string
Key string
Weight StatsOnWeights
}
type StatsOnDescriptions map[string]StatsOnDescription
func BioseqCount(sequence *BioSequence) int {
return sequence.Count()
}
func MakeStatsOnDescription(descriptor string) StatsOnDescription {
parts := strings.SplitN(descriptor, ":", 2)
var ff StatsOnWeights
switch len(parts) {
case 1:
ff = func(s *BioSequence) int {
return s.Count()
}
case 2:
ff = func(s *BioSequence) int {
v, ok := s.GetIntAttribute(parts[1])
if !ok {
return 0
}
return v
}
}
return StatsOnDescription{
Name: descriptor,
Key: parts[0],
Weight: ff,
}
}
// StatsOnSlotName returns the name of the slot that summarizes statistics of occurrence for a given attribute. // StatsOnSlotName returns the name of the slot that summarizes statistics of occurrence for a given attribute.
// //
@ -50,8 +87,8 @@ func (sequence *BioSequence) HasStatsOn(key string) bool {
// //
// Return type: // Return type:
// - StatsOnValues // - StatsOnValues
func (sequence *BioSequence) StatsOn(key string, na string) StatsOnValues { func (sequence *BioSequence) StatsOn(desc StatsOnDescription, na string) StatsOnValues {
mkey := StatsOnSlotName(key) mkey := StatsOnSlotName(desc.Name)
annotations := sequence.Annotations() annotations := sequence.Annotations()
istat, ok := annotations[mkey] istat, ok := annotations[mkey]
@ -88,8 +125,8 @@ func (sequence *BioSequence) StatsOn(key string, na string) StatsOnValues {
newstat = true newstat = true
} }
if newstat && sequence.StatsPlusOne(key, sequence, na) { if newstat && sequence.StatsPlusOne(desc, sequence, na) {
delete(sequence.Annotations(), key) delete(sequence.Annotations(), desc.Key)
} }
return stats return stats
@ -103,14 +140,14 @@ func (sequence *BioSequence) StatsOn(key string, na string) StatsOnValues {
// - na: the value to be used if the attribute is not present // - na: the value to be used if the attribute is not present
// Return type: // Return type:
// - bool // - bool
func (sequence *BioSequence) StatsPlusOne(key string, toAdd *BioSequence, na string) bool { func (sequence *BioSequence) StatsPlusOne(desc StatsOnDescription, toAdd *BioSequence, na string) bool {
sval := na sval := na
annotations := sequence.Annotations() annotations := sequence.Annotations()
stats := sequence.StatsOn(key, na) stats := sequence.StatsOn(desc, na)
retval := false retval := false
if toAdd.HasAnnotation() { if toAdd.HasAnnotation() {
value, ok := toAdd.Annotations()[key] value, ok := toAdd.Annotations()[desc.Key]
if ok { if ok {
@ -139,8 +176,8 @@ func (sequence *BioSequence) StatsPlusOne(key string, toAdd *BioSequence, na str
if !ok { if !ok {
old = 0 old = 0
} }
stats[sval] = old + toAdd.Count() stats[sval] = old + desc.Weight(toAdd)
annotations[StatsOnSlotName(key)] = stats // TODO: check if this is necessary annotations[StatsOnSlotName(desc.Name)] = stats // TODO: check if this is necessary
return retval return retval
} }
@ -170,7 +207,7 @@ func (stats StatsOnValues) Merge(toMerged StatsOnValues) StatsOnValues {
// //
// Return type: // Return type:
// - *BioSequence: the merged sequence (BioSequence) // - *BioSequence: the merged sequence (BioSequence)
func (sequence *BioSequence) Merge(tomerge *BioSequence, na string, inplace bool, statsOn ...string) *BioSequence { func (sequence *BioSequence) Merge(tomerge *BioSequence, na string, inplace bool, statsOn StatsOnDescriptions) *BioSequence {
if !inplace { if !inplace {
sequence = sequence.Copy() sequence = sequence.Copy()
} }
@ -183,14 +220,14 @@ func (sequence *BioSequence) Merge(tomerge *BioSequence, na string, inplace bool
count := sequence.Count() + tomerge.Count() count := sequence.Count() + tomerge.Count()
for _, key := range statsOn { for key, desc := range statsOn {
if tomerge.HasStatsOn(key) { if tomerge.HasStatsOn(key) {
smk := sequence.StatsOn(key, na) smk := sequence.StatsOn(desc, na)
mmk := tomerge.StatsOn(key, na) mmk := tomerge.StatsOn(desc, na)
annotations[StatsOnSlotName(key)] = smk.Merge(mmk) annotations[StatsOnSlotName(key)] = smk.Merge(mmk)
} else { } else {
sequence.StatsPlusOne(key, tomerge, na) sequence.StatsPlusOne(desc, tomerge, na)
} }
} }
@ -237,19 +274,19 @@ func (sequence *BioSequence) Merge(tomerge *BioSequence, na string, inplace bool
// //
// Return type: // Return type:
// - *BioSequence: the merged sequence (BioSequence) // - *BioSequence: the merged sequence (BioSequence)
func (sequences BioSequenceSlice) Merge(na string, statsOn []string) *BioSequence { func (sequences BioSequenceSlice) Merge(na string, statsOn StatsOnDescriptions) *BioSequence {
seq := sequences[0] seq := sequences[0]
//sequences[0] = nil //sequences[0] = nil
seq.SetQualities(nil) seq.SetQualities(nil)
if len(sequences) == 1 { if len(sequences) == 1 {
seq.Annotations()["count"] = seq.Count() seq.Annotations()["count"] = seq.Count()
for _, v := range statsOn { for _, desc := range statsOn {
seq.StatsOn(v, na) seq.StatsOn(desc, na)
} }
} else { } else {
for k, toMerge := range sequences[1:] { for k, toMerge := range sequences[1:] {
seq.Merge(toMerge, na, true, statsOn...) seq.Merge(toMerge, na, true, statsOn)
toMerge.Recycle() toMerge.Recycle()
sequences[1+k] = nil sequences[1+k] = nil
} }

View File

@ -43,7 +43,7 @@ func (t1 *TaxNode) LCA(t2 *TaxNode) (*TaxNode, error) {
} }
func (taxonomy *Taxonomy) TaxonomicDistribution(sequence *obiseq.BioSequence) map[*TaxNode]int { func (taxonomy *Taxonomy) TaxonomicDistribution(sequence *obiseq.BioSequence) map[*TaxNode]int {
taxids := sequence.StatsOn("taxid", "na") taxids := sequence.StatsOn(obiseq.MakeStatsOnDescription("taxid"), "na")
taxons := make(map[*TaxNode]int, len(taxids)) taxons := make(map[*TaxNode]int, len(taxids))
for k, v := range taxids { for k, v := range taxids {

View File

@ -34,7 +34,7 @@ func buildSamples(dataset obiseq.BioSequenceSlice,
samples := make(map[string]*([]*seqPCR)) samples := make(map[string]*([]*seqPCR))
for _, s := range dataset { for _, s := range dataset {
stats := s.StatsOn(tag, NAValue) stats := s.StatsOn(obiseq.MakeStatsOnDescription(tag), NAValue)
for k, v := range stats { for k, v := range stats {
pcr, ok := samples[k] pcr, ok := samples[k]

View File

@ -129,7 +129,7 @@ func SampleWeight(seqs *obiseq.BioSequenceSlice, sample, sample_key string) func
f := func(i int) float64 { f := func(i int) float64 {
stats := (*seqs)[i].StatsOn(sample_key, "NA") stats := (*seqs)[i].StatsOn(obiseq.MakeStatsOnDescription(sample_key), "NA")
if value, ok := stats[sample]; ok { if value, ok := stats[sample]; ok {
return float64(value) return float64(value)
@ -155,7 +155,7 @@ func SeqBySamples(seqs obiseq.BioSequenceSlice, sample_key string) map[string]*o
for _, s := range seqs { for _, s := range seqs {
if s.HasStatsOn(sample_key) { if s.HasStatsOn(sample_key) {
stats := s.StatsOn(sample_key, "NA") stats := s.StatsOn(obiseq.MakeStatsOnDescription(sample_key), "NA")
for k := range stats { for k := range stats {
if seqset, ok := samples[k]; ok { if seqset, ok := samples[k]; ok {
*seqset = append(*seqset, s) *seqset = append(*seqset, s)
@ -378,7 +378,7 @@ func CLIOBIMinion(itertator obiiter.IBioSequence) obiiter.IBioSequence {
}() }()
obiuniq.AddStatsOn(CLISampleAttribute()) obiuniq.AddStatsOn(CLISampleAttribute())
obiuniq.AddStatsOn("obiconsensus_weight") obiuniq.AddStatsOn("sample:obiconsensus_weight")
obiuniq.SetUniqueInMemory(false) obiuniq.SetUniqueInMemory(false)
obiuniq.SetNoSingleton(CLINoSingleton()) obiuniq.SetNoSingleton(CLINoSingleton())
return obiuniq.CLIUnique(newIter).Pipe(obiiter.WorkerPipe(obiannotate.AddSeqLengthWorker(), false)) return obiuniq.CLIUnique(newIter).Pipe(obiiter.WorkerPipe(obiannotate.AddSeqLengthWorker(), false))