mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
Patch a bug in fasta and fastq reading
Former-commit-id: bcaa264b4c4a7c67617eb909b199176bf09913db
This commit is contained in:
@ -1,9 +1,12 @@
|
|||||||
package obichunk
|
package obichunk
|
||||||
|
|
||||||
import "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
|
import (
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||||
|
)
|
||||||
|
|
||||||
type __options__ struct {
|
type __options__ struct {
|
||||||
statsOn []string
|
statsOn obiseq.StatsOnDescriptions
|
||||||
categories []string
|
categories []string
|
||||||
navalue string
|
navalue string
|
||||||
cacheOnDisk bool
|
cacheOnDisk bool
|
||||||
@ -21,7 +24,7 @@ type WithOption func(Options)
|
|||||||
|
|
||||||
func MakeOptions(setters []WithOption) Options {
|
func MakeOptions(setters []WithOption) Options {
|
||||||
o := __options__{
|
o := __options__{
|
||||||
statsOn: make([]string, 0, 100),
|
statsOn: make(obiseq.StatsOnDescriptions, 10),
|
||||||
categories: make([]string, 0, 100),
|
categories: make([]string, 0, 100),
|
||||||
navalue: "NA",
|
navalue: "NA",
|
||||||
cacheOnDisk: false,
|
cacheOnDisk: false,
|
||||||
@ -53,7 +56,7 @@ func (opt Options) PopCategories() string {
|
|||||||
return ""
|
return ""
|
||||||
}
|
}
|
||||||
|
|
||||||
func (opt Options) StatsOn() []string {
|
func (opt Options) StatsOn() obiseq.StatsOnDescriptions {
|
||||||
return opt.pointer.statsOn
|
return opt.pointer.statsOn
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -114,7 +117,10 @@ func OptionNAValue(na string) WithOption {
|
|||||||
|
|
||||||
func OptionStatOn(keys ...string) WithOption {
|
func OptionStatOn(keys ...string) WithOption {
|
||||||
f := WithOption(func(opt Options) {
|
f := WithOption(func(opt Options) {
|
||||||
opt.pointer.statsOn = append(opt.pointer.categories, keys...)
|
for _, k := range keys {
|
||||||
|
d := obiseq.MakeStatsOnDescription(k)
|
||||||
|
opt.pointer.statsOn[d.Name] = d
|
||||||
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
return f
|
return f
|
||||||
|
@ -50,13 +50,12 @@ func _ParseFastaFile(source string,
|
|||||||
var identifier string
|
var identifier string
|
||||||
var definition string
|
var definition string
|
||||||
|
|
||||||
state := 0
|
idBytes := bytes.Buffer{}
|
||||||
|
defBytes := bytes.Buffer{}
|
||||||
idBytes := new(bytes.Buffer)
|
seqBytes := bytes.Buffer{}
|
||||||
defBytes := new(bytes.Buffer)
|
|
||||||
seqBytes := new(bytes.Buffer)
|
|
||||||
|
|
||||||
for chunks := range input {
|
for chunks := range input {
|
||||||
|
state := 0
|
||||||
scanner := bufio.NewReader(chunks.raw)
|
scanner := bufio.NewReader(chunks.raw)
|
||||||
start, _ := scanner.Peek(20)
|
start, _ := scanner.Peek(20)
|
||||||
if start[0] != '>' {
|
if start[0] != '>' {
|
||||||
|
@ -129,14 +129,13 @@ func _ParseFastqFile(source string,
|
|||||||
var identifier string
|
var identifier string
|
||||||
var definition string
|
var definition string
|
||||||
|
|
||||||
state := 0
|
idBytes := bytes.Buffer{}
|
||||||
|
defBytes := bytes.Buffer{}
|
||||||
idBytes := new(bytes.Buffer)
|
qualBytes := bytes.Buffer{}
|
||||||
defBytes := new(bytes.Buffer)
|
seqBytes := bytes.Buffer{}
|
||||||
qualBytes := new(bytes.Buffer)
|
|
||||||
seqBytes := new(bytes.Buffer)
|
|
||||||
|
|
||||||
for chunks := range input {
|
for chunks := range input {
|
||||||
|
state := 0
|
||||||
scanner := bufio.NewReader(chunks.raw)
|
scanner := bufio.NewReader(chunks.raw)
|
||||||
sequences := make(obiseq.BioSequenceSlice, 0, 100)
|
sequences := make(obiseq.BioSequenceSlice, 0, 100)
|
||||||
previous := byte(0)
|
previous := byte(0)
|
||||||
@ -257,7 +256,7 @@ func _ParseFastqFile(source string,
|
|||||||
}
|
}
|
||||||
case 10:
|
case 10:
|
||||||
if is_end_of_line {
|
if is_end_of_line {
|
||||||
_storeSequenceQuality(qualBytes, sequences[len(sequences)-1], quality_shift)
|
_storeSequenceQuality(&qualBytes, sequences[len(sequences)-1], quality_shift)
|
||||||
|
|
||||||
if no_order {
|
if no_order {
|
||||||
if len(sequences) == batch_size {
|
if len(sequences) == batch_size {
|
||||||
@ -286,7 +285,7 @@ func _ParseFastqFile(source string,
|
|||||||
|
|
||||||
if len(sequences) > 0 {
|
if len(sequences) > 0 {
|
||||||
if state == 10 {
|
if state == 10 {
|
||||||
_storeSequenceQuality(qualBytes, sequences[len(sequences)-1], quality_shift)
|
_storeSequenceQuality(&qualBytes, sequences[len(sequences)-1], quality_shift)
|
||||||
state = 1
|
state = 1
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2,7 +2,7 @@ package obiiter
|
|||||||
|
|
||||||
import "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
import "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||||
|
|
||||||
func (iterator IBioSequence) IMergeSequenceBatch(na string, statsOn []string, sizes ...int) IBioSequence {
|
func (iterator IBioSequence) IMergeSequenceBatch(na string, statsOn obiseq.StatsOnDescriptions, sizes ...int) IBioSequence {
|
||||||
batchsize := 100
|
batchsize := 100
|
||||||
|
|
||||||
if len(sizes) > 0 {
|
if len(sizes) > 0 {
|
||||||
@ -36,7 +36,7 @@ func (iterator IBioSequence) IMergeSequenceBatch(na string, statsOn []string, si
|
|||||||
return newIter
|
return newIter
|
||||||
}
|
}
|
||||||
|
|
||||||
func MergePipe(na string, statsOn []string, sizes ...int) Pipeable {
|
func MergePipe(na string, statsOn obiseq.StatsOnDescriptions, sizes ...int) Pipeable {
|
||||||
f := func(iterator IBioSequence) IBioSequence {
|
f := func(iterator IBioSequence) IBioSequence {
|
||||||
return iterator.IMergeSequenceBatch(na, statsOn, sizes...)
|
return iterator.IMergeSequenceBatch(na, statsOn, sizes...)
|
||||||
}
|
}
|
||||||
|
@ -7,7 +7,7 @@ import (
|
|||||||
// TODO: The version number is extracted from git. This induces that the version
|
// TODO: The version number is extracted from git. This induces that the version
|
||||||
// corresponds to the last commit, and not the one when the file will be
|
// corresponds to the last commit, and not the one when the file will be
|
||||||
// commited
|
// commited
|
||||||
var _Commit = "58bcc67"
|
var _Commit = "612868a"
|
||||||
var _Version = "Release 4.2.0"
|
var _Version = "Release 4.2.0"
|
||||||
|
|
||||||
// Version returns the version of the obitools package.
|
// Version returns the version of the obitools package.
|
||||||
|
@ -11,6 +11,43 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
type StatsOnValues map[string]int
|
type StatsOnValues map[string]int
|
||||||
|
type StatsOnWeights func(sequence *BioSequence) int
|
||||||
|
type StatsOnDescription struct {
|
||||||
|
Name string
|
||||||
|
Key string
|
||||||
|
Weight StatsOnWeights
|
||||||
|
}
|
||||||
|
type StatsOnDescriptions map[string]StatsOnDescription
|
||||||
|
|
||||||
|
func BioseqCount(sequence *BioSequence) int {
|
||||||
|
return sequence.Count()
|
||||||
|
}
|
||||||
|
|
||||||
|
func MakeStatsOnDescription(descriptor string) StatsOnDescription {
|
||||||
|
parts := strings.SplitN(descriptor, ":", 2)
|
||||||
|
var ff StatsOnWeights
|
||||||
|
switch len(parts) {
|
||||||
|
case 1:
|
||||||
|
ff = func(s *BioSequence) int {
|
||||||
|
return s.Count()
|
||||||
|
}
|
||||||
|
|
||||||
|
case 2:
|
||||||
|
ff = func(s *BioSequence) int {
|
||||||
|
v, ok := s.GetIntAttribute(parts[1])
|
||||||
|
if !ok {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
return v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return StatsOnDescription{
|
||||||
|
Name: descriptor,
|
||||||
|
Key: parts[0],
|
||||||
|
Weight: ff,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// StatsOnSlotName returns the name of the slot that summarizes statistics of occurrence for a given attribute.
|
// StatsOnSlotName returns the name of the slot that summarizes statistics of occurrence for a given attribute.
|
||||||
//
|
//
|
||||||
@ -50,8 +87,8 @@ func (sequence *BioSequence) HasStatsOn(key string) bool {
|
|||||||
//
|
//
|
||||||
// Return type:
|
// Return type:
|
||||||
// - StatsOnValues
|
// - StatsOnValues
|
||||||
func (sequence *BioSequence) StatsOn(key string, na string) StatsOnValues {
|
func (sequence *BioSequence) StatsOn(desc StatsOnDescription, na string) StatsOnValues {
|
||||||
mkey := StatsOnSlotName(key)
|
mkey := StatsOnSlotName(desc.Name)
|
||||||
annotations := sequence.Annotations()
|
annotations := sequence.Annotations()
|
||||||
istat, ok := annotations[mkey]
|
istat, ok := annotations[mkey]
|
||||||
|
|
||||||
@ -88,8 +125,8 @@ func (sequence *BioSequence) StatsOn(key string, na string) StatsOnValues {
|
|||||||
newstat = true
|
newstat = true
|
||||||
}
|
}
|
||||||
|
|
||||||
if newstat && sequence.StatsPlusOne(key, sequence, na) {
|
if newstat && sequence.StatsPlusOne(desc, sequence, na) {
|
||||||
delete(sequence.Annotations(), key)
|
delete(sequence.Annotations(), desc.Key)
|
||||||
}
|
}
|
||||||
|
|
||||||
return stats
|
return stats
|
||||||
@ -103,14 +140,14 @@ func (sequence *BioSequence) StatsOn(key string, na string) StatsOnValues {
|
|||||||
// - na: the value to be used if the attribute is not present
|
// - na: the value to be used if the attribute is not present
|
||||||
// Return type:
|
// Return type:
|
||||||
// - bool
|
// - bool
|
||||||
func (sequence *BioSequence) StatsPlusOne(key string, toAdd *BioSequence, na string) bool {
|
func (sequence *BioSequence) StatsPlusOne(desc StatsOnDescription, toAdd *BioSequence, na string) bool {
|
||||||
sval := na
|
sval := na
|
||||||
annotations := sequence.Annotations()
|
annotations := sequence.Annotations()
|
||||||
stats := sequence.StatsOn(key, na)
|
stats := sequence.StatsOn(desc, na)
|
||||||
retval := false
|
retval := false
|
||||||
|
|
||||||
if toAdd.HasAnnotation() {
|
if toAdd.HasAnnotation() {
|
||||||
value, ok := toAdd.Annotations()[key]
|
value, ok := toAdd.Annotations()[desc.Key]
|
||||||
|
|
||||||
if ok {
|
if ok {
|
||||||
|
|
||||||
@ -139,8 +176,8 @@ func (sequence *BioSequence) StatsPlusOne(key string, toAdd *BioSequence, na str
|
|||||||
if !ok {
|
if !ok {
|
||||||
old = 0
|
old = 0
|
||||||
}
|
}
|
||||||
stats[sval] = old + toAdd.Count()
|
stats[sval] = old + desc.Weight(toAdd)
|
||||||
annotations[StatsOnSlotName(key)] = stats // TODO: check if this is necessary
|
annotations[StatsOnSlotName(desc.Name)] = stats // TODO: check if this is necessary
|
||||||
return retval
|
return retval
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -170,7 +207,7 @@ func (stats StatsOnValues) Merge(toMerged StatsOnValues) StatsOnValues {
|
|||||||
//
|
//
|
||||||
// Return type:
|
// Return type:
|
||||||
// - *BioSequence: the merged sequence (BioSequence)
|
// - *BioSequence: the merged sequence (BioSequence)
|
||||||
func (sequence *BioSequence) Merge(tomerge *BioSequence, na string, inplace bool, statsOn ...string) *BioSequence {
|
func (sequence *BioSequence) Merge(tomerge *BioSequence, na string, inplace bool, statsOn StatsOnDescriptions) *BioSequence {
|
||||||
if !inplace {
|
if !inplace {
|
||||||
sequence = sequence.Copy()
|
sequence = sequence.Copy()
|
||||||
}
|
}
|
||||||
@ -183,14 +220,14 @@ func (sequence *BioSequence) Merge(tomerge *BioSequence, na string, inplace bool
|
|||||||
|
|
||||||
count := sequence.Count() + tomerge.Count()
|
count := sequence.Count() + tomerge.Count()
|
||||||
|
|
||||||
for _, key := range statsOn {
|
for key, desc := range statsOn {
|
||||||
if tomerge.HasStatsOn(key) {
|
if tomerge.HasStatsOn(key) {
|
||||||
smk := sequence.StatsOn(key, na)
|
smk := sequence.StatsOn(desc, na)
|
||||||
mmk := tomerge.StatsOn(key, na)
|
mmk := tomerge.StatsOn(desc, na)
|
||||||
|
|
||||||
annotations[StatsOnSlotName(key)] = smk.Merge(mmk)
|
annotations[StatsOnSlotName(key)] = smk.Merge(mmk)
|
||||||
} else {
|
} else {
|
||||||
sequence.StatsPlusOne(key, tomerge, na)
|
sequence.StatsPlusOne(desc, tomerge, na)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -237,19 +274,19 @@ func (sequence *BioSequence) Merge(tomerge *BioSequence, na string, inplace bool
|
|||||||
//
|
//
|
||||||
// Return type:
|
// Return type:
|
||||||
// - *BioSequence: the merged sequence (BioSequence)
|
// - *BioSequence: the merged sequence (BioSequence)
|
||||||
func (sequences BioSequenceSlice) Merge(na string, statsOn []string) *BioSequence {
|
func (sequences BioSequenceSlice) Merge(na string, statsOn StatsOnDescriptions) *BioSequence {
|
||||||
seq := sequences[0]
|
seq := sequences[0]
|
||||||
//sequences[0] = nil
|
//sequences[0] = nil
|
||||||
seq.SetQualities(nil)
|
seq.SetQualities(nil)
|
||||||
|
|
||||||
if len(sequences) == 1 {
|
if len(sequences) == 1 {
|
||||||
seq.Annotations()["count"] = seq.Count()
|
seq.Annotations()["count"] = seq.Count()
|
||||||
for _, v := range statsOn {
|
for _, desc := range statsOn {
|
||||||
seq.StatsOn(v, na)
|
seq.StatsOn(desc, na)
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for k, toMerge := range sequences[1:] {
|
for k, toMerge := range sequences[1:] {
|
||||||
seq.Merge(toMerge, na, true, statsOn...)
|
seq.Merge(toMerge, na, true, statsOn)
|
||||||
toMerge.Recycle()
|
toMerge.Recycle()
|
||||||
sequences[1+k] = nil
|
sequences[1+k] = nil
|
||||||
}
|
}
|
||||||
|
@ -43,7 +43,7 @@ func (t1 *TaxNode) LCA(t2 *TaxNode) (*TaxNode, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (taxonomy *Taxonomy) TaxonomicDistribution(sequence *obiseq.BioSequence) map[*TaxNode]int {
|
func (taxonomy *Taxonomy) TaxonomicDistribution(sequence *obiseq.BioSequence) map[*TaxNode]int {
|
||||||
taxids := sequence.StatsOn("taxid", "na")
|
taxids := sequence.StatsOn(obiseq.MakeStatsOnDescription("taxid"), "na")
|
||||||
taxons := make(map[*TaxNode]int, len(taxids))
|
taxons := make(map[*TaxNode]int, len(taxids))
|
||||||
|
|
||||||
for k, v := range taxids {
|
for k, v := range taxids {
|
||||||
|
@ -34,7 +34,7 @@ func buildSamples(dataset obiseq.BioSequenceSlice,
|
|||||||
samples := make(map[string]*([]*seqPCR))
|
samples := make(map[string]*([]*seqPCR))
|
||||||
|
|
||||||
for _, s := range dataset {
|
for _, s := range dataset {
|
||||||
stats := s.StatsOn(tag, NAValue)
|
stats := s.StatsOn(obiseq.MakeStatsOnDescription(tag), NAValue)
|
||||||
|
|
||||||
for k, v := range stats {
|
for k, v := range stats {
|
||||||
pcr, ok := samples[k]
|
pcr, ok := samples[k]
|
||||||
|
@ -129,7 +129,7 @@ func SampleWeight(seqs *obiseq.BioSequenceSlice, sample, sample_key string) func
|
|||||||
|
|
||||||
f := func(i int) float64 {
|
f := func(i int) float64 {
|
||||||
|
|
||||||
stats := (*seqs)[i].StatsOn(sample_key, "NA")
|
stats := (*seqs)[i].StatsOn(obiseq.MakeStatsOnDescription(sample_key), "NA")
|
||||||
|
|
||||||
if value, ok := stats[sample]; ok {
|
if value, ok := stats[sample]; ok {
|
||||||
return float64(value)
|
return float64(value)
|
||||||
@ -155,7 +155,7 @@ func SeqBySamples(seqs obiseq.BioSequenceSlice, sample_key string) map[string]*o
|
|||||||
|
|
||||||
for _, s := range seqs {
|
for _, s := range seqs {
|
||||||
if s.HasStatsOn(sample_key) {
|
if s.HasStatsOn(sample_key) {
|
||||||
stats := s.StatsOn(sample_key, "NA")
|
stats := s.StatsOn(obiseq.MakeStatsOnDescription(sample_key), "NA")
|
||||||
for k := range stats {
|
for k := range stats {
|
||||||
if seqset, ok := samples[k]; ok {
|
if seqset, ok := samples[k]; ok {
|
||||||
*seqset = append(*seqset, s)
|
*seqset = append(*seqset, s)
|
||||||
@ -378,7 +378,7 @@ func CLIOBIMinion(itertator obiiter.IBioSequence) obiiter.IBioSequence {
|
|||||||
}()
|
}()
|
||||||
|
|
||||||
obiuniq.AddStatsOn(CLISampleAttribute())
|
obiuniq.AddStatsOn(CLISampleAttribute())
|
||||||
obiuniq.AddStatsOn("obiconsensus_weight")
|
obiuniq.AddStatsOn("sample:obiconsensus_weight")
|
||||||
obiuniq.SetUniqueInMemory(false)
|
obiuniq.SetUniqueInMemory(false)
|
||||||
obiuniq.SetNoSingleton(CLINoSingleton())
|
obiuniq.SetNoSingleton(CLINoSingleton())
|
||||||
return obiuniq.CLIUnique(newIter).Pipe(obiiter.WorkerPipe(obiannotate.AddSeqLengthWorker(), false))
|
return obiuniq.CLIUnique(newIter).Pipe(obiiter.WorkerPipe(obiannotate.AddSeqLengthWorker(), false))
|
||||||
|
Reference in New Issue
Block a user